0xDEADBEEF

RSS odkazy

In-vector sorting (hminpos)

4. 3. 2021 #kód
import core.simd;
import core.bitop;
import gcc.builtins;

import std.stdio;

void doNothing(ref ushort[8] arr) {
  pragma(inline, false);
  asm { ""; }
}

private alias hminposuw = (a) => cast(ushort8) __builtin_ia32_phminposuw128(cast(short8) a);
private alias onesMask = (mask, i) {
  ushort8 v = i;
  return cast(ushort8) __builtin_ia32_pcmpeqw128(cast(short8) mask, cast(short8) v);
};
private alias unalignedLoad = (vec) => cast(ushort8) __builtin_ia32_loaddqu(cast(char*) vec.ptr);
private alias shuffle = (vec, mask) => cast(ushort8) __builtin_ia32_pshufb128(cast(ubyte16) vec, mask);


void minposSortSub(ref ushort[8] vec) {
  pragma(inline, false);

  ushort8 v = unalignedLoad(vec);
  ushort[8] res = void;
  ushort sub = 0;

  static foreach (i; 0 .. 8) {{
    ushort8 minAndPos = hminposuw(v);
    ushort minval = minAndPos.array[0];

    res[i] = cast(ushort) (minval+sub);
    v     -= cast(ushort) (minval+1);
    sub   += cast(ushort) (minval+1);
  }}

  vec = res;
}

void minposSortSub2(ref ushort[8] vec) {
  pragma(inline, false);

  ushort8 pos = unalignedLoad(vec);
  ushort8 neg = ~pos;
  ushort subPos = 0, subNeg = 0;
  ushort[8] res = void;

  static foreach (i; 0 .. 4) {{
    ushort8 minAndPos = hminposuw(pos);
    ushort8 maxAndPos = hminposuw(neg);

    ushort min = minAndPos.array[0];
    res[i] =  cast(ushort) (min+subPos);
    pos    -= cast(ushort) (min+1);
    subPos += cast(ushort) (min+1);

    ushort max = maxAndPos.array[0];
    res[7-i] =  ~(cast(ushort) (max+subNeg));
    neg      -= cast(ushort) (max+1);
    subNeg   += cast(ushort) (max+1);
  }}

  vec = res;
}

void minposSortSubVec(ref ushort[8] vec) {
  pragma(inline, false);

  ushort8 v = unalignedLoad(vec);
  ushort[8] res = void;
  ushort sub = 0;
  ubyte16 mask = [0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1];

  static foreach (i; 0 .. 8) {{
    ushort8 minAndPos = hminposuw(v);

    auto mins = shuffle(minAndPos, mask);
    v  -= (mins + 1);

    ushort minval = minAndPos.array[0];
    res[i] = cast(ushort) (minval+sub);
    sub   += cast(ushort) (minval+1);
  }}

  vec = res;
}

void minposSortSubVec2(ref ushort[8] vec) {
  pragma(inline, false);

  ushort8 pos = unalignedLoad(vec);
  ushort8 neg = ~pos;
  ushort subPos = 0, subNeg = 0;
  ushort[8] res = void;
  ubyte16 mask = [0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1];

  static foreach (i; 0 .. 4) {{
    ushort8 minAndPos = hminposuw(pos);
    ushort8 maxAndPos = hminposuw(neg);
    auto mins = shuffle(minAndPos, mask);
    auto maxs = shuffle(maxAndPos, mask);
    pos  -= (mins + 1);
    neg  -= (maxs + 1);

    ushort minval = minAndPos.array[0];
    res[i] = cast(ushort) (minval+subPos);
    subPos += cast(ushort) (minval+1);

    ushort maxval = maxAndPos.array[0];
    res[7-i] = ~(cast(ushort) (maxval+subNeg));
    subNeg   += cast(ushort) (maxval+1);
  }}

  vec = res;
}

void minposSort(ref ushort[8] vec) {
  pragma(inline, false);

  ushort8 v = unalignedLoad(vec);
  ushort[8] res = void;
  ushort8 mask = [0,1,2,3,4,5,6,7];

  static foreach (i; 0 .. 8) {{
    ushort8 minAndPos = hminposuw(v);
    res[i]     = minAndPos.array[0];
    ushort pos = minAndPos.array[1] & 0b111;

    //v.array[pos] = cast(ushort) 0xffff;
    v |= onesMask(mask, pos);
  }}

  vec = res;
}

void minposSort2(ref ushort[8] vec) {
  pragma(inline, false);

  ushort8 pos = unalignedLoad(vec);
  ushort8 neg = ~pos;
  ushort[8] res = void;
  ushort8 mask = [0,1,2,3,4,5,6,7];

  static foreach (i; 0 .. 4) {{
    ushort8 minAndPos = hminposuw(pos);
    ushort8 maxAndPos = hminposuw(neg);

    res[i]        = minAndPos.array[0];
    ushort minpos = minAndPos.array[1] & 0b111;
    pos |= onesMask(mask, minpos);

    res[7-i]      = ~maxAndPos.array[0];
    ushort maxpos = maxAndPos.array[1] & 0b111;
    neg |= onesMask(mask, maxpos);
  }}

  vec = res;
}
píše k47 (@kaja47, k47)