In-vector sorting (hminpos)
4. 3. 2021 #kód
import core.simd; import core.bitop; import gcc.builtins; import std.stdio; void doNothing(ref ushort[8] arr) { pragma(inline, false); asm { ""; } } private alias hminposuw = (a) => cast(ushort8) __builtin_ia32_phminposuw128(cast(short8) a); private alias onesMask = (mask, i) { ushort8 v = i; return cast(ushort8) __builtin_ia32_pcmpeqw128(cast(short8) mask, cast(short8) v); }; private alias unalignedLoad = (vec) => cast(ushort8) __builtin_ia32_loaddqu(cast(char*) vec.ptr); private alias shuffle = (vec, mask) => cast(ushort8) __builtin_ia32_pshufb128(cast(ubyte16) vec, mask); void minposSortSub(ref ushort[8] vec) { pragma(inline, false); ushort8 v = unalignedLoad(vec); ushort[8] res = void; ushort sub = 0; static foreach (i; 0 .. 8) {{ ushort8 minAndPos = hminposuw(v); ushort minval = minAndPos.array[0]; res[i] = cast(ushort) (minval+sub); v -= cast(ushort) (minval+1); sub += cast(ushort) (minval+1); }} vec = res; } void minposSortSub2(ref ushort[8] vec) { pragma(inline, false); ushort8 pos = unalignedLoad(vec); ushort8 neg = ~pos; ushort subPos = 0, subNeg = 0; ushort[8] res = void; static foreach (i; 0 .. 4) {{ ushort8 minAndPos = hminposuw(pos); ushort8 maxAndPos = hminposuw(neg); ushort min = minAndPos.array[0]; res[i] = cast(ushort) (min+subPos); pos -= cast(ushort) (min+1); subPos += cast(ushort) (min+1); ushort max = maxAndPos.array[0]; res[7-i] = ~(cast(ushort) (max+subNeg)); neg -= cast(ushort) (max+1); subNeg += cast(ushort) (max+1); }} vec = res; } void minposSortSubVec(ref ushort[8] vec) { pragma(inline, false); ushort8 v = unalignedLoad(vec); ushort[8] res = void; ushort sub = 0; ubyte16 mask = [0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1]; static foreach (i; 0 .. 8) {{ ushort8 minAndPos = hminposuw(v); auto mins = shuffle(minAndPos, mask); v -= (mins + 1); ushort minval = minAndPos.array[0]; res[i] = cast(ushort) (minval+sub); sub += cast(ushort) (minval+1); }} vec = res; } void minposSortSubVec2(ref ushort[8] vec) { pragma(inline, false); ushort8 pos = unalignedLoad(vec); ushort8 neg = ~pos; ushort subPos = 0, subNeg = 0; ushort[8] res = void; ubyte16 mask = [0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1]; static foreach (i; 0 .. 4) {{ ushort8 minAndPos = hminposuw(pos); ushort8 maxAndPos = hminposuw(neg); auto mins = shuffle(minAndPos, mask); auto maxs = shuffle(maxAndPos, mask); pos -= (mins + 1); neg -= (maxs + 1); ushort minval = minAndPos.array[0]; res[i] = cast(ushort) (minval+subPos); subPos += cast(ushort) (minval+1); ushort maxval = maxAndPos.array[0]; res[7-i] = ~(cast(ushort) (maxval+subNeg)); subNeg += cast(ushort) (maxval+1); }} vec = res; } void minposSort(ref ushort[8] vec) { pragma(inline, false); ushort8 v = unalignedLoad(vec); ushort[8] res = void; ushort8 mask = [0,1,2,3,4,5,6,7]; static foreach (i; 0 .. 8) {{ ushort8 minAndPos = hminposuw(v); res[i] = minAndPos.array[0]; ushort pos = minAndPos.array[1] & 0b111; //v.array[pos] = cast(ushort) 0xffff; v |= onesMask(mask, pos); }} vec = res; } void minposSort2(ref ushort[8] vec) { pragma(inline, false); ushort8 pos = unalignedLoad(vec); ushort8 neg = ~pos; ushort[8] res = void; ushort8 mask = [0,1,2,3,4,5,6,7]; static foreach (i; 0 .. 4) {{ ushort8 minAndPos = hminposuw(pos); ushort8 maxAndPos = hminposuw(neg); res[i] = minAndPos.array[0]; ushort minpos = minAndPos.array[1] & 0b111; pos |= onesMask(mask, minpos); res[7-i] = ~maxAndPos.array[0]; ushort maxpos = maxAndPos.array[1] & 0b111; neg |= onesMask(mask, maxpos); }} vec = res; }