Bitonic sort
#kód
import core.simd; import gcc.builtins; private enum swapHalves = 0b01001110; // 1234 -> 3412 private enum swapPairs = 0b10110001; // 1234 -> 2143 private alias min4 = __builtin_ia32_pminud128; private alias max4 = __builtin_ia32_pmaxud128; private alias shuffle = __builtin_ia32_pshufd; private alias blend = __builtin_ia32_pblendw128; private alias reverse = (x) => shuffle(x, 0b00011011); void bitonicMergeSort20(int[] _input, int[] _output) { auto input = cast(int4*) _input.ptr; auto output = cast(int4*) _output.ptr; auto a = input[0], b = input[1], c = input[2], d = input[3], e = input[4]; sort4x4(a, b, c, d); e = sort4(e); auto ab = merge8(a, b); auto cd = merge8(c, d); a = ab[0]; b = ab[1]; c = reverse(cd[1]); d = reverse(cd[0]); auto acmin = min4(a, c); auto acmax = max4(a, c); auto bdmin = min4(b, d); auto bdmax = max4(b, d); auto o12 = merge8WithoutReverse(acmin, bdmin); auto o34 = merge8WithoutReverse(acmax, bdmax); // insertion sort the rest auto pair = merge8(o12[0], e); *output = pair[0]; output++; pair = merge8(o12[1], pair[1]); *output = pair[0]; output++; pair = merge8(o34[0], pair[1]); *output = pair[0]; output++; pair = merge8(o34[1], pair[1]); *output = pair[0]; output++; *output = pair[1]; } void sort4x4(ref int4 a, ref int4 b, ref int4 c, ref int4 d) { pragma(inline, true); int4 x; int4 y; x = min4(a, c); y = max4(a, c); a = x; c = y; x = min4(b, d); y = max4(b, d); b = x; d = y; x = min4(a, b); y = max4(a, b); a = x; b = y; x = min4(c, d); y = max4(c, d); c = x; d = y; x = min4(b, c); y = max4(b, c); b = x; c = y; auto aa = cast(long2) __builtin_ia32_punpckldq128(a, b); auto bb = cast(long2) __builtin_ia32_punpckhdq128(a, b); auto cc = cast(long2) __builtin_ia32_punpckldq128(c, d); auto dd = cast(long2) __builtin_ia32_punpckhdq128(c, d); a = cast(int4) __builtin_ia32_punpcklqdq128(aa, cc); b = cast(int4) __builtin_ia32_punpckhqdq128(aa, cc); c = cast(int4) __builtin_ia32_punpcklqdq128(bb, dd); d = cast(int4) __builtin_ia32_punpckhqdq128(bb, dd); } int4 sort4(int4 v) { auto v2 = shuffle(v, swapPairs); auto vmin = min4(v, v2); auto vmax = max4(v, v2); v = cast(int4) blend(cast(short8)vmin, cast(short8)vmax, 0b11000011); return merge4(v); } int4 merge4(int4 v) { auto v2 = shuffle(v, swapHalves); auto vmin = min4(v, v2); auto vmax = max4(v, v2); v = cast(int4) blend(cast(short8)vmin, cast(short8)vmax, 0b11110000); v2 = shuffle(v, swapPairs); vmin = min4(v, v2); vmax = max4(v, v2); v = cast(int4) blend(cast(short8)vmin, cast(short8)vmax, 0b11001100); return v; } int4[2] merge8(int4 a, int4 b) { return merge8WithoutReverse(a, reverse(b)); } int4[2] merge8WithoutReverse(int4 a, int4 b) { pragma(inline, true); auto vmin = min4(a, b); auto vmax = max4(a, b); vmin = merge4(vmin); vmax = merge4(vmax); return [vmin, vmax]; }