0xDEADBEEF

RSS odkazy english edition

Bitonic sort

import core.simd;
import gcc.builtins;

private enum swapHalves = 0b01001110; // 1234 -> 3412
private enum swapPairs  = 0b10110001; // 1234 -> 2143

private alias min4    = __builtin_ia32_pminud128;
private alias max4    = __builtin_ia32_pmaxud128;
private alias shuffle = __builtin_ia32_pshufd;
private alias blend   = __builtin_ia32_pblendw128;
private alias reverse = (x) => shuffle(x, 0b00011011);

void bitonicMergeSort20(int[] _input, int[] _output) {
  auto input  = cast(int4*) _input.ptr;
  auto output = cast(int4*) _output.ptr;

  auto a = input[0], b = input[1], c = input[2], d = input[3], e = input[4];

  sort4x4(a, b, c, d);
  e = sort4(e);

  auto ab = merge8(a, b);
  auto cd = merge8(c, d);

  a = ab[0];
  b = ab[1];

  c = reverse(cd[1]);
  d = reverse(cd[0]);

  auto acmin = min4(a, c);
  auto acmax = max4(a, c);

  auto bdmin = min4(b, d);
  auto bdmax = max4(b, d);

  auto o12 = merge8WithoutReverse(acmin, bdmin);
  auto o34 = merge8WithoutReverse(acmax, bdmax);

  // insertion sort the rest
  auto pair = merge8(o12[0], e);
  *output = pair[0]; output++;

  pair = merge8(o12[1], pair[1]);
  *output = pair[0]; output++;

  pair = merge8(o34[0], pair[1]);
  *output = pair[0]; output++;

  pair = merge8(o34[1], pair[1]);
  *output = pair[0]; output++;
  *output = pair[1];
}



void sort4x4(ref int4 a, ref int4 b, ref int4 c, ref int4 d) {
  pragma(inline, true);

  int4 x;
  int4 y;

  x = min4(a, c);
  y = max4(a, c);
  a = x;
  c = y;

  x = min4(b, d);
  y = max4(b, d);
  b = x;
  d = y;

  x = min4(a, b);
  y = max4(a, b);
  a = x;
  b = y;

  x = min4(c, d);
  y = max4(c, d);
  c = x;
  d = y;

  x = min4(b, c);
  y = max4(b, c);
  b = x;
  c = y;

  auto aa = cast(long2) __builtin_ia32_punpckldq128(a, b);
  auto bb = cast(long2) __builtin_ia32_punpckhdq128(a, b);
  auto cc = cast(long2) __builtin_ia32_punpckldq128(c, d);
  auto dd = cast(long2) __builtin_ia32_punpckhdq128(c, d);

  a = cast(int4) __builtin_ia32_punpcklqdq128(aa, cc);
  b = cast(int4) __builtin_ia32_punpckhqdq128(aa, cc);
  c = cast(int4) __builtin_ia32_punpcklqdq128(bb, dd);
  d = cast(int4) __builtin_ia32_punpckhqdq128(bb, dd);
}


int4 sort4(int4 v) {
  auto v2 = shuffle(v, swapPairs);
  auto vmin = min4(v, v2);
  auto vmax = max4(v, v2);
  v = cast(int4) blend(cast(short8)vmin, cast(short8)vmax, 0b11000011);
  return merge4(v);
}


int4 merge4(int4 v) {
  auto v2 = shuffle(v, swapHalves);
  auto vmin = min4(v, v2);
  auto vmax = max4(v, v2);
  v = cast(int4) blend(cast(short8)vmin, cast(short8)vmax, 0b11110000);

  v2 = shuffle(v, swapPairs);
  vmin = min4(v, v2);
  vmax = max4(v, v2);
  v = cast(int4) blend(cast(short8)vmin, cast(short8)vmax, 0b11001100);

  return v;
}


int4[2] merge8(int4 a, int4 b) {
  return merge8WithoutReverse(a, reverse(b));
}

int4[2] merge8WithoutReverse(int4 a, int4 b) {
  pragma(inline, true);
  auto vmin = min4(a, b);
  auto vmax = max4(a, b);
  vmin = merge4(vmin);
  vmax = merge4(vmax);
  return [vmin, vmax];
}
píše k47 (@kaja47, k47)