0xDEADBEEF

RSS odkazy english edition

Intersection size SSE

10. 8. 2020
uint intersectionSizeSSE(ushort[] a, ushort[] b) {
  import gcc.builtins;
  import core.simd;
  import core.bitop;

  uint count = 0;
  ulong ai = 0, bi = 0;

  if (a.length > 7 && b.length > 7) {
    while (ai < a.length-7 && bi < b.length-7) {
      ushort8 av = cast(ushort8) __builtin_ia32_loaddqu(cast(char*)&a.ptr[ai]);
      ushort8 bv = cast(ushort8) __builtin_ia32_loaddqu(cast(char*)&b.ptr[bi]);

      int4 res = cast(int4) __builtin_ia32_pcmpestrm128(
        cast(ubyte16)bv, 8, cast(ubyte16)av, 8,
        0x01 // _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK
      );

      uint r    = res.array[0];
      ushort a7 = av.array[7];
      ushort b7 = bv.array[7];
      ai += (a7 <= b7) * 8;
      bi += (a7 >= b7) * 8;
      count += popcnt(r);
    }
  }

  if (ai >= a.length || bi >= b.length) return count;

  ushort[] c, d;
  ulong ci, di;

  bool aIsShorter = (a.length-ai) < (b.length-bi);
  if (aIsShorter) {
    c  = a;
    ci = ai;
    d  = b;
    di = bi;
  } else {
    c  = b;
    ci = bi;
    d  = a;
    di = ai;
  }

  ushort8 cvec;
  uint clen = to!uint(c.length-ci);
  cvec.array[0 .. clen] = c[ci .. $];

  while (di < d.length) {
    ushort8 dvec = cast(ushort8) __builtin_ia32_loaddqu(cast(char*)&d.ptr[di]);
    uint dlen = min(8, cast(uint) (d.length-di));

    int4 res = cast(int4) __builtin_ia32_pcmpestrm128(
      cast(ubyte16) cvec, clen, cast(ubyte16) dvec, dlen,
      0x01
    );

    di += 8;
    uint r = res.array[0];
    count += popcnt(r);
  }

  return count;
}
píše k47 (@kaja47, k47)