Intersection size SSE
10. 8. 2020 #kód
uint intersectionSizeSSE(ushort[] a, ushort[] b) { import gcc.builtins; import core.simd; import core.bitop; uint count = 0; ulong ai = 0, bi = 0; if (a.length > 7 && b.length > 7) { while (ai < a.length-7 && bi < b.length-7) { ushort8 av = cast(ushort8) __builtin_ia32_loaddqu(cast(char*)&a.ptr[ai]); ushort8 bv = cast(ushort8) __builtin_ia32_loaddqu(cast(char*)&b.ptr[bi]); int4 res = cast(int4) __builtin_ia32_pcmpestrm128( cast(ubyte16)bv, 8, cast(ubyte16)av, 8, 0x01 // _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK ); uint r = res.array[0]; ushort a7 = av.array[7]; ushort b7 = bv.array[7]; ai += (a7 <= b7) * 8; bi += (a7 >= b7) * 8; count += popcnt(r); } } if (ai >= a.length || bi >= b.length) return count; ushort[] c, d; ulong ci, di; bool aIsShorter = (a.length-ai) < (b.length-bi); if (aIsShorter) { c = a; ci = ai; d = b; di = bi; } else { c = b; ci = bi; d = a; di = ai; } ushort8 cvec; uint clen = to!uint(c.length-ci); cvec.array[0 .. clen] = c[ci .. $]; while (di < d.length) { ushort8 dvec = cast(ushort8) __builtin_ia32_loaddqu(cast(char*)&d.ptr[di]); uint dlen = min(8, cast(uint) (d.length-di)); int4 res = cast(int4) __builtin_ia32_pcmpestrm128( cast(ubyte16) cvec, clen, cast(ubyte16) dvec, dlen, 0x01 ); di += 8; uint r = res.array[0]; count += popcnt(r); } return count; }