nearest-neighbor.c
29. 7. 2015 #kód
#include <stdio.h> #include <string.h> #include <time.h> #include <math.h> #include <stdlib.h> #include <stdint.h> #include <immintrin.h> #include "sys/param.h" float euclidean_distance_sq(float *vecs, int veclen, int a, int b) { float d = 0; for (int i = 0; i < veclen; i++) { float t = vecs[veclen * a + i] - vecs[veclen * b + i]; d += t * t; } return d; } double euclidean_distance_sq_vec(double *vecs, int veclen, int a, int b) { __m256d d = _mm256_set1_pd(0.0); for (int i = 0; i < veclen; i+=4) { __m256d aa = _mm256_load_pd(vecs + veclen * a + i); __m256d bb = _mm256_load_pd(vecs + veclen * b + i); __m256d t = _mm256_sub_pd(aa, bb); d = _mm256_add_pd(d, _mm256_mul_pd(t, t)); } __m256d sum = _mm256_hadd_pd(d, d); return ((double*)&sum)[0] + ((double*)&sum)[2]; } float euclidean_distance_sq_vec_fl(float *vecs, int veclen, int a, int b) { __m256 d = _mm256_set1_ps(0.0); for (int i = 0; i < veclen; i+=8) { __m256 aa = _mm256_load_ps(vecs + veclen * a + i); __m256 bb = _mm256_load_ps(vecs + veclen * b + i); __m256 t = _mm256_sub_ps(aa, bb); d = _mm256_add_ps(d, _mm256_mul_ps(t, t)); } d = _mm256_hadd_ps(d, d); d = _mm256_hadd_ps(d, d); return ((float*)&d)[0] + ((float*)&d)[4]; } float euclidean_distance_sq_ssse(float *vecs, int veclen, int a, int b) { __m128 d = _mm_set1_ps(0.0f); for (int i = 0; i < veclen; i+=4) { __m128 aa = _mm_load_ps(vecs + veclen * a + i); __m128 bb = _mm_load_ps(vecs + veclen * b + i); __m128 t = _mm_sub_ps(aa, bb); d = _mm_add_ps(d, _mm_mul_ps(t, t)); } __m128 sum = _mm_hadd_ps(d, d); return ((float*)&sum)[0] + ((float*)&sum)[1]; } int nn(float *vecs, int veccnt, int veclen, int idx) { int nearest = -1; float dist = INFINITY; for (int i = 0; i < veccnt; i++) { float d = euclidean_distance_sq(vecs, veclen, idx, i); if (i != idx && d < dist) { nearest = i; dist = d; } } return nearest; } struct idxdist { int idx; float dist; }; int main(int argc, char *argv[]) { int veccnt = atoi(argv[1]); int veclen = atoi(argv[2]); if (veclen % 8 != 0) { printf("veclen not divisible by 8\n"); return 1; } printf("count: %d, vector length: %d\n", veccnt, veclen); float *vecs; posix_memalign((void**)&vecs, 32, sizeof(float) * veccnt * veclen); struct idxdist *idxdists = malloc(sizeof(struct idxdist) * veccnt); srand(47); for (int i = 0; i < veccnt * veclen; i++) { vecs[i] = ((float) rand() / (float) RAND_MAX); } for (int i = 0; i < veccnt; i++) { idxdists[i].idx = -1; idxdists[i].dist = INFINITY; } /* for (int i = 0; i < veccnt; i++) { idxdists[i].idx = nn(vecs, veccnt, veclen, i); } */ int tile = 64; #pragma omp parallel for for (int ti = 0; ti < veccnt; ti += tile) { for (int tj = 0; tj < veccnt; tj += tile) { int maxi = MIN(ti+tile, veccnt); for (int i = ti; i < maxi; i++) { // current vec idx int maxj = MIN(tj+tile, veccnt); for (int j = tj; j < maxj; j++) { float d = euclidean_distance_sq_vec_fl(vecs, veclen, i, j); if (j != i && d < idxdists[i].dist) { idxdists[i].idx = j; idxdists[i].dist = d; } } } } } for (int i = 0; i < MIN(32, veccnt); i++) { printf("%d ", idxdists[i].idx); } printf("\n"); }