0xDEADBEEF

RSS odkazy english edition

nearest-neighbor.c

29. 7. 2015 #kód
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <stdint.h>
#include <immintrin.h>
#include "sys/param.h"


float euclidean_distance_sq(float *vecs, int veclen, int a, int b) {
  float d = 0;
  for (int i = 0; i < veclen; i++) {
    float t = vecs[veclen * a + i] - vecs[veclen * b + i];
    d += t * t;
  }
  return d;
}


double euclidean_distance_sq_vec(double *vecs, int veclen, int a, int b) {
  __m256d d = _mm256_set1_pd(0.0);
  for (int i = 0; i < veclen; i+=4) {
    __m256d aa = _mm256_load_pd(vecs + veclen * a + i);
    __m256d bb = _mm256_load_pd(vecs + veclen * b + i);
    __m256d t = _mm256_sub_pd(aa, bb);
    d = _mm256_add_pd(d, _mm256_mul_pd(t, t));
  }

  __m256d sum = _mm256_hadd_pd(d, d);
  return ((double*)&sum)[0] + ((double*)&sum)[2];
}


float euclidean_distance_sq_vec_fl(float *vecs, int veclen, int a, int b) {
  __m256 d = _mm256_set1_ps(0.0);
  for (int i = 0; i < veclen; i+=8) {
    __m256 aa = _mm256_load_ps(vecs + veclen * a + i);
    __m256 bb = _mm256_load_ps(vecs + veclen * b + i);
    __m256 t = _mm256_sub_ps(aa, bb);
    d = _mm256_add_ps(d, _mm256_mul_ps(t, t));
  }

  d = _mm256_hadd_ps(d, d);
  d = _mm256_hadd_ps(d, d);
  return ((float*)&d)[0] + ((float*)&d)[4];
}


float euclidean_distance_sq_ssse(float *vecs, int veclen, int a, int b) {
  __m128 d = _mm_set1_ps(0.0f);
  for (int i = 0; i < veclen; i+=4) {
    __m128 aa = _mm_load_ps(vecs + veclen * a + i);
    __m128 bb = _mm_load_ps(vecs + veclen * b + i);
    __m128 t = _mm_sub_ps(aa, bb);
    d = _mm_add_ps(d, _mm_mul_ps(t, t));
  }

  __m128 sum = _mm_hadd_ps(d, d);
  return ((float*)&sum)[0] + ((float*)&sum)[1];
}


int nn(float *vecs, int veccnt, int veclen, int idx) {
  int nearest = -1;
  float dist = INFINITY;
  for (int i = 0; i < veccnt; i++) {
    float d = euclidean_distance_sq(vecs, veclen, idx, i);
    if (i != idx && d < dist) {
      nearest = i;
      dist = d;
    }
  }
  return nearest;
}


struct idxdist {
  int idx;
  float dist;
};


int main(int argc, char *argv[]) {

  int veccnt = atoi(argv[1]);
  int veclen = atoi(argv[2]);

  if (veclen % 8 != 0) {
    printf("veclen not divisible by 8\n");
    return 1;
  }

  printf("count: %d, vector length: %d\n", veccnt, veclen);

  float *vecs;
  posix_memalign((void**)&vecs, 32, sizeof(float) * veccnt * veclen);

  struct idxdist *idxdists = malloc(sizeof(struct idxdist) * veccnt);

  srand(47);
  for (int i = 0; i < veccnt * veclen; i++) {
    vecs[i] = ((float) rand() / (float) RAND_MAX);
  }

  for (int i = 0; i < veccnt; i++) {
    idxdists[i].idx = -1;
    idxdists[i].dist = INFINITY;
  }

  /*
  for (int i = 0; i < veccnt; i++) {
    idxdists[i].idx = nn(vecs, veccnt, veclen, i);
  }
  */


  int tile = 64;

  #pragma omp parallel for
  for (int ti = 0; ti < veccnt; ti += tile) {

    for (int tj = 0; tj < veccnt; tj += tile) {
      int maxi = MIN(ti+tile, veccnt);
      for (int i = ti; i < maxi; i++) { // current vec idx
        int maxj = MIN(tj+tile, veccnt);
        for (int j = tj; j < maxj; j++) {

          float d = euclidean_distance_sq_vec_fl(vecs, veclen, i, j);
          if (j != i && d < idxdists[i].dist) {
            idxdists[i].idx = j;
            idxdists[i].dist = d;
          }

        }
      }
    }
  }

  for (int i = 0; i < MIN(32, veccnt); i++) {
    printf("%d ", idxdists[i].idx);
  }
  printf("\n");

}
píše k47 (@kaja47, k47)