0xDEADBEEF (⇉english edition⇉)

[RSS] [odkazy]

CSH medoid

27. 8. 2020

D ver­sion of pri­mor­dial python im­plemn­tation of Ultra Fast Medoid Iden­ti­fi­cation via Corre­la­ted Sequen­tial Hal­ving.

ulong medoidIndex(alias DistFunc, T, alias less = (a, b) => a < b)(T[] data, double armBudget) {
  assert(!data.empty, "must not be empty");
  const n         = data.length;
  const budget    = armBudget * n;
  const numRounds = log2(n).to!ulong + 1;
  auto pullCount  = 0; // number of times each arm is pulled
  auto estimates  = new double[n]; estimates[] = 0.0;
  auto candidates = iota(0, n).array;

  double mean(R)(R r) { return (r.length == 0) ? 0.0 : r.sum / double(r.length); }

  /*  Pulls the given arms pullsPerArm times. Updates the estimates */
  auto pullArm(ulong[] arms, ulong pullsPerArm) {
    auto others  = randomSample(iota(0, n), pullsPerArm).array;

    foreach (a; arms) {
      auto meanDist = mean(others.map!(o => DistFunc(data[a], data[o])));
      estimates[a] = (estimates[a]*pullCount + meanDist*pullsPerArm) / double(pullCount + pullsPerArm);

      if (pullsPerArm == n) {
        estimates[a] = meanDist;
      }
      assert(!isNaN(meanDist) && !isNaN(estimates[a]));
    }
    pullCount += pullsPerArm;
  }

  while (candidates.length > 1) {
    auto pullsPerArm = clamp((budget / candidates.length / numRounds).to!ulong, 1, n);
    pullArm(candidates, pullsPerArm);

    auto lst = indexed(estimates, candidates);

    if (pullsPerArm == n) { // calculated exactly
      candidates = [candidates[lst.minIndex]];
      break;
    }

    auto locs = new ulong[lst.length/2];
    topNIndex!less(lst, locs);
    auto median = estimates[candidates[locs[$-1]]];

    candidates = indexed(candidates, locs).array;
  }

  assert(candidates.length == 1);
  return candidates[0];
}
píše k47 (@kaja47, k47)