LUT based search for additive quantizers (#1908)

mdouze · facebook-github-bot · commit 8eab15eca382 · 2021-05-25T01:54:53.000-07:00
Summary: Pull Request resolved: #1908 To search the best combination of codebooks, the method that was implemented so far is via a beam search. It is possible to make this faster for a query vector q by precomputing look-up tables in the form of LUT_m = <q, cent_m> where cent_m is the set of centroids for quantizer m=0..M-1. The LUT can then be used as inner_prod = sum_m LUT_m[c_m] and L2_distance = norm_q + norm_db - 2 * inner_prod This diff implements this computation by: - adding the LUT precomputation - storing an exhaustive table of all centroid norms (when using L2) This is only practical for small additive quantizers, eg. when a residual vector quantizer is used as coarse quantizer (ResidualCoarseQuantizer). This diff is based on AdditiveQuantizer diff because it applies equally to other quantizers (eg. the LSQ). Reviewed By: sc268 Differential Revision: D28467746 fbshipit-source-id: 82611fe1e4908c290204d4de866338c622ae4148
diff --git a/faiss/IndexResidual.cpp b/faiss/IndexResidual.cpp
@@ -6,7 +6,6 @@
  */
 
 #include <faiss/IndexResidual.h>
-#include "faiss/utils/utils.h"
 
 #include <algorithm>
 #include <cmath>
@@ -198,13 +197,35 @@ void ResidualCoarseQuantizer::add(idx_t, const float*) {
     FAISS_THROW_MSG("not applicable");
 }
 
+void ResidualCoarseQuantizer::set_beam_factor(float new_beam_factor) {
+    centroid_norms.resize(0);
+    beam_factor = new_beam_factor;
+    if (new_beam_factor > 0) {
+        FAISS_THROW_IF_NOT(new_beam_factor >= 1.0);
+        return;
+    }
+
+    if (metric_type == METRIC_L2) {
+        centroid_norms.resize((size_t)1 << rq.tot_bits);
+        rq.compute_centroid_norms(centroid_norms.data());
+    }
+}
+
 void ResidualCoarseQuantizer::search(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels) const {
-    FAISS_THROW_IF_NOT(beam_factor >= 1.0);
+    if (beam_factor < 0) {
+        if (metric_type == METRIC_INNER_PRODUCT) {
+            rq.knn_exact_inner_product(n, x, k, distances, labels);
+        } else if (metric_type == METRIC_L2) {
+            FAISS_THROW_IF_NOT(centroid_norms.size() == ntotal);
+            rq.knn_exact_L2(n, x, k, distances, labels, centroid_norms.data());
+        }
+        return;
+    }
 
     int beam_size = int(k * beam_factor);
 
@@ -249,28 +270,18 @@ void ResidualCoarseQuantizer::search(
         const int32_t* codes_i = codes.data() + beam_size * i * rq.M;
         for (idx_t j = 0; j < k; j++) {
             idx_t l = 0;
+            int shift = 0;
             for (int m = 0; m < rq.M; m++) {
-                l = (l << rq.nbits[m]) | *codes_i++;
+                l |= (*codes_i++) << shift;
+                shift += rq.nbits[m];
             }
             labels[i * k + j] = l;
         }
     }
 }
 
 void ResidualCoarseQuantizer::reconstruct(idx_t key, float* recons) const {
-    for (int m = 0; m < rq.M; m++) {
-        int nbits = rq.nbits[m];
-        idx_t l = key & ((idx_t(1) << nbits) - 1);
-        key = key >> nbits;
-        const float* c = rq.codebooks.data() + d * (rq.codebook_offsets[m] + l);
-        if (m == 0) {
-            memcpy(recons, c, sizeof(*c) * d);
-        } else {
-            for (int i = 0; i < d; i++) {
-                recons[i] += c[i];
-            }
-        }
-    }
+    rq.decode_64bit(key, recons);
 }
 
 void ResidualCoarseQuantizer::reset() {
diff --git a/faiss/IndexResidual.h b/faiss/IndexResidual.h
@@ -101,8 +101,15 @@ struct ResidualCoarseQuantizer : Index {
     ResidualQuantizer rq;
 
     /// factor between the beam size and the search k
+    /// if negative, use exact search-to-centroid
     float beam_factor;
 
+    /// norms of centroids, useful for knn-search
+    std::vector<float> centroid_norms;
+
+    /// computes centroid norms if required
+    void set_beam_factor(float new_beam_factor);
+
     /** Constructor.
      *
      * @param d      dimensionality of the input vectors
diff --git a/faiss/impl/AdditiveQuantizer.cpp b/faiss/impl/AdditiveQuantizer.cpp
@@ -18,10 +18,30 @@
 
 #include <algorithm>
 
+#include <faiss/utils/Heap.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/hamming.h> // BitstringWriter
 #include <faiss/utils/utils.h>
 
+extern "C" {
+
+// general matrix multiplication
+int sgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const float* alpha,
+        const float* a,
+        FINTEGER* lda,
+        const float* b,
+        FINTEGER* ldb,
+        float* beta,
+        float* c,
+        FINTEGER* ldc);
+}
+
 namespace {
 
 // c and a and b can overlap
@@ -31,6 +51,12 @@ void fvec_add(size_t d, const float* a, const float* b, float* c) {
     }
 }
 
+void fvec_add(size_t d, const float* a, float b, float* c) {
+    for (size_t i = 0; i < d; i++) {
+        c[i] = a[i] + b;
+    }
+}
+
 } // namespace
 
 namespace faiss {
@@ -48,6 +74,7 @@ void AdditiveQuantizer::set_derived_values() {
             is_byte_aligned = false;
         }
     }
+    total_codebook_size = codebook_offsets[M];
     // convert bits to bytes
     code_size = (tot_bits + 7) / 8;
 }
@@ -93,4 +120,151 @@ void AdditiveQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
 
 AdditiveQuantizer::~AdditiveQuantizer() {}
 
+/****************************************************************************
+ * Support for fast distance computations and search with additive quantizer
+ ****************************************************************************/
+
+void AdditiveQuantizer::compute_centroid_norms(float* norms) const {
+    size_t ntotal = (size_t)1 << tot_bits;
+    // TODO: make tree of partial sums
+#pragma omp parallel
+    {
+        std::vector<float> tmp(d);
+#pragma omp for
+        for (int64_t i = 0; i < ntotal; i++) {
+            decode_64bit(i, tmp.data());
+            norms[i] = fvec_norm_L2sqr(tmp.data(), d);
+        }
+    }
+}
+
+void AdditiveQuantizer::decode_64bit(idx_t bits, float* xi) const {
+    for (int m = 0; m < M; m++) {
+        idx_t idx = bits & (((size_t)1 << nbits[m]) - 1);
+        bits >>= nbits[m];
+        const float* c = codebooks.data() + d * (codebook_offsets[m] + idx);
+        if (m == 0) {
+            memcpy(xi, c, sizeof(*xi) * d);
+        } else {
+            fvec_add(d, xi, c, xi);
+        }
+    }
+}
+
+void AdditiveQuantizer::compute_LUT(size_t n, const float* xq, float* LUT)
+        const {
+    // in all cases, it is large matrix multiplication
+
+    FINTEGER ncenti = total_codebook_size;
+    FINTEGER di = d;
+    FINTEGER nqi = n;
+    float one = 1, zero = 0;
+
+    sgemm_("Transposed",
+           "Not transposed",
+           &ncenti,
+           &nqi,
+           &di,
+           &one,
+           codebooks.data(),
+           &di,
+           xq,
+           &di,
+           &zero,
+           LUT,
+           &ncenti);
+}
+
+namespace {
+
+void compute_inner_prod_with_LUT(
+        const AdditiveQuantizer& aq,
+        const float* LUT,
+        float* ips) {
+    size_t prev_size = 1;
+    for (int m = 0; m < aq.M; m++) {
+        const float* LUTm = LUT + aq.codebook_offsets[m];
+        int nb = aq.nbits[m];
+        size_t nc = (size_t)1 << nb;
+
+        if (m == 0) {
+            memcpy(ips, LUT, sizeof(*ips) * nc);
+        } else {
+            for (int64_t i = nc - 1; i >= 0; i--) {
+                float v = LUTm[i];
+                fvec_add(prev_size, ips, v, ips + i * prev_size);
+            }
+        }
+        prev_size *= nc;
+    }
+}
+
+} // anonymous namespace
+
+void AdditiveQuantizer::knn_exact_inner_product(
+        idx_t n,
+        const float* xq,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    std::unique_ptr<float[]> LUT(new float[n * total_codebook_size]);
+    compute_LUT(n, xq, LUT.get());
+    size_t ntotal = (size_t)1 << tot_bits;
+
+#pragma omp parallel if (n > 100)
+    {
+        std::vector<float> dis(ntotal);
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const float* LUTi = LUT.get() + i * total_codebook_size;
+            compute_inner_prod_with_LUT(*this, LUTi, dis.data());
+            float* distances_i = distances + i * k;
+            idx_t* labels_i = labels + i * k;
+            minheap_heapify(k, distances_i, labels_i);
+            minheap_addn(k, distances_i, labels_i, dis.data(), nullptr, ntotal);
+            minheap_reorder(k, distances_i, labels_i);
+        }
+    }
+}
+
+void AdditiveQuantizer::knn_exact_L2(
+        idx_t n,
+        const float* xq,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const float* norms) const {
+    std::unique_ptr<float[]> LUT(new float[n * total_codebook_size]);
+    compute_LUT(n, xq, LUT.get());
+    std::unique_ptr<float[]> q_norms(new float[n]);
+    fvec_norms_L2sqr(q_norms.get(), xq, d, n);
+    size_t ntotal = (size_t)1 << tot_bits;
+
+#pragma omp parallel if (n > 100)
+    {
+        std::vector<float> dis(ntotal);
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const float* LUTi = LUT.get() + i * total_codebook_size;
+            float* distances_i = distances + i * k;
+            idx_t* labels_i = labels + i * k;
+
+            compute_inner_prod_with_LUT(*this, LUTi, dis.data());
+
+            // update distances using
+            // ||x - y||^2 = ||x||^2 + ||y||^2 - 2 * <x,y>
+
+            maxheap_heapify(k, distances_i, labels_i);
+            for (idx_t j = 0; j < ntotal; j++) {
+                float disj = q_norms[i] + norms[j] - 2 * dis[j];
+                if (disj < distances_i[0]) {
+                    heap_replace_top<CMax<float, int64_t>>(
+                            k, distances_i, labels_i, disj, j);
+                }
+            }
+            maxheap_reorder(k, distances_i, labels_i);
+        }
+    }
+}
+
 } // namespace faiss
diff --git a/faiss/impl/AdditiveQuantizer.h b/faiss/impl/AdditiveQuantizer.h
@@ -10,6 +10,8 @@
 #include <cstdint>
 #include <vector>
 
+#include <faiss/Index.h>
+
 namespace faiss {
 
 /** Abstract structure for additive quantizers
@@ -26,8 +28,9 @@ struct AdditiveQuantizer {
 
     // derived values
     std::vector<size_t> codebook_offsets;
-    size_t code_size; ///< code size in bytes
-    size_t tot_bits;  ///< total number of bits
+    size_t code_size;           ///< code size in bytes
+    size_t tot_bits;            ///< total number of bits
+    size_t total_codebook_size; ///< size of the codebook in vectors
     bool is_byte_aligned;
 
     bool verbose;    ///< verbose during training?
@@ -66,6 +69,46 @@ struct AdditiveQuantizer {
      */
     void decode(const uint8_t* codes, float* x, size_t n) const;
 
+    /****************************************************************************
+     * Support for exhaustive distance computations with the centroids.
+     * Hence, the number of elements that can be enumerated is not too large.
+     ****************************************************************************/
+    using idx_t = Index::idx_t;
+
+    /// decoding function for a code in a 64-bit word
+    void decode_64bit(idx_t n, float* x) const;
+
+    /** Compute inner-product look-up tables. Used in the centroid search
+     * functions.
+     *
+     * @param xq     query vector, size (n, d)
+     * @param LUT    look-up table, size (n, total_codebook_size)
+     */
+    void compute_LUT(size_t n, const float* xq, float* LUT) const;
+
+    /// exact IP search
+    void knn_exact_inner_product(
+            idx_t n,
+            const float* xq,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+
+    /** For L2 search we need the L2 norms of the centroids
+     *
+     * @param norms    output norms table, size total_codebook_size
+     */
+    void compute_centroid_norms(float* norms) const;
+
+    /** Exact L2 search, with precomputed norms */
+    void knn_exact_L2(
+            idx_t n,
+            const float* xq,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const float* centroid_norms) const;
+
     virtual ~AdditiveQuantizer();
 };
 
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
@@ -490,6 +490,7 @@ Index* read_index(IOReader* f, int io_flags) {
         read_index_header(idxr, f);
         read_ResidualQuantizer(&idxr->rq, f);
         READ1(idxr->beam_factor);
+        idxr->set_beam_factor(idxr->beam_factor);
         idx = idxr;
     } else if (h == fourcc("IvFl") || h == fourcc("IvFL")) { // legacy
         IndexIVFFlat* ivfl = new IndexIVFFlat();
diff --git a/tests/test_residual_quantizer.py b/tests/test_residual_quantizer.py