From b3c02270654f9afbba866a584a7461bf7e008cff Mon Sep 17 00:00:00 2001
From: Gagan Bhullar <g.deepsingh1@gmail.com>
Date: Thu, 19 Sep 2024 11:32:46 -0600
Subject: [PATCH] docs: hnsw documentation (#1640)

PR closes #1627

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
---
 nodejs/lancedb/indices.ts      | 208 ++++++++++++++++++++++++++++++++-
 python/python/lancedb/index.py | 191 +++++++++++++++++++++++++++++-
 rust/lancedb/src/index.rs      |   2 +
 3 files changed, 397 insertions(+), 4 deletions(-)

diff --git a/nodejs/lancedb/indices.ts b/nodejs/lancedb/indices.ts
index 601d719d..e7ce10b9 100644
--- a/nodejs/lancedb/indices.ts
+++ b/nodejs/lancedb/indices.ts
@@ -113,22 +113,218 @@ export interface IvfPqOptions {
   sampleRate?: number;
 }
 
+/**
+ * Options to create an `HNSW_PQ` index
+ */
 export interface HnswPqOptions {
+  /**
+   * The distance metric used to train the index.
+   *
+   * Default value is "l2".
+   *
+   * The following distance types are available:
+   *
+   * "l2" - Euclidean distance. This is a very common distance metric that
+   * accounts for both magnitude and direction when determining the distance
+   * between vectors. L2 distance has a range of [0, ∞).
+   *
+   * "cosine" - Cosine distance.  Cosine distance is a distance metric
+   * calculated from the cosine similarity between two vectors. Cosine
+   * similarity is a measure of similarity between two non-zero vectors of an
+   * inner product space. It is defined to equal the cosine of the angle
+   * between them.  Unlike L2, the cosine distance is not affected by the
+   * magnitude of the vectors.  Cosine distance has a range of [0, 2].
+   *
+   * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
+   * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
+   * L2 norm is 1), then dot distance is equivalent to the cosine distance.
+   */
   distanceType?: "l2" | "cosine" | "dot";
+
+  /**
+   * The number of IVF partitions to create.
+   *
+   * For HNSW, we recommend a small number of partitions. Setting this to 1 works
+   * well for most tables. For very large tables, training just one HNSW graph
+   * will require too much memory. Each partition becomes its own HNSW graph, so
+   * setting this value higher reduces the peak memory use of training.
+   *
+   */
   numPartitions?: number;
+
+  /**
+   * Number of sub-vectors of PQ.
+   *
+   * This value controls how much the vector is compressed during the quantization step.
+   * The more sub vectors there are the less the vector is compressed.  The default is
+   * the dimension of the vector divided by 16.  If the dimension is not evenly divisible
+   * by 16 we use the dimension divded by 8.
+   *
+   * The above two cases are highly preferred.  Having 8 or 16 values per subvector allows
+   * us to use efficient SIMD instructions.
+   *
+   * If the dimension is not visible by 8 then we use 1 subvector.  This is not ideal and
+   * will likely result in poor performance.
+   *
+   */
   numSubVectors?: number;
+
+  /**
+   * Max iterations to train kmeans.
+   *
+   * The default value is 50.
+   *
+   * When training an IVF index we use kmeans to calculate the partitions.  This parameter
+   * controls how many iterations of kmeans to run.
+   *
+   * Increasing this might improve the quality of the index but in most cases the parameter
+   * is unused because kmeans will converge with fewer iterations.  The parameter is only
+   * used in cases where kmeans does not appear to converge.  In those cases it is unlikely
+   * that setting this larger will lead to the index converging anyways.
+   *
+   */
   maxIterations?: number;
+
+  /**
+   * The rate used to calculate the number of training vectors for kmeans.
+   *
+   * Default value is 256.
+   *
+   * When an IVF index is trained, we need to calculate partitions.  These are groups
+   * of vectors that are similar to each other.  To do this we use an algorithm called kmeans.
+   *
+   * Running kmeans on a large dataset can be slow.  To speed this up we run kmeans on a
+   * random sample of the data.  This parameter controls the size of the sample.  The total
+   * number of vectors used to train the index is `sample_rate * num_partitions`.
+   *
+   * Increasing this value might improve the quality of the index but in most cases the
+   * default should be sufficient.
+   *
+   */
   sampleRate?: number;
+
+  /**
+   * The number of neighbors to select for each vector in the HNSW graph.
+   *
+   * The default value is 20.
+   *
+   * This value controls the tradeoff between search speed and accuracy.
+   * The higher the value the more accurate the search but the slower it will be.
+   *
+   */
   m?: number;
+
+  /**
+   * The number of candidates to evaluate during the construction of the HNSW graph.
+   *
+   * The default value is 300.
+   *
+   * This value controls the tradeoff between build speed and accuracy.
+   * The higher the value the more accurate the build but the slower it will be.
+   * 150 to 300 is the typical range. 100 is a minimum for good quality search
+   * results. In most cases, there is no benefit to setting this higher than 500.
+   * This value should be set to a value that is not less than `ef` in the search phase.
+   *
+   */
   efConstruction?: number;
 }
 
+/**
+ * Options to create an `HNSW_SQ` index
+ */
 export interface HnswSqOptions {
+  /**
+   * The distance metric used to train the index.
+   *
+   * Default value is "l2".
+   *
+   * The following distance types are available:
+   *
+   * "l2" - Euclidean distance. This is a very common distance metric that
+   * accounts for both magnitude and direction when determining the distance
+   * between vectors. L2 distance has a range of [0, ∞).
+   *
+   * "cosine" - Cosine distance.  Cosine distance is a distance metric
+   * calculated from the cosine similarity between two vectors. Cosine
+   * similarity is a measure of similarity between two non-zero vectors of an
+   * inner product space. It is defined to equal the cosine of the angle
+   * between them.  Unlike L2, the cosine distance is not affected by the
+   * magnitude of the vectors.  Cosine distance has a range of [0, 2].
+   *
+   * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
+   * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
+   * L2 norm is 1), then dot distance is equivalent to the cosine distance.
+   */
   distanceType?: "l2" | "cosine" | "dot";
+
+  /**
+   * The number of IVF partitions to create.
+   *
+   * For HNSW, we recommend a small number of partitions. Setting this to 1 works
+   * well for most tables. For very large tables, training just one HNSW graph
+   * will require too much memory. Each partition becomes its own HNSW graph, so
+   * setting this value higher reduces the peak memory use of training.
+   *
+   */
   numPartitions?: number;
+
+  /**
+   * Max iterations to train kmeans.
+   *
+   * The default value is 50.
+   *
+   * When training an IVF index we use kmeans to calculate the partitions.  This parameter
+   * controls how many iterations of kmeans to run.
+   *
+   * Increasing this might improve the quality of the index but in most cases the parameter
+   * is unused because kmeans will converge with fewer iterations.  The parameter is only
+   * used in cases where kmeans does not appear to converge.  In those cases it is unlikely
+   * that setting this larger will lead to the index converging anyways.
+   *
+   */
   maxIterations?: number;
+
+  /**
+   * The rate used to calculate the number of training vectors for kmeans.
+   *
+   * Default value is 256.
+   *
+   * When an IVF index is trained, we need to calculate partitions.  These are groups
+   * of vectors that are similar to each other.  To do this we use an algorithm called kmeans.
+   *
+   * Running kmeans on a large dataset can be slow.  To speed this up we run kmeans on a
+   * random sample of the data.  This parameter controls the size of the sample.  The total
+   * number of vectors used to train the index is `sample_rate * num_partitions`.
+   *
+   * Increasing this value might improve the quality of the index but in most cases the
+   * default should be sufficient.
+   *
+   */
   sampleRate?: number;
+
+  /**
+   * The number of neighbors to select for each vector in the HNSW graph.
+   *
+   * The default value is 20.
+   *
+   * This value controls the tradeoff between search speed and accuracy.
+   * The higher the value the more accurate the search but the slower it will be.
+   *
+   */
   m?: number;
+
+  /**
+   * The number of candidates to evaluate during the construction of the HNSW graph.
+   *
+   * The default value is 300.
+   *
+   * This value controls the tradeoff between build speed and accuracy.
+   * The higher the value the more accurate the build but the slower it will be.
+   * 150 to 300 is the typical range. 100 is a minimum for good quality search
+   * results. In most cases, there is no benefit to setting this higher than 500.
+   * This value should be set to a value that is not less than `ef` in the search phase.
+   *
+   */
   efConstruction?: number;
 }
 
@@ -249,7 +445,11 @@ export class Index {
 
   /**
    *
-   * Create a hnswpq index
+   * Create a hnswPq index
+   *
+   * HNSW-PQ stands for Hierarchical Navigable Small World - Product Quantization.
+   * It is a variant of the HNSW algorithm that uses product quantization to compress
+   * the vectors.
    *
    */
   static hnswPq(options?: Partial<HnswPqOptions>) {
@@ -268,7 +468,11 @@ export class Index {
 
   /**
    *
-   * Create a hnswsq index
+   * Create a hnswSq index
+   *
+   * HNSW-SQ stands for Hierarchical Navigable Small World - Scalar Quantization.
+   * It is a variant of the HNSW algorithm that uses scalar quantization to compress
+   * the vectors.
    *
    */
   static hnswSq(options?: Partial<HnswSqOptions>) {
diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py
index bedbb097..78160ac6 100644
--- a/python/python/lancedb/index.py
+++ b/python/python/lancedb/index.py
@@ -83,7 +83,108 @@ class FTS:
 
 
 class HnswPq:
-    """Describe a Hnswpq index configuration."""
+    """Describe a HNSW-PQ index configuration.
+
+    HNSW-PQ stands for Hierarchical Navigable Small World - Product Quantization.
+    It is a variant of the HNSW algorithm that uses product quantization to compress
+    the vectors. To create an HNSW-PQ index, you can specify the following parameters:
+
+    Parameters
+    ----------
+
+    distance_type: str, default "L2"
+
+        The distance metric used to train the index.
+
+        The following distance types are available:
+
+        "l2" - Euclidean distance. This is a very common distance metric that
+        accounts for both magnitude and direction when determining the distance
+        between vectors. L2 distance has a range of [0, ∞).
+
+        "cosine" - Cosine distance.  Cosine distance is a distance metric
+        calculated from the cosine similarity between two vectors. Cosine
+        similarity is a measure of similarity between two non-zero vectors of an
+        inner product space. It is defined to equal the cosine of the angle
+        between them.  Unlike L2, the cosine distance is not affected by the
+        magnitude of the vectors.  Cosine distance has a range of [0, 2].
+
+        "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
+        distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
+        L2 norm is 1), then dot distance is equivalent to the cosine distance.
+
+    num_partitions, default sqrt(num_rows)
+
+        The number of IVF partitions to create.
+
+        For HNSW, we recommend a small number of partitions. Setting this to 1 works
+        well for most tables. For very large tables, training just one HNSW graph
+        will require too much memory. Each partition becomes its own HNSW graph, so
+        setting this value higher reduces the peak memory use of training.
+
+    num_sub_vectors, default is vector dimension / 16
+
+        Number of sub-vectors of PQ.
+
+        This value controls how much the vector is compressed during the
+        quantization step. The more sub vectors there are the less the vector is
+        compressed.  The default is the dimension of the vector divided by 16.
+        If the dimension is not evenly divisible by 16 we use the dimension
+        divided by 8.
+
+        The above two cases are highly preferred.  Having 8 or 16 values per
+        subvector allows us to use efficient SIMD instructions.
+
+        If the dimension is not visible by 8 then we use 1 subvector.  This is not
+        ideal and will likely result in poor performance.
+
+    max_iterations, default 50
+
+        Max iterations to train kmeans.
+
+        When training an IVF index we use kmeans to calculate the partitions.  This
+        parameter controls how many iterations of kmeans to run.
+
+        Increasing this might improve the quality of the index but in most cases the
+        parameter is unused because kmeans will converge with fewer iterations.  The
+        parameter is only used in cases where kmeans does not appear to converge.  In
+        those cases it is unlikely that setting this larger will lead to the index
+        converging anyways.
+
+    sample_rate, default 256
+
+        The rate used to calculate the number of training vectors for kmeans.
+
+        When an IVF index is trained, we need to calculate partitions.  These are
+        groups of vectors that are similar to each other.  To do this we use an
+        algorithm called kmeans.
+
+        Running kmeans on a large dataset can be slow.  To speed this up we
+        run kmeans on a random sample of the data.  This parameter controls the
+        size of the sample.  The total number of vectors used to train the index
+        is `sample_rate * num_partitions`.
+
+        Increasing this value might improve the quality of the index but in
+        most cases the default should be sufficient.
+
+    m, default 20
+
+        The number of neighbors to select for each vector in the HNSW graph.
+
+        This value controls the tradeoff between search speed and accuracy.
+        The higher the value the more accurate the search but the slower it will be.
+
+    ef_construction, default 300
+
+        The number of candidates to evaluate during the construction of the HNSW graph.
+
+        This value controls the tradeoff between build speed and accuracy.
+        The higher the value the more accurate the build but the slower it will be.
+        150 to 300 is the typical range. 100 is a minimum for good quality search
+        results. In most cases, there is no benefit to setting this higher than 500.
+        This value should be set to a value that is not less than `ef` in the
+        search phase.
+    """
 
     def __init__(
         self,
@@ -108,7 +209,93 @@ class HnswPq:
 
 
 class HnswSq:
-    """Describe a HNSW-SQ index configuration."""
+    """Describe a HNSW-SQ index configuration.
+
+    HNSW-SQ stands for Hierarchical Navigable Small World - Scalar Quantization.
+    It is a variant of the HNSW algorithm that uses scalar quantization to compress
+    the vectors.
+
+    Parameters
+    ----------
+
+    distance_type: str, default "L2"
+
+        The distance metric used to train the index.
+
+        The following distance types are available:
+
+        "l2" - Euclidean distance. This is a very common distance metric that
+        accounts for both magnitude and direction when determining the distance
+        between vectors. L2 distance has a range of [0, ∞).
+
+        "cosine" - Cosine distance.  Cosine distance is a distance metric
+        calculated from the cosine similarity between two vectors. Cosine
+        similarity is a measure of similarity between two non-zero vectors of an
+        inner product space. It is defined to equal the cosine of the angle
+        between them.  Unlike L2, the cosine distance is not affected by the
+        magnitude of the vectors.  Cosine distance has a range of [0, 2].
+
+        "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
+        distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
+        L2 norm is 1), then dot distance is equivalent to the cosine distance.
+
+    num_partitions, default sqrt(num_rows)
+
+        The number of IVF partitions to create.
+
+        For HNSW, we recommend a small number of partitions. Setting this to 1 works
+        well for most tables. For very large tables, training just one HNSW graph
+        will require too much memory. Each partition becomes its own HNSW graph, so
+        setting this value higher reduces the peak memory use of training.
+
+    max_iterations, default 50
+
+        Max iterations to train kmeans.
+
+        When training an IVF index we use kmeans to calculate the partitions.
+        This parameter controls how many iterations of kmeans to run.
+
+        Increasing this might improve the quality of the index but in most cases
+        the parameter is unused because kmeans will converge with fewer iterations.
+        The parameter is only used in cases where kmeans does not appear to converge.
+        In those cases it is unlikely that setting this larger will lead to
+        the index converging anyways.
+
+    sample_rate, default 256
+
+        The rate used to calculate the number of training vectors for kmeans.
+
+        When an IVF index is trained, we need to calculate partitions.  These
+        are groups of vectors that are similar to each other.  To do this
+        we use an algorithm called kmeans.
+
+        Running kmeans on a large dataset can be slow.  To speed this up we
+        run kmeans on a random sample of the data.  This parameter controls the
+        size of the sample.  The total number of vectors used to train the index
+        is `sample_rate * num_partitions`.
+
+        Increasing this value might improve the quality of the index but in
+        most cases the default should be sufficient.
+
+    m, default 20
+
+        The number of neighbors to select for each vector in the HNSW graph.
+
+        This value controls the tradeoff between search speed and accuracy.
+        The higher the value the more accurate the search but the slower it will be.
+
+    ef_construction, default 300
+
+        The number of candidates to evaluate during the construction of the HNSW graph.
+
+        This value controls the tradeoff between build speed and accuracy.
+        The higher the value the more accurate the build but the slower it will be.
+        150 to 300 is the typical range. 100 is a minimum for good quality search
+        results. In most cases, there is no benefit to setting this higher than 500.
+        This value should be set to a value that is not less than `ef` in the search
+        phase.
+
+    """
 
     def __init__(
         self,
diff --git a/rust/lancedb/src/index.rs b/rust/lancedb/src/index.rs
index 4540b427..8af6b3de 100644
--- a/rust/lancedb/src/index.rs
+++ b/rust/lancedb/src/index.rs
@@ -59,9 +59,11 @@ pub enum Index {
     IvfPq(IvfPqIndexBuilder),
 
     /// IVF-HNSW index with Product Quantization
+    /// It is a variant of the HNSW algorithm that uses product quantization to compress the vectors.
     IvfHnswPq(IvfHnswPqIndexBuilder),
 
     /// IVF-HNSW index with Scalar Quantization
+    /// It is a variant of the HNSW algorithm that uses scalar quantization to compress the vectors.
     IvfHnswSq(IvfHnswSqIndexBuilder),
 }