feat: support IVF_HNSW_PQ (#1314)

this also simplifies the code of creating index with macro --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2026-01-05 19:32:56 +00:00 · 2024-05-24 18:32:00 +08:00
parent def087fc85
commit fd1a5ce788
3 changed files with 335 additions and 161 deletions
--- a/rust/lancedb/src/index.rs
+++ b/rust/lancedb/src/index.rs
@@ -21,7 +21,7 @@ use crate::{table::TableInternal, Result};

 use self::{
    scalar::BTreeIndexBuilder,
-    vector::{IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
+    vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
 };

 pub mod scalar;
@@ -31,6 +31,7 @@ pub enum Index {
    Auto,
    BTree(BTreeIndexBuilder),
    IvfPq(IvfPqIndexBuilder),
+    IvfHnswPq(IvfHnswPqIndexBuilder),
    IvfHnswSq(IvfHnswSqIndexBuilder),
 }

@@ -72,6 +73,7 @@ impl IndexBuilder {
 #[derive(Debug, Clone, PartialEq)]
 pub enum IndexType {
    IvfPq,
+    IvfHnswPq,
    IvfHnswSq,
    BTree,
 }
--- a/rust/lancedb/src/index/vector.rs
+++ b/rust/lancedb/src/index/vector.rs
@@ -44,6 +44,120 @@ impl VectorIndex {
    }
 }

+macro_rules! impl_distance_type_setter {
+    () => {
+        /// [DistanceType] to use to build the index.
+        ///
+        /// Default value is [DistanceType::L2].
+        ///
+        /// This is used when training the index to calculate the IVF partitions (vectors are
+        /// grouped in partitions with similar vectors according to this distance type) and to
+        /// calculate a subvector's code during quantization.
+        ///
+        /// The metric type used to train an index MUST match the metric type used to search the
+        /// index.  Failure to do so will yield inaccurate results.
+        pub fn distance_type(mut self, distance_type: DistanceType) -> Self {
+            self.distance_type = distance_type;
+            self
+        }
+    };
+}
+
+macro_rules! impl_ivf_params_setter {
+    () => {
+        /// The number of IVF partitions to create.
+        ///
+        /// This value should generally scale with the number of rows in the dataset.  By default
+        /// the number of partitions is the square root of the number of rows.
+        ///
+        /// If this value is too large then the first part of the search (picking the right partition)
+        /// will be slow.  If this value is too small then the second part of the search (searching
+        /// within a partition) will be slow.
+        pub fn num_partitions(mut self, num_partitions: u32) -> Self {
+            self.num_partitions = Some(num_partitions);
+            self
+        }
+
+        /// The rate used to calculate the number of training vectors for kmeans.
+        ///
+        /// When an IVF index is trained, we need to calculate partitions.  These are groups
+        /// of vectors that are similar to each other.  To do this we use an algorithm called kmeans.
+        ///
+        /// Running kmeans on a large dataset can be slow.  To speed this up we run kmeans on a
+        /// random sample of the data.  This parameter controls the size of the sample.  The total
+        /// number of vectors used to train the index is `sample_rate * num_partitions`.
+        ///
+        /// Increasing this value might improve the quality of the index but in most cases the
+        /// default should be sufficient.
+        ///
+        /// The default value is 256.
+        pub fn sample_rate(mut self, sample_rate: u32) -> Self {
+            self.sample_rate = sample_rate;
+            self
+        }
+
+        /// Max iterations to train kmeans.
+        ///
+        /// When training an IVF index we use kmeans to calculate the partitions.  This parameter
+        /// controls how many iterations of kmeans to run.
+        ///
+        /// Increasing this might improve the quality of the index but in most cases the parameter
+        /// is unused because kmeans will converge with fewer iterations.  The parameter is only
+        /// used in cases where kmeans does not appear to converge.  In those cases it is unlikely
+        /// that setting this larger will lead to the index converging anyways.
+        ///
+        /// The default value is 50.
+        pub fn max_iterations(mut self, max_iterations: u32) -> Self {
+            self.max_iterations = max_iterations;
+            self
+        }
+    };
+}
+
+macro_rules! impl_pq_params_setter {
+    () => {
+        /// Number of sub-vectors of PQ.
+        ///
+        /// This value controls how much the vector is compressed during the quantization step.
+        /// The more sub vectors there are the less the vector is compressed.  The default is
+        /// the dimension of the vector divided by 16.  If the dimension is not evenly divisible
+        /// by 16 we use the dimension divded by 8.
+        ///
+        /// The above two cases are highly preferred.  Having 8 or 16 values per subvector allows
+        /// us to use efficient SIMD instructions.
+        ///
+        /// If the dimension is not visible by 8 then we use 1 subvector.  This is not ideal and
+        /// will likely result in poor performance.
+        pub fn num_sub_vectors(mut self, num_sub_vectors: u32) -> Self {
+            self.num_sub_vectors = Some(num_sub_vectors);
+            self
+        }
+    };
+}
+
+macro_rules! impl_hnsw_params_setter {
+    () => {
+        /// The number of neighbors to select for each vector in the HNSW graph.
+        /// This value controls the tradeoff between search speed and accuracy.
+        /// The higher the value the more accurate the search but the slower it will be.
+        /// The default value is 20.
+        pub fn num_edges(mut self, m: u32) -> Self {
+            self.m = m;
+            self
+        }
+
+        /// The number of candidates to evaluate during the construction of the HNSW graph.
+        /// This value controls the tradeoff between build speed and accuracy.
+        /// The higher the value the more accurate the build but the slower it will be.
+        /// This value should be set to a value that is not less than `ef` in the search phase.
+        /// The default value is 300.
+        pub fn ef_construction(mut self, ef_construction: u32) -> Self {
+            self.ef_construction = ef_construction;
+            self
+        }
+    };
+}
+
 /// Builder for an IVF PQ index.
 ///
 /// This index stores a compressed (quantized) copy of every vector.  These vectors
@@ -90,84 +204,9 @@ impl Default for IvfPqIndexBuilder {
 }

 impl IvfPqIndexBuilder {
-    /// [DistanceType] to use to build the index.
-    ///
-    /// Default value is [DistanceType::L2].
-    ///
-    /// This is used when training the index to calculate the IVF partitions (vectors are
-    /// grouped in partitions with similar vectors according to this distance type) and to
-    /// calculate a subvector's code during quantization.
-    ///
-    /// The metric type used to train an index MUST match the metric type used to search the
-    /// index.  Failure to do so will yield inaccurate results.
-    pub fn distance_type(mut self, distance_type: DistanceType) -> Self {
-        self.distance_type = distance_type;
-        self
-    }
-
-    /// The number of IVF partitions to create.
-    ///
-    /// This value should generally scale with the number of rows in the dataset.  By default
-    /// the number of partitions is the square root of the number of rows.
-    ///
-    /// If this value is too large then the first part of the search (picking the right partition)
-    /// will be slow.  If this value is too small then the second part of the search (searching
-    /// within a partition) will be slow.
-    pub fn num_partitions(mut self, num_partitions: u32) -> Self {
-        self.num_partitions = Some(num_partitions);
-        self
-    }
-
-    /// Number of sub-vectors of PQ.
-    ///
-    /// This value controls how much the vector is compressed during the quantization step.
-    /// The more sub vectors there are the less the vector is compressed.  The default is
-    /// the dimension of the vector divided by 16.  If the dimension is not evenly divisible
-    /// by 16 we use the dimension divded by 8.
-    ///
-    /// The above two cases are highly preferred.  Having 8 or 16 values per subvector allows
-    /// us to use efficient SIMD instructions.
-    ///
-    /// If the dimension is not visible by 8 then we use 1 subvector.  This is not ideal and
-    /// will likely result in poor performance.
-    pub fn num_sub_vectors(mut self, num_sub_vectors: u32) -> Self {
-        self.num_sub_vectors = Some(num_sub_vectors);
-        self
-    }
-
-    /// The rate used to calculate the number of training vectors for kmeans.
-    ///
-    /// When an IVF PQ index is trained, we need to calculate partitions.  These are groups
-    /// of vectors that are similar to each other.  To do this we use an algorithm called kmeans.
-    ///
-    /// Running kmeans on a large dataset can be slow.  To speed this up we run kmeans on a
-    /// random sample of the data.  This parameter controls the size of the sample.  The total
-    /// number of vectors used to train the index is `sample_rate * num_partitions`.
-    ///
-    /// Increasing this value might improve the quality of the index but in most cases the
-    /// default should be sufficient.
-    ///
-    /// The default value is 256.
-    pub fn sample_rate(mut self, sample_rate: u32) -> Self {
-        self.sample_rate = sample_rate;
-        self
-    }
-
-    /// Max iterations to train kmeans.
-    ///
-    /// When training an IVF PQ index we use kmeans to calculate the partitions.  This parameter
-    /// controls how many iterations of kmeans to run.
-    ///
-    /// Increasing this might improve the quality of the index but in most cases the parameter
-    /// is unused because kmeans will converge with fewer iterations.  The parameter is only
-    /// used in cases where kmeans does not appear to converge.  In those cases it is unlikely
-    /// that setting this larger will lead to the index converging anyways.
-    ///
-    /// The default value is 50.
-    pub fn max_iterations(mut self, max_iterations: u32) -> Self {
-        self.max_iterations = max_iterations;
-        self
-    }
+    impl_distance_type_setter!();
+    impl_ivf_params_setter!();
+    impl_pq_params_setter!();
 }

 pub(crate) fn suggested_num_partitions(rows: usize) -> u32 {
@@ -190,6 +229,51 @@ pub(crate) fn suggested_num_sub_vectors(dim: u32) -> u32 {
    }
 }

+/// Builder for an IVF HNSW PQ index.
+///
+/// This index is a combination of IVF and HNSW.
+/// The IVF part is the same as the IVF PQ index.
+/// For each IVF partition, this builds a HNSW graph, the graph is used to
+/// quickly find the closest vectors to a query vector.
+///
+/// The PQ (product quantizer) is used to compress the vectors as the same as IVF PQ.
+#[derive(Debug, Clone)]
+pub struct IvfHnswPqIndexBuilder {
+    // IVF
+    pub(crate) distance_type: DistanceType,
+    pub(crate) num_partitions: Option<u32>,
+    pub(crate) sample_rate: u32,
+    pub(crate) max_iterations: u32,
+
+    // HNSW
+    pub(crate) m: u32,
+    pub(crate) ef_construction: u32,
+
+    // PQ
+    pub(crate) num_sub_vectors: Option<u32>,
+}
+
+impl Default for IvfHnswPqIndexBuilder {
+    fn default() -> Self {
+        Self {
+            distance_type: DistanceType::L2,
+            num_partitions: None,
+            num_sub_vectors: None,
+            sample_rate: 256,
+            max_iterations: 50,
+            m: 20,
+            ef_construction: 300,
+        }
+    }
+}
+
+impl IvfHnswPqIndexBuilder {
+    impl_distance_type_setter!();
+    impl_ivf_params_setter!();
+    impl_hnsw_params_setter!();
+    impl_pq_params_setter!();
+}
+
 /// Builder for an IVF_HNSW_SQ index.
 ///
 /// This index is a combination of IVF and HNSW.
@@ -228,85 +312,7 @@ impl Default for IvfHnswSqIndexBuilder {
 }

 impl IvfHnswSqIndexBuilder {
-    /// [DistanceType] to use to build the index.
-    ///
-    /// Default value is [DistanceType::L2].
-    ///
-    /// This is used when training the index to calculate the IVF partitions (vectors are
-    /// grouped in partitions with similar vectors according to this distance type)
-    ///
-    /// The metric type used to train an index MUST match the metric type used to search the
-    /// index.  Failure to do so will yield inaccurate results.
-    ///
-    /// Now IVF_HNSW_SQ only supports L2 and Cosine distance types.
-    pub fn distance_type(mut self, distance_type: DistanceType) -> Self {
-        self.distance_type = distance_type;
-        self
-    }
-
-    /// The number of IVF partitions to create.
-    ///
-    /// This value should generally scale with the number of rows in the dataset.  By default
-    /// the number of partitions is the square root of the number of rows.
-    ///
-    /// If this value is too large then the first part of the search (picking the right partition)
-    /// will be slow.  If this value is too small then the second part of the search (searching
-    /// within a partition) will be slow.
-    pub fn num_partitions(mut self, num_partitions: u32) -> Self {
-        self.num_partitions = Some(num_partitions);
-        self
-    }
-
-    /// The rate used to calculate the number of training vectors for kmeans and SQ.
-    ///
-    /// When an IVF_HNSW_SQ index is trained, we need to calculate partitions and min/max value of vectors.  These are groups
-    /// of vectors that are similar to each other.  To do this we use an algorithm called kmeans.
-    ///
-    /// Running kmeans on a large dataset can be slow.  To speed this up we run kmeans on a
-    /// random sample of the data.  This parameter controls the size of the sample.  The total
-    /// number of vectors used to train the IVF is `sample_rate * num_partitions`.
-    ///
-    /// The total number of vectors used to train the SQ is `sample_rate * 2^{num_bits}`.
-    ///
-    /// Increasing this value might improve the quality of the index but in most cases the
-    /// default should be sufficient.
-    ///
-    /// The default value is 256.
-    pub fn sample_rate(mut self, sample_rate: u32) -> Self {
-        self.sample_rate = sample_rate;
-        self
-    }
-
-    /// Max iterations to train kmeans.
-    ///
-    /// When training an IVF index we use kmeans to calculate the partitions.  This parameter
-    /// controls how many iterations of kmeans to run.
-    ///
-    /// Increasing this might improve the quality of the index but in most cases the parameter
-    /// is unused because kmeans will converge with fewer iterations.  The parameter is only
-    /// used in cases where kmeans does not appear to converge.  In those cases it is unlikely
-    /// that setting this larger will lead to the index converging anyways.
-    ///
-    /// The default value is 50.
-    pub fn max_iterations(mut self, max_iterations: u32) -> Self {
-        self.max_iterations = max_iterations;
-        self
-    }
-
-    /// The number of neighbors to select for each vector in the HNSW graph.
-    /// Bumping this number will increase the recall of the search but also increase the build/search time.
-    /// The default value is 20.
-    pub fn m(mut self, m: u32) -> Self {
-        self.m = m;
-        self
-    }
-
-    /// The number of candidates to evaluate during the construction of the HNSW graph.
-    /// Bumping this number will increase the recall of the search but also increase the build/search time.
-    /// This value should be not less than `ef` in the search phase.
-    /// The default value is 300.
-    pub fn ef_construction(mut self, ef_construction: u32) -> Self {
-        self.ef_construction = ef_construction;
-        self
-    }
+    impl_distance_type_setter!();
+    impl_ivf_params_setter!();
+    impl_hnsw_params_setter!();
 }
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -37,6 +37,7 @@ use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatch
 use lance::io::WrappingObjectStore;
 use lance_index::vector::hnsw::builder::HnswBuildParams;
 use lance_index::vector::ivf::IvfBuildParams;
+use lance_index::vector::pq::PQBuildParams;
 use lance_index::vector::sq::builder::SQBuildParams;
 use lance_index::DatasetIndexExt;
 use lance_index::IndexType;
@@ -48,7 +49,9 @@ use crate::arrow::IntoArrow;
 use crate::connection::NoData;
 use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
 use crate::error::{Error, Result};
-use crate::index::vector::{IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex};
+use crate::index::vector::{
+    IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex,
+};
 use crate::index::IndexConfig;
 use crate::index::IndexStatistics;
 use crate::index::{
@@ -1315,6 +1318,69 @@ impl NativeTable {
        Ok(())
    }

+    async fn create_ivf_hnsw_pq_index(
+        &self,
+        index: IvfHnswPqIndexBuilder,
+        field: &Field,
+        replace: bool,
+    ) -> Result<()> {
+        if !Self::supported_vector_data_type(field.data_type()) {
+            return Err(Error::InvalidInput {
+                message: format!(
+                    "An IVF HNSW PQ index cannot be created on the column `{}` which has data type {}",
+                    field.name(),
+                    field.data_type()
+                ),
+            });
+        }
+
+        let num_partitions = if let Some(n) = index.num_partitions {
+            n
+        } else {
+            suggested_num_partitions(self.count_rows(None).await?)
+        };
+        let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
+            n
+        } else {
+            match field.data_type() {
+                arrow_schema::DataType::FixedSizeList(_, n) => {
+                    Ok::<u32, Error>(suggested_num_sub_vectors(*n as u32))
+                }
+                _ => Err(Error::Schema {
+                    message: format!("Column '{}' is not a FixedSizeList", field.name()),
+                }),
+            }?
+        };
+
+        let mut dataset = self.dataset.get_mut().await?;
+        let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
+        ivf_params.sample_rate = index.sample_rate as usize;
+        ivf_params.max_iters = index.max_iterations as usize;
+        let hnsw_params = HnswBuildParams::default()
+            .num_edges(index.m as usize)
+            .ef_construction(index.ef_construction as usize);
+        let pq_params = PQBuildParams {
+            num_sub_vectors: num_sub_vectors as usize,
+            ..Default::default()
+        };
+        let lance_idx_params = lance::index::vector::VectorIndexParams::with_ivf_hnsw_pq_params(
+            index.distance_type.into(),
+            ivf_params,
+            hnsw_params,
+            pq_params,
+        );
+        dataset
+            .create_index(
+                &[field.name()],
+                IndexType::Vector,
+                None,
+                &lance_idx_params,
+                replace,
+            )
+            .await?;
+        Ok(())
+    }
+
    async fn create_ivf_hnsw_sq_index(
        &self,
        index: IvfHnswSqIndexBuilder,
@@ -1609,6 +1675,10 @@ impl TableInternal for NativeTable {
            Index::Auto => self.create_auto_index(field, opts).await,
            Index::BTree(_) => self.create_btree_index(field, opts).await,
            Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
+            Index::IvfHnswPq(ivf_hnsw_pq) => {
+                self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace)
+                    .await
+            }
            Index::IvfHnswSq(ivf_hnsw_sq) => {
                self.create_ivf_hnsw_sq_index(ivf_hnsw_sq, field, opts.replace)
                    .await
@@ -2591,6 +2661,102 @@ mod tests {
        );
    }

+    #[tokio::test]
+    async fn test_create_index_ivf_hnsw_pq() {
+        use arrow_array::RecordBatch;
+        use arrow_schema::{DataType, Field, Schema as ArrowSchema};
+        use rand;
+        use std::iter::repeat_with;
+
+        use arrow_array::Float32Array;
+
+        let tmp_dir = tempdir().unwrap();
+        let uri = tmp_dir.path().to_str().unwrap();
+        let conn = connect(uri).execute().await.unwrap();
+
+        let dimension = 16;
+        let schema = Arc::new(ArrowSchema::new(vec![Field::new(
+            "embeddings",
+            DataType::FixedSizeList(
+                Arc::new(Field::new("item", DataType::Float32, true)),
+                dimension,
+            ),
+            false,
+        )]));
+
+        let mut rng = rand::thread_rng();
+        let float_arr = Float32Array::from(
+            repeat_with(|| rng.gen::<f32>())
+                .take(512 * dimension as usize)
+                .collect::<Vec<f32>>(),
+        );
+
+        let vectors = Arc::new(create_fixed_size_list(float_arr, dimension).unwrap());
+        let batches = RecordBatchIterator::new(
+            vec![RecordBatch::try_new(schema.clone(), vec![vectors.clone()]).unwrap()]
+                .into_iter()
+                .map(Ok),
+            schema,
+        );
+
+        let table = conn.create_table("test", batches).execute().await.unwrap();
+
+        assert_eq!(
+            table
+                .as_native()
+                .unwrap()
+                .count_indexed_rows("my_index")
+                .await
+                .unwrap(),
+            None
+        );
+        assert_eq!(
+            table
+                .as_native()
+                .unwrap()
+                .count_unindexed_rows("my_index")
+                .await
+                .unwrap(),
+            None
+        );
+
+        let index = IvfHnswPqIndexBuilder::default();
+        table
+            .create_index(&["embeddings"], Index::IvfHnswPq(index))
+            .execute()
+            .await
+            .unwrap();
+
+        let index_configs = table.list_indices().await.unwrap();
+        assert_eq!(index_configs.len(), 1);
+        let index = index_configs.into_iter().next().unwrap();
+        assert_eq!(index.index_type, crate::index::IndexType::IvfPq);
+        assert_eq!(index.columns, vec!["embeddings".to_string()]);
+        assert_eq!(table.count_rows(None).await.unwrap(), 512);
+        assert_eq!(table.name(), "test");
+
+        let indices = table.as_native().unwrap().load_indices().await.unwrap();
+        let index_uuid = &indices[0].index_uuid;
+        assert_eq!(
+            table
+                .as_native()
+                .unwrap()
+                .count_indexed_rows(index_uuid)
+                .await
+                .unwrap(),
+            Some(512)
+        );
+        assert_eq!(
+            table
+                .as_native()
+                .unwrap()
+                .count_unindexed_rows(index_uuid)
+                .await
+                .unwrap(),
+            Some(0)
+        );
+    }
+
    fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
        let list_type = DataType::FixedSizeList(
            Arc::new(Field::new("item", values.data_type().clone(), true)),