feat: support IVF_FLAT, binary vectors and hamming distance (#1955)

binary vectors and hamming distance can work on only IVF_FLAT, so introduce them all in this PR. --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2026-01-10 22:02:58 +00:00 · 2024-12-25 02:36:20 +08:00
parent ac0068b80e
commit e70fd4fecc
14 changed files with 390 additions and 35 deletions
--- a/python/python/lancedb/index.py
+++ b/python/python/lancedb/index.py
@@ -355,6 +355,97 @@ class HnswSq:
    ef_construction: int = 300


+@dataclass
+class IvfFlat:
+    """Describes an IVF Flat Index
+
+    This index stores raw vectors.
+    These vectors are grouped into partitions of similar vectors.
+    Each partition keeps track of a centroid which is
+    the average value of all vectors in the group.
+
+    Attributes
+    ----------
+    distance_type: str, default "L2"
+        The distance metric used to train the index
+
+        This is used when training the index to calculate the IVF partitions
+        (vectors are grouped in partitions with similar vectors according to this
+        distance type) and to calculate a subvector's code during quantization.
+
+        The distance type used to train an index MUST match the distance type used
+        to search the index.  Failure to do so will yield inaccurate results.
+
+        The following distance types are available:
+
+        "l2" - Euclidean distance. This is a very common distance metric that
+        accounts for both magnitude and direction when determining the distance
+        between vectors. L2 distance has a range of [0, ∞).
+
+        "cosine" - Cosine distance.  Cosine distance is a distance metric
+        calculated from the cosine similarity between two vectors. Cosine
+        similarity is a measure of similarity between two non-zero vectors of an
+        inner product space. It is defined to equal the cosine of the angle
+        between them.  Unlike L2, the cosine distance is not affected by the
+        magnitude of the vectors.  Cosine distance has a range of [0, 2].
+
+        Note: the cosine distance is undefined when one (or both) of the vectors
+        are all zeros (there is no direction).  These vectors are invalid and may
+        never be returned from a vector search.
+
+        "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
+        distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
+        L2 norm is 1), then dot distance is equivalent to the cosine distance.
+
+        "hamming" - Hamming distance. Hamming distance is a distance metric
+        calculated as the number of positions at which the corresponding bits are
+        different. Hamming distance has a range of [0, vector dimension].
+
+    num_partitions: int, default sqrt(num_rows)
+        The number of IVF partitions to create.
+
+        This value should generally scale with the number of rows in the dataset.
+        By default the number of partitions is the square root of the number of
+        rows.
+
+        If this value is too large then the first part of the search (picking the
+        right partition) will be slow.  If this value is too small then the second
+        part of the search (searching within a partition) will be slow.
+
+    max_iterations: int, default 50
+        Max iteration to train kmeans.
+
+        When training an IVF PQ index we use kmeans to calculate the partitions.
+        This parameter controls how many iterations of kmeans to run.
+
+        Increasing this might improve the quality of the index but in most cases
+        these extra iterations have diminishing returns.
+
+        The default value is 50.
+    sample_rate: int, default 256
+        The rate used to calculate the number of training vectors for kmeans.
+
+        When an IVF PQ index is trained, we need to calculate partitions.  These
+        are groups of vectors that are similar to each other.  To do this we use an
+        algorithm called kmeans.
+
+        Running kmeans on a large dataset can be slow.  To speed this up we run
+        kmeans on a random sample of the data.  This parameter controls the size of
+        the sample.  The total number of vectors used to train the index is
+        `sample_rate * num_partitions`.
+
+        Increasing this value might improve the quality of the index but in most
+        cases the default should be sufficient.
+
+        The default value is 256.
+    """
+
+    distance_type: Literal["l2", "cosine", "dot", "hamming"] = "l2"
+    num_partitions: Optional[int] = None
+    max_iterations: int = 50
+    sample_rate: int = 256
+
+
@dataclass
 class IvfPq:
    """Describes an IVF PQ Index
@@ -477,4 +568,4 @@ class IvfPq:
    sample_rate: int = 256


-__all__ = ["BTree", "IvfPq", "HnswPq", "HnswSq", "IndexConfig"]
+__all__ = ["BTree", "IvfFlat", "IvfPq", "HnswPq", "HnswSq", "IndexConfig"]
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -34,7 +34,7 @@ from lance.dependencies import _check_for_hugging_face

 from .common import DATA, VEC, VECTOR_COLUMN_NAME
 from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
-from .index import BTree, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
+from .index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
 from .merge import LanceMergeInsertBuilder
 from .pydantic import LanceModel, model_to_dict
 from .query import (
@@ -433,7 +433,9 @@ class Table(ABC):
        accelerator: Optional[str] = None,
        index_cache_size: Optional[int] = None,
        *,
-        index_type: Literal["IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"] = "IVF_PQ",
+        index_type: Literal[
+            "IVF_FLAT", "IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
+        ] = "IVF_PQ",
        num_bits: int = 8,
        max_iterations: int = 50,
        sample_rate: int = 256,
@@ -446,8 +448,9 @@ class Table(ABC):
        ----------
        metric: str, default "L2"
            The distance metric to use when creating the index.
-            Valid values are "L2", "cosine", or "dot".
+            Valid values are "L2", "cosine", "dot", or "hamming".
            L2 is euclidean distance.
+            Hamming is available only for binary vectors.
        num_partitions: int, default 256
            The number of IVF partitions to use when creating the index.
            Default is 256.
@@ -1408,7 +1411,9 @@ class LanceTable(Table):
        accelerator: Optional[str] = None,
        index_cache_size: Optional[int] = None,
        num_bits: int = 8,
-        index_type: Literal["IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"] = "IVF_PQ",
+        index_type: Literal[
+            "IVF_FLAT", "IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
+        ] = "IVF_PQ",
        max_iterations: int = 50,
        sample_rate: int = 256,
        m: int = 20,
@@ -1432,6 +1437,13 @@ class LanceTable(Table):
            )
            self.checkout_latest()
            return
+        elif index_type == "IVF_FLAT":
+            config = IvfFlat(
+                distance_type=metric,
+                num_partitions=num_partitions,
+                max_iterations=max_iterations,
+                sample_rate=sample_rate,
+            )
        elif index_type == "IVF_PQ":
            config = IvfPq(
                distance_type=metric,
@@ -2619,7 +2631,7 @@ class AsyncTable:
        *,
        replace: Optional[bool] = None,
        config: Optional[
-            Union[IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
+            Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
        ] = None,
    ):
        """Create an index to speed up queries
@@ -2648,7 +2660,7 @@ class AsyncTable:
        """
        if config is not None:
            if not isinstance(
-                config, (IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS)
+                config, (IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS)
            ):
                raise TypeError(
                    "config must be an instance of IvfPq, HnswPq, HnswSq, BTree,"
--- a/python/python/tests/docs/test_binary_vector.py
+++ b/python/python/tests/docs/test_binary_vector.py
@@ -0,0 +1,44 @@
+import shutil
+
+# --8<-- [start:imports]
+import lancedb
+import numpy as np
+import pytest
+# --8<-- [end:imports]
+
+shutil.rmtree("data/binary_lancedb", ignore_errors=True)
+
+
+def test_binary_vector():
+    # --8<-- [start:sync_binary_vector]
+    db = lancedb.connect("data/binary_lancedb")
+    data = [
+        {
+            "id": i,
+            "vector": np.random.randint(0, 256, size=16),
+        }
+        for i in range(1024)
+    ]
+    tbl = db.create_table("my_binary_vectors", data=data)
+    query = np.random.randint(0, 256, size=16)
+    tbl.search(query).to_arrow()
+    # --8<-- [end:sync_binary_vector]
+    db.drop_table("my_binary_vectors")
+
+
+@pytest.mark.asyncio
+async def test_binary_vector_async():
+    # --8<-- [start:async_binary_vector]
+    db = await lancedb.connect_async("data/binary_lancedb")
+    data = [
+        {
+            "id": i,
+            "vector": np.random.randint(0, 256, size=16),
+        }
+        for i in range(1024)
+    ]
+    tbl = await db.create_table("my_binary_vectors", data=data)
+    query = np.random.randint(0, 256, size=16)
+    await tbl.query().nearest_to(query).to_arrow()
+    # --8<-- [end:async_binary_vector]
+    await db.drop_table("my_binary_vectors")
--- a/python/python/tests/test_index.py
+++ b/python/python/tests/test_index.py
@@ -8,7 +8,7 @@ import pyarrow as pa
 import pytest
 import pytest_asyncio
 from lancedb import AsyncConnection, AsyncTable, connect_async
-from lancedb.index import BTree, IvfPq, Bitmap, LabelList, HnswPq, HnswSq
+from lancedb.index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq


@pytest_asyncio.fixture
@@ -42,6 +42,27 @@ async def some_table(db_async):
    )


+@pytest_asyncio.fixture
+async def binary_table(db_async):
+    data = [
+        {
+            "id": i,
+            "vector": [i] * 128,
+        }
+        for i in range(NROWS)
+    ]
+    return await db_async.create_table(
+        "binary_table",
+        data,
+        schema=pa.schema(
+            [
+                pa.field("id", pa.int64()),
+                pa.field("vector", pa.list_(pa.uint8(), 128)),
+            ]
+        ),
+    )
+
+
@pytest.mark.asyncio
 async def test_create_scalar_index(some_table: AsyncTable):
    # Can create
@@ -143,3 +164,27 @@ async def test_create_hnswsq_index(some_table: AsyncTable):
    await some_table.create_index("vector", config=HnswSq(num_partitions=10))
    indices = await some_table.list_indices()
    assert len(indices) == 1
+
+
+@pytest.mark.asyncio
+async def test_create_index_with_binary_vectors(binary_table: AsyncTable):
+    await binary_table.create_index(
+        "vector", config=IvfFlat(distance_type="hamming", num_partitions=10)
+    )
+    indices = await binary_table.list_indices()
+    assert len(indices) == 1
+    assert indices[0].index_type == "IvfFlat"
+    assert indices[0].columns == ["vector"]
+    assert indices[0].name == "vector_idx"
+
+    stats = await binary_table.index_stats("vector_idx")
+    assert stats.index_type == "IVF_FLAT"
+    assert stats.distance_type == "hamming"
+    assert stats.num_indexed_rows == await binary_table.count_rows()
+    assert stats.num_unindexed_rows == 0
+    assert stats.num_indices == 1
+
+    # the dataset contains vectors with all values from 0 to 255
+    for v in range(256):
+        res = await binary_table.query().nearest_to([v] * 128).to_arrow()
+        assert res["id"][0].as_py() == v
--- a/python/src/index.rs
+++ b/python/src/index.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use lancedb::index::vector::IvfFlatIndexBuilder;
 use lancedb::index::{
    scalar::{BTreeIndexBuilder, FtsIndexBuilder, TokenizerConfig},
    vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
@@ -59,6 +60,18 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
                opts.tokenizer_configs = inner_opts;
                Ok(LanceDbIndex::FTS(opts))
            },
+            "IvfFlat" => {
+                let params = source.extract::<IvfFlatParams>()?;
+                let distance_type = parse_distance_type(params.distance_type)?;
+                let mut ivf_flat_builder = IvfFlatIndexBuilder::default()
+                    .distance_type(distance_type)
+                    .max_iterations(params.max_iterations)
+                    .sample_rate(params.sample_rate);
+                if let Some(num_partitions) = params.num_partitions {
+                    ivf_flat_builder = ivf_flat_builder.num_partitions(num_partitions);
+                }
+                Ok(LanceDbIndex::IvfFlat(ivf_flat_builder))
+            },
            "IvfPq" => {
                let params = source.extract::<IvfPqParams>()?;
                let distance_type = parse_distance_type(params.distance_type)?;
@@ -129,6 +142,14 @@ struct FtsParams {
    ascii_folding: bool,
 }

+#[derive(FromPyObject)]
+struct IvfFlatParams {
+    distance_type: String,
+    num_partitions: Option<u32>,
+    max_iterations: u32,
+    sample_rate: u32,
+}
+
 #[derive(FromPyObject)]
 struct IvfPqParams {
    distance_type: String,
--- a/python/src/util.rs
+++ b/python/src/util.rs
@@ -43,8 +43,9 @@ pub fn parse_distance_type(distance_type: impl AsRef<str>) -> PyResult<DistanceT
        "l2" => Ok(DistanceType::L2),
        "cosine" => Ok(DistanceType::Cosine),
        "dot" => Ok(DistanceType::Dot),
+        "hamming" => Ok(DistanceType::Hamming),
        _ => Err(PyValueError::new_err(format!(
-            "Invalid distance type '{}'.  Must be one of l2, cosine, or dot",
+            "Invalid distance type '{}'.  Must be one of l2, cosine, dot, or hamming",
            distance_type.as_ref()
        ))),
    }