Basic full text search capabilities (#62)

This is v1 of integrating full text search index into LanceDB. # API The query API is roughly the same as before, except if the input is text instead of a vector we assume that its fts search. ## Example If `table` is a LanceDB LanceTable, then: Build index: `table.create_fts_index("text")` Query: `df = table.search("puppy").limit(10).select(["text"]).to_df()` # Implementation Here we use the tantivy-py package to build the index. We then use the row id's as the full-text-search index's doc id then we just do a Take operation to fetch the rows. # Limitations 1. don't support incremental row appends yet. New data won't show up in search 2. local filesystem only 3. requires building tantivy explicitly --------- Co-authored-by: Chang She <chang@lancedb.com>
2025-12-27 23:12:58 +00:00 · 2023-05-24 22:25:31 -06:00
parent f923cfe47f
commit f485378ea4
9 changed files with 319 additions and 6 deletions
--- a/python/lancedb/fts.py
+++ b/python/lancedb/fts.py
@@ -0,0 +1,122 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Full text search index using tantivy-py"""
+import os
+from typing import List, Tuple
+
+import pyarrow as pa
+import tantivy
+
+from .table import LanceTable
+
+
+def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index:
+    """
+    Create a new Index (not populated)
+
+    Parameters
+    ----------
+    index_path : str
+        Path to the index directory
+    text_fields : List[str]
+        List of text fields to index
+
+    Returns
+    -------
+    index : tantivy.Index
+        The index object (not yet populated)
+    """
+    # Declaring our schema.
+    schema_builder = tantivy.SchemaBuilder()
+    # special field that we'll populate with row_id
+    schema_builder.add_integer_field("doc_id", stored=True)
+    # data fields
+    for name in text_fields:
+        schema_builder.add_text_field(name, stored=True)
+    schema = schema_builder.build()
+    os.makedirs(index_path, exist_ok=True)
+    index = tantivy.Index(schema, path=index_path)
+    return index
+
+
+def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int:
+    """
+    Populate an index with data from a LanceTable
+
+    Parameters
+    ----------
+    index : tantivy.Index
+        The index object
+    table : LanceTable
+        The table to index
+    fields : List[str]
+        List of fields to index
+    """
+    # first check the fields exist and are string or large string type
+    for name in fields:
+        f = table.schema.field(name)  # raises KeyError if not found
+        if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
+            raise TypeError(f"Field {name} is not a string type")
+
+    # create a tantivy writer
+    writer = index.writer()
+    # write data into index
+    dataset = table.to_lance()
+    row_id = 0
+    for b in dataset.to_batches(columns=fields):
+        for i in range(b.num_rows):
+            doc = tantivy.Document()
+            doc.add_integer("doc_id", row_id)
+            for name in fields:
+                doc.add_text(name, b[name][i].as_py())
+            writer.add_document(doc)
+            row_id += 1
+    # commit changes
+    writer.commit()
+    return row_id
+
+
+def search_index(
+    index: tantivy.Index, query: str, limit: int = 10
+) -> Tuple[Tuple[int], Tuple[float]]:
+    """
+    Search an index for a query
+
+    Parameters
+    ----------
+    index : tantivy.Index
+        The index object
+    query : str
+        The query string
+    limit : int
+        The maximum number of results to return
+
+    Returns
+    -------
+    ids_and_score: list[tuple[int], tuple[float]]
+        A tuple of two tuples, the first containing the document ids
+        and the second containing the scores
+    """
+    searcher = index.searcher()
+    query = index.parse_query(query)
+    # get top results
+    results = searcher.search(query, limit)
+    return tuple(
+        zip(
+            *[
+                (searcher.doc(doc_address)["doc_id"][0], score)
+                for score, doc_address in results.hits
+            ]
+        )
+    )
--- a/python/lancedb/query.py
+++ b/python/lancedb/query.py
@@ -14,6 +14,7 @@ from __future__ import annotations

 import numpy as np
 import pandas as pd
+import pyarrow as pa

 from .common import VECTOR_COLUMN_NAME

@@ -131,7 +132,6 @@ class LanceQueryBuilder:
        vector and the returned vector.
        """
        ds = self._table.to_lance()
-        # TODO indexed search
        tbl = ds.to_table(
            columns=self._columns,
            filter=self._where,
@@ -145,3 +145,26 @@ class LanceQueryBuilder:
            },
        )
        return tbl.to_pandas()
+
+
+class LanceFtsQueryBuilder(LanceQueryBuilder):
+    def to_df(self) -> pd.DataFrame:
+        try:
+            import tantivy
+        except ImportError:
+            raise ImportError(
+                "You need to install the `lancedb[fts]` extra to use this method."
+            )
+
+        from .fts import search_index
+
+        # get the index path
+        index_path = self._table._get_fts_index_path()
+        # open the index
+        index = tantivy.Index.open(index_path)
+        # get the scores and doc ids
+        row_ids, scores = search_index(index, self._query, self._limit)
+        scores = pa.array(scores)
+        output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
+        output_tbl = output_tbl.append_column("score", scores)
+        return output_tbl.to_pandas()
--- a/python/lancedb/table.py
+++ b/python/lancedb/table.py
@@ -14,7 +14,9 @@
 from __future__ import annotations

 import os
+import shutil
 from functools import cached_property
+from typing import List, Union

 import lance
 import numpy as np
@@ -24,7 +26,8 @@ from lance import LanceDataset
 from lance.vector import vec_to_table

 from .common import DATA, VEC, VECTOR_COLUMN_NAME
-from .query import LanceQueryBuilder
+from .query import LanceFtsQueryBuilder, LanceQueryBuilder
+from .util import get_uri_scheme


 def _sanitize_data(data, schema):
@@ -130,6 +133,27 @@ class LanceTable:
        )
        self._reset_dataset()

+    def create_fts_index(self, field_names: Union[str, List[str]]):
+        """Create a full-text search index on the table.
+
+        Warning - this API is highly experimental and is highly likely to change
+        in the future.
+
+        Parameters
+        ----------
+        field_names: str or list of str
+            The name(s) of the field to index.
+        """
+        from .fts import create_index, populate_index
+
+        if isinstance(field_names, str):
+            field_names = [field_names]
+        index = create_index(self._get_fts_index_path(), field_names)
+        populate_index(index, self, field_names)
+
+    def _get_fts_index_path(self):
+        return os.path.join(self._dataset_uri, "_indices", "tantivy")
+
    @cached_property
    def _dataset(self) -> LanceDataset:
        return lance.dataset(self._dataset_uri, version=self._version)
@@ -158,7 +182,7 @@ class LanceTable:
        self._reset_dataset()
        return len(self)

-    def search(self, query: VEC) -> LanceQueryBuilder:
+    def search(self, query: Union[VEC, str]) -> LanceQueryBuilder:
        """Create a search query to find the nearest neighbors
        of the given query vector.

@@ -174,6 +198,10 @@ class LanceTable:
        and also the "score" column which is the distance between the query
        vector and the returned vector.
        """
+        if isinstance(query, str):
+            # fts
+            return LanceFtsQueryBuilder(self, query)
+
        if isinstance(query, list):
            query = np.array(query)
        if isinstance(query, np.ndarray):
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -45,6 +45,10 @@ dev = [
 docs = [
    "mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"
 ]
+fts = [
+    # tantivy 0.19.2
+    "tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985"
+]

 [build-system]
 requires = [
--- a/python/tests/test_fts.py
+++ b/python/tests/test_fts.py
@@ -0,0 +1,84 @@
+# Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import os
+import random
+
+import numpy as np
+import pandas as pd
+import pytest
+import tantivy
+
+import lancedb as ldb
+import lancedb.fts
+
+
+@pytest.fixture
+def table(tmp_path) -> ldb.table.LanceTable:
+    db = ldb.connect(tmp_path)
+    vectors = [np.random.randn(128) for _ in range(100)]
+
+    nouns = ("puppy", "car", "rabbit", "girl", "monkey")
+    verbs = ("runs", "hits", "jumps", "drives", "barfs")
+    adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
+    adj = ("adorable", "clueless", "dirty", "odd", "stupid")
+    text = [
+        " ".join(
+            [
+                nouns[random.randrange(0, 5)],
+                verbs[random.randrange(0, 5)],
+                adv[random.randrange(0, 5)],
+                adj[random.randrange(0, 5)],
+            ]
+        )
+        for _ in range(100)
+    ]
+    table = db.create_table(
+        "test", data=pd.DataFrame({"vector": vectors, "text": text, "text2": text})
+    )
+    return table
+
+
+def test_create_index(tmp_path):
+    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
+    assert isinstance(index, tantivy.Index)
+    assert os.path.exists(str(tmp_path / "index"))
+
+
+def test_populate_index(tmp_path, table):
+    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
+    assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
+
+
+def test_search_index(tmp_path, table):
+    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
+    ldb.fts.populate_index(index, table, ["text"])
+    index.reload()
+    results = ldb.fts.search_index(index, query="puppy", limit=10)
+    assert len(results) == 2
+    assert len(results[0]) == 10  # row_ids
+    assert len(results[1]) == 10  # scores
+
+
+def test_create_index_from_table(tmp_path, table):
+    table.create_fts_index("text")
+    df = table.search("puppy").limit(10).select(["text"]).to_df()
+    assert len(df) == 10
+    assert "text" in df.columns
+
+
+def test_create_index_multiple_columns(tmp_path, table):
+    table.create_fts_index(["text", "text2"])
+    df = table.search("puppy").limit(10).to_df()
+    assert len(df) == 10
+    assert "text" in df.columns
+    assert "text2" in df.columns