fix: prevent duplicate data in FTS index (#728)

This forces the user to replace the whole FTS directory when re-creating the index, prevent duplicate data from being created. Previously, the whole dataset was re-added to the existing index, duplicating existing rows in the index. This (in combination with lancedb/lance#1707) caused #726, since the duplicate data emitted duplicate indices for `take()` and an upstream issue caused those queries to fail. This solution isn't ideal, since it makes the FTS index temporarily unavailable while the index is built. In the future, we should have multiple FTS index directories, which would allow atomic commits of new indexes (as well as multiple indexes for different columns). Fixes #498. Fixes #726. --------- Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com>
2026-07-04 11:30:46 +00:00 · 2023-12-20 13:07:07 -08:00
parent 48a12e780c
commit a975cc0a94
2 changed files with 36 additions and 1 deletions
--- a/python/lancedb/table.py
+++ b/python/lancedb/table.py
@@ -23,6 +23,7 @@ import lance
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
+import pyarrow.fs as pa_fs
 from lance import LanceDataset
 from lance.vector import vec_to_table

@@ -575,7 +576,9 @@ class LanceTable(Table):
        )
        self._reset_dataset()

-    def create_fts_index(self, field_names: Union[str, List[str]]):
+    def create_fts_index(
+        self, field_names: Union[str, List[str]], *, replace: bool = False
+    ):
        """Create a full-text search index on the table.

        Warning - this API is highly experimental and is highly likely to change
@@ -585,11 +588,25 @@ class LanceTable(Table):
        ----------
        field_names: str or list of str
            The name(s) of the field to index.
+        replace: bool, default False
+            If True, replace the existing index if it exists. Note that this is
+            not yet an atomic operation; the index will be temporarily
+            unavailable while the new index is being created.
        """
        from .fts import create_index, populate_index

        if isinstance(field_names, str):
            field_names = [field_names]
+
+        fs, path = fs_from_uri(self._get_fts_index_path())
+        index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound
+        if index_exists:
+            if not replace:
+                raise ValueError(
+                    f"Index already exists. Use replace=True to overwrite."
+                )
+            fs.delete_dir(path)
+
        index = create_index(self._get_fts_index_path(), field_names)
        populate_index(index, self, field_names)

--- a/python/tests/test_fts.py
+++ b/python/tests/test_fts.py
@@ -83,6 +83,24 @@ def test_create_index_from_table(tmp_path, table):
    assert len(df) == 10
    assert "text" in df.columns

+    # Check whether it can be updated
+    table.add(
+        [
+            {
+                "vector": np.random.randn(128),
+                "text": "gorilla",
+                "text2": "gorilla",
+                "nested": {"text": "gorilla"},
+            }
+        ]
+    )
+
+    with pytest.raises(ValueError, match="already exists"):
+        table.create_fts_index("text")
+
+    table.create_fts_index("text", replace=True)
+    assert len(table.search("gorilla").limit(1).to_pandas()) == 1
+

 def test_create_index_multiple_columns(tmp_path, table):
    table.create_fts_index(["text", "text2"])