From a975cc0a94940e8eb78cfb3f93fef6320c183392 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 20 Dec 2023 13:07:07 -0800 Subject: [PATCH] fix: prevent duplicate data in FTS index (#728) This forces the user to replace the whole FTS directory when re-creating the index, prevent duplicate data from being created. Previously, the whole dataset was re-added to the existing index, duplicating existing rows in the index. This (in combination with lancedb/lance#1707) caused #726, since the duplicate data emitted duplicate indices for `take()` and an upstream issue caused those queries to fail. This solution isn't ideal, since it makes the FTS index temporarily unavailable while the index is built. In the future, we should have multiple FTS index directories, which would allow atomic commits of new indexes (as well as multiple indexes for different columns). Fixes #498. Fixes #726. --------- Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com> --- python/lancedb/table.py | 19 ++++++++++++++++++- python/tests/test_fts.py | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/python/lancedb/table.py b/python/lancedb/table.py index 0b8fa9c6..c9170e2d 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -23,6 +23,7 @@ import lance import numpy as np import pyarrow as pa import pyarrow.compute as pc +import pyarrow.fs as pa_fs from lance import LanceDataset from lance.vector import vec_to_table @@ -575,7 +576,9 @@ class LanceTable(Table): ) self._reset_dataset() - def create_fts_index(self, field_names: Union[str, List[str]]): + def create_fts_index( + self, field_names: Union[str, List[str]], *, replace: bool = False + ): """Create a full-text search index on the table. Warning - this API is highly experimental and is highly likely to change @@ -585,11 +588,25 @@ class LanceTable(Table): ---------- field_names: str or list of str The name(s) of the field to index. + replace: bool, default False + If True, replace the existing index if it exists. Note that this is + not yet an atomic operation; the index will be temporarily + unavailable while the new index is being created. """ from .fts import create_index, populate_index if isinstance(field_names, str): field_names = [field_names] + + fs, path = fs_from_uri(self._get_fts_index_path()) + index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound + if index_exists: + if not replace: + raise ValueError( + f"Index already exists. Use replace=True to overwrite." + ) + fs.delete_dir(path) + index = create_index(self._get_fts_index_path(), field_names) populate_index(index, self, field_names) diff --git a/python/tests/test_fts.py b/python/tests/test_fts.py index 301a4d23..2a61f3ca 100644 --- a/python/tests/test_fts.py +++ b/python/tests/test_fts.py @@ -83,6 +83,24 @@ def test_create_index_from_table(tmp_path, table): assert len(df) == 10 assert "text" in df.columns + # Check whether it can be updated + table.add( + [ + { + "vector": np.random.randn(128), + "text": "gorilla", + "text2": "gorilla", + "nested": {"text": "gorilla"}, + } + ] + ) + + with pytest.raises(ValueError, match="already exists"): + table.create_fts_index("text") + + table.create_fts_index("text", replace=True) + assert len(table.search("gorilla").limit(1).to_pandas()) == 1 + def test_create_index_multiple_columns(tmp_path, table): table.create_fts_index(["text", "text2"])