From a975cc0a94940e8eb78cfb3f93fef6320c183392 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 20 Dec 2023 13:07:07 -0800
Subject: [PATCH] fix: prevent duplicate data in FTS index (#728)

This forces the user to replace the whole FTS directory when re-creating
the index, prevent duplicate data from being created. Previously, the
whole dataset was re-added to the existing index, duplicating existing
rows in the index.

This (in combination with lancedb/lance#1707) caused #726, since the
duplicate data emitted duplicate indices for `take()` and an upstream
issue caused those queries to fail.

This solution isn't ideal, since it makes the FTS index temporarily
unavailable while the index is built. In the future, we should have
multiple FTS index directories, which would allow atomic commits of new
indexes (as well as multiple indexes for different columns).

Fixes #498.
Fixes #726.

---------

Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com>
---
 python/lancedb/table.py  | 19 ++++++++++++++++++-
 python/tests/test_fts.py | 18 ++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/python/lancedb/table.py b/python/lancedb/table.py
index 0b8fa9c6..c9170e2d 100644
--- a/python/lancedb/table.py
+++ b/python/lancedb/table.py
@@ -23,6 +23,7 @@ import lance
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
+import pyarrow.fs as pa_fs
 from lance import LanceDataset
 from lance.vector import vec_to_table
 
@@ -575,7 +576,9 @@ class LanceTable(Table):
         )
         self._reset_dataset()
 
-    def create_fts_index(self, field_names: Union[str, List[str]]):
+    def create_fts_index(
+        self, field_names: Union[str, List[str]], *, replace: bool = False
+    ):
         """Create a full-text search index on the table.
 
         Warning - this API is highly experimental and is highly likely to change
@@ -585,11 +588,25 @@ class LanceTable(Table):
         ----------
         field_names: str or list of str
             The name(s) of the field to index.
+        replace: bool, default False
+            If True, replace the existing index if it exists. Note that this is
+            not yet an atomic operation; the index will be temporarily
+            unavailable while the new index is being created.
         """
         from .fts import create_index, populate_index
 
         if isinstance(field_names, str):
             field_names = [field_names]
+
+        fs, path = fs_from_uri(self._get_fts_index_path())
+        index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound
+        if index_exists:
+            if not replace:
+                raise ValueError(
+                    f"Index already exists. Use replace=True to overwrite."
+                )
+            fs.delete_dir(path)
+
         index = create_index(self._get_fts_index_path(), field_names)
         populate_index(index, self, field_names)
 
diff --git a/python/tests/test_fts.py b/python/tests/test_fts.py
index 301a4d23..2a61f3ca 100644
--- a/python/tests/test_fts.py
+++ b/python/tests/test_fts.py
@@ -83,6 +83,24 @@ def test_create_index_from_table(tmp_path, table):
     assert len(df) == 10
     assert "text" in df.columns
 
+    # Check whether it can be updated
+    table.add(
+        [
+            {
+                "vector": np.random.randn(128),
+                "text": "gorilla",
+                "text2": "gorilla",
+                "nested": {"text": "gorilla"},
+            }
+        ]
+    )
+
+    with pytest.raises(ValueError, match="already exists"):
+        table.create_fts_index("text")
+
+    table.create_fts_index("text", replace=True)
+    assert len(table.search("gorilla").limit(1).to_pandas()) == 1
+
 
 def test_create_index_multiple_columns(tmp_path, table):
     table.create_fts_index(["text", "text2"])