mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 15:12:53 +00:00
This forces the user to replace the whole FTS directory when re-creating the index, prevent duplicate data from being created. Previously, the whole dataset was re-added to the existing index, duplicating existing rows in the index. This (in combination with lancedb/lance#1707) caused #726, since the duplicate data emitted duplicate indices for `take()` and an upstream issue caused those queries to fail. This solution isn't ideal, since it makes the FTS index temporarily unavailable while the index is built. In the future, we should have multiple FTS index directories, which would allow atomic commits of new indexes (as well as multiple indexes for different columns). Fixes #498. Fixes #726. --------- Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com>
124 lines
3.7 KiB
Python
124 lines
3.7 KiB
Python
# Copyright 2023 LanceDB Developers
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import os
|
|
import random
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
import tantivy
|
|
|
|
import lancedb as ldb
|
|
import lancedb.fts
|
|
|
|
|
|
@pytest.fixture
|
|
def table(tmp_path) -> ldb.table.LanceTable:
|
|
db = ldb.connect(tmp_path)
|
|
vectors = [np.random.randn(128) for _ in range(100)]
|
|
|
|
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
|
|
verbs = ("runs", "hits", "jumps", "drives", "barfs")
|
|
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
|
|
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
|
|
text = [
|
|
" ".join(
|
|
[
|
|
nouns[random.randrange(0, 5)],
|
|
verbs[random.randrange(0, 5)],
|
|
adv[random.randrange(0, 5)],
|
|
adj[random.randrange(0, 5)],
|
|
]
|
|
)
|
|
for _ in range(100)
|
|
]
|
|
table = db.create_table(
|
|
"test",
|
|
data=pd.DataFrame(
|
|
{
|
|
"vector": vectors,
|
|
"text": text,
|
|
"text2": text,
|
|
"nested": [{"text": t} for t in text],
|
|
}
|
|
),
|
|
)
|
|
return table
|
|
|
|
|
|
def test_create_index(tmp_path):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
assert isinstance(index, tantivy.Index)
|
|
assert os.path.exists(str(tmp_path / "index"))
|
|
|
|
|
|
def test_populate_index(tmp_path, table):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
|
|
|
|
|
def test_search_index(tmp_path, table):
|
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
ldb.fts.populate_index(index, table, ["text"])
|
|
index.reload()
|
|
results = ldb.fts.search_index(index, query="puppy", limit=10)
|
|
assert len(results) == 2
|
|
assert len(results[0]) == 10 # row_ids
|
|
assert len(results[1]) == 10 # _distance
|
|
|
|
|
|
def test_create_index_from_table(tmp_path, table):
|
|
table.create_fts_index("text")
|
|
df = table.search("puppy").limit(10).select(["text"]).to_pandas()
|
|
assert len(df) == 10
|
|
assert "text" in df.columns
|
|
|
|
# Check whether it can be updated
|
|
table.add(
|
|
[
|
|
{
|
|
"vector": np.random.randn(128),
|
|
"text": "gorilla",
|
|
"text2": "gorilla",
|
|
"nested": {"text": "gorilla"},
|
|
}
|
|
]
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="already exists"):
|
|
table.create_fts_index("text")
|
|
|
|
table.create_fts_index("text", replace=True)
|
|
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
|
|
|
|
|
def test_create_index_multiple_columns(tmp_path, table):
|
|
table.create_fts_index(["text", "text2"])
|
|
df = table.search("puppy").limit(10).to_pandas()
|
|
assert len(df) == 10
|
|
assert "text" in df.columns
|
|
assert "text2" in df.columns
|
|
|
|
|
|
def test_empty_rs(tmp_path, table, mocker):
|
|
table.create_fts_index(["text", "text2"])
|
|
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
|
df = table.search("puppy").limit(10).to_pandas()
|
|
assert len(df) == 0
|
|
|
|
|
|
def test_nested_schema(tmp_path, table):
|
|
table.create_fts_index("nested.text")
|
|
rs = table.search("puppy").limit(10).to_list()
|
|
assert len(rs) == 10
|