mirror of
https://github.com/lancedb/lancedb.git
synced 2026-06-03 20:30:42 +00:00
refactor(python): remove legacy tantivy FTS support (#3282)
This follows the Rust-side Tantivy removal by deleting the remaining Python Tantivy runtime, tests, and packaging references. It also turns the legacy Python-only Tantivy parameters into explicit errors and stops reading legacy `_indices/fts` directories so Python FTS is fully native-only.
This commit is contained in:
@@ -180,7 +180,7 @@ def test_fts_fuzzy_query():
|
||||
),
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
table.create_fts_index("text", replace=True)
|
||||
|
||||
results = table.search(MatchQuery("foo", "text", fuzziness=1)).to_pandas()
|
||||
assert len(results) == 4
|
||||
@@ -230,7 +230,7 @@ def test_fts_boost_query():
|
||||
),
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("desc", use_tantivy=False, replace=True)
|
||||
table.create_fts_index("desc", replace=True)
|
||||
|
||||
results = table.search(
|
||||
BoostQuery(
|
||||
@@ -265,7 +265,7 @@ def test_fts_boolean_query(tmp_path):
|
||||
],
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
table.create_fts_index("text", replace=True)
|
||||
|
||||
# SHOULD
|
||||
results = table.search(
|
||||
@@ -319,9 +319,7 @@ def test_fts_native():
|
||||
],
|
||||
)
|
||||
|
||||
# passing `use_tantivy=False` to use lance FTS index
|
||||
# `use_tantivy=True` by default
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
table.search("puppy").limit(10).select(["text"]).to_list()
|
||||
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
|
||||
# ...
|
||||
@@ -332,7 +330,6 @@ def test_fts_native():
|
||||
# --8<-- [start:fts_config_folding]
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
language="French",
|
||||
stem=True,
|
||||
ascii_folding=True,
|
||||
@@ -346,7 +343,7 @@ def test_fts_native():
|
||||
table.search("puppy").limit(10).where("text='foo'", prefilter=False).to_list()
|
||||
# --8<-- [end:fts_postfiltering]
|
||||
# --8<-- [start:fts_with_position]
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
|
||||
table.create_fts_index("text", with_position=True, replace=True)
|
||||
# --8<-- [end:fts_with_position]
|
||||
# --8<-- [start:fts_incremental_index]
|
||||
table.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
|
||||
|
||||
@@ -15,8 +15,7 @@ import pytest
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_basic(tmp_path, use_tantivy):
|
||||
def test_basic(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
assert db.uri == str(tmp_path)
|
||||
@@ -49,7 +48,7 @@ def test_basic(tmp_path, use_tantivy):
|
||||
assert len(rs) == 1
|
||||
assert rs["item"].iloc[0] == "foo"
|
||||
|
||||
table.create_fts_index("item", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("item")
|
||||
rs = table.search("bar", query_type="fts").to_pandas()
|
||||
assert len(rs) == 1
|
||||
assert rs["item"].iloc[0] == "bar"
|
||||
|
||||
@@ -36,9 +36,6 @@ import pytest
|
||||
import pytest_asyncio
|
||||
from utils import exception_output
|
||||
|
||||
pytest.importorskip("lancedb.fts")
|
||||
tantivy = pytest.importorskip("tantivy")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table(tmp_path) -> ldb.table.LanceTable:
|
||||
@@ -144,58 +141,53 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
||||
return table
|
||||
|
||||
|
||||
def test_create_index(tmp_path):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
assert isinstance(index, tantivy.Index)
|
||||
assert os.path.exists(str(tmp_path / "index"))
|
||||
@pytest.mark.parametrize(
|
||||
("kwargs", "match"),
|
||||
[
|
||||
(
|
||||
{"use_tantivy": True},
|
||||
"Tantivy-based FTS has been removed",
|
||||
),
|
||||
(
|
||||
{"ordering_field_names": ["count"]},
|
||||
"ordering_field_names was only supported",
|
||||
),
|
||||
(
|
||||
{"writer_heap_size": 128},
|
||||
"writer_heap_size was only supported",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_reject_removed_tantivy_parameters(table, kwargs, match):
|
||||
with pytest.raises(ValueError, match=match):
|
||||
table.create_fts_index("text", **kwargs)
|
||||
|
||||
|
||||
def test_create_index_with_stemming(tmp_path, table):
|
||||
index = ldb.fts.create_index(
|
||||
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
|
||||
)
|
||||
assert isinstance(index, tantivy.Index)
|
||||
assert os.path.exists(str(tmp_path / "index"))
|
||||
def test_reject_legacy_tantivy_index(table):
|
||||
path, _, _ = table._get_fts_index_path()
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
# Check stemming by running tokenizer on non empty table
|
||||
table.create_fts_index("text", tokenizer_name="en_stem", use_tantivy=True)
|
||||
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
|
||||
table.search("puppy").limit(5).to_list()
|
||||
|
||||
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
|
||||
table.create_fts_index("text")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
@pytest.mark.parametrize("with_position", [True, False])
|
||||
def test_create_inverted_index(table, use_tantivy, with_position):
|
||||
if use_tantivy and not with_position:
|
||||
pytest.skip("we don't support building a tantivy index without position")
|
||||
def test_create_inverted_index(table, with_position):
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=use_tantivy,
|
||||
with_position=with_position,
|
||||
name="custom_fts_index",
|
||||
)
|
||||
if not use_tantivy:
|
||||
indices = table.list_indices()
|
||||
fts_indices = [i for i in indices if i.index_type == "FTS"]
|
||||
assert any(i.name == "custom_fts_index" for i in fts_indices)
|
||||
indices = table.list_indices()
|
||||
fts_indices = [i for i in indices if i.index_type == "FTS"]
|
||||
assert any(i.name == "custom_fts_index" for i in fts_indices)
|
||||
|
||||
|
||||
def test_populate_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
||||
|
||||
|
||||
def test_search_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
ldb.fts.populate_index(index, table, ["text"])
|
||||
index.reload()
|
||||
results = ldb.fts.search_index(index, query="puppy", limit=5)
|
||||
assert len(results) == 2
|
||||
assert len(results[0]) == 5 # row_ids
|
||||
assert len(results[1]) == 5 # _score
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_search_fts(table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
def test_search_fts(table):
|
||||
table.create_fts_index("text")
|
||||
results = table.search("puppy").select(["id", "text"]).limit(5).to_list()
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
@@ -204,53 +196,52 @@ def test_search_fts(table, use_tantivy):
|
||||
results = table.search("puppy").select(["id", "text"]).to_list()
|
||||
assert len(results) == 10
|
||||
|
||||
if not use_tantivy:
|
||||
# Test with a query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
# Test with a query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
|
||||
# Test boost query
|
||||
results = (
|
||||
table.search(
|
||||
BoostQuery(
|
||||
MatchQuery("puppy", "text"),
|
||||
MatchQuery("runs", "text"),
|
||||
)
|
||||
# Test boost query
|
||||
results = (
|
||||
table.search(
|
||||
BoostQuery(
|
||||
MatchQuery("puppy", "text"),
|
||||
MatchQuery("runs", "text"),
|
||||
)
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
|
||||
# Test multi match query
|
||||
table.create_fts_index("text2", use_tantivy=use_tantivy)
|
||||
results = (
|
||||
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
# Test multi match query
|
||||
table.create_fts_index("text2")
|
||||
results = (
|
||||
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
# Test boolean query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
for r in results:
|
||||
assert "puppy" in r["text"]
|
||||
assert "runs" in r["text"]
|
||||
# Test boolean query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
for r in results:
|
||||
assert "puppy" in r["text"]
|
||||
assert "runs" in r["text"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -318,13 +309,13 @@ async def test_fts_select_async(async_table):
|
||||
|
||||
|
||||
def test_search_fts_phrase_query(table):
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=False)
|
||||
table.create_fts_index("text", with_position=False)
|
||||
try:
|
||||
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||
assert False
|
||||
except Exception:
|
||||
pass
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
|
||||
table.create_fts_index("text", with_position=True, replace=True)
|
||||
results = table.search("puppy").limit(100).to_list()
|
||||
|
||||
# Test with quotation marks
|
||||
@@ -375,8 +366,8 @@ async def test_search_fts_phrase_query_async(async_table):
|
||||
|
||||
|
||||
def test_search_fts_specify_column(table):
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text2", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
table.create_fts_index("text2")
|
||||
|
||||
results = table.search("puppy", fts_columns="text").limit(5).to_list()
|
||||
assert len(results) == 5
|
||||
@@ -470,42 +461,8 @@ async def test_search_fts_specify_column_async(async_table):
|
||||
pass
|
||||
|
||||
|
||||
def test_search_ordering_field_index_table(tmp_path, table):
|
||||
table.create_fts_index("text", ordering_field_names=["count"], use_tantivy=True)
|
||||
rows = (
|
||||
table.search("puppy", ordering_field_name="count")
|
||||
.limit(20)
|
||||
.select(["text", "count"])
|
||||
.to_list()
|
||||
)
|
||||
for r in rows:
|
||||
assert "puppy" in r["text"]
|
||||
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
||||
|
||||
|
||||
def test_search_ordering_field_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(
|
||||
str(tmp_path / "index"), ["text"], ordering_fields=["count"]
|
||||
)
|
||||
|
||||
ldb.fts.populate_index(index, table, ["text"], ordering_fields=["count"])
|
||||
index.reload()
|
||||
results = ldb.fts.search_index(
|
||||
index, query="puppy", limit=5, ordering_field="count"
|
||||
)
|
||||
assert len(results) == 2
|
||||
assert len(results[0]) == 5 # row_ids
|
||||
assert len(results[1]) == 5 # _distance
|
||||
rows = table.to_lance().take(results[0]).to_pylist()
|
||||
|
||||
for r in rows:
|
||||
assert "puppy" in r["text"]
|
||||
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_create_index_from_table(tmp_path, table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
def test_create_index_from_table(tmp_path, table):
|
||||
table.create_fts_index("text")
|
||||
df = table.search("puppy").limit(5).select(["text"]).to_pandas()
|
||||
assert len(df) <= 5
|
||||
assert "text" in df.columns
|
||||
@@ -525,36 +482,24 @@ def test_create_index_from_table(tmp_path, table, use_tantivy):
|
||||
)
|
||||
|
||||
with pytest.raises(Exception, match="already exists"):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text")
|
||||
|
||||
table.create_fts_index("text", replace=True, use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text", replace=True)
|
||||
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
||||
|
||||
|
||||
def test_create_index_multiple_columns(tmp_path, table):
|
||||
table.create_fts_index(["text", "text2"], use_tantivy=True)
|
||||
df = table.search("puppy").limit(5).to_pandas()
|
||||
assert len(df) == 5
|
||||
assert "text" in df.columns
|
||||
assert "text2" in df.columns
|
||||
|
||||
|
||||
def test_empty_rs(tmp_path, table, mocker):
|
||||
table.create_fts_index(["text", "text2"], use_tantivy=True)
|
||||
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
||||
df = table.search("puppy").limit(5).to_pandas()
|
||||
assert len(df) == 0
|
||||
with pytest.raises(ValueError, match="Native FTS indexes can only be created"):
|
||||
table.create_fts_index(["text", "text2"])
|
||||
|
||||
|
||||
def test_nested_schema(tmp_path, table):
|
||||
table.create_fts_index("nested.text", use_tantivy=True)
|
||||
rs = table.search("puppy").limit(5).to_list()
|
||||
assert len(rs) == 5
|
||||
with pytest.raises(ValueError, match="top-level fields"):
|
||||
table.create_fts_index("nested.text")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_search_index_with_filter(table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
def test_search_index_with_filter(table):
|
||||
table.create_fts_index("text")
|
||||
orig_import = __import__
|
||||
|
||||
def import_mock(name, *args):
|
||||
@@ -584,8 +529,7 @@ def test_search_index_with_filter(table, use_tantivy):
|
||||
assert r["_rowid"] is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_null_input(table, use_tantivy):
|
||||
def test_null_input(table):
|
||||
table.add(
|
||||
[
|
||||
{
|
||||
@@ -598,14 +542,13 @@ def test_null_input(table, use_tantivy):
|
||||
}
|
||||
]
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text")
|
||||
|
||||
|
||||
def test_syntax(table):
|
||||
# https://github.com/lancedb/lancedb/issues/769
|
||||
table.create_fts_index("text", use_tantivy=True)
|
||||
with pytest.raises(ValueError, match="Syntax Error"):
|
||||
table.search("they could have been dogs OR").limit(10).to_list()
|
||||
table.create_fts_index("text")
|
||||
table.search("they could have been dogs OR").limit(10).to_list()
|
||||
|
||||
# these should work
|
||||
|
||||
@@ -616,6 +559,7 @@ def test_syntax(table):
|
||||
).to_list()
|
||||
|
||||
# phrase queries
|
||||
table.create_fts_index("text", with_position=True, replace=True)
|
||||
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
|
||||
table.search('"they could have been dogs OR cats"').limit(10).to_list()
|
||||
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
|
||||
@@ -639,7 +583,7 @@ def test_language(mem_db: DBConnection):
|
||||
table = mem_db.create_table("test", data=data)
|
||||
|
||||
with pytest.raises(ValueError) as e:
|
||||
table.create_fts_index("text", use_tantivy=False, language="klingon")
|
||||
table.create_fts_index("text", language="klingon")
|
||||
|
||||
assert exception_output(e) == (
|
||||
"ValueError: LanceDB does not support the requested language: 'klingon'\n"
|
||||
@@ -650,7 +594,6 @@ def test_language(mem_db: DBConnection):
|
||||
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
language="French",
|
||||
stem=True,
|
||||
ascii_folding=True,
|
||||
@@ -690,7 +633,7 @@ def test_fts_on_list(mem_db: DBConnection):
|
||||
}
|
||||
)
|
||||
table = mem_db.create_table("test", data=data)
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True)
|
||||
table.create_fts_index("text", with_position=True)
|
||||
|
||||
res = table.search("lance").limit(5).to_list()
|
||||
assert len(res) == 3
|
||||
@@ -702,7 +645,7 @@ def test_fts_on_list(mem_db: DBConnection):
|
||||
def test_fts_ngram(mem_db: DBConnection):
|
||||
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
|
||||
table = mem_db.create_table("test", data=data)
|
||||
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
|
||||
table.create_fts_index("text", base_tokenizer="ngram")
|
||||
|
||||
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 2
|
||||
@@ -721,7 +664,6 @@ def test_fts_ngram(mem_db: DBConnection):
|
||||
# test setting min_ngram_length and prefix_only
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
base_tokenizer="ngram",
|
||||
replace=True,
|
||||
ngram_min_length=2,
|
||||
@@ -886,7 +828,7 @@ def test_fts_query_to_json():
|
||||
|
||||
|
||||
def test_fts_fast_search(table):
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
|
||||
# Insert some unindexed data
|
||||
table.add(
|
||||
|
||||
@@ -28,7 +28,7 @@ def sync_table(tmpdir_factory) -> Table:
|
||||
}
|
||||
)
|
||||
table = db.create_table("test", data)
|
||||
table.create_fts_index("text", with_position=False, use_tantivy=False)
|
||||
table.create_fts_index("text", with_position=False)
|
||||
return table
|
||||
|
||||
|
||||
@@ -192,7 +192,7 @@ def table_with_id(tmpdir_factory) -> Table:
|
||||
}
|
||||
)
|
||||
table = db.create_table("test_with_id", data)
|
||||
table.create_fts_index("text", with_position=False, use_tantivy=False)
|
||||
table.create_fts_index("text", with_position=False)
|
||||
return table
|
||||
|
||||
|
||||
|
||||
@@ -1385,7 +1385,7 @@ def test_query_timeout(tmp_path):
|
||||
}
|
||||
)
|
||||
table = db.create_table("test", data)
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
|
||||
with pytest.raises(Exception, match="Query timeout"):
|
||||
table.search().where("text = 'a'").to_list(timeout=timedelta(0))
|
||||
|
||||
@@ -26,11 +26,8 @@ from lancedb.rerankers import (
|
||||
)
|
||||
from lancedb.table import LanceTable
|
||||
|
||||
# Tests rely on FTS index
|
||||
pytest.importorskip("lancedb.fts")
|
||||
|
||||
|
||||
def get_test_table(tmp_path, use_tantivy):
|
||||
def get_test_table(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
# Create a LanceDB table schema with a vector and a text column
|
||||
emb = EmbeddingFunctionRegistry.get_instance().get("test").create()
|
||||
@@ -98,7 +95,7 @@ def get_test_table(tmp_path, use_tantivy):
|
||||
)
|
||||
|
||||
# Create a fts index
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy, replace=True)
|
||||
table.create_fts_index("text", replace=True)
|
||||
|
||||
return table, MyTable
|
||||
|
||||
@@ -208,8 +205,8 @@ def _run_test_reranker(reranker, table, query, query_vector, schema):
|
||||
assert len(result) == 20 and result == result_arrow
|
||||
|
||||
|
||||
def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
def _run_test_hybrid_reranker(reranker, tmp_path):
|
||||
table, schema = get_test_table(tmp_path)
|
||||
# The default reranker
|
||||
result1 = (
|
||||
table.search(
|
||||
@@ -285,8 +282,7 @@ def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_linear_combination(tmp_path, use_tantivy):
|
||||
def test_linear_combination(tmp_path):
|
||||
reranker = LinearCombinationReranker()
|
||||
|
||||
vector_results = pa.Table.from_pydict(
|
||||
@@ -313,22 +309,20 @@ def test_linear_combination(tmp_path, use_tantivy):
|
||||
assert "_score" not in combined_results.column_names
|
||||
assert "_relevance_score" in combined_results.column_names
|
||||
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
_run_test_hybrid_reranker(reranker, tmp_path)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_rrf_reranker(tmp_path, use_tantivy):
|
||||
def test_rrf_reranker(tmp_path):
|
||||
reranker = RRFReranker()
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
_run_test_hybrid_reranker(reranker, tmp_path)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_mrr_reranker(tmp_path, use_tantivy):
|
||||
def test_mrr_reranker(tmp_path):
|
||||
reranker = MRRReranker()
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
_run_test_hybrid_reranker(reranker, tmp_path)
|
||||
|
||||
# Test multi-vector part
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
query = "single player experience"
|
||||
rs1 = table.search(query, vector_column_name="vector").limit(10).with_row_id(True)
|
||||
rs2 = (
|
||||
@@ -363,7 +357,7 @@ def test_rrf_reranker_distance():
|
||||
table = db.create_table("test", data)
|
||||
|
||||
table.create_index(num_partitions=1, num_sub_vectors=2)
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
|
||||
reranker = RRFReranker(return_score="all")
|
||||
|
||||
@@ -422,35 +416,31 @@ def test_rrf_reranker_distance():
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
|
||||
)
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_cohere_reranker(tmp_path, use_tantivy):
|
||||
def test_cohere_reranker(tmp_path):
|
||||
pytest.importorskip("cohere")
|
||||
reranker = CohereReranker()
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_cross_encoder_reranker(tmp_path, use_tantivy):
|
||||
def test_cross_encoder_reranker(tmp_path):
|
||||
pytest.importorskip("sentence_transformers")
|
||||
reranker = CrossEncoderReranker()
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_colbert_reranker(tmp_path, use_tantivy):
|
||||
def test_colbert_reranker(tmp_path):
|
||||
pytest.importorskip("rerankers")
|
||||
reranker = ColbertReranker()
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_answerdotai_reranker(tmp_path, use_tantivy):
|
||||
def test_answerdotai_reranker(tmp_path):
|
||||
pytest.importorskip("rerankers")
|
||||
reranker = AnswerdotaiRerankers()
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@@ -459,10 +449,9 @@ def test_answerdotai_reranker(tmp_path, use_tantivy):
|
||||
or os.environ.get("OPENAI_BASE_URL") is not None,
|
||||
reason="OPENAI_API_KEY not set",
|
||||
)
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_openai_reranker(tmp_path, use_tantivy):
|
||||
def test_openai_reranker(tmp_path):
|
||||
pytest.importorskip("openai")
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
reranker = OpenaiReranker()
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
@@ -470,10 +459,9 @@ def test_openai_reranker(tmp_path, use_tantivy):
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("JINA_API_KEY") is None, reason="JINA_API_KEY not set"
|
||||
)
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_jina_reranker(tmp_path, use_tantivy):
|
||||
def test_jina_reranker(tmp_path):
|
||||
pytest.importorskip("jina")
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
reranker = JinaReranker()
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
@@ -481,11 +469,10 @@ def test_jina_reranker(tmp_path, use_tantivy):
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||
)
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_voyageai_reranker(tmp_path, use_tantivy):
|
||||
def test_voyageai_reranker(tmp_path):
|
||||
pytest.importorskip("voyageai")
|
||||
reranker = VoyageAIReranker(model_name="rerank-2.5")
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@@ -504,7 +491,7 @@ def test_empty_result_reranker():
|
||||
|
||||
# Create empty table with schema
|
||||
empty_table = db.create_table("empty_table", schema=schema, mode="overwrite")
|
||||
empty_table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
empty_table.create_fts_index("text", replace=True)
|
||||
for reranker in [
|
||||
CrossEncoderReranker(),
|
||||
# ColbertReranker(),
|
||||
@@ -603,11 +590,10 @@ def test_empty_hybrid_result_reranker():
|
||||
assert "_rowid" in result.column_names
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy):
|
||||
def test_cross_encoder_reranker_return_all(tmp_path):
|
||||
pytest.importorskip("sentence_transformers")
|
||||
reranker = CrossEncoderReranker(return_score="all")
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
query = "single player experience"
|
||||
result = (
|
||||
table.search(query, query_type="hybrid", vector_column_name="vector")
|
||||
|
||||
@@ -242,8 +242,8 @@ def test_s3_dynamodb_sync(s3_bucket: str, commit_table: str, monkeypatch):
|
||||
|
||||
# FTS indices should error since they are not supported yet.
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Full-text search is only supported on the local filesystem",
|
||||
ValueError,
|
||||
match="Tantivy-based FTS has been removed",
|
||||
):
|
||||
table.create_fts_index("x", use_tantivy=True)
|
||||
|
||||
|
||||
@@ -1948,7 +1948,6 @@ def setup_hybrid_search_table(db: DBConnection, embedding_func):
|
||||
|
||||
def test_hybrid_search(tmp_db: DBConnection):
|
||||
# This test uses an FTS index
|
||||
pytest.importorskip("lancedb.fts")
|
||||
pytest.importorskip("lance")
|
||||
|
||||
table, MyTable, emb = setup_hybrid_search_table(tmp_db, "test")
|
||||
@@ -2019,7 +2018,6 @@ def test_hybrid_search(tmp_db: DBConnection):
|
||||
|
||||
def test_hybrid_search_metric_type(tmp_db: DBConnection):
|
||||
# This test uses an FTS index
|
||||
pytest.importorskip("lancedb.fts")
|
||||
pytest.importorskip("lance")
|
||||
|
||||
# Need to use nonnorm as the embedding function so l2 and dot results
|
||||
|
||||
Reference in New Issue
Block a user