Files
lancedb/python/python/tests/test_fts.py
BubbleCal 84ded9d678 feat: support new FTS features in python SDK (#2411)
- AND operator
- phrase query slop param
- boolean query

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Added support for combining full-text search queries using AND/OR
operators, enabling more flexible query composition.
- Introduced new query types and parameters, including boolean queries,
operator selection, occurrence constraints, and phrase slop for advanced
search scenarios.
- Enhanced asynchronous search to accept rich full-text query objects
directly.

- **Bug Fixes**
- Improved handling and validation of full-text search queries in both
synchronous and asynchronous search operations.

- **Tests**
- Updated and expanded tests to cover new full-text query types and
their usage in search functions.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-06-06 14:33:46 +08:00

672 lines
20 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
from unittest import mock
import lancedb as ldb
from lancedb.db import DBConnection
from lancedb.index import FTS
from lancedb.query import BoostQuery, MatchQuery, MultiMatchQuery, PhraseQuery
import numpy as np
import pyarrow as pa
import pandas as pd
import pytest
from utils import exception_output
pytest.importorskip("lancedb.fts")
tantivy = pytest.importorskip("tantivy")
@pytest.fixture
def table(tmp_path) -> ldb.table.LanceTable:
db = ldb.connect(tmp_path)
vectors = [np.random.randn(128) for _ in range(100)]
text_nouns = ("puppy", "car")
text2_nouns = ("rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
text = [
" ".join(
[
text_nouns[random.randrange(0, len(text_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
]
)
for _ in range(100)
]
text2 = [
" ".join(
[
text2_nouns[random.randrange(0, len(text2_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
]
)
for _ in range(100)
]
count = [random.randint(1, 10000) for _ in range(100)]
table = db.create_table(
"test",
data=pd.DataFrame(
{
"vector": vectors,
"id": [i % 2 for i in range(100)],
"text": text,
"text2": text2,
"nested": [{"text": t} for t in text],
"count": count,
}
),
)
return table
@pytest.fixture
async def async_table(tmp_path) -> ldb.table.AsyncTable:
db = await ldb.connect_async(tmp_path)
vectors = [np.random.randn(128) for _ in range(100)]
text_nouns = ("puppy", "car")
text2_nouns = ("rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
text = [
" ".join(
[
text_nouns[random.randrange(0, len(text_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
]
)
for _ in range(100)
]
text2 = [
" ".join(
[
text2_nouns[random.randrange(0, len(text2_nouns))],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
]
)
for _ in range(100)
]
count = [random.randint(1, 10000) for _ in range(100)]
table = await db.create_table(
"test",
data=pd.DataFrame(
{
"vector": vectors,
"id": [i % 2 for i in range(100)],
"text": text,
"text2": text2,
"nested": [{"text": t} for t in text],
"count": count,
}
),
)
return table
def test_create_index(tmp_path):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert isinstance(index, tantivy.Index)
assert os.path.exists(str(tmp_path / "index"))
def test_create_index_with_stemming(tmp_path, table):
index = ldb.fts.create_index(
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
)
assert isinstance(index, tantivy.Index)
assert os.path.exists(str(tmp_path / "index"))
# Check stemming by running tokenizer on non empty table
table.create_fts_index("text", tokenizer_name="en_stem", use_tantivy=True)
@pytest.mark.parametrize("use_tantivy", [True, False])
@pytest.mark.parametrize("with_position", [True, False])
def test_create_inverted_index(table, use_tantivy, with_position):
if use_tantivy and not with_position:
pytest.skip("we don't support building a tantivy index without position")
table.create_fts_index("text", use_tantivy=use_tantivy, with_position=with_position)
def test_populate_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
def test_search_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
ldb.fts.populate_index(index, table, ["text"])
index.reload()
results = ldb.fts.search_index(index, query="puppy", limit=5)
assert len(results) == 2
assert len(results[0]) == 5 # row_ids
assert len(results[1]) == 5 # _score
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_search_fts(table, use_tantivy):
table.create_fts_index("text", use_tantivy=use_tantivy)
results = table.search("puppy").select(["id", "text"]).limit(5).to_list()
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Default limit of 10
results = table.search("puppy").select(["id", "text"]).to_list()
assert len(results) == 10
if not use_tantivy:
# Test with a query
results = (
table.search(MatchQuery("puppy", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
# Test boost query
results = (
table.search(
BoostQuery(
MatchQuery("puppy", "text"),
MatchQuery("runs", "text"),
)
)
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
# Test multi match query
table.create_fts_index("text2", use_tantivy=use_tantivy)
results = (
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test boolean query
results = (
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
for r in results:
assert "puppy" in r["text"]
assert "runs" in r["text"]
@pytest.mark.asyncio
async def test_fts_select_async(async_table):
tbl = await async_table
await tbl.create_index("text", config=FTS())
await tbl.create_index("text2", config=FTS())
results = (
await tbl.query()
.nearest_to_text("puppy")
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test with FullTextQuery
results = (
await tbl.query()
.nearest_to_text(MatchQuery("puppy", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test with BoostQuery
results = (
await tbl.query()
.nearest_to_text(
BoostQuery(
MatchQuery("puppy", "text"),
MatchQuery("runs", "text"),
)
)
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test with MultiMatchQuery
results = (
await tbl.query()
.nearest_to_text(MultiMatchQuery("puppy", ["text", "text2"]))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test with search() API
results = (
await (await tbl.search(MatchQuery("puppy", "text")))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
def test_search_fts_phrase_query(table):
table.create_fts_index("text", use_tantivy=False, with_position=False)
try:
phrase_results = table.search('"puppy runs"').limit(100).to_list()
assert False
except Exception:
pass
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
results = table.search("puppy").limit(100).to_list()
phrase_results = table.search('"puppy runs"').limit(100).to_list()
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
# Test with a query
phrase_results = (
table.search(PhraseQuery("puppy runs", "text")).limit(100).to_list()
)
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
@pytest.mark.asyncio
async def test_search_fts_phrase_query_async(async_table):
async_table = await async_table
await async_table.create_index("text", config=FTS(with_position=False))
try:
phrase_results = (
await async_table.query().nearest_to_text("puppy runs").limit(100).to_list()
)
assert False
except Exception:
pass
await async_table.create_index("text", config=FTS(with_position=True))
results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
phrase_results = (
await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
)
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
# Test with a query
phrase_results = (
await async_table.query()
.nearest_to_text(PhraseQuery("puppy runs", "text"))
.limit(100)
.to_list()
)
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
def test_search_fts_specify_column(table):
table.create_fts_index("text", use_tantivy=False)
table.create_fts_index("text2", use_tantivy=False)
results = table.search("puppy", fts_columns="text").limit(5).to_list()
assert len(results) == 5
results = table.search("rabbit", fts_columns="text2").limit(5).to_list()
assert len(results) == 5
try:
# we can only specify one column for now
table.search("puppy", fts_columns=["text", "text2"]).limit(5).to_list()
assert False
except Exception:
pass
try:
# have to specify a column because we have two fts indices
table.search("puppy").limit(5).to_list()
assert False
except Exception:
pass
@pytest.mark.asyncio
async def test_search_fts_async(async_table):
async_table = await async_table
await async_table.create_index("text", config=FTS())
results = await async_table.query().nearest_to_text("puppy").limit(5).to_list()
assert len(results) == 5
expected_count = await async_table.count_rows(
"count > 5000 and contains(text, 'puppy')"
)
expected_count = min(expected_count, 10)
limited_results_pre_filter = await (
async_table.query()
.nearest_to_text("puppy")
.where("count > 5000")
.limit(10)
.to_list()
)
assert len(limited_results_pre_filter) == expected_count
limited_results_post_filter = await (
async_table.query()
.nearest_to_text("puppy")
.where("count > 5000")
.limit(10)
.postfilter()
.to_list()
)
assert len(limited_results_post_filter) <= expected_count
@pytest.mark.asyncio
async def test_search_fts_specify_column_async(async_table):
async_table = await async_table
await async_table.create_index("text", config=FTS())
await async_table.create_index("text2", config=FTS())
results = (
await async_table.query()
.nearest_to_text("puppy", columns="text")
.limit(5)
.to_list()
)
assert len(results) == 5
results = (
await async_table.query()
.nearest_to_text("rabbit", columns="text2")
.limit(5)
.to_list()
)
assert len(results) == 5
try:
# we can only specify one column for now
await (
async_table.query()
.nearest_to_text("rabbit", columns="text2")
.limit(5)
.to_list()
)
assert False
except Exception:
pass
try:
# have to specify a column because we have two fts indices
await async_table.query().nearest_to_text("puppy").limit(5).to_list()
assert False
except Exception:
pass
def test_search_ordering_field_index_table(tmp_path, table):
table.create_fts_index("text", ordering_field_names=["count"], use_tantivy=True)
rows = (
table.search("puppy", ordering_field_name="count")
.limit(20)
.select(["text", "count"])
.to_list()
)
for r in rows:
assert "puppy" in r["text"]
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
def test_search_ordering_field_index(tmp_path, table):
index = ldb.fts.create_index(
str(tmp_path / "index"), ["text"], ordering_fields=["count"]
)
ldb.fts.populate_index(index, table, ["text"], ordering_fields=["count"])
index.reload()
results = ldb.fts.search_index(
index, query="puppy", limit=5, ordering_field="count"
)
assert len(results) == 2
assert len(results[0]) == 5 # row_ids
assert len(results[1]) == 5 # _distance
rows = table.to_lance().take(results[0]).to_pylist()
for r in rows:
assert "puppy" in r["text"]
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_create_index_from_table(tmp_path, table, use_tantivy):
table.create_fts_index("text", use_tantivy=use_tantivy)
df = table.search("puppy").limit(5).select(["text"]).to_pandas()
assert len(df) <= 5
assert "text" in df.columns
# Check whether it can be updated
table.add(
[
{
"vector": np.random.randn(128),
"id": 101,
"text": "gorilla",
"text2": "gorilla",
"nested": {"text": "gorilla"},
"count": 10,
}
]
)
with pytest.raises(Exception, match="already exists"):
table.create_fts_index("text", use_tantivy=use_tantivy)
table.create_fts_index("text", replace=True, use_tantivy=use_tantivy)
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
def test_create_index_multiple_columns(tmp_path, table):
table.create_fts_index(["text", "text2"], use_tantivy=True)
df = table.search("puppy").limit(5).to_pandas()
assert len(df) == 5
assert "text" in df.columns
assert "text2" in df.columns
def test_empty_rs(tmp_path, table, mocker):
table.create_fts_index(["text", "text2"], use_tantivy=True)
mocker.patch("lancedb.fts.search_index", return_value=([], []))
df = table.search("puppy").limit(5).to_pandas()
assert len(df) == 0
def test_nested_schema(tmp_path, table):
table.create_fts_index("nested.text", use_tantivy=True)
rs = table.search("puppy").limit(5).to_list()
assert len(rs) == 5
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_search_index_with_filter(table, use_tantivy):
table.create_fts_index("text", use_tantivy=use_tantivy)
orig_import = __import__
def import_mock(name, *args):
if name == "duckdb":
raise ImportError
return orig_import(name, *args)
# no duckdb
with mock.patch("builtins.__import__", side_effect=import_mock):
rs = table.search("puppy").where("id=1").limit(10)
# test schema
assert rs.to_arrow().drop("_score").schema.equals(table.schema)
rs = rs.to_list()
for r in rs:
assert r["id"] == 1
# yes duckdb
rs2 = table.search("puppy").where("id=1").limit(10).to_list()
for r in rs2:
assert r["id"] == 1
assert rs == rs2
rs = table.search("puppy").where("id=1").with_row_id(True).limit(10).to_list()
for r in rs:
assert r["id"] == 1
assert r["_rowid"] is not None
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_null_input(table, use_tantivy):
table.add(
[
{
"vector": np.random.randn(128),
"id": 101,
"text": None,
"text2": None,
"nested": {"text": None},
"count": 7,
}
]
)
table.create_fts_index("text", use_tantivy=use_tantivy)
def test_syntax(table):
# https://github.com/lancedb/lancedb/issues/769
table.create_fts_index("text", use_tantivy=True)
with pytest.raises(ValueError, match="Syntax Error"):
table.search("they could have been dogs OR").limit(10).to_list()
# these should work
# terms queries
table.search('"they could have been dogs" OR cats').limit(10).to_list()
table.search("(they AND could) OR (have AND been AND dogs) OR cats").limit(
10
).to_list()
# phrase queries
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
table.search('"they could have been dogs OR cats"').limit(10).to_list()
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
10
).to_list()
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
10
).to_list()
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
10
).to_list()
def test_language(mem_db: DBConnection):
sentences = [
"Il n'y a que trois routes qui traversent la ville.",
"Je veux prendre la route vers l'est.",
"Je te retrouve au café au bout de la route.",
]
data = [{"text": s} for s in sentences]
table = mem_db.create_table("test", data=data)
with pytest.raises(ValueError) as e:
table.create_fts_index("text", use_tantivy=False, language="klingon")
assert exception_output(e) == (
"ValueError: LanceDB does not support the requested language: 'klingon'\n"
"Supported languages: Arabic, Danish, Dutch, English, Finnish, French, "
"German, Greek, Hungarian, Italian, Norwegian, Portuguese, Romanian, "
"Russian, Spanish, Swedish, Tamil, Turkish"
)
table.create_fts_index(
"text",
use_tantivy=False,
language="French",
stem=True,
ascii_folding=True,
remove_stop_words=True,
)
# Can get "routes" and "route" from the same root
results = table.search("route", query_type="fts").limit(5).to_list()
assert len(results) == 3
# Can find "café", without needing to provide accent
results = table.search("cafe", query_type="fts").limit(5).to_list()
assert len(results) == 1
# Stop words -> no results
results = table.search("la", query_type="fts").limit(5).to_list()
assert len(results) == 0
def test_fts_on_list(mem_db: DBConnection):
data = pa.table(
{
"text": [
["lance database", "the", "search"],
["lance database"],
["lance", "search"],
["database", "search"],
["unrelated", "doc"],
],
"vector": [
[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0],
[7.0, 8.0, 9.0],
[10.0, 11.0, 12.0],
[13.0, 14.0, 15.0],
],
}
)
table = mem_db.create_table("test", data=data)
table.create_fts_index("text", use_tantivy=False, with_position=True)
res = table.search("lance").limit(5).to_list()
assert len(res) == 3
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
assert len(res) == 2