fix: bugs for new FTS APIs (#2314)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced full-text search capabilities with support for phrase
queries, fuzzy matching, boosting, and multi-column matching.
- Search methods now accept full-text query objects directly, improving
query flexibility and precision.
- Python and JavaScript SDKs updated to handle full-text queries
seamlessly, including async search support.

- **Tests**
- Added comprehensive tests covering fuzzy search, phrase search, and
boosted queries to ensure robust full-text search functionality.

- **Documentation**
- Updated query class documentation to reflect new constructor options
and removal of deprecated methods for clarity and simplicity.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2025-04-15 11:51:35 +08:00
committed by GitHub
parent a6fa69ab89
commit 2248aa9508
15 changed files with 397 additions and 415 deletions

View File

@@ -266,8 +266,8 @@ class MultiMatchQuery(FullTextQuery):
Parameters
----------
query : str | list[Query]
If a string, the query string to match against.
query : str
The query string to match against.
columns : list[str]
The list of columns to match against.

View File

@@ -2141,6 +2141,8 @@ class LanceTable(Table):
and also the "_distance" column which is the distance between the query
vector and the returned vector.
"""
if isinstance(query, FullTextQuery):
query_type = "fts"
vector_column_name = infer_vector_column_name(
schema=self.schema,
query_type=query_type,
@@ -3223,8 +3225,10 @@ class AsyncTable:
async def get_embedding_func(
vector_column_name: Optional[str],
query_type: QueryType,
query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]],
query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple, FullTextQuery]],
) -> Tuple[str, EmbeddingFunctionConfig]:
if isinstance(query, FullTextQuery):
query_type = "fts"
schema = await self.schema()
vector_column_name = infer_vector_column_name(
schema=schema,

View File

@@ -253,9 +253,14 @@ def infer_vector_column_name(
query: Optional[Any], # inferred later in query builder
vector_column_name: Optional[str],
):
if (vector_column_name is None and query is not None and query_type != "fts") or (
vector_column_name is None and query_type == "hybrid"
):
if vector_column_name is not None:
return vector_column_name
if query_type == "fts":
# FTS queries do not require a vector column
return None
if query is not None or query_type == "hybrid":
try:
vector_column_name = inf_vector_column_query(schema)
except Exception as e:

View File

@@ -6,7 +6,9 @@ import lancedb
# --8<-- [end:import-lancedb]
# --8<-- [start:import-numpy]
from lancedb.query import BoostQuery, MatchQuery
import numpy as np
import pyarrow as pa
# --8<-- [end:import-numpy]
# --8<-- [start:import-datetime]
@@ -154,6 +156,84 @@ async def test_vector_search_async():
# --8<-- [end:search_result_async_as_list]
def test_fts_fuzzy_query():
uri = "data/fuzzy-example"
db = lancedb.connect(uri)
table = db.create_table(
"my_table_fts_fuzzy",
data=pa.table(
{
"text": [
"fa",
"fo", # spellchecker:disable-line
"fob",
"focus",
"foo",
"food",
"foul",
]
}
),
mode="overwrite",
)
table.create_fts_index("text", use_tantivy=False, replace=True)
results = table.search(MatchQuery("foo", "text", fuzziness=1)).to_pandas()
assert len(results) == 4
assert set(results["text"].to_list()) == {
"foo",
"fo", # 1 deletion # spellchecker:disable-line
"fob", # 1 substitution
"food", # 1 insertion
}
def test_fts_boost_query():
uri = "data/boost-example"
db = lancedb.connect(uri)
table = db.create_table(
"my_table_fts_boost",
data=pa.table(
{
"title": [
"The Hidden Gems of Travel",
"Exploring Nature's Wonders",
"Cultural Treasures Unveiled",
"The Nightlife Chronicles",
"Scenic Escapes and Challenges",
],
"desc": [
"A vibrant city with occasional traffic jams.",
"Beautiful landscapes but overpriced tourist spots.",
"Rich cultural heritage but humid summers.",
"Bustling nightlife but noisy streets.",
"Scenic views but limited public transport options.",
],
}
),
mode="overwrite",
)
table.create_fts_index("desc", use_tantivy=False, replace=True)
results = table.search(
BoostQuery(
MatchQuery("beautiful, cultural, nightlife", "desc"),
MatchQuery("bad traffic jams, overpriced", "desc"),
),
).to_pandas()
# we will hit 3 results because the positive query has 3 hits
assert len(results) == 3
# the one containing "overpriced" will be negatively boosted,
# so it will be the last one
assert (
results["desc"].to_list()[2]
== "Beautiful landscapes but overpriced tourist spots."
)
def test_fts_native():
# --8<-- [start:basic_fts]
uri = "data/sample-lancedb"