fix: bugs for new FTS APIs (#2314)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced full-text search capabilities with support for phrase
queries, fuzzy matching, boosting, and multi-column matching.
- Search methods now accept full-text query objects directly, improving
query flexibility and precision.
- Python and JavaScript SDKs updated to handle full-text queries
seamlessly, including async search support.

- **Tests**
- Added comprehensive tests covering fuzzy search, phrase search, and
boosted queries to ensure robust full-text search functionality.

- **Documentation**
- Updated query class documentation to reflect new constructor options
and removal of deprecated methods for clarity and simplicity.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2025-04-15 11:51:35 +08:00
committed by GitHub
parent a6fa69ab89
commit 2248aa9508
15 changed files with 397 additions and 415 deletions

View File

@@ -6,7 +6,9 @@ import lancedb
# --8<-- [end:import-lancedb]
# --8<-- [start:import-numpy]
from lancedb.query import BoostQuery, MatchQuery
import numpy as np
import pyarrow as pa
# --8<-- [end:import-numpy]
# --8<-- [start:import-datetime]
@@ -154,6 +156,84 @@ async def test_vector_search_async():
# --8<-- [end:search_result_async_as_list]
def test_fts_fuzzy_query():
uri = "data/fuzzy-example"
db = lancedb.connect(uri)
table = db.create_table(
"my_table_fts_fuzzy",
data=pa.table(
{
"text": [
"fa",
"fo", # spellchecker:disable-line
"fob",
"focus",
"foo",
"food",
"foul",
]
}
),
mode="overwrite",
)
table.create_fts_index("text", use_tantivy=False, replace=True)
results = table.search(MatchQuery("foo", "text", fuzziness=1)).to_pandas()
assert len(results) == 4
assert set(results["text"].to_list()) == {
"foo",
"fo", # 1 deletion # spellchecker:disable-line
"fob", # 1 substitution
"food", # 1 insertion
}
def test_fts_boost_query():
uri = "data/boost-example"
db = lancedb.connect(uri)
table = db.create_table(
"my_table_fts_boost",
data=pa.table(
{
"title": [
"The Hidden Gems of Travel",
"Exploring Nature's Wonders",
"Cultural Treasures Unveiled",
"The Nightlife Chronicles",
"Scenic Escapes and Challenges",
],
"desc": [
"A vibrant city with occasional traffic jams.",
"Beautiful landscapes but overpriced tourist spots.",
"Rich cultural heritage but humid summers.",
"Bustling nightlife but noisy streets.",
"Scenic views but limited public transport options.",
],
}
),
mode="overwrite",
)
table.create_fts_index("desc", use_tantivy=False, replace=True)
results = table.search(
BoostQuery(
MatchQuery("beautiful, cultural, nightlife", "desc"),
MatchQuery("bad traffic jams, overpriced", "desc"),
),
).to_pandas()
# we will hit 3 results because the positive query has 3 hits
assert len(results) == 3
# the one containing "overpriced" will be negatively boosted,
# so it will be the last one
assert (
results["desc"].to_list()[2]
== "Beautiful landscapes but overpriced tourist spots."
)
def test_fts_native():
# --8<-- [start:basic_fts]
uri = "data/sample-lancedb"