fix: bugs for new FTS APIs (#2314)

## Summary by CodeRabbit - **New Features** - Enhanced full-text search capabilities with support for phrase queries, fuzzy matching, boosting, and multi-column matching. - Search methods now accept full-text query objects directly, improving query flexibility and precision. - Python and JavaScript SDKs updated to handle full-text queries seamlessly, including async search support. - **Tests** - Added comprehensive tests covering fuzzy search, phrase search, and boosted queries to ensure robust full-text search functionality. - **Documentation** - Updated query class documentation to reflect new constructor options and removal of deprecated methods for clarity and simplicity.  --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2026-06-01 19:30:45 +00:00 · 2025-04-15 11:51:35 +08:00
parent a6fa69ab89
commit 2248aa9508
15 changed files with 397 additions and 415 deletions
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -266,8 +266,8 @@ class MultiMatchQuery(FullTextQuery):

        Parameters
        ----------
-        query : str | list[Query]
-            If a string, the query string to match against.
+        query : str
+            The query string to match against.

        columns : list[str]
            The list of columns to match against.
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -2141,6 +2141,8 @@ class LanceTable(Table):
            and also the "_distance" column which is the distance between the query
            vector and the returned vector.
        """
+        if isinstance(query, FullTextQuery):
+            query_type = "fts"
        vector_column_name = infer_vector_column_name(
            schema=self.schema,
            query_type=query_type,
@@ -3223,8 +3225,10 @@ class AsyncTable:
        async def get_embedding_func(
            vector_column_name: Optional[str],
            query_type: QueryType,
-            query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]],
+            query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple, FullTextQuery]],
        ) -> Tuple[str, EmbeddingFunctionConfig]:
+            if isinstance(query, FullTextQuery):
+                query_type = "fts"
            schema = await self.schema()
            vector_column_name = infer_vector_column_name(
                schema=schema,
--- a/python/python/lancedb/util.py
+++ b/python/python/lancedb/util.py
@@ -253,9 +253,14 @@ def infer_vector_column_name(
    query: Optional[Any],  # inferred later in query builder
    vector_column_name: Optional[str],
 ):
-    if (vector_column_name is None and query is not None and query_type != "fts") or (
-        vector_column_name is None and query_type == "hybrid"
-    ):
+    if vector_column_name is not None:
+        return vector_column_name
+
+    if query_type == "fts":
+        # FTS queries do not require a vector column
+        return None
+
+    if query is not None or query_type == "hybrid":
        try:
            vector_column_name = inf_vector_column_query(schema)
        except Exception as e:
--- a/python/python/tests/docs/test_search.py
+++ b/python/python/tests/docs/test_search.py
@@ -6,7 +6,9 @@ import lancedb

 # --8<-- [end:import-lancedb]
 # --8<-- [start:import-numpy]
+from lancedb.query import BoostQuery, MatchQuery
 import numpy as np
+import pyarrow as pa

 # --8<-- [end:import-numpy]
 # --8<-- [start:import-datetime]
@@ -154,6 +156,84 @@ async def test_vector_search_async():
    # --8<-- [end:search_result_async_as_list]


+def test_fts_fuzzy_query():
+    uri = "data/fuzzy-example"
+    db = lancedb.connect(uri)
+
+    table = db.create_table(
+        "my_table_fts_fuzzy",
+        data=pa.table(
+            {
+                "text": [
+                    "fa",
+                    "fo",  # spellchecker:disable-line
+                    "fob",
+                    "focus",
+                    "foo",
+                    "food",
+                    "foul",
+                ]
+            }
+        ),
+        mode="overwrite",
+    )
+    table.create_fts_index("text", use_tantivy=False, replace=True)
+
+    results = table.search(MatchQuery("foo", "text", fuzziness=1)).to_pandas()
+    assert len(results) == 4
+    assert set(results["text"].to_list()) == {
+        "foo",
+        "fo",  # 1 deletion # spellchecker:disable-line
+        "fob",  # 1 substitution
+        "food",  # 1 insertion
+    }
+
+
+def test_fts_boost_query():
+    uri = "data/boost-example"
+    db = lancedb.connect(uri)
+
+    table = db.create_table(
+        "my_table_fts_boost",
+        data=pa.table(
+            {
+                "title": [
+                    "The Hidden Gems of Travel",
+                    "Exploring Nature's Wonders",
+                    "Cultural Treasures Unveiled",
+                    "The Nightlife Chronicles",
+                    "Scenic Escapes and Challenges",
+                ],
+                "desc": [
+                    "A vibrant city with occasional traffic jams.",
+                    "Beautiful landscapes but overpriced tourist spots.",
+                    "Rich cultural heritage but humid summers.",
+                    "Bustling nightlife but noisy streets.",
+                    "Scenic views but limited public transport options.",
+                ],
+            }
+        ),
+        mode="overwrite",
+    )
+    table.create_fts_index("desc", use_tantivy=False, replace=True)
+
+    results = table.search(
+        BoostQuery(
+            MatchQuery("beautiful, cultural, nightlife", "desc"),
+            MatchQuery("bad traffic jams, overpriced", "desc"),
+        ),
+    ).to_pandas()
+
+    # we will hit 3 results because the positive query has 3 hits
+    assert len(results) == 3
+    # the one containing "overpriced" will be negatively boosted,
+    # so it will be the last one
+    assert (
+        results["desc"].to_list()[2]
+        == "Beautiful landscapes but overpriced tourist spots."
+    )
+
+
 def test_fts_native():
    # --8<-- [start:basic_fts]
    uri = "data/sample-lancedb"