feat: support new FTS features in python SDK (#2411)

- AND operator
- phrase query slop param
- boolean query

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Added support for combining full-text search queries using AND/OR
operators, enabling more flexible query composition.
- Introduced new query types and parameters, including boolean queries,
operator selection, occurrence constraints, and phrase slop for advanced
search scenarios.
- Enhanced asynchronous search to accept rich full-text query objects
directly.

- **Bug Fixes**
- Improved handling and validation of full-text search queries in both
synchronous and asynchronous search operations.

- **Tests**
- Updated and expanded tests to cover new full-text query types and
their usage in search functions.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2025-06-06 14:33:46 +08:00
committed by GitHub
parent 65696d9713
commit 84ded9d678
6 changed files with 364 additions and 321 deletions

View File

@@ -165,17 +165,14 @@ class HybridQuery:
def get_with_row_id(self) -> bool: ...
def to_query_request(self) -> PyQueryRequest: ...
class PyFullTextSearchQuery:
columns: Optional[List[str]]
query: str
limit: Optional[int]
wand_factor: Optional[float]
class FullTextQuery:
pass
class PyQueryRequest:
limit: Optional[int]
offset: Optional[int]
filter: Optional[Union[str, bytes]]
full_text_search: Optional[PyFullTextSearchQuery]
full_text_search: Optional[FullTextQuery]
select: Optional[Union[str, List[str]]]
fast_search: Optional[bool]
with_row_id: Optional[bool]

View File

@@ -4,7 +4,6 @@
from __future__ import annotations
from abc import ABC, abstractmethod
import abc
from concurrent.futures import ThreadPoolExecutor
from enum import Enum
from datetime import timedelta
@@ -88,15 +87,27 @@ def ensure_vector_query(
return val
class FullTextQueryType(Enum):
class FullTextQueryType(str, Enum):
MATCH = "match"
MATCH_PHRASE = "match_phrase"
BOOST = "boost"
MULTI_MATCH = "multi_match"
BOOLEAN = "boolean"
class FullTextQuery(abc.ABC, pydantic.BaseModel):
@abc.abstractmethod
class FullTextOperator(str, Enum):
AND = "AND"
OR = "OR"
class Occur(str, Enum):
MUST = "MUST"
SHOULD = "SHOULD"
@pydantic.dataclasses.dataclass
class FullTextQuery(ABC):
@abstractmethod
def query_type(self) -> FullTextQueryType:
"""
Get the query type of the query.
@@ -106,193 +117,174 @@ class FullTextQuery(abc.ABC, pydantic.BaseModel):
str
The type of the query.
"""
pass
@abc.abstractmethod
def to_dict(self) -> dict:
def __and__(self, other: "FullTextQuery") -> "FullTextQuery":
"""
Convert the query to a dictionary.
Returns
-------
dict
The query as a dictionary.
"""
class MatchQuery(FullTextQuery):
query: str
column: str
boost: float = 1.0
fuzziness: int = 0
max_expansions: int = 50
def __init__(
self,
query: str,
column: str,
*,
boost: float = 1.0,
fuzziness: int = 0,
max_expansions: int = 50,
):
"""
Match query for full-text search.
Combine two queries with a logical AND operation.
Parameters
----------
query : str
The query string to match against.
column : str
The name of the column to match against.
boost : float, default 1.0
The boost factor for the query.
The score of each matching document is multiplied by this value.
fuzziness : int, optional
The maximum edit distance for each term in the match query.
Defaults to 0 (exact match).
If None, fuzziness is applied automatically by the rules:
- 0 for terms with length <= 2
- 1 for terms with length <= 5
- 2 for terms with length > 5
max_expansions : int, optional
The maximum number of terms to consider for fuzzy matching.
Defaults to 50.
other : FullTextQuery
The other query to combine with.
Returns
-------
FullTextQuery
A new query that combines both queries with AND.
"""
super().__init__(
query=query,
column=column,
boost=boost,
fuzziness=fuzziness,
max_expansions=max_expansions,
)
return BooleanQuery([(Occur.MUST, self), (Occur.MUST, other)])
def __or__(self, other: "FullTextQuery") -> "FullTextQuery":
"""
Combine two queries with a logical OR operation.
Parameters
----------
other : FullTextQuery
The other query to combine with.
Returns
-------
FullTextQuery
A new query that combines both queries with OR.
"""
return BooleanQuery([(Occur.SHOULD, self), (Occur.SHOULD, other)])
@pydantic.dataclasses.dataclass
class MatchQuery(FullTextQuery):
"""
Match query for full-text search.
Parameters
----------
query : str
The query string to match against.
column : str
The name of the column to match against.
boost : float, default 1.0
The boost factor for the query.
The score of each matching document is multiplied by this value.
fuzziness : int, optional
The maximum edit distance for each term in the match query.
Defaults to 0 (exact match).
If None, fuzziness is applied automatically by the rules:
- 0 for terms with length <= 2
- 1 for terms with length <= 5
- 2 for terms with length > 5
max_expansions : int, optional
The maximum number of terms to consider for fuzzy matching.
Defaults to 50.
operator : FullTextOperator, default OR
The operator to use for combining the query results.
Can be either `AND` or `OR`.
If `AND`, all terms in the query must match.
If `OR`, at least one term in the query must match.
"""
query: str
column: str
boost: float = pydantic.Field(1.0, kw_only=True)
fuzziness: int = pydantic.Field(0, kw_only=True)
max_expansions: int = pydantic.Field(50, kw_only=True)
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.MATCH
def to_dict(self) -> dict:
return {
"match": {
self.column: {
"query": self.query,
"boost": self.boost,
"fuzziness": self.fuzziness,
"max_expansions": self.max_expansions,
}
}
}
@pydantic.dataclasses.dataclass
class PhraseQuery(FullTextQuery):
"""
Phrase query for full-text search.
Parameters
----------
query : str
The query string to match against.
column : str
The name of the column to match against.
"""
query: str
column: str
def __init__(self, query: str, column: str):
"""
Phrase query for full-text search.
Parameters
----------
query : str
The query string to match against.
column : str
The name of the column to match against.
"""
super().__init__(query=query, column=column)
slop: int = pydantic.Field(0, kw_only=True)
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.MATCH_PHRASE
def to_dict(self) -> dict:
return {
"match_phrase": {
self.column: self.query,
}
}
@pydantic.dataclasses.dataclass
class BoostQuery(FullTextQuery):
"""
Boost query for full-text search.
Parameters
----------
positive : dict
The positive query object.
negative : dict
The negative query object.
negative_boost : float, default 0.5
The boost factor for the negative query.
"""
positive: FullTextQuery
negative: FullTextQuery
negative_boost: float = 0.5
def __init__(
self,
positive: FullTextQuery,
negative: FullTextQuery,
*,
negative_boost: float = 0.5,
):
"""
Boost query for full-text search.
Parameters
----------
positive : dict
The positive query object.
negative : dict
The negative query object.
negative_boost : float
The boost factor for the negative query.
"""
super().__init__(
positive=positive, negative=negative, negative_boost=negative_boost
)
negative_boost: float = pydantic.Field(0.5, kw_only=True)
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.BOOST
def to_dict(self) -> dict:
return {
"boost": {
"positive": self.positive.to_dict(),
"negative": self.negative.to_dict(),
"negative_boost": self.negative_boost,
}
}
@pydantic.dataclasses.dataclass
class MultiMatchQuery(FullTextQuery):
"""
Multi-match query for full-text search.
Parameters
----------
query : str | list[Query]
If a string, the query string to match against.
columns : list[str]
The list of columns to match against.
boosts : list[float], optional
The list of boost factors for each column. If not provided,
all columns will have the same boost factor.
operator : FullTextOperator, default OR
The operator to use for combining the query results.
Can be either `AND` or `OR`.
It would be applied to all columns individually.
For example, if the operator is `AND`,
then the query "hello world" is equal to
`match("hello AND world", column1) OR match("hello AND world", column2)`.
"""
query: str
columns: list[str]
boosts: list[float]
def __init__(
self,
query: str,
columns: list[str],
*,
boosts: Optional[list[float]] = None,
):
"""
Multi-match query for full-text search.
Parameters
----------
query : str
The query string to match against.
columns : list[str]
The list of columns to match against.
boosts : list[float], optional
The list of boost factors for each column. If not provided,
all columns will have the same boost factor.
"""
if boosts is None:
boosts = [1.0] * len(columns)
super().__init__(query=query, columns=columns, boosts=boosts)
boosts: Optional[list[float]] = pydantic.Field(None, kw_only=True)
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.MULTI_MATCH
def to_dict(self) -> dict:
return {
"multi_match": {
"query": self.query,
"columns": self.columns,
"boost": self.boosts,
}
}
@pydantic.dataclasses.dataclass
class BooleanQuery(FullTextQuery):
"""
Boolean query for full-text search.
Parameters
----------
queries : list[tuple(Occur, FullTextQuery)]
The list of queries with their occurrence requirements.
"""
queries: list[tuple[Occur, FullTextQuery]]
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.BOOLEAN
class FullTextSearchQuery(pydantic.BaseModel):
@@ -493,10 +485,8 @@ class Query(pydantic.BaseModel):
query.postfilter = req.postfilter
if req.full_text_search is not None:
query.full_text_query = FullTextSearchQuery(
columns=req.full_text_search.columns,
query=req.full_text_search.query,
limit=req.full_text_search.limit,
wand_factor=req.full_text_search.wand_factor,
columns=None,
query=req.full_text_search,
)
return query
@@ -2513,7 +2503,7 @@ class AsyncQuery(AsyncQueryBase):
self._inner.nearest_to_text({"query": query, "columns": columns})
)
# FullTextQuery object
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query}))
class AsyncFTSQuery(AsyncQueryBase):
@@ -2835,7 +2825,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
self._inner.nearest_to_text({"query": query, "columns": columns})
)
# FullTextQuery object
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query}))
async def to_batches(
self,

View File

@@ -215,6 +215,19 @@ def test_search_fts(table, use_tantivy):
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test boolean query
results = (
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
for r in results:
assert "puppy" in r["text"]
assert "runs" in r["text"]
@pytest.mark.asyncio
async def test_fts_select_async(async_table):

View File

@@ -25,6 +25,8 @@ from lancedb.query import (
AsyncQueryBase,
AsyncVectorQuery,
LanceVectorQueryBuilder,
MatchQuery,
PhraseQuery,
Query,
FullTextSearchQuery,
)
@@ -1065,18 +1067,27 @@ async def test_query_serialization_async(table_async: AsyncTable):
)
# FTS queries
q = (await table_async.search("foo")).limit(10).to_query_object()
match_query = MatchQuery("foo", "text")
q = (await table_async.search(match_query)).limit(10).to_query_object()
check_set_props(
q,
limit=10,
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
full_text_query=FullTextSearchQuery(columns=None, query=match_query),
with_row_id=False,
)
q = (await table_async.search("foo", query_type="fts")).to_query_object()
q = (await table_async.search(match_query)).to_query_object()
check_set_props(
q,
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
full_text_query=FullTextSearchQuery(columns=None, query=match_query),
with_row_id=False,
)
phrase_query = PhraseQuery("foo", "text", slop=1)
q = (await table_async.search(phrase_query)).to_query_object()
check_set_props(
q,
full_text_query=FullTextSearchQuery(columns=None, query=phrase_query),
with_row_id=False,
)