mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 22:59:57 +00:00
feat: support new FTS features in python SDK (#2411)
- AND operator - phrase query slop param - boolean query <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Added support for combining full-text search queries using AND/OR operators, enabling more flexible query composition. - Introduced new query types and parameters, including boolean queries, operator selection, occurrence constraints, and phrase slop for advanced search scenarios. - Enhanced asynchronous search to accept rich full-text query objects directly. - **Bug Fixes** - Improved handling and validation of full-text search queries in both synchronous and asynchronous search operations. - **Tests** - Updated and expanded tests to cover new full-text query types and their usage in search functions. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
@@ -165,17 +165,14 @@ class HybridQuery:
|
||||
def get_with_row_id(self) -> bool: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class PyFullTextSearchQuery:
|
||||
columns: Optional[List[str]]
|
||||
query: str
|
||||
limit: Optional[int]
|
||||
wand_factor: Optional[float]
|
||||
class FullTextQuery:
|
||||
pass
|
||||
|
||||
class PyQueryRequest:
|
||||
limit: Optional[int]
|
||||
offset: Optional[int]
|
||||
filter: Optional[Union[str, bytes]]
|
||||
full_text_search: Optional[PyFullTextSearchQuery]
|
||||
full_text_search: Optional[FullTextQuery]
|
||||
select: Optional[Union[str, List[str]]]
|
||||
fast_search: Optional[bool]
|
||||
with_row_id: Optional[bool]
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
import abc
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from enum import Enum
|
||||
from datetime import timedelta
|
||||
@@ -88,15 +87,27 @@ def ensure_vector_query(
|
||||
return val
|
||||
|
||||
|
||||
class FullTextQueryType(Enum):
|
||||
class FullTextQueryType(str, Enum):
|
||||
MATCH = "match"
|
||||
MATCH_PHRASE = "match_phrase"
|
||||
BOOST = "boost"
|
||||
MULTI_MATCH = "multi_match"
|
||||
BOOLEAN = "boolean"
|
||||
|
||||
|
||||
class FullTextQuery(abc.ABC, pydantic.BaseModel):
|
||||
@abc.abstractmethod
|
||||
class FullTextOperator(str, Enum):
|
||||
AND = "AND"
|
||||
OR = "OR"
|
||||
|
||||
|
||||
class Occur(str, Enum):
|
||||
MUST = "MUST"
|
||||
SHOULD = "SHOULD"
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class FullTextQuery(ABC):
|
||||
@abstractmethod
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
"""
|
||||
Get the query type of the query.
|
||||
@@ -106,193 +117,174 @@ class FullTextQuery(abc.ABC, pydantic.BaseModel):
|
||||
str
|
||||
The type of the query.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def to_dict(self) -> dict:
|
||||
def __and__(self, other: "FullTextQuery") -> "FullTextQuery":
|
||||
"""
|
||||
Convert the query to a dictionary.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
The query as a dictionary.
|
||||
"""
|
||||
|
||||
|
||||
class MatchQuery(FullTextQuery):
|
||||
query: str
|
||||
column: str
|
||||
boost: float = 1.0
|
||||
fuzziness: int = 0
|
||||
max_expansions: int = 50
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
query: str,
|
||||
column: str,
|
||||
*,
|
||||
boost: float = 1.0,
|
||||
fuzziness: int = 0,
|
||||
max_expansions: int = 50,
|
||||
):
|
||||
"""
|
||||
Match query for full-text search.
|
||||
Combine two queries with a logical AND operation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
column : str
|
||||
The name of the column to match against.
|
||||
boost : float, default 1.0
|
||||
The boost factor for the query.
|
||||
The score of each matching document is multiplied by this value.
|
||||
fuzziness : int, optional
|
||||
The maximum edit distance for each term in the match query.
|
||||
Defaults to 0 (exact match).
|
||||
If None, fuzziness is applied automatically by the rules:
|
||||
- 0 for terms with length <= 2
|
||||
- 1 for terms with length <= 5
|
||||
- 2 for terms with length > 5
|
||||
max_expansions : int, optional
|
||||
The maximum number of terms to consider for fuzzy matching.
|
||||
Defaults to 50.
|
||||
other : FullTextQuery
|
||||
The other query to combine with.
|
||||
|
||||
Returns
|
||||
-------
|
||||
FullTextQuery
|
||||
A new query that combines both queries with AND.
|
||||
"""
|
||||
super().__init__(
|
||||
query=query,
|
||||
column=column,
|
||||
boost=boost,
|
||||
fuzziness=fuzziness,
|
||||
max_expansions=max_expansions,
|
||||
)
|
||||
return BooleanQuery([(Occur.MUST, self), (Occur.MUST, other)])
|
||||
|
||||
def __or__(self, other: "FullTextQuery") -> "FullTextQuery":
|
||||
"""
|
||||
Combine two queries with a logical OR operation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : FullTextQuery
|
||||
The other query to combine with.
|
||||
|
||||
Returns
|
||||
-------
|
||||
FullTextQuery
|
||||
A new query that combines both queries with OR.
|
||||
"""
|
||||
return BooleanQuery([(Occur.SHOULD, self), (Occur.SHOULD, other)])
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class MatchQuery(FullTextQuery):
|
||||
"""
|
||||
Match query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
column : str
|
||||
The name of the column to match against.
|
||||
boost : float, default 1.0
|
||||
The boost factor for the query.
|
||||
The score of each matching document is multiplied by this value.
|
||||
fuzziness : int, optional
|
||||
The maximum edit distance for each term in the match query.
|
||||
Defaults to 0 (exact match).
|
||||
If None, fuzziness is applied automatically by the rules:
|
||||
- 0 for terms with length <= 2
|
||||
- 1 for terms with length <= 5
|
||||
- 2 for terms with length > 5
|
||||
max_expansions : int, optional
|
||||
The maximum number of terms to consider for fuzzy matching.
|
||||
Defaults to 50.
|
||||
operator : FullTextOperator, default OR
|
||||
The operator to use for combining the query results.
|
||||
Can be either `AND` or `OR`.
|
||||
If `AND`, all terms in the query must match.
|
||||
If `OR`, at least one term in the query must match.
|
||||
"""
|
||||
|
||||
query: str
|
||||
column: str
|
||||
boost: float = pydantic.Field(1.0, kw_only=True)
|
||||
fuzziness: int = pydantic.Field(0, kw_only=True)
|
||||
max_expansions: int = pydantic.Field(50, kw_only=True)
|
||||
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.MATCH
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"match": {
|
||||
self.column: {
|
||||
"query": self.query,
|
||||
"boost": self.boost,
|
||||
"fuzziness": self.fuzziness,
|
||||
"max_expansions": self.max_expansions,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class PhraseQuery(FullTextQuery):
|
||||
"""
|
||||
Phrase query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
column : str
|
||||
The name of the column to match against.
|
||||
"""
|
||||
|
||||
query: str
|
||||
column: str
|
||||
|
||||
def __init__(self, query: str, column: str):
|
||||
"""
|
||||
Phrase query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
column : str
|
||||
The name of the column to match against.
|
||||
"""
|
||||
super().__init__(query=query, column=column)
|
||||
slop: int = pydantic.Field(0, kw_only=True)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.MATCH_PHRASE
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"match_phrase": {
|
||||
self.column: self.query,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class BoostQuery(FullTextQuery):
|
||||
"""
|
||||
Boost query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
positive : dict
|
||||
The positive query object.
|
||||
negative : dict
|
||||
The negative query object.
|
||||
negative_boost : float, default 0.5
|
||||
The boost factor for the negative query.
|
||||
"""
|
||||
|
||||
positive: FullTextQuery
|
||||
negative: FullTextQuery
|
||||
negative_boost: float = 0.5
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
positive: FullTextQuery,
|
||||
negative: FullTextQuery,
|
||||
*,
|
||||
negative_boost: float = 0.5,
|
||||
):
|
||||
"""
|
||||
Boost query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
positive : dict
|
||||
The positive query object.
|
||||
negative : dict
|
||||
The negative query object.
|
||||
negative_boost : float
|
||||
The boost factor for the negative query.
|
||||
"""
|
||||
super().__init__(
|
||||
positive=positive, negative=negative, negative_boost=negative_boost
|
||||
)
|
||||
negative_boost: float = pydantic.Field(0.5, kw_only=True)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.BOOST
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"boost": {
|
||||
"positive": self.positive.to_dict(),
|
||||
"negative": self.negative.to_dict(),
|
||||
"negative_boost": self.negative_boost,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class MultiMatchQuery(FullTextQuery):
|
||||
"""
|
||||
Multi-match query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str | list[Query]
|
||||
If a string, the query string to match against.
|
||||
columns : list[str]
|
||||
The list of columns to match against.
|
||||
boosts : list[float], optional
|
||||
The list of boost factors for each column. If not provided,
|
||||
all columns will have the same boost factor.
|
||||
operator : FullTextOperator, default OR
|
||||
The operator to use for combining the query results.
|
||||
Can be either `AND` or `OR`.
|
||||
It would be applied to all columns individually.
|
||||
For example, if the operator is `AND`,
|
||||
then the query "hello world" is equal to
|
||||
`match("hello AND world", column1) OR match("hello AND world", column2)`.
|
||||
"""
|
||||
|
||||
query: str
|
||||
columns: list[str]
|
||||
boosts: list[float]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
query: str,
|
||||
columns: list[str],
|
||||
*,
|
||||
boosts: Optional[list[float]] = None,
|
||||
):
|
||||
"""
|
||||
Multi-match query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
The query string to match against.
|
||||
|
||||
columns : list[str]
|
||||
The list of columns to match against.
|
||||
|
||||
boosts : list[float], optional
|
||||
The list of boost factors for each column. If not provided,
|
||||
all columns will have the same boost factor.
|
||||
"""
|
||||
if boosts is None:
|
||||
boosts = [1.0] * len(columns)
|
||||
super().__init__(query=query, columns=columns, boosts=boosts)
|
||||
boosts: Optional[list[float]] = pydantic.Field(None, kw_only=True)
|
||||
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.MULTI_MATCH
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"multi_match": {
|
||||
"query": self.query,
|
||||
"columns": self.columns,
|
||||
"boost": self.boosts,
|
||||
}
|
||||
}
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class BooleanQuery(FullTextQuery):
|
||||
"""
|
||||
Boolean query for full-text search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
queries : list[tuple(Occur, FullTextQuery)]
|
||||
The list of queries with their occurrence requirements.
|
||||
"""
|
||||
|
||||
queries: list[tuple[Occur, FullTextQuery]]
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.BOOLEAN
|
||||
|
||||
|
||||
class FullTextSearchQuery(pydantic.BaseModel):
|
||||
@@ -493,10 +485,8 @@ class Query(pydantic.BaseModel):
|
||||
query.postfilter = req.postfilter
|
||||
if req.full_text_search is not None:
|
||||
query.full_text_query = FullTextSearchQuery(
|
||||
columns=req.full_text_search.columns,
|
||||
query=req.full_text_search.query,
|
||||
limit=req.full_text_search.limit,
|
||||
wand_factor=req.full_text_search.wand_factor,
|
||||
columns=None,
|
||||
query=req.full_text_search,
|
||||
)
|
||||
return query
|
||||
|
||||
@@ -2513,7 +2503,7 @@ class AsyncQuery(AsyncQueryBase):
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
# FullTextQuery object
|
||||
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
|
||||
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query}))
|
||||
|
||||
|
||||
class AsyncFTSQuery(AsyncQueryBase):
|
||||
@@ -2835,7 +2825,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
# FullTextQuery object
|
||||
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query.to_dict()}))
|
||||
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query}))
|
||||
|
||||
async def to_batches(
|
||||
self,
|
||||
|
||||
@@ -215,6 +215,19 @@ def test_search_fts(table, use_tantivy):
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
# Test boolean query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
for r in results:
|
||||
assert "puppy" in r["text"]
|
||||
assert "runs" in r["text"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fts_select_async(async_table):
|
||||
|
||||
@@ -25,6 +25,8 @@ from lancedb.query import (
|
||||
AsyncQueryBase,
|
||||
AsyncVectorQuery,
|
||||
LanceVectorQueryBuilder,
|
||||
MatchQuery,
|
||||
PhraseQuery,
|
||||
Query,
|
||||
FullTextSearchQuery,
|
||||
)
|
||||
@@ -1065,18 +1067,27 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
)
|
||||
|
||||
# FTS queries
|
||||
q = (await table_async.search("foo")).limit(10).to_query_object()
|
||||
match_query = MatchQuery("foo", "text")
|
||||
q = (await table_async.search(match_query)).limit(10).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
limit=10,
|
||||
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
||||
full_text_query=FullTextSearchQuery(columns=None, query=match_query),
|
||||
with_row_id=False,
|
||||
)
|
||||
|
||||
q = (await table_async.search("foo", query_type="fts")).to_query_object()
|
||||
q = (await table_async.search(match_query)).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
||||
full_text_query=FullTextSearchQuery(columns=None, query=match_query),
|
||||
with_row_id=False,
|
||||
)
|
||||
|
||||
phrase_query = PhraseQuery("foo", "text", slop=1)
|
||||
q = (await table_async.search(phrase_query)).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
full_text_query=FullTextSearchQuery(columns=None, query=phrase_query),
|
||||
with_row_id=False,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user