mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 07:09:57 +00:00
feat: support prefix matching and must_not clause (#2441)
This commit is contained in:
@@ -101,8 +101,9 @@ class FullTextOperator(str, Enum):
|
||||
|
||||
|
||||
class Occur(str, Enum):
|
||||
MUST = "MUST"
|
||||
SHOULD = "SHOULD"
|
||||
MUST = "MUST"
|
||||
MUST_NOT = "MUST_NOT"
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
@@ -181,6 +182,9 @@ class MatchQuery(FullTextQuery):
|
||||
Can be either `AND` or `OR`.
|
||||
If `AND`, all terms in the query must match.
|
||||
If `OR`, at least one term in the query must match.
|
||||
prefix_length : int, optional
|
||||
The number of beginning characters being unchanged for fuzzy matching.
|
||||
This is useful to achieve prefix matching.
|
||||
"""
|
||||
|
||||
query: str
|
||||
@@ -189,6 +193,7 @@ class MatchQuery(FullTextQuery):
|
||||
fuzziness: int = pydantic.Field(0, kw_only=True)
|
||||
max_expansions: int = pydantic.Field(50, kw_only=True)
|
||||
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
||||
prefix_length: int = pydantic.Field(0, kw_only=True)
|
||||
|
||||
def query_type(self) -> FullTextQueryType:
|
||||
return FullTextQueryType.MATCH
|
||||
|
||||
@@ -6,7 +6,7 @@ import lancedb
|
||||
|
||||
# --8<-- [end:import-lancedb]
|
||||
# --8<-- [start:import-numpy]
|
||||
from lancedb.query import BoostQuery, MatchQuery
|
||||
from lancedb.query import BooleanQuery, BoostQuery, MatchQuery, Occur
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -191,6 +191,15 @@ def test_fts_fuzzy_query():
|
||||
"food", # 1 insertion
|
||||
}
|
||||
|
||||
results = table.search(
|
||||
MatchQuery("foo", "text", fuzziness=1, prefix_length=3)
|
||||
).to_pandas()
|
||||
assert len(results) == 2
|
||||
assert set(results["text"].to_list()) == {
|
||||
"foo",
|
||||
"food",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||
@@ -240,6 +249,60 @@ def test_fts_boost_query():
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||
)
|
||||
def test_fts_boolean_query(tmp_path):
|
||||
uri = tmp_path / "boolean-example"
|
||||
db = lancedb.connect(uri)
|
||||
table = db.create_table(
|
||||
"my_table_fts_boolean",
|
||||
data=[
|
||||
{"text": "The cat and dog are playing"},
|
||||
{"text": "The cat is sleeping"},
|
||||
{"text": "The dog is barking"},
|
||||
{"text": "The dog chases the cat"},
|
||||
],
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
|
||||
# SHOULD
|
||||
results = table.search(
|
||||
MatchQuery("cat", "text") | MatchQuery("dog", "text")
|
||||
).to_pandas()
|
||||
assert len(results) == 4
|
||||
assert set(results["text"].to_list()) == {
|
||||
"The cat and dog are playing",
|
||||
"The cat is sleeping",
|
||||
"The dog is barking",
|
||||
"The dog chases the cat",
|
||||
}
|
||||
# MUST
|
||||
results = table.search(
|
||||
MatchQuery("cat", "text") & MatchQuery("dog", "text")
|
||||
).to_pandas()
|
||||
assert len(results) == 2
|
||||
assert set(results["text"].to_list()) == {
|
||||
"The cat and dog are playing",
|
||||
"The dog chases the cat",
|
||||
}
|
||||
|
||||
# MUST NOT
|
||||
results = table.search(
|
||||
BooleanQuery(
|
||||
[
|
||||
(Occur.MUST, MatchQuery("cat", "text")),
|
||||
(Occur.MUST_NOT, MatchQuery("dog", "text")),
|
||||
]
|
||||
)
|
||||
).to_pandas()
|
||||
assert len(results) == 1
|
||||
assert set(results["text"].to_list()) == {
|
||||
"The cat is sleeping",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user