mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-04 02:42:57 +00:00
feat(python): add phrase query option for fts (#798)
addresses #797 Problem: tantivy does not expose option to explicitly Proposed solution here: 1. Add a `.phrase_query()` option 2. Under the hood, LanceDB takes care of wrapping the input in quotes and replace nested double quotes with single quotes I've also filed an upstream issue, if they support phrase queries natively then we can get rid of our manual custom processing here.
This commit is contained in:
committed by
Andrew Miracle
parent
fcfb4587bb
commit
ff81c0d698
@@ -468,6 +468,24 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
def __init__(self, table: "lancedb.table.Table", query: str):
|
||||
super().__init__(table)
|
||||
self._query = query
|
||||
self._phrase_query = False
|
||||
|
||||
def phrase_query(self, phrase_query: bool = True) -> LanceFtsQueryBuilder:
|
||||
"""Set whether to use phrase query.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
phrase_query: bool, default True
|
||||
If True, then the query will be wrapped in quotes and
|
||||
double quotes replaced by single quotes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceFtsQueryBuilder
|
||||
The LanceFtsQueryBuilder object.
|
||||
"""
|
||||
self._phrase_query = phrase_query
|
||||
return self
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
try:
|
||||
@@ -490,7 +508,11 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
# open the index
|
||||
index = tantivy.Index.open(index_path)
|
||||
# get the scores and doc ids
|
||||
row_ids, scores = search_index(index, self._query, self._limit)
|
||||
query = self._query
|
||||
if self._phrase_query:
|
||||
query = query.replace('"', "'")
|
||||
query = f'"{query}"'
|
||||
row_ids, scores = search_index(index, query, self._limit)
|
||||
if len(row_ids) == 0:
|
||||
empty_schema = pa.schema([pa.field("score", pa.float32())])
|
||||
return pa.Table.from_pylist([], schema=empty_schema)
|
||||
|
||||
@@ -169,13 +169,16 @@ def test_syntax(table):
|
||||
table.create_fts_index("text")
|
||||
with pytest.raises(ValueError, match="Syntax Error"):
|
||||
table.search("they could have been dogs OR cats").limit(10).to_list()
|
||||
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
|
||||
# this should work
|
||||
table.search('"they could have been dogs OR cats"').limit(10).to_list()
|
||||
# this should work too
|
||||
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
|
||||
10
|
||||
).to_list()
|
||||
with pytest.raises(ValueError, match="Syntax Error"):
|
||||
table.search('''"the cats OR dogs were not really "pets" at all"''').limit(
|
||||
10
|
||||
).to_list()
|
||||
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
|
||||
10
|
||||
).to_list()
|
||||
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
|
||||
10
|
||||
).to_list()
|
||||
|
||||
Reference in New Issue
Block a user