feat(python): add phrase query option for fts (#798)

addresses #797 

Problem: tantivy does not expose option to explicitly

Proposed solution here: 

1. Add a `.phrase_query()` option
2. Under the hood, LanceDB takes care of wrapping the input in quotes
and replace nested double quotes with single quotes

I've also filed an upstream issue, if they support phrase queries
natively then we can get rid of our manual custom processing here.
This commit is contained in:
Chang She
2024-01-09 19:41:31 -08:00
committed by Weston Pace
parent f17d16f935
commit 881dfa022b
2 changed files with 30 additions and 5 deletions

View File

@@ -468,6 +468,24 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
def __init__(self, table: "lancedb.table.Table", query: str):
super().__init__(table)
self._query = query
self._phrase_query = False
def phrase_query(self, phrase_query: bool = True) -> LanceFtsQueryBuilder:
"""Set whether to use phrase query.
Parameters
----------
phrase_query: bool, default True
If True, then the query will be wrapped in quotes and
double quotes replaced by single quotes.
Returns
-------
LanceFtsQueryBuilder
The LanceFtsQueryBuilder object.
"""
self._phrase_query = phrase_query
return self
def to_arrow(self) -> pa.Table:
try:
@@ -490,7 +508,11 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
# open the index
index = tantivy.Index.open(index_path)
# get the scores and doc ids
row_ids, scores = search_index(index, self._query, self._limit)
query = self._query
if self._phrase_query:
query = query.replace('"', "'")
query = f'"{query}"'
row_ids, scores = search_index(index, query, self._limit)
if len(row_ids) == 0:
empty_schema = pa.schema([pa.field("score", pa.float32())])
return pa.Table.from_pylist([], schema=empty_schema)

View File

@@ -169,13 +169,16 @@ def test_syntax(table):
table.create_fts_index("text")
with pytest.raises(ValueError, match="Syntax Error"):
table.search("they could have been dogs OR cats").limit(10).to_list()
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
# this should work
table.search('"they could have been dogs OR cats"').limit(10).to_list()
# this should work too
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
10
).to_list()
with pytest.raises(ValueError, match="Syntax Error"):
table.search('''"the cats OR dogs were not really "pets" at all"''').limit(
10
).to_list()
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
10
).to_list()
table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit(
10
).to_list()