From 881dfa022b40a1aa3e930c5183b685e23d377c93 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Tue, 9 Jan 2024 19:41:31 -0800 Subject: [PATCH] feat(python): add phrase query option for fts (#798) addresses #797 Problem: tantivy does not expose option to explicitly Proposed solution here: 1. Add a `.phrase_query()` option 2. Under the hood, LanceDB takes care of wrapping the input in quotes and replace nested double quotes with single quotes I've also filed an upstream issue, if they support phrase queries natively then we can get rid of our manual custom processing here. --- python/lancedb/query.py | 24 +++++++++++++++++++++++- python/tests/test_fts.py | 11 +++++++---- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/python/lancedb/query.py b/python/lancedb/query.py index dcc72c58..532566da 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -468,6 +468,24 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): def __init__(self, table: "lancedb.table.Table", query: str): super().__init__(table) self._query = query + self._phrase_query = False + + def phrase_query(self, phrase_query: bool = True) -> LanceFtsQueryBuilder: + """Set whether to use phrase query. + + Parameters + ---------- + phrase_query: bool, default True + If True, then the query will be wrapped in quotes and + double quotes replaced by single quotes. + + Returns + ------- + LanceFtsQueryBuilder + The LanceFtsQueryBuilder object. + """ + self._phrase_query = phrase_query + return self def to_arrow(self) -> pa.Table: try: @@ -490,7 +508,11 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): # open the index index = tantivy.Index.open(index_path) # get the scores and doc ids - row_ids, scores = search_index(index, self._query, self._limit) + query = self._query + if self._phrase_query: + query = query.replace('"', "'") + query = f'"{query}"' + row_ids, scores = search_index(index, query, self._limit) if len(row_ids) == 0: empty_schema = pa.schema([pa.field("score", pa.float32())]) return pa.Table.from_pylist([], schema=empty_schema) diff --git a/python/tests/test_fts.py b/python/tests/test_fts.py index f65dc4ca..a62b1b2e 100644 --- a/python/tests/test_fts.py +++ b/python/tests/test_fts.py @@ -169,13 +169,16 @@ def test_syntax(table): table.create_fts_index("text") with pytest.raises(ValueError, match="Syntax Error"): table.search("they could have been dogs OR cats").limit(10).to_list() + table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list() # this should work table.search('"they could have been dogs OR cats"').limit(10).to_list() # this should work too table.search('''"the cats OR dogs were not really 'pets' at all"''').limit( 10 ).to_list() - with pytest.raises(ValueError, match="Syntax Error"): - table.search('''"the cats OR dogs were not really "pets" at all"''').limit( - 10 - ).to_list() + table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit( + 10 + ).to_list() + table.search('the cats OR dogs were not really "pets" at all').phrase_query().limit( + 10 + ).to_list()