From cb70ff8cee445669182c93edf8dbc5d6ca57e4fc Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 19 Jun 2025 10:38:34 +0800 Subject: [PATCH] feat!: switch default FTS to native lance FTS (#2428) This switches the default FTS to native lance FTS for Python sync table API, the other APIs have switched to native implementation already ## Summary by CodeRabbit - **New Features** - The default behavior for creating a full-text search index now uses the new implementation rather than the legacy one. - **Bug Fixes** - Improved handling and error messages for phrase queries in full-text search. --------- Signed-off-by: BubbleCal --- python/python/lancedb/query.py | 11 +++++++---- python/python/lancedb/table.py | 6 +++--- python/python/tests/test_s3.py | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 338a7f16..20a6bdc0 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -1451,10 +1451,13 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): query = self._query if self._phrase_query: - raise NotImplementedError( - "Phrase query is not yet supported in Lance FTS. " - "Use tantivy-based index instead for now." - ) + if isinstance(query, str): + if not query.startswith('"') or not query.endswith('"'): + query = f'"{query}"' + elif isinstance(query, FullTextQuery) and not isinstance( + query, PhraseQuery + ): + raise TypeError("Please use PhraseQuery for phrase queries.") query = self.to_query_object() results = self._table._execute_query(query, timeout=timeout) results = results.read_all() diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 5bc78f9f..6a964104 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -827,7 +827,7 @@ class Table(ABC): ordering_field_names: Optional[Union[str, List[str]]] = None, replace: bool = False, writer_heap_size: Optional[int] = 1024 * 1024 * 1024, - use_tantivy: bool = True, + use_tantivy: bool = False, tokenizer_name: Optional[str] = None, with_position: bool = False, # tokenizer configs: @@ -864,7 +864,7 @@ class Table(ABC): The tokenizer to use for the index. Can be "raw", "default" or the 2 letter language code followed by "_stem". So for english it would be "en_stem". For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html - use_tantivy: bool, default True + use_tantivy: bool, default False If True, use the legacy full-text search implementation based on tantivy. If False, use the new full-text search implementation based on lance-index. with_position: bool, default False @@ -1970,7 +1970,7 @@ class LanceTable(Table): ordering_field_names: Optional[Union[str, List[str]]] = None, replace: bool = False, writer_heap_size: Optional[int] = 1024 * 1024 * 1024, - use_tantivy: bool = True, + use_tantivy: bool = False, tokenizer_name: Optional[str] = None, with_position: bool = False, # tokenizer configs: diff --git a/python/python/tests/test_s3.py b/python/python/tests/test_s3.py index 3bee14cd..3b62e116 100644 --- a/python/python/tests/test_s3.py +++ b/python/python/tests/test_s3.py @@ -245,7 +245,7 @@ def test_s3_dynamodb_sync(s3_bucket: str, commit_table: str, monkeypatch): NotImplementedError, match="Full-text search is only supported on the local filesystem", ): - table.create_fts_index("x") + table.create_fts_index("x", use_tantivy=True) # make sure list tables still works assert db.table_names() == ["test_ddb_sync"]