feat!: switch default FTS to native lance FTS (#2428)

This switches the default FTS to native lance FTS for Python sync table
API, the other APIs have switched to native implementation already

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- The default behavior for creating a full-text search index now uses
the new implementation rather than the legacy one.
- **Bug Fixes**
- Improved handling and error messages for phrase queries in full-text
search.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2025-06-19 10:38:34 +08:00
committed by GitHub
parent cbb5a841b1
commit cb70ff8cee
3 changed files with 11 additions and 8 deletions

View File

@@ -1451,10 +1451,13 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
query = self._query
if self._phrase_query:
raise NotImplementedError(
"Phrase query is not yet supported in Lance FTS. "
"Use tantivy-based index instead for now."
)
if isinstance(query, str):
if not query.startswith('"') or not query.endswith('"'):
query = f'"{query}"'
elif isinstance(query, FullTextQuery) and not isinstance(
query, PhraseQuery
):
raise TypeError("Please use PhraseQuery for phrase queries.")
query = self.to_query_object()
results = self._table._execute_query(query, timeout=timeout)
results = results.read_all()

View File

@@ -827,7 +827,7 @@ class Table(ABC):
ordering_field_names: Optional[Union[str, List[str]]] = None,
replace: bool = False,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
use_tantivy: bool = True,
use_tantivy: bool = False,
tokenizer_name: Optional[str] = None,
with_position: bool = False,
# tokenizer configs:
@@ -864,7 +864,7 @@ class Table(ABC):
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
language code followed by "_stem". So for english it would be "en_stem".
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
use_tantivy: bool, default True
use_tantivy: bool, default False
If True, use the legacy full-text search implementation based on tantivy.
If False, use the new full-text search implementation based on lance-index.
with_position: bool, default False
@@ -1970,7 +1970,7 @@ class LanceTable(Table):
ordering_field_names: Optional[Union[str, List[str]]] = None,
replace: bool = False,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
use_tantivy: bool = True,
use_tantivy: bool = False,
tokenizer_name: Optional[str] = None,
with_position: bool = False,
# tokenizer configs:

View File

@@ -245,7 +245,7 @@ def test_s3_dynamodb_sync(s3_bucket: str, commit_table: str, monkeypatch):
NotImplementedError,
match="Full-text search is only supported on the local filesystem",
):
table.create_fts_index("x")
table.create_fts_index("x", use_tantivy=True)
# make sure list tables still works
assert db.table_names() == ["test_ddb_sync"]