feat!: upgrade lance to v0.28.0 (#2404)

this introduces some breaking changes in terms of rust API of creating FTS index, and the default index params changed Signed-off-by: BubbleCal <bubble-cal@outlook.com>  ## Summary by CodeRabbit - **New Features** - Updated default settings for full-text search (FTS) index creation: stemming, stop word removal, and ASCII folding are now enabled by default, while token position storage is disabled by default. - **Refactor** - Simplified and streamlined the configuration and handling of FTS index parameters for improved maintainability and consistency across interfaces. - Enhanced serialization and request construction for FTS index parameters to reduce manual handling and improve code clarity. - Improved test coverage by explicitly enabling positional indexing in FTS tests to support phrase queries. - **Chores** - Upgraded all internal dependencies related to FTS indexing to the latest version for enhanced compatibility and performance. - Updated package versions for Node.js, Python, and Rust components to the latest beta releases. - Improved CI workflows by adding Rust toolchain setup with formatting and linting tools.  --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com> Co-authored-by: Will Jones <willjones127@gmail.com>
2025-12-23 05:19:58 +00:00 · 2025-05-30 06:19:24 +08:00
parent d0bc671cac
commit 5c7f63388d
21 changed files with 484 additions and 479 deletions
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-python"
-version = "0.22.2-beta.0"
+version = "0.23.0-beta.0"
 edition.workspace = true
 description = "Python bindings for LanceDB"
 license.workspace = true
--- a/python/python/lancedb/index.py
+++ b/python/python/lancedb/index.py
@@ -102,7 +102,7 @@ class FTS:

    Attributes
    ----------
-    with_position : bool, default True
+    with_position : bool, default False
        Whether to store the position of the token in the document. Setting this
        to False can reduce the size of the index and improve indexing speed,
        but it will disable support for phrase queries.
@@ -118,25 +118,25 @@ class FTS:
        ignored.
    lower_case : bool, default True
        Whether to convert the token to lower case. This makes queries case-insensitive.
-    stem : bool, default False
+    stem : bool, default True
        Whether to stem the token. Stemming reduces words to their root form.
        For example, in English "running" and "runs" would both be reduced to "run".
-    remove_stop_words : bool, default False
+    remove_stop_words : bool, default True
        Whether to remove stop words. Stop words are common words that are often
        removed from text before indexing. For example, in English "the" and "and".
-    ascii_folding : bool, default False
+    ascii_folding : bool, default True
        Whether to fold ASCII characters. This converts accented characters to
        their ASCII equivalent. For example, "café" would be converted to "cafe".
    """

-    with_position: bool = True
+    with_position: bool = False
    base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple"
    language: str = "English"
    max_token_length: Optional[int] = 40
    lower_case: bool = True
-    stem: bool = False
-    remove_stop_words: bool = False
-    ascii_folding: bool = False
+    stem: bool = True
+    remove_stop_words: bool = True
+    ascii_folding: bool = True


@dataclass
--- a/python/python/lancedb/remote/table.py
+++ b/python/python/lancedb/remote/table.py
@@ -149,15 +149,15 @@ class RemoteTable(Table):
        *,
        replace: bool = False,
        wait_timeout: timedelta = None,
-        with_position: bool = True,
+        with_position: bool = False,
        # tokenizer configs:
        base_tokenizer: str = "simple",
        language: str = "English",
        max_token_length: Optional[int] = 40,
        lower_case: bool = True,
-        stem: bool = False,
-        remove_stop_words: bool = False,
-        ascii_folding: bool = False,
+        stem: bool = True,
+        remove_stop_words: bool = True,
+        ascii_folding: bool = True,
    ):
        config = FTS(
            with_position=with_position,
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -829,15 +829,15 @@ class Table(ABC):
        writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
        use_tantivy: bool = True,
        tokenizer_name: Optional[str] = None,
-        with_position: bool = True,
+        with_position: bool = False,
        # tokenizer configs:
        base_tokenizer: BaseTokenizerType = "simple",
        language: str = "English",
        max_token_length: Optional[int] = 40,
        lower_case: bool = True,
-        stem: bool = False,
-        remove_stop_words: bool = False,
-        ascii_folding: bool = False,
+        stem: bool = True,
+        remove_stop_words: bool = True,
+        ascii_folding: bool = True,
        wait_timeout: Optional[timedelta] = None,
    ):
        """Create a full-text search index on the table.
@@ -867,7 +867,7 @@ class Table(ABC):
        use_tantivy: bool, default True
            If True, use the legacy full-text search implementation based on tantivy.
            If False, use the new full-text search implementation based on lance-index.
-        with_position: bool, default True
+        with_position: bool, default False
            Only available with use_tantivy=False
            If False, do not store the positions of the terms in the text.
            This can reduce the size of the index and improve indexing speed.
@@ -885,13 +885,13 @@ class Table(ABC):
        lower_case : bool, default True
            Whether to convert the token to lower case. This makes queries
            case-insensitive.
-        stem : bool, default False
+        stem : bool, default True
            Whether to stem the token. Stemming reduces words to their root form.
            For example, in English "running" and "runs" would both be reduced to "run".
-        remove_stop_words : bool, default False
+        remove_stop_words : bool, default True
            Whether to remove stop words. Stop words are common words that are often
            removed from text before indexing. For example, in English "the" and "and".
-        ascii_folding : bool, default False
+        ascii_folding : bool, default True
            Whether to fold ASCII characters. This converts accented characters to
            their ASCII equivalent. For example, "café" would be converted to "cafe".
        wait_timeout: timedelta, optional
@@ -1972,15 +1972,15 @@ class LanceTable(Table):
        writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
        use_tantivy: bool = True,
        tokenizer_name: Optional[str] = None,
-        with_position: bool = True,
+        with_position: bool = False,
        # tokenizer configs:
        base_tokenizer: BaseTokenizerType = "simple",
        language: str = "English",
        max_token_length: Optional[int] = 40,
        lower_case: bool = True,
-        stem: bool = False,
-        remove_stop_words: bool = False,
-        ascii_folding: bool = False,
+        stem: bool = True,
+        remove_stop_words: bool = True,
+        ascii_folding: bool = True,
    ):
        if not use_tantivy:
            if not isinstance(field_names, str):
@@ -1990,6 +1990,7 @@ class LanceTable(Table):
                tokenizer_configs = {
                    "base_tokenizer": base_tokenizer,
                    "language": language,
+                    "with_position": with_position,
                    "max_token_length": max_token_length,
                    "lower_case": lower_case,
                    "stem": stem,
@@ -2000,7 +2001,6 @@ class LanceTable(Table):
                tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)

            config = FTS(
-                with_position=with_position,
                **tokenizer_configs,
            )

--- a/python/python/tests/docs/test_search.py
+++ b/python/python/tests/docs/test_search.py
@@ -156,6 +156,9 @@ async def test_vector_search_async():
    # --8<-- [end:search_result_async_as_list]


+@pytest.mark.skipif(
+    os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
+)
 def test_fts_fuzzy_query():
    uri = "data/fuzzy-example"
    db = lancedb.connect(uri)
@@ -189,6 +192,9 @@ def test_fts_fuzzy_query():
    }


+@pytest.mark.skipif(
+    os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
+)
 def test_fts_boost_query():
    uri = "data/boost-example"
    db = lancedb.connect(uri)
@@ -234,6 +240,9 @@ def test_fts_boost_query():
    )


+@pytest.mark.skipif(
+    os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
+)
 def test_fts_native():
    # --8<-- [start:basic_fts]
    uri = "data/sample-lancedb"
@@ -282,6 +291,9 @@ def test_fts_native():
    # --8<-- [end:fts_incremental_index]


+@pytest.mark.skipif(
+    os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
+)
@pytest.mark.asyncio
 async def test_fts_native_async():
    # --8<-- [start:basic_fts_async]
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -287,7 +287,7 @@ def test_search_fts_phrase_query(table):
        assert False
    except Exception:
        pass
-    table.create_fts_index("text", use_tantivy=False, replace=True)
+    table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
    results = table.search("puppy").limit(100).to_list()
    phrase_results = table.search('"puppy runs"').limit(100).to_list()
    assert len(results) > len(phrase_results)
@@ -312,7 +312,7 @@ async def test_search_fts_phrase_query_async(async_table):
        assert False
    except Exception:
        pass
-    await async_table.create_index("text", config=FTS())
+    await async_table.create_index("text", config=FTS(with_position=True))
    results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
    phrase_results = (
        await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
@@ -649,7 +649,7 @@ def test_fts_on_list(mem_db: DBConnection):
        }
    )
    table = mem_db.create_table("test", data=data)
-    table.create_fts_index("text", use_tantivy=False)
+    table.create_fts_index("text", use_tantivy=False, with_position=True)

    res = table.search("lance").limit(5).to_list()
    assert len(res) == 3
--- a/python/src/index.rs
+++ b/python/src/index.rs
@@ -3,7 +3,7 @@

 use lancedb::index::vector::IvfFlatIndexBuilder;
 use lancedb::index::{
-    scalar::{BTreeIndexBuilder, FtsIndexBuilder, TokenizerConfig},
+    scalar::{BTreeIndexBuilder, FtsIndexBuilder},
    vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
    Index as LanceDbIndex,
 };
@@ -38,19 +38,17 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
            "LabelList" => Ok(LanceDbIndex::LabelList(Default::default())),
            "FTS" => {
                let params = source.extract::<FtsParams>()?;
-                let inner_opts = TokenizerConfig::default()
+                let inner_opts = FtsIndexBuilder::default()
                    .base_tokenizer(params.base_tokenizer)
                    .language(&params.language)
                    .map_err(|_| PyValueError::new_err(format!("LanceDB does not support the requested language: '{}'", params.language)))?
+                    .with_position(params.with_position)
                    .lower_case(params.lower_case)
                    .max_token_length(params.max_token_length)
                    .remove_stop_words(params.remove_stop_words)
                    .stem(params.stem)
                    .ascii_folding(params.ascii_folding);
-                let mut opts = FtsIndexBuilder::default()
-                    .with_position(params.with_position);
-                opts.tokenizer_configs = inner_opts;
-                Ok(LanceDbIndex::FTS(opts))
+                Ok(LanceDbIndex::FTS(inner_opts))
            },
            "IvfFlat" => {
                let params = source.extract::<IvfFlatParams>()?;