feat!: upgrade lance to v0.28.0 (#2404)

this introduces some breaking changes in terms of rust API of creating FTS index, and the default index params changed Signed-off-by: BubbleCal <bubble-cal@outlook.com>  ## Summary by CodeRabbit - **New Features** - Updated default settings for full-text search (FTS) index creation: stemming, stop word removal, and ASCII folding are now enabled by default, while token position storage is disabled by default. - **Refactor** - Simplified and streamlined the configuration and handling of FTS index parameters for improved maintainability and consistency across interfaces. - Enhanced serialization and request construction for FTS index parameters to reduce manual handling and improve code clarity. - Improved test coverage by explicitly enabling positional indexing in FTS tests to support phrase queries. - **Chores** - Upgraded all internal dependencies related to FTS indexing to the latest version for enhanced compatibility and performance. - Updated package versions for Node.js, Python, and Rust components to the latest beta releases. - Improved CI workflows by adding Rust toolchain setup with formatting and linting tools.  --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com> Co-authored-by: Will Jones <willjones127@gmail.com>
2026-07-08 05:20:41 +00:00 · 2025-05-30 06:19:24 +08:00
parent d0bc671cac
commit 5c7f63388d
21 changed files with 484 additions and 479 deletions
--- a/.github/workflows/java.yml
+++ b/.github/workflows/java.yml
@@ -35,6 +35,9 @@ jobs:
      - uses: Swatinem/rust-cache@v2
        with:
          workspaces: java/core/lancedb-jni
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          components: rustfmt
      - name: Run cargo fmt
        run: cargo fmt --check
        working-directory: ./java/core/lancedb-jni
@@ -68,6 +71,9 @@ jobs:
      - uses: Swatinem/rust-cache@v2
        with:
          workspaces: java/core/lancedb-jni
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          components: rustfmt
      - name: Run cargo fmt
        run: cargo fmt --check
        working-directory: ./java/core/lancedb-jni
@@ -110,4 +116,3 @@ jobs:
          -Djdk.reflect.useDirectMethodHandle=false \
          -Dio.netty.tryReflectionSetAccessible=true"
          JAVA_HOME=$JAVA_17 mvn clean test
-  
--- a/.github/workflows/nodejs.yml
+++ b/.github/workflows/nodejs.yml
@@ -47,6 +47,9 @@ jobs:
      run: |
        sudo apt update
        sudo apt install -y protobuf-compiler libssl-dev
+    - uses: actions-rust-lang/setup-rust-toolchain@v1
+      with:
+        components: rustfmt, clippy
    - name: Lint
      run: |
        cargo fmt --all -- --check
--- a/.github/workflows/run_tests/action.yml
+++ b/.github/workflows/run_tests/action.yml
@@ -24,8 +24,8 @@ runs:
    - name: pytest (with integration)
      shell: bash
      if: ${{ inputs.integration == 'true' }}
-      run: pytest -m "not slow" -x -v --durations=30 python/python/tests
+      run: pytest -m "not slow" -vv --durations=30 python/python/tests
    - name: pytest (no integration tests)
      shell: bash
      if: ${{ inputs.integration != 'true' }}
-      run: pytest -m "not slow and not s3_test" -x -v --durations=30 python/python/tests
+      run: pytest -m "not slow and not s3_test" -vv --durations=30 python/python/tests
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,14 +21,14 @@ categories = ["database-implementations"]
 rust-version = "1.78.0"

 [workspace.dependencies]
-lance = { "version" = "=0.27.3", "features" = ["dynamodb"], tag = "v0.27.3-beta.2", git="https://github.com/lancedb/lance.git" }
-lance-io = { version = "=0.27.3", tag = "v0.27.3-beta.2", git="https://github.com/lancedb/lance.git" }
-lance-index = { version = "=0.27.3", tag = "v0.27.3-beta.2", git="https://github.com/lancedb/lance.git" }
-lance-linalg = { version = "=0.27.3", tag = "v0.27.3-beta.2", git="https://github.com/lancedb/lance.git" }
-lance-table = { version = "=0.27.3", tag = "v0.27.3-beta.2", git="https://github.com/lancedb/lance.git" }
-lance-testing = { version = "=0.27.3", tag = "v0.27.3-beta.2", git="https://github.com/lancedb/lance.git" }
-lance-datafusion = { version = "=0.27.3", tag = "v0.27.3-beta.2", git="https://github.com/lancedb/lance.git" }
-lance-encoding = { version = "=0.27.3", tag = "v0.27.3-beta.2", git="https://github.com/lancedb/lance.git" }
+lance = { "version" = "=0.28.1", "features" = ["dynamodb"], tag = "v0.28.1-beta.1", git="https://github.com/lancedb/lance.git" }
+lance-io = { version = "=0.28.1", tag = "v0.28.1-beta.1", git="https://github.com/lancedb/lance.git" }
+lance-index = { version = "=0.28.1", tag = "v0.28.1-beta.1", git="https://github.com/lancedb/lance.git" }
+lance-linalg = { version = "=0.28.1", tag = "v0.28.1-beta.1", git="https://github.com/lancedb/lance.git" }
+lance-table = { version = "=0.28.1", tag = "v0.28.1-beta.1", git="https://github.com/lancedb/lance.git" }
+lance-testing = { version = "=0.28.1", tag = "v0.28.1-beta.1", git="https://github.com/lancedb/lance.git" }
+lance-datafusion = { version = "=0.28.1", tag = "v0.28.1-beta.1", git="https://github.com/lancedb/lance.git" }
+lance-encoding = { version = "=0.28.1", tag = "v0.28.1-beta.1", git="https://github.com/lancedb/lance.git" }
 # Note that this one does not include pyarrow
 arrow = { version = "54.1", optional = false }
 arrow-array = "54.1"
--- a/nodejs/Cargo.toml
+++ b/nodejs/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "lancedb-nodejs"
 edition.workspace = true
-version = "0.19.2-beta.0"
+version = "0.20.0-beta.0"
 license.workspace = true
 description.workspace = true
 repository.workspace = true
@@ -30,6 +30,7 @@ log.workspace = true

 # Workaround for build failure until we can fix it.
 aws-lc-sys = "=0.28.0"
+aws-lc-rs = "=1.13.0"

 [build-dependencies]
 napi-build = "2.1"
--- a/nodejs/test/table.test.ts
+++ b/nodejs/test/table.test.ts
@@ -1506,7 +1506,9 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
      ];
      const table = await db.createTable("test", data);
      await table.createIndex("text", {
-        config: Index.fts(),
+        config: Index.fts({
+          withPosition: true,
+        }),
      });

      const results = await table.search("lance").toArray();
@@ -1559,7 +1561,9 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
      ];
      const table = await db.createTable("test", data);
      await table.createIndex("text", {
-        config: Index.fts(),
+        config: Index.fts({
+          withPosition: true,
+        }),
      });

      const results = await table.search("world").toArray();
--- a/nodejs/package.json
+++ b/nodejs/package.json
@@ -11,7 +11,7 @@
    "ann"
  ],
  "private": false,
-  "version": "0.19.2-beta.0",
+  "version": "0.20.0-beta.0",
  "main": "dist/index.js",
  "exports": {
    ".": "./dist/index.js",
--- a/nodejs/src/index.rs
+++ b/nodejs/src/index.rs
@@ -125,32 +125,30 @@ impl Index {
        ascii_folding: Option<bool>,
    ) -> Self {
        let mut opts = FtsIndexBuilder::default();
-        let mut tokenizer_configs = opts.tokenizer_configs.clone();
        if let Some(with_position) = with_position {
            opts = opts.with_position(with_position);
        }
        if let Some(base_tokenizer) = base_tokenizer {
-            tokenizer_configs = tokenizer_configs.base_tokenizer(base_tokenizer);
+            opts = opts.base_tokenizer(base_tokenizer);
        }
        if let Some(language) = language {
-            tokenizer_configs = tokenizer_configs.language(&language).unwrap();
+            opts = opts.language(&language).unwrap();
        }
        if let Some(max_token_length) = max_token_length {
-            tokenizer_configs = tokenizer_configs.max_token_length(Some(max_token_length as usize));
+            opts = opts.max_token_length(Some(max_token_length as usize));
        }
        if let Some(lower_case) = lower_case {
-            tokenizer_configs = tokenizer_configs.lower_case(lower_case);
+            opts = opts.lower_case(lower_case);
        }
        if let Some(stem) = stem {
-            tokenizer_configs = tokenizer_configs.stem(stem);
+            opts = opts.stem(stem);
        }
        if let Some(remove_stop_words) = remove_stop_words {
-            tokenizer_configs = tokenizer_configs.remove_stop_words(remove_stop_words);
+            opts = opts.remove_stop_words(remove_stop_words);
        }
        if let Some(ascii_folding) = ascii_folding {
-            tokenizer_configs = tokenizer_configs.ascii_folding(ascii_folding);
+            opts = opts.ascii_folding(ascii_folding);
        }
-        opts.tokenizer_configs = tokenizer_configs;

        Self {
            inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-python"
-version = "0.22.2-beta.0"
+version = "0.23.0-beta.0"
 edition.workspace = true
 description = "Python bindings for LanceDB"
 license.workspace = true
--- a/python/python/lancedb/index.py
+++ b/python/python/lancedb/index.py
@@ -102,7 +102,7 @@ class FTS:

    Attributes
    ----------
-    with_position : bool, default True
+    with_position : bool, default False
        Whether to store the position of the token in the document. Setting this
        to False can reduce the size of the index and improve indexing speed,
        but it will disable support for phrase queries.
@@ -118,25 +118,25 @@ class FTS:
        ignored.
    lower_case : bool, default True
        Whether to convert the token to lower case. This makes queries case-insensitive.
-    stem : bool, default False
+    stem : bool, default True
        Whether to stem the token. Stemming reduces words to their root form.
        For example, in English "running" and "runs" would both be reduced to "run".
-    remove_stop_words : bool, default False
+    remove_stop_words : bool, default True
        Whether to remove stop words. Stop words are common words that are often
        removed from text before indexing. For example, in English "the" and "and".
-    ascii_folding : bool, default False
+    ascii_folding : bool, default True
        Whether to fold ASCII characters. This converts accented characters to
        their ASCII equivalent. For example, "café" would be converted to "cafe".
    """

-    with_position: bool = True
+    with_position: bool = False
    base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple"
    language: str = "English"
    max_token_length: Optional[int] = 40
    lower_case: bool = True
-    stem: bool = False
-    remove_stop_words: bool = False
-    ascii_folding: bool = False
+    stem: bool = True
+    remove_stop_words: bool = True
+    ascii_folding: bool = True


@dataclass
--- a/python/python/lancedb/remote/table.py
+++ b/python/python/lancedb/remote/table.py
@@ -149,15 +149,15 @@ class RemoteTable(Table):
        *,
        replace: bool = False,
        wait_timeout: timedelta = None,
-        with_position: bool = True,
+        with_position: bool = False,
        # tokenizer configs:
        base_tokenizer: str = "simple",
        language: str = "English",
        max_token_length: Optional[int] = 40,
        lower_case: bool = True,
-        stem: bool = False,
-        remove_stop_words: bool = False,
-        ascii_folding: bool = False,
+        stem: bool = True,
+        remove_stop_words: bool = True,
+        ascii_folding: bool = True,
    ):
        config = FTS(
            with_position=with_position,
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -829,15 +829,15 @@ class Table(ABC):
        writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
        use_tantivy: bool = True,
        tokenizer_name: Optional[str] = None,
-        with_position: bool = True,
+        with_position: bool = False,
        # tokenizer configs:
        base_tokenizer: BaseTokenizerType = "simple",
        language: str = "English",
        max_token_length: Optional[int] = 40,
        lower_case: bool = True,
-        stem: bool = False,
-        remove_stop_words: bool = False,
-        ascii_folding: bool = False,
+        stem: bool = True,
+        remove_stop_words: bool = True,
+        ascii_folding: bool = True,
        wait_timeout: Optional[timedelta] = None,
    ):
        """Create a full-text search index on the table.
@@ -867,7 +867,7 @@ class Table(ABC):
        use_tantivy: bool, default True
            If True, use the legacy full-text search implementation based on tantivy.
            If False, use the new full-text search implementation based on lance-index.
-        with_position: bool, default True
+        with_position: bool, default False
            Only available with use_tantivy=False
            If False, do not store the positions of the terms in the text.
            This can reduce the size of the index and improve indexing speed.
@@ -885,13 +885,13 @@ class Table(ABC):
        lower_case : bool, default True
            Whether to convert the token to lower case. This makes queries
            case-insensitive.
-        stem : bool, default False
+        stem : bool, default True
            Whether to stem the token. Stemming reduces words to their root form.
            For example, in English "running" and "runs" would both be reduced to "run".
-        remove_stop_words : bool, default False
+        remove_stop_words : bool, default True
            Whether to remove stop words. Stop words are common words that are often
            removed from text before indexing. For example, in English "the" and "and".
-        ascii_folding : bool, default False
+        ascii_folding : bool, default True
            Whether to fold ASCII characters. This converts accented characters to
            their ASCII equivalent. For example, "café" would be converted to "cafe".
        wait_timeout: timedelta, optional
@@ -1972,15 +1972,15 @@ class LanceTable(Table):
        writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
        use_tantivy: bool = True,
        tokenizer_name: Optional[str] = None,
-        with_position: bool = True,
+        with_position: bool = False,
        # tokenizer configs:
        base_tokenizer: BaseTokenizerType = "simple",
        language: str = "English",
        max_token_length: Optional[int] = 40,
        lower_case: bool = True,
-        stem: bool = False,
-        remove_stop_words: bool = False,
-        ascii_folding: bool = False,
+        stem: bool = True,
+        remove_stop_words: bool = True,
+        ascii_folding: bool = True,
    ):
        if not use_tantivy:
            if not isinstance(field_names, str):
@@ -1990,6 +1990,7 @@ class LanceTable(Table):
                tokenizer_configs = {
                    "base_tokenizer": base_tokenizer,
                    "language": language,
+                    "with_position": with_position,
                    "max_token_length": max_token_length,
                    "lower_case": lower_case,
                    "stem": stem,
@@ -2000,7 +2001,6 @@ class LanceTable(Table):
                tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)

            config = FTS(
-                with_position=with_position,
                **tokenizer_configs,
            )

--- a/python/python/tests/docs/test_search.py
+++ b/python/python/tests/docs/test_search.py
@@ -156,6 +156,9 @@ async def test_vector_search_async():
    # --8<-- [end:search_result_async_as_list]


+@pytest.mark.skipif(
+    os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
+)
 def test_fts_fuzzy_query():
    uri = "data/fuzzy-example"
    db = lancedb.connect(uri)
@@ -189,6 +192,9 @@ def test_fts_fuzzy_query():
    }


+@pytest.mark.skipif(
+    os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
+)
 def test_fts_boost_query():
    uri = "data/boost-example"
    db = lancedb.connect(uri)
@@ -234,6 +240,9 @@ def test_fts_boost_query():
    )


+@pytest.mark.skipif(
+    os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
+)
 def test_fts_native():
    # --8<-- [start:basic_fts]
    uri = "data/sample-lancedb"
@@ -282,6 +291,9 @@ def test_fts_native():
    # --8<-- [end:fts_incremental_index]


+@pytest.mark.skipif(
+    os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
+)
@pytest.mark.asyncio
 async def test_fts_native_async():
    # --8<-- [start:basic_fts_async]
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -287,7 +287,7 @@ def test_search_fts_phrase_query(table):
        assert False
    except Exception:
        pass
-    table.create_fts_index("text", use_tantivy=False, replace=True)
+    table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
    results = table.search("puppy").limit(100).to_list()
    phrase_results = table.search('"puppy runs"').limit(100).to_list()
    assert len(results) > len(phrase_results)
@@ -312,7 +312,7 @@ async def test_search_fts_phrase_query_async(async_table):
        assert False
    except Exception:
        pass
-    await async_table.create_index("text", config=FTS())
+    await async_table.create_index("text", config=FTS(with_position=True))
    results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
    phrase_results = (
        await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
@@ -649,7 +649,7 @@ def test_fts_on_list(mem_db: DBConnection):
        }
    )
    table = mem_db.create_table("test", data=data)
-    table.create_fts_index("text", use_tantivy=False)
+    table.create_fts_index("text", use_tantivy=False, with_position=True)

    res = table.search("lance").limit(5).to_list()
    assert len(res) == 3
--- a/python/src/index.rs
+++ b/python/src/index.rs
@@ -3,7 +3,7 @@

 use lancedb::index::vector::IvfFlatIndexBuilder;
 use lancedb::index::{
-    scalar::{BTreeIndexBuilder, FtsIndexBuilder, TokenizerConfig},
+    scalar::{BTreeIndexBuilder, FtsIndexBuilder},
    vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
    Index as LanceDbIndex,
 };
@@ -38,19 +38,17 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
            "LabelList" => Ok(LanceDbIndex::LabelList(Default::default())),
            "FTS" => {
                let params = source.extract::<FtsParams>()?;
-                let inner_opts = TokenizerConfig::default()
+                let inner_opts = FtsIndexBuilder::default()
                    .base_tokenizer(params.base_tokenizer)
                    .language(&params.language)
                    .map_err(|_| PyValueError::new_err(format!("LanceDB does not support the requested language: '{}'", params.language)))?
+                    .with_position(params.with_position)
                    .lower_case(params.lower_case)
                    .max_token_length(params.max_token_length)
                    .remove_stop_words(params.remove_stop_words)
                    .stem(params.stem)
                    .ascii_folding(params.ascii_folding);
-                let mut opts = FtsIndexBuilder::default()
-                    .with_position(params.with_position);
-                opts.tokenizer_configs = inner_opts;
-                Ok(LanceDbIndex::FTS(opts))
+                Ok(LanceDbIndex::FTS(inner_opts))
            },
            "IvfFlat" => {
                let params = source.extract::<IvfFlatParams>()?;
--- a/rust/lancedb/Cargo.toml
+++ b/rust/lancedb/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb"
-version = "0.19.2-beta.0"
+version = "0.20.0-beta.0"
 edition.workspace = true
 description = "LanceDB: A serverless, low-latency vector database for AI applications"
 license.workspace = true
--- a/rust/lancedb/src/index/scalar.rs
+++ b/rust/lancedb/src/index/scalar.rs
@@ -51,35 +51,7 @@ pub struct BitmapIndexBuilder {}
 #[derive(Debug, Clone, Default)]
 pub struct LabelListIndexBuilder {}

-/// Builder for a full text search index
-///
-/// A full text search index is an index on a string column that allows for full text search
-#[derive(Debug, Clone)]
-pub struct FtsIndexBuilder {
-    /// Whether to store the position of the tokens
-    /// This is used for phrase queries
-    pub with_position: bool,
-
-    pub tokenizer_configs: TokenizerConfig,
-}
-
-impl Default for FtsIndexBuilder {
-    fn default() -> Self {
-        Self {
-            with_position: true,
-            tokenizer_configs: TokenizerConfig::default(),
-        }
-    }
-}
-
-impl FtsIndexBuilder {
-    /// Set the with_position flag
-    pub fn with_position(mut self, with_position: bool) -> Self {
-        self.with_position = with_position;
-        self
-    }
-}
-
 pub use lance_index::scalar::inverted::query::*;
-pub use lance_index::scalar::inverted::TokenizerConfig;
 pub use lance_index::scalar::FullTextSearchQuery;
+pub use lance_index::scalar::InvertedIndexParams as FtsIndexBuilder;
+pub use lance_index::scalar::InvertedIndexParams;
--- a/rust/lancedb/src/io/object_store.rs
+++ b/rust/lancedb/src/io/object_store.rs
@@ -197,16 +197,8 @@ mod test {

    #[tokio::test]
    async fn test_e2e() {
-        let dir1 = tempfile::tempdir()
-            .unwrap()
-            .into_path()
-            .canonicalize()
-            .unwrap();
-        let dir2 = tempfile::tempdir()
-            .unwrap()
-            .into_path()
-            .canonicalize()
-            .unwrap();
+        let dir1 = tempfile::tempdir().unwrap().keep().canonicalize().unwrap();
+        let dir2 = tempfile::tempdir().unwrap().keep().canonicalize().unwrap();

        let secondary_store = LocalFileSystem::new_with_prefix(dir2.to_str().unwrap()).unwrap();
        let object_store_wrapper = Arc::new(MirroringObjectStoreWrapper {
--- a/rust/lancedb/src/remote/table.rs
+++ b/rust/lancedb/src/remote/table.rs
@@ -995,16 +995,12 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
            Index::Bitmap(_) => ("BITMAP", None),
            Index::LabelList(_) => ("LABEL_LIST", None),
            Index::FTS(fts) => {
-                let with_position = fts.with_position;
-                let configs = serde_json::to_value(fts.tokenizer_configs).map_err(|e| {
-                    Error::InvalidInput {
-                        message: format!("failed to serialize FTS index params {:?}", e),
-                    }
+                let params = serde_json::to_value(&fts).map_err(|e| Error::InvalidInput {
+                    message: format!("failed to serialize FTS index params {:?}", e),
                })?;
-                for (key, value) in configs.as_object().unwrap() {
+                for (key, value) in params.as_object().unwrap() {
                    body[key] = value.clone();
                }
-                body["with_position"] = serde_json::Value::Bool(with_position);
                ("FTS", None)
            }
            Index::Auto => {
@@ -2460,14 +2456,10 @@ mod tests {
                    expected_body["metric_type"] = distance_type.to_lowercase().into();
                }
                if let Index::FTS(fts) = &params {
-                    expected_body["with_position"] = fts.with_position.into();
-                    expected_body["base_tokenizer"] = "simple".into();
-                    expected_body["language"] = "English".into();
-                    expected_body["max_token_length"] = 40.into();
-                    expected_body["lower_case"] = true.into();
-                    expected_body["stem"] = false.into();
-                    expected_body["remove_stop_words"] = false.into();
-                    expected_body["ascii_folding"] = false.into();
+                    let params = serde_json::to_value(fts).unwrap();
+                    for (key, value) in params.as_object().unwrap() {
+                        expected_body[key] = value.clone();
+                    }
                }

                assert_eq!(body, expected_body);
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -1977,16 +1977,12 @@ impl NativeTable {
        }

        let mut dataset = self.dataset.get_mut().await?;
-        let fts_params = lance_index::scalar::InvertedIndexParams {
-            with_position: fts_opts.with_position,
-            tokenizer_config: fts_opts.tokenizer_configs,
-        };
        dataset
            .create_index(
                &[field.name()],
                IndexType::Inverted,
                None,
-                &fts_params,
+                &fts_opts,
                replace,
            )
            .await?;