feat!: upgrade lance to 0.19.1 (#1762)

BREAKING CHANGE: default tokenizer no longer does stemming or stop-word removal. Users should explicitly turn that option on in the future. - upgrade lance to 0.19.1 - update the FTS docs - update the FTS API Upstream change notes: https://github.com/lancedb/lance/releases/tag/v0.19.1 --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com> Co-authored-by: Will Jones <willjones127@gmail.com>
2025-12-26 14:49:57 +00:00 · 2024-10-30 00:03:52 +08:00
parent b9802a0d23
commit 32fdcf97db
16 changed files with 459 additions and 166 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -3,7 +3,7 @@ name = "lancedb"
 # version in Cargo.toml
 dependencies = [
    "deprecation",
-    "pylance==0.18.3-beta.2",
+    "pylance==0.19.1",
    "requests>=2.31.0",
    "tqdm>=4.27.0",
    "pydantic>=1.10",
--- a/python/python/lancedb/index.py
+++ b/python/python/lancedb/index.py
@@ -7,6 +7,27 @@ from ._lancedb import (
    IndexConfig,
 )

+lang_mapping = {
+    "ar": "Arabic",
+    "da": "Danish",
+    "du": "Dutch",
+    "en": "English",
+    "fi": "Finnish",
+    "fr": "French",
+    "de": "German",
+    "gr": "Greek",
+    "hu": "Hungarian",
+    "it": "Italian",
+    "no": "Norwegian",
+    "pt": "Portuguese",
+    "ro": "Romanian",
+    "ru": "Russian",
+    "es": "Spanish",
+    "sv": "Swedish",
+    "ta": "Tamil",
+    "tr": "Turkish",
+}
+

 class BTree:
    """Describes a btree index configuration
@@ -78,7 +99,17 @@ class FTS:
    For example, it works with `title`, `description`, `content`, etc.
    """

-    def __init__(self, with_position: bool = True):
+    def __init__(
+        self,
+        with_position: bool = True,
+        base_tokenizer: str = "simple",
+        language: str = "English",
+        max_token_length: Optional[int] = 40,
+        lower_case: bool = True,
+        stem: bool = False,
+        remove_stop_words: bool = False,
+        ascii_folding: bool = False,
+    ):
        self._inner = LanceDbIndex.fts(with_position=with_position)


--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -55,6 +55,7 @@ from .util import (
    safe_import_polars,
    value_to_sql,
 )
+from .index import lang_mapping

 if TYPE_CHECKING:
    import PIL
@@ -497,10 +498,18 @@ class Table(ABC):
        ordering_field_names: Union[str, List[str]] = None,
        *,
        replace: bool = False,
-        with_position: bool = True,
        writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
-        tokenizer_name: str = "default",
        use_tantivy: bool = True,
+        tokenizer_name: Optional[str] = None,
+        with_position: bool = True,
+        # tokenizer configs:
+        base_tokenizer: str = "simple",
+        language: str = "English",
+        max_token_length: Optional[int] = 40,
+        lower_case: bool = True,
+        stem: bool = False,
+        remove_stop_words: bool = False,
+        ascii_folding: bool = False,
    ):
        """Create a full-text search index on the table.

@@ -526,7 +535,6 @@ class Table(ABC):
            The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
            language code followed by "_stem". So for english it would be "en_stem".
            For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
-            only available with use_tantivy=True for now
        use_tantivy: bool, default True
            If True, use the legacy full-text search implementation based on tantivy.
            If False, use the new full-text search implementation based on lance-index.
@@ -1341,14 +1349,33 @@ class LanceTable(Table):
        ordering_field_names: Union[str, List[str]] = None,
        *,
        replace: bool = False,
-        with_position: bool = True,
        writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
-        tokenizer_name: str = "default",
        use_tantivy: bool = True,
+        tokenizer_name: Optional[str] = None,
+        with_position: bool = True,
+        # tokenizer configs:
+        base_tokenizer: str = "simple",
+        language: str = "English",
+        max_token_length: Optional[int] = 40,
+        lower_case: bool = True,
+        stem: bool = False,
+        remove_stop_words: bool = False,
+        ascii_folding: bool = False,
    ):
        if not use_tantivy:
            if not isinstance(field_names, str):
                raise ValueError("field_names must be a string when use_tantivy=False")
+            tokenizer_configs = {
+                "base_tokenizer": base_tokenizer,
+                "language": language,
+                "max_token_length": max_token_length,
+                "lower_case": lower_case,
+                "stem": stem,
+                "remove_stop_words": remove_stop_words,
+                "ascii_folding": ascii_folding,
+            }
+            if tokenizer_name is not None:
+                tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
            # delete the existing legacy index if it exists
            if replace:
                path, fs, exist = self._get_fts_index_path()
@@ -1359,6 +1386,7 @@ class LanceTable(Table):
                index_type="INVERTED",
                replace=replace,
                with_position=with_position,
+                **tokenizer_configs,
            )
            return

@@ -1381,6 +1409,8 @@ class LanceTable(Table):
                "Full-text search is only supported on the local filesystem"
            )

+        if tokenizer_name is None:
+            tokenizer_name = "default"
        index = create_index(
            path,
            field_names,
@@ -1395,6 +1425,56 @@ class LanceTable(Table):
            writer_heap_size=writer_heap_size,
        )

+    def infer_tokenizer_configs(tokenizer_name: str) -> dict:
+        if tokenizer_name == "default":
+            return {
+                "base_tokenizer": "simple",
+                "language": "English",
+                "max_token_length": 40,
+                "lower_case": True,
+                "stem": False,
+                "remove_stop_words": False,
+                "ascii_folding": False,
+            }
+        elif tokenizer_name == "raw":
+            return {
+                "base_tokenizer": "raw",
+                "language": "English",
+                "max_token_length": None,
+                "lower_case": False,
+                "stem": False,
+                "remove_stop_words": False,
+                "ascii_folding": False,
+            }
+        elif tokenizer_name == "whitespace":
+            return {
+                "base_tokenizer": "whitespace",
+                "language": "English",
+                "max_token_length": None,
+                "lower_case": False,
+                "stem": False,
+                "remove_stop_words": False,
+                "ascii_folding": False,
+            }
+
+        # or it's with language stemming with pattern like "en_stem"
+        if len(tokenizer_name) != 7:
+            raise ValueError(f"Invalid tokenizer name {tokenizer_name}")
+        lang = tokenizer_name[:2]
+        if tokenizer_name[-5:] != "_stem":
+            raise ValueError(f"Invalid tokenizer name {tokenizer_name}")
+        if lang not in lang_mapping:
+            raise ValueError(f"Invalid language code {lang}")
+        return {
+            "base_tokenizer": "simple",
+            "language": lang_mapping[lang],
+            "max_token_length": 40,
+            "lower_case": True,
+            "stem": True,
+            "remove_stop_words": False,
+            "ascii_folding": False,
+        }
+
    def add(
        self,
        data: DATA,
--- a/python/src/index.rs
+++ b/python/src/index.rs
@@ -106,12 +106,41 @@ impl Index {
        })
    }

+    #[allow(clippy::too_many_arguments)]
    #[staticmethod]
-    pub fn fts(with_position: Option<bool>) -> Self {
+    pub fn fts(
+        with_position: Option<bool>,
+        base_tokenizer: Option<String>,
+        language: Option<String>,
+        max_token_length: Option<usize>,
+        lower_case: Option<bool>,
+        stem: Option<bool>,
+        remove_stop_words: Option<bool>,
+        ascii_folding: Option<bool>,
+    ) -> Self {
        let mut opts = FtsIndexBuilder::default();
        if let Some(with_position) = with_position {
            opts = opts.with_position(with_position);
        }
+        if let Some(base_tokenizer) = base_tokenizer {
+            opts.tokenizer_configs = opts.tokenizer_configs.base_tokenizer(base_tokenizer);
+        }
+        if let Some(language) = language {
+            opts.tokenizer_configs = opts.tokenizer_configs.language(&language).unwrap();
+        }
+        opts.tokenizer_configs = opts.tokenizer_configs.max_token_length(max_token_length);
+        if let Some(lower_case) = lower_case {
+            opts.tokenizer_configs = opts.tokenizer_configs.lower_case(lower_case);
+        }
+        if let Some(stem) = stem {
+            opts.tokenizer_configs = opts.tokenizer_configs.stem(stem);
+        }
+        if let Some(remove_stop_words) = remove_stop_words {
+            opts.tokenizer_configs = opts.tokenizer_configs.remove_stop_words(remove_stop_words);
+        }
+        if let Some(ascii_folding) = ascii_folding {
+            opts.tokenizer_configs = opts.tokenizer_configs.ascii_folding(ascii_folding);
+        }
        Self {
            inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
        }