feat: enable stemming (#1356)

Added the ability to specify tokenizer_name, when creating a full text search index using tantivy. This enables the use of language specific stemming. Also updated the [guide on full text search](https://lancedb.github.io/lancedb/fts/) with a short section on choosing tokenizer. Fixes #1315
2026-01-13 23:32:57 +00:00 · 2024-06-20 23:23:55 +02:00
parent f41eb899dc
commit 0fe844034d
4 changed files with 35 additions and 2 deletions
--- a/docs/src/fts.md
+++ b/docs/src/fts.md
@@ -54,6 +54,16 @@ This returns the result as a list of dictionaries as follows.
 !!! note
    LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.

+## Tokenization
+By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
+
+```python
+table.create_fts_index("text", tokenizer_name="en_stem")
+```
+
+The following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
+
+
 ## Index multiple columns

 If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
@@ -139,6 +149,7 @@ is treated as a phrase query.
 In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
 double quotes replaced by single quotes.

+
 ## Configurations

 By default, LanceDB configures a 1GB heap size limit for creating the index. You can
--- a/python/python/lancedb/fts.py
+++ b/python/python/lancedb/fts.py
@@ -29,7 +29,10 @@ from .table import LanceTable


 def create_index(
-    index_path: str, text_fields: List[str], ordering_fields: List[str] = None
+    index_path: str,
+    text_fields: List[str],
+    ordering_fields: List[str] = None,
+    tokenizer_name: str = "default",
 ) -> tantivy.Index:
    """
    Create a new Index (not populated)
@@ -42,6 +45,8 @@ def create_index(
        List of text fields to index
    ordering_fields: List[str]
        List of unsigned type fields to order by at search time
+    tokenizer_name : str, default "default"
+        The tokenizer to use

    Returns
    -------
@@ -56,7 +61,7 @@ def create_index(
    schema_builder.add_integer_field("doc_id", stored=True)
    # data fields
    for name in text_fields:
-        schema_builder.add_text_field(name, stored=True)
+        schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
    if ordering_fields:
        for name in ordering_fields:
            schema_builder.add_unsigned_field(name, fast=True)
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -1171,6 +1171,7 @@ class LanceTable(Table):
        *,
        replace: bool = False,
        writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
+        tokenizer_name: str = "default",
    ):
        """Create a full-text search index on the table.

@@ -1189,6 +1190,10 @@ class LanceTable(Table):
        ordering_field_names:
            A list of unsigned type fields to index to optionally order
            results on at search time
+        tokenizer_name: str, default "default"
+            The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
+            language code followed by "_stem". So for english it would be "en_stem".
+            For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
        """
        from .fts import create_index, populate_index

@@ -1214,6 +1219,7 @@ class LanceTable(Table):
            self._get_fts_index_path(),
            field_names,
            ordering_fields=ordering_field_names,
+            tokenizer_name=tokenizer_name,
        )
        populate_index(
            index,
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -66,6 +66,17 @@ def test_create_index(tmp_path):
    assert os.path.exists(str(tmp_path / "index"))


+def test_create_index_with_stemming(tmp_path, table):
+    index = ldb.fts.create_index(
+        str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
+    )
+    assert isinstance(index, tantivy.Index)
+    assert os.path.exists(str(tmp_path / "index"))
+
+    # Check stemming by running tokenizer on non empty table
+    table.create_fts_index("text", tokenizer_name="en_stem")
+
+
 def test_populate_index(tmp_path, table):
    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
    assert ldb.fts.populate_index(index, table, ["text"]) == len(table)