feat: enable stemming (#1356)

Added the ability to specify tokenizer_name, when creating a full text search index using tantivy. This enables the use of language specific stemming. Also updated the [guide on full text search](https://lancedb.github.io/lancedb/fts/) with a short section on choosing tokenizer. Fixes #1315
2026-07-03 11:00:40 +00:00 · 2024-06-20 23:23:55 +02:00
parent f41eb899dc
commit 0fe844034d
4 changed files with 35 additions and 2 deletions
--- a/python/python/lancedb/fts.py
+++ b/python/python/lancedb/fts.py
@@ -29,7 +29,10 @@ from .table import LanceTable


 def create_index(
-    index_path: str, text_fields: List[str], ordering_fields: List[str] = None
+    index_path: str,
+    text_fields: List[str],
+    ordering_fields: List[str] = None,
+    tokenizer_name: str = "default",
 ) -> tantivy.Index:
    """
    Create a new Index (not populated)
@@ -42,6 +45,8 @@ def create_index(
        List of text fields to index
    ordering_fields: List[str]
        List of unsigned type fields to order by at search time
+    tokenizer_name : str, default "default"
+        The tokenizer to use

    Returns
    -------
@@ -56,7 +61,7 @@ def create_index(
    schema_builder.add_integer_field("doc_id", stored=True)
    # data fields
    for name in text_fields:
-        schema_builder.add_text_field(name, stored=True)
+        schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
    if ordering_fields:
        for name in ordering_fields:
            schema_builder.add_unsigned_field(name, fast=True)
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -1171,6 +1171,7 @@ class LanceTable(Table):
        *,
        replace: bool = False,
        writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
+        tokenizer_name: str = "default",
    ):
        """Create a full-text search index on the table.

@@ -1189,6 +1190,10 @@ class LanceTable(Table):
        ordering_field_names:
            A list of unsigned type fields to index to optionally order
            results on at search time
+        tokenizer_name: str, default "default"
+            The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
+            language code followed by "_stem". So for english it would be "en_stem".
+            For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
        """
        from .fts import create_index, populate_index

@@ -1214,6 +1219,7 @@ class LanceTable(Table):
            self._get_fts_index_path(),
            field_names,
            ordering_fields=ordering_field_names,
+            tokenizer_name=tokenizer_name,
        )
        populate_index(
            index,
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -66,6 +66,17 @@ def test_create_index(tmp_path):
    assert os.path.exists(str(tmp_path / "index"))


+def test_create_index_with_stemming(tmp_path, table):
+    index = ldb.fts.create_index(
+        str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
+    )
+    assert isinstance(index, tantivy.Index)
+    assert os.path.exists(str(tmp_path / "index"))
+
+    # Check stemming by running tokenizer on non empty table
+    table.create_fts_index("text", tokenizer_name="en_stem")
+
+
 def test_populate_index(tmp_path, table):
    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
    assert ldb.fts.populate_index(index, table, ["text"]) == len(table)