From 0fe844034de074063809caf53cb87d2638b2afae Mon Sep 17 00:00:00 2001
From: josca42 <94163701+josca42@users.noreply.github.com>
Date: Thu, 20 Jun 2024 23:23:55 +0200
Subject: [PATCH] feat: enable stemming (#1356)

Added the ability to specify tokenizer_name, when creating a full text
search index using tantivy. This enables the use of language specific
stemming.

Also updated the [guide on full text
search](https://lancedb.github.io/lancedb/fts/) with a short section on
choosing tokenizer.

Fixes #1315
---
 docs/src/fts.md                 | 11 +++++++++++
 python/python/lancedb/fts.py    |  9 +++++++--
 python/python/lancedb/table.py  |  6 ++++++
 python/python/tests/test_fts.py | 11 +++++++++++
 4 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/docs/src/fts.md b/docs/src/fts.md
index a3659cef..2330e837 100644
--- a/docs/src/fts.md
+++ b/docs/src/fts.md
@@ -54,6 +54,16 @@ This returns the result as a list of dictionaries as follows.
 !!! note
     LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
 
+## Tokenization
+By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
+
+```python
+table.create_fts_index("text", tokenizer_name="en_stem")
+```
+
+The following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
+
+
 ## Index multiple columns
 
 If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
@@ -139,6 +149,7 @@ is treated as a phrase query.
 In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
 double quotes replaced by single quotes.
 
+
 ## Configurations
 
 By default, LanceDB configures a 1GB heap size limit for creating the index. You can
diff --git a/python/python/lancedb/fts.py b/python/python/lancedb/fts.py
index 970f6f2d..1b7adde5 100644
--- a/python/python/lancedb/fts.py
+++ b/python/python/lancedb/fts.py
@@ -29,7 +29,10 @@ from .table import LanceTable
 
 
 def create_index(
-    index_path: str, text_fields: List[str], ordering_fields: List[str] = None
+    index_path: str,
+    text_fields: List[str],
+    ordering_fields: List[str] = None,
+    tokenizer_name: str = "default",
 ) -> tantivy.Index:
     """
     Create a new Index (not populated)
@@ -42,6 +45,8 @@ def create_index(
         List of text fields to index
     ordering_fields: List[str]
         List of unsigned type fields to order by at search time
+    tokenizer_name : str, default "default"
+        The tokenizer to use
 
     Returns
     -------
@@ -56,7 +61,7 @@ def create_index(
     schema_builder.add_integer_field("doc_id", stored=True)
     # data fields
     for name in text_fields:
-        schema_builder.add_text_field(name, stored=True)
+        schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
     if ordering_fields:
         for name in ordering_fields:
             schema_builder.add_unsigned_field(name, fast=True)
diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py
index d9ab1add..ca8fd881 100644
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -1171,6 +1171,7 @@ class LanceTable(Table):
         *,
         replace: bool = False,
         writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
+        tokenizer_name: str = "default",
     ):
         """Create a full-text search index on the table.
 
@@ -1189,6 +1190,10 @@ class LanceTable(Table):
         ordering_field_names:
             A list of unsigned type fields to index to optionally order
             results on at search time
+        tokenizer_name: str, default "default"
+            The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
+            language code followed by "_stem". So for english it would be "en_stem".
+            For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
         """
         from .fts import create_index, populate_index
 
@@ -1214,6 +1219,7 @@ class LanceTable(Table):
             self._get_fts_index_path(),
             field_names,
             ordering_fields=ordering_field_names,
+            tokenizer_name=tokenizer_name,
         )
         populate_index(
             index,
diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py
index a70fcf53..77f07388 100644
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -66,6 +66,17 @@ def test_create_index(tmp_path):
     assert os.path.exists(str(tmp_path / "index"))
 
 
+def test_create_index_with_stemming(tmp_path, table):
+    index = ldb.fts.create_index(
+        str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
+    )
+    assert isinstance(index, tantivy.Index)
+    assert os.path.exists(str(tmp_path / "index"))
+
+    # Check stemming by running tokenizer on non empty table
+    table.create_fts_index("text", tokenizer_name="en_stem")
+
+
 def test_populate_index(tmp_path, table):
     index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
     assert ldb.fts.populate_index(index, table, ["text"]) == len(table)