feat: enable stemming (#1356)

Added the ability to specify tokenizer_name, when creating a full text search index using tantivy. This enables the use of language specific stemming. Also updated the [guide on full text search](https://lancedb.github.io/lancedb/fts/) with a short section on choosing tokenizer. Fixes #1315
2025-12-26 14:49:57 +00:00 · 2024-06-20 23:23:55 +02:00
parent f41eb899dc
commit 0fe844034d
4 changed files with 35 additions and 2 deletions
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -66,6 +66,17 @@ def test_create_index(tmp_path):
    assert os.path.exists(str(tmp_path / "index"))


+def test_create_index_with_stemming(tmp_path, table):
+    index = ldb.fts.create_index(
+        str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
+    )
+    assert isinstance(index, tantivy.Index)
+    assert os.path.exists(str(tmp_path / "index"))
+
+    # Check stemming by running tokenizer on non empty table
+    table.create_fts_index("text", tokenizer_name="en_stem")
+
+
 def test_populate_index(tmp_path, table):
    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
    assert ldb.fts.populate_index(index, table, ["text"]) == len(table)