feat: enable stemming (#1356)

Added the ability to specify tokenizer_name, when creating a full text
search index using tantivy. This enables the use of language specific
stemming.

Also updated the [guide on full text
search](https://lancedb.github.io/lancedb/fts/) with a short section on
choosing tokenizer.

Fixes #1315
This commit is contained in:
josca42
2024-06-20 23:23:55 +02:00
committed by GitHub
parent f41eb899dc
commit 0fe844034d
4 changed files with 35 additions and 2 deletions

View File

@@ -29,7 +29,10 @@ from .table import LanceTable
def create_index(
index_path: str, text_fields: List[str], ordering_fields: List[str] = None
index_path: str,
text_fields: List[str],
ordering_fields: List[str] = None,
tokenizer_name: str = "default",
) -> tantivy.Index:
"""
Create a new Index (not populated)
@@ -42,6 +45,8 @@ def create_index(
List of text fields to index
ordering_fields: List[str]
List of unsigned type fields to order by at search time
tokenizer_name : str, default "default"
The tokenizer to use
Returns
-------
@@ -56,7 +61,7 @@ def create_index(
schema_builder.add_integer_field("doc_id", stored=True)
# data fields
for name in text_fields:
schema_builder.add_text_field(name, stored=True)
schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
if ordering_fields:
for name in ordering_fields:
schema_builder.add_unsigned_field(name, fast=True)

View File

@@ -1171,6 +1171,7 @@ class LanceTable(Table):
*,
replace: bool = False,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default",
):
"""Create a full-text search index on the table.
@@ -1189,6 +1190,10 @@ class LanceTable(Table):
ordering_field_names:
A list of unsigned type fields to index to optionally order
results on at search time
tokenizer_name: str, default "default"
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
language code followed by "_stem". So for english it would be "en_stem".
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
"""
from .fts import create_index, populate_index
@@ -1214,6 +1219,7 @@ class LanceTable(Table):
self._get_fts_index_path(),
field_names,
ordering_fields=ordering_field_names,
tokenizer_name=tokenizer_name,
)
populate_index(
index,

View File

@@ -66,6 +66,17 @@ def test_create_index(tmp_path):
assert os.path.exists(str(tmp_path / "index"))
def test_create_index_with_stemming(tmp_path, table):
index = ldb.fts.create_index(
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
)
assert isinstance(index, tantivy.Index)
assert os.path.exists(str(tmp_path / "index"))
# Check stemming by running tokenizer on non empty table
table.create_fts_index("text", tokenizer_name="en_stem")
def test_populate_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)