feat: enable stemming (#1356)

Added the ability to specify tokenizer_name, when creating a full text
search index using tantivy. This enables the use of language specific
stemming.

Also updated the [guide on full text
search](https://lancedb.github.io/lancedb/fts/) with a short section on
choosing tokenizer.

Fixes #1315
This commit is contained in:
josca42
2024-06-20 23:23:55 +02:00
committed by GitHub
parent f41eb899dc
commit 0fe844034d
4 changed files with 35 additions and 2 deletions

View File

@@ -54,6 +54,16 @@ This returns the result as a list of dictionaries as follows.
!!! note
LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
## Tokenization
By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
```python
table.create_fts_index("text", tokenizer_name="en_stem")
```
The following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
## Index multiple columns
If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
@@ -139,6 +149,7 @@ is treated as a phrase query.
In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
double quotes replaced by single quotes.
## Configurations
By default, LanceDB configures a 1GB heap size limit for creating the index. You can

View File

@@ -29,7 +29,10 @@ from .table import LanceTable
def create_index(
index_path: str, text_fields: List[str], ordering_fields: List[str] = None
index_path: str,
text_fields: List[str],
ordering_fields: List[str] = None,
tokenizer_name: str = "default",
) -> tantivy.Index:
"""
Create a new Index (not populated)
@@ -42,6 +45,8 @@ def create_index(
List of text fields to index
ordering_fields: List[str]
List of unsigned type fields to order by at search time
tokenizer_name : str, default "default"
The tokenizer to use
Returns
-------
@@ -56,7 +61,7 @@ def create_index(
schema_builder.add_integer_field("doc_id", stored=True)
# data fields
for name in text_fields:
schema_builder.add_text_field(name, stored=True)
schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
if ordering_fields:
for name in ordering_fields:
schema_builder.add_unsigned_field(name, fast=True)

View File

@@ -1171,6 +1171,7 @@ class LanceTable(Table):
*,
replace: bool = False,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default",
):
"""Create a full-text search index on the table.
@@ -1189,6 +1190,10 @@ class LanceTable(Table):
ordering_field_names:
A list of unsigned type fields to index to optionally order
results on at search time
tokenizer_name: str, default "default"
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
language code followed by "_stem". So for english it would be "en_stem".
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
"""
from .fts import create_index, populate_index
@@ -1214,6 +1219,7 @@ class LanceTable(Table):
self._get_fts_index_path(),
field_names,
ordering_fields=ordering_field_names,
tokenizer_name=tokenizer_name,
)
populate_index(
index,

View File

@@ -66,6 +66,17 @@ def test_create_index(tmp_path):
assert os.path.exists(str(tmp_path / "index"))
def test_create_index_with_stemming(tmp_path, table):
index = ldb.fts.create_index(
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
)
assert isinstance(index, tantivy.Index)
assert os.path.exists(str(tmp_path / "index"))
# Check stemming by running tokenizer on non empty table
table.create_fts_index("text", tokenizer_name="en_stem")
def test_populate_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)