From 0fe844034de074063809caf53cb87d2638b2afae Mon Sep 17 00:00:00 2001 From: josca42 <94163701+josca42@users.noreply.github.com> Date: Thu, 20 Jun 2024 23:23:55 +0200 Subject: [PATCH] feat: enable stemming (#1356) Added the ability to specify tokenizer_name, when creating a full text search index using tantivy. This enables the use of language specific stemming. Also updated the [guide on full text search](https://lancedb.github.io/lancedb/fts/) with a short section on choosing tokenizer. Fixes #1315 --- docs/src/fts.md | 11 +++++++++++ python/python/lancedb/fts.py | 9 +++++++-- python/python/lancedb/table.py | 6 ++++++ python/python/tests/test_fts.py | 11 +++++++++++ 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/docs/src/fts.md b/docs/src/fts.md index a3659cef..2330e837 100644 --- a/docs/src/fts.md +++ b/docs/src/fts.md @@ -54,6 +54,16 @@ This returns the result as a list of dictionaries as follows. !!! note LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead. +## Tokenization +By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem". + +```python +table.create_fts_index("text", tokenizer_name="en_stem") +``` + +The following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported. + + ## Index multiple columns If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`: @@ -139,6 +149,7 @@ is treated as a phrase query. In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested double quotes replaced by single quotes. + ## Configurations By default, LanceDB configures a 1GB heap size limit for creating the index. You can diff --git a/python/python/lancedb/fts.py b/python/python/lancedb/fts.py index 970f6f2d..1b7adde5 100644 --- a/python/python/lancedb/fts.py +++ b/python/python/lancedb/fts.py @@ -29,7 +29,10 @@ from .table import LanceTable def create_index( - index_path: str, text_fields: List[str], ordering_fields: List[str] = None + index_path: str, + text_fields: List[str], + ordering_fields: List[str] = None, + tokenizer_name: str = "default", ) -> tantivy.Index: """ Create a new Index (not populated) @@ -42,6 +45,8 @@ def create_index( List of text fields to index ordering_fields: List[str] List of unsigned type fields to order by at search time + tokenizer_name : str, default "default" + The tokenizer to use Returns ------- @@ -56,7 +61,7 @@ def create_index( schema_builder.add_integer_field("doc_id", stored=True) # data fields for name in text_fields: - schema_builder.add_text_field(name, stored=True) + schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name) if ordering_fields: for name in ordering_fields: schema_builder.add_unsigned_field(name, fast=True) diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index d9ab1add..ca8fd881 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -1171,6 +1171,7 @@ class LanceTable(Table): *, replace: bool = False, writer_heap_size: Optional[int] = 1024 * 1024 * 1024, + tokenizer_name: str = "default", ): """Create a full-text search index on the table. @@ -1189,6 +1190,10 @@ class LanceTable(Table): ordering_field_names: A list of unsigned type fields to index to optionally order results on at search time + tokenizer_name: str, default "default" + The tokenizer to use for the index. Can be "raw", "default" or the 2 letter + language code followed by "_stem". So for english it would be "en_stem". + For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html """ from .fts import create_index, populate_index @@ -1214,6 +1219,7 @@ class LanceTable(Table): self._get_fts_index_path(), field_names, ordering_fields=ordering_field_names, + tokenizer_name=tokenizer_name, ) populate_index( index, diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index a70fcf53..77f07388 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -66,6 +66,17 @@ def test_create_index(tmp_path): assert os.path.exists(str(tmp_path / "index")) +def test_create_index_with_stemming(tmp_path, table): + index = ldb.fts.create_index( + str(tmp_path / "index"), ["text"], tokenizer_name="en_stem" + ) + assert isinstance(index, tantivy.Index) + assert os.path.exists(str(tmp_path / "index")) + + # Check stemming by running tokenizer on non empty table + table.create_fts_index("text", tokenizer_name="en_stem") + + def test_populate_index(tmp_path, table): index = ldb.fts.create_index(str(tmp_path / "index"), ["text"]) assert ldb.fts.populate_index(index, table, ["text"]) == len(table)