feat: support ngram tokenizer (#2507)

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2025-07-15 16:36:08 +08:00
committed by GitHub
parent 4c999fb651
commit 03b62599d7
9 changed files with 173 additions and 3 deletions

View File

@@ -137,6 +137,9 @@ class FTS:
stem: bool = True
remove_stop_words: bool = True
ascii_folding: bool = True
ngram_min_length: int = 3
ngram_max_length: int = 3
prefix_only: bool = False
@dataclass

View File

@@ -158,6 +158,9 @@ class RemoteTable(Table):
stem: bool = True,
remove_stop_words: bool = True,
ascii_folding: bool = True,
ngram_min_length: int = 3,
ngram_max_length: int = 3,
prefix_only: bool = False,
):
config = FTS(
with_position=with_position,
@@ -168,6 +171,9 @@ class RemoteTable(Table):
stem=stem,
remove_stop_words=remove_stop_words,
ascii_folding=ascii_folding,
ngram_min_length=ngram_min_length,
ngram_max_length=ngram_max_length,
prefix_only=prefix_only,
)
LOOP.run(
self._table.create_index(

View File

@@ -838,6 +838,9 @@ class Table(ABC):
stem: bool = True,
remove_stop_words: bool = True,
ascii_folding: bool = True,
ngram_min_length: int = 3,
ngram_max_length: int = 3,
prefix_only: bool = False,
wait_timeout: Optional[timedelta] = None,
):
"""Create a full-text search index on the table.
@@ -877,6 +880,7 @@ class Table(ABC):
- "simple": Splits text by whitespace and punctuation.
- "whitespace": Split text by whitespace, but not punctuation.
- "raw": No tokenization. The entire text is treated as a single token.
- "ngram": N-Gram tokenizer.
language : str, default "English"
The language to use for tokenization.
max_token_length : int, default 40
@@ -894,6 +898,12 @@ class Table(ABC):
ascii_folding : bool, default True
Whether to fold ASCII characters. This converts accented characters to
their ASCII equivalent. For example, "café" would be converted to "cafe".
ngram_min_length: int, default 3
The minimum length of an n-gram.
ngram_max_length: int, default 3
The maximum length of an n-gram.
prefix_only: bool, default False
Whether to only index the prefix of the token for ngram tokenizer.
wait_timeout: timedelta, optional
The timeout to wait if indexing is asynchronous.
"""
@@ -1981,6 +1991,9 @@ class LanceTable(Table):
stem: bool = True,
remove_stop_words: bool = True,
ascii_folding: bool = True,
ngram_min_length: int = 3,
ngram_max_length: int = 3,
prefix_only: bool = False,
):
if not use_tantivy:
if not isinstance(field_names, str):
@@ -1996,6 +2009,9 @@ class LanceTable(Table):
"stem": stem,
"remove_stop_words": remove_stop_words,
"ascii_folding": ascii_folding,
"ngram_min_length": ngram_min_length,
"ngram_max_length": ngram_max_length,
"prefix_only": prefix_only,
}
else:
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
@@ -2065,6 +2081,9 @@ class LanceTable(Table):
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
"ngram_min_length": 3,
"ngram_max_length": 3,
"prefix_only": False,
}
elif tokenizer_name == "raw":
return {
@@ -2075,6 +2094,9 @@ class LanceTable(Table):
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
"ngram_min_length": 3,
"ngram_max_length": 3,
"prefix_only": False,
}
elif tokenizer_name == "whitespace":
return {
@@ -2085,6 +2107,9 @@ class LanceTable(Table):
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
"ngram_min_length": 3,
"ngram_max_length": 3,
"prefix_only": False,
}
# or it's with language stemming with pattern like "en_stem"
@@ -2103,6 +2128,9 @@ class LanceTable(Table):
"stem": True,
"remove_stop_words": False,
"ascii_folding": False,
"ngram_min_length": 3,
"ngram_max_length": 3,
"prefix_only": False,
}
def add(

View File

@@ -25,4 +25,4 @@ IndexType = Literal[
]
# Tokenizer literals
BaseTokenizerType = Literal["simple", "raw", "whitespace"]
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]

View File

@@ -669,3 +669,46 @@ def test_fts_on_list(mem_db: DBConnection):
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
assert len(res) == 2
def test_fts_ngram(mem_db: DBConnection):
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
table = mem_db.create_table("test", data=data)
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
results = table.search("lan", query_type="fts").limit(10).to_list()
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
results = (
table.search("nce", query_type="fts").limit(10).to_list()
) # spellchecker:disable-line
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
# the default min_ngram_length is 3, so "la" should not match
results = table.search("la", query_type="fts").limit(10).to_list()
assert len(results) == 0
# test setting min_ngram_length and prefix_only
table.create_fts_index(
"text",
use_tantivy=False,
base_tokenizer="ngram",
replace=True,
ngram_min_length=2,
prefix_only=True,
)
results = table.search("lan", query_type="fts").limit(10).to_list()
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
results = (
table.search("nce", query_type="fts").limit(10).to_list()
) # spellchecker:disable-line
assert len(results) == 0
results = table.search("la", query_type="fts").limit(10).to_list()
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}

View File

@@ -47,7 +47,10 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
.max_token_length(params.max_token_length)
.remove_stop_words(params.remove_stop_words)
.stem(params.stem)
.ascii_folding(params.ascii_folding);
.ascii_folding(params.ascii_folding)
.ngram_min_length(params.ngram_min_length)
.ngram_max_length(params.ngram_max_length)
.ngram_prefix_only(params.prefix_only);
Ok(LanceDbIndex::FTS(inner_opts))
},
"IvfFlat" => {
@@ -130,6 +133,9 @@ struct FtsParams {
stem: bool,
remove_stop_words: bool,
ascii_folding: bool,
ngram_min_length: u32,
ngram_max_length: u32,
prefix_only: bool,
}
#[derive(FromPyObject)]