mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 06:39:57 +00:00
feat: support ngram tokenizer (#2507)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
@@ -137,6 +137,9 @@ class FTS:
|
||||
stem: bool = True
|
||||
remove_stop_words: bool = True
|
||||
ascii_folding: bool = True
|
||||
ngram_min_length: int = 3
|
||||
ngram_max_length: int = 3
|
||||
prefix_only: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -158,6 +158,9 @@ class RemoteTable(Table):
|
||||
stem: bool = True,
|
||||
remove_stop_words: bool = True,
|
||||
ascii_folding: bool = True,
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
):
|
||||
config = FTS(
|
||||
with_position=with_position,
|
||||
@@ -168,6 +171,9 @@ class RemoteTable(Table):
|
||||
stem=stem,
|
||||
remove_stop_words=remove_stop_words,
|
||||
ascii_folding=ascii_folding,
|
||||
ngram_min_length=ngram_min_length,
|
||||
ngram_max_length=ngram_max_length,
|
||||
prefix_only=prefix_only,
|
||||
)
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
|
||||
@@ -838,6 +838,9 @@ class Table(ABC):
|
||||
stem: bool = True,
|
||||
remove_stop_words: bool = True,
|
||||
ascii_folding: bool = True,
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
):
|
||||
"""Create a full-text search index on the table.
|
||||
@@ -877,6 +880,7 @@ class Table(ABC):
|
||||
- "simple": Splits text by whitespace and punctuation.
|
||||
- "whitespace": Split text by whitespace, but not punctuation.
|
||||
- "raw": No tokenization. The entire text is treated as a single token.
|
||||
- "ngram": N-Gram tokenizer.
|
||||
language : str, default "English"
|
||||
The language to use for tokenization.
|
||||
max_token_length : int, default 40
|
||||
@@ -894,6 +898,12 @@ class Table(ABC):
|
||||
ascii_folding : bool, default True
|
||||
Whether to fold ASCII characters. This converts accented characters to
|
||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||
ngram_min_length: int, default 3
|
||||
The minimum length of an n-gram.
|
||||
ngram_max_length: int, default 3
|
||||
The maximum length of an n-gram.
|
||||
prefix_only: bool, default False
|
||||
Whether to only index the prefix of the token for ngram tokenizer.
|
||||
wait_timeout: timedelta, optional
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
"""
|
||||
@@ -1981,6 +1991,9 @@ class LanceTable(Table):
|
||||
stem: bool = True,
|
||||
remove_stop_words: bool = True,
|
||||
ascii_folding: bool = True,
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
):
|
||||
if not use_tantivy:
|
||||
if not isinstance(field_names, str):
|
||||
@@ -1996,6 +2009,9 @@ class LanceTable(Table):
|
||||
"stem": stem,
|
||||
"remove_stop_words": remove_stop_words,
|
||||
"ascii_folding": ascii_folding,
|
||||
"ngram_min_length": ngram_min_length,
|
||||
"ngram_max_length": ngram_max_length,
|
||||
"prefix_only": prefix_only,
|
||||
}
|
||||
else:
|
||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||
@@ -2065,6 +2081,9 @@ class LanceTable(Table):
|
||||
"stem": False,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
"ngram_min_length": 3,
|
||||
"ngram_max_length": 3,
|
||||
"prefix_only": False,
|
||||
}
|
||||
elif tokenizer_name == "raw":
|
||||
return {
|
||||
@@ -2075,6 +2094,9 @@ class LanceTable(Table):
|
||||
"stem": False,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
"ngram_min_length": 3,
|
||||
"ngram_max_length": 3,
|
||||
"prefix_only": False,
|
||||
}
|
||||
elif tokenizer_name == "whitespace":
|
||||
return {
|
||||
@@ -2085,6 +2107,9 @@ class LanceTable(Table):
|
||||
"stem": False,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
"ngram_min_length": 3,
|
||||
"ngram_max_length": 3,
|
||||
"prefix_only": False,
|
||||
}
|
||||
|
||||
# or it's with language stemming with pattern like "en_stem"
|
||||
@@ -2103,6 +2128,9 @@ class LanceTable(Table):
|
||||
"stem": True,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
"ngram_min_length": 3,
|
||||
"ngram_max_length": 3,
|
||||
"prefix_only": False,
|
||||
}
|
||||
|
||||
def add(
|
||||
|
||||
@@ -25,4 +25,4 @@ IndexType = Literal[
|
||||
]
|
||||
|
||||
# Tokenizer literals
|
||||
BaseTokenizerType = Literal["simple", "raw", "whitespace"]
|
||||
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||
|
||||
@@ -669,3 +669,46 @@ def test_fts_on_list(mem_db: DBConnection):
|
||||
|
||||
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
|
||||
assert len(res) == 2
|
||||
|
||||
|
||||
def test_fts_ngram(mem_db: DBConnection):
|
||||
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
|
||||
table = mem_db.create_table("test", data=data)
|
||||
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
|
||||
|
||||
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 2
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
results = (
|
||||
table.search("nce", query_type="fts").limit(10).to_list()
|
||||
) # spellchecker:disable-line
|
||||
assert len(results) == 2
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
# the default min_ngram_length is 3, so "la" should not match
|
||||
results = table.search("la", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 0
|
||||
|
||||
# test setting min_ngram_length and prefix_only
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
base_tokenizer="ngram",
|
||||
replace=True,
|
||||
ngram_min_length=2,
|
||||
prefix_only=True,
|
||||
)
|
||||
|
||||
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 2
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
results = (
|
||||
table.search("nce", query_type="fts").limit(10).to_list()
|
||||
) # spellchecker:disable-line
|
||||
assert len(results) == 0
|
||||
|
||||
results = table.search("la", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 2
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
@@ -47,7 +47,10 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
||||
.max_token_length(params.max_token_length)
|
||||
.remove_stop_words(params.remove_stop_words)
|
||||
.stem(params.stem)
|
||||
.ascii_folding(params.ascii_folding);
|
||||
.ascii_folding(params.ascii_folding)
|
||||
.ngram_min_length(params.ngram_min_length)
|
||||
.ngram_max_length(params.ngram_max_length)
|
||||
.ngram_prefix_only(params.prefix_only);
|
||||
Ok(LanceDbIndex::FTS(inner_opts))
|
||||
},
|
||||
"IvfFlat" => {
|
||||
@@ -130,6 +133,9 @@ struct FtsParams {
|
||||
stem: bool,
|
||||
remove_stop_words: bool,
|
||||
ascii_folding: bool,
|
||||
ngram_min_length: u32,
|
||||
ngram_max_length: u32,
|
||||
prefix_only: bool,
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
|
||||
Reference in New Issue
Block a user