From 03b62599d7cc40b567336599389f32cce1bab078 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 15 Jul 2025 16:36:08 +0800 Subject: [PATCH] feat: support ngram tokenizer (#2507) Signed-off-by: BubbleCal --- nodejs/__test__/table.test.ts | 54 +++++++++++++++++++++++++++ nodejs/lancedb/indices.ts | 20 +++++++++- nodejs/src/index.rs | 12 ++++++ python/python/lancedb/index.py | 3 ++ python/python/lancedb/remote/table.py | 6 +++ python/python/lancedb/table.py | 28 ++++++++++++++ python/python/lancedb/types.py | 2 +- python/python/tests/test_fts.py | 43 +++++++++++++++++++++ python/src/index.rs | 8 +++- 9 files changed, 173 insertions(+), 3 deletions(-) diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index 4995f757..9d49f243 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -1706,6 +1706,60 @@ describe.each([arrow15, arrow16, arrow17, arrow18])( expect(mustNotResults.length).toBe(1); }); + test("full text search ngram", async () => { + const db = await connect(tmpDir.name); + const data = [ + { text: "hello world", vector: [0.1, 0.2, 0.3] }, + { text: "lance database", vector: [0.4, 0.5, 0.6] }, + { text: "lance is cool", vector: [0.7, 0.8, 0.9] }, + ]; + const table = await db.createTable("test", data); + await table.createIndex("text", { + config: Index.fts({ baseTokenizer: "ngram" }), + }); + + const results = await table.search("lan").toArray(); + expect(results.length).toBe(2); + const resultSet = new Set(results.map((r) => r.text)); + expect(resultSet.has("lance database")).toBe(true); + expect(resultSet.has("lance is cool")).toBe(true); + + const results2 = await table.search("nce").toArray(); // spellchecker:disable-line + expect(results2.length).toBe(2); + const resultSet2 = new Set(results2.map((r) => r.text)); + expect(resultSet2.has("lance database")).toBe(true); + expect(resultSet2.has("lance is cool")).toBe(true); + + // the default min_ngram_length is 3, so "la" should not match + const results3 = await table.search("la").toArray(); + expect(results3.length).toBe(0); + + // test setting min_ngram_length and prefix_only + await table.createIndex("text", { + config: Index.fts({ + baseTokenizer: "ngram", + ngramMinLength: 2, + prefixOnly: true, + }), + replace: true, + }); + + const results4 = await table.search("lan").toArray(); + expect(results4.length).toBe(2); + const resultSet4 = new Set(results4.map((r) => r.text)); + expect(resultSet4.has("lance database")).toBe(true); + expect(resultSet4.has("lance is cool")).toBe(true); + + const results5 = await table.search("nce").toArray(); // spellchecker:disable-line + expect(results5.length).toBe(0); + + const results6 = await table.search("la").toArray(); + expect(results6.length).toBe(2); + const resultSet6 = new Set(results6.map((r) => r.text)); + expect(resultSet6.has("lance database")).toBe(true); + expect(resultSet6.has("lance is cool")).toBe(true); + }); + test.each([ [0.4, 0.5, 0.599], // number[] Float32Array.of(0.4, 0.5, 0.599), // Float32Array diff --git a/nodejs/lancedb/indices.ts b/nodejs/lancedb/indices.ts index 2aa14ff5..a03e732b 100644 --- a/nodejs/lancedb/indices.ts +++ b/nodejs/lancedb/indices.ts @@ -439,7 +439,7 @@ export interface FtsOptions { * * "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token. */ - baseTokenizer?: "simple" | "whitespace" | "raw"; + baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram"; /** * language for stemming and stop words @@ -472,6 +472,21 @@ export interface FtsOptions { * whether to remove punctuation */ asciiFolding?: boolean; + + /** + * ngram min length + */ + ngramMinLength?: number; + + /** + * ngram max length + */ + ngramMaxLength?: number; + + /** + * whether to only index the prefix of the token for ngram tokenizer + */ + prefixOnly?: boolean; } export class Index { @@ -608,6 +623,9 @@ export class Index { options?.stem, options?.removeStopWords, options?.asciiFolding, + options?.ngramMinLength, + options?.ngramMaxLength, + options?.prefixOnly, ), ); } diff --git a/nodejs/src/index.rs b/nodejs/src/index.rs index 8179d2d7..37b775f6 100644 --- a/nodejs/src/index.rs +++ b/nodejs/src/index.rs @@ -123,6 +123,9 @@ impl Index { stem: Option, remove_stop_words: Option, ascii_folding: Option, + ngram_min_length: Option, + ngram_max_length: Option, + prefix_only: Option, ) -> Self { let mut opts = FtsIndexBuilder::default(); if let Some(with_position) = with_position { @@ -149,6 +152,15 @@ impl Index { if let Some(ascii_folding) = ascii_folding { opts = opts.ascii_folding(ascii_folding); } + if let Some(ngram_min_length) = ngram_min_length { + opts = opts.ngram_min_length(ngram_min_length); + } + if let Some(ngram_max_length) = ngram_max_length { + opts = opts.ngram_max_length(ngram_max_length); + } + if let Some(prefix_only) = prefix_only { + opts = opts.ngram_prefix_only(prefix_only); + } Self { inner: Mutex::new(Some(LanceDbIndex::FTS(opts))), diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py index b91e5582..dbcb6013 100644 --- a/python/python/lancedb/index.py +++ b/python/python/lancedb/index.py @@ -137,6 +137,9 @@ class FTS: stem: bool = True remove_stop_words: bool = True ascii_folding: bool = True + ngram_min_length: int = 3 + ngram_max_length: int = 3 + prefix_only: bool = False @dataclass diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index 3c23d6e5..a07bc00d 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -158,6 +158,9 @@ class RemoteTable(Table): stem: bool = True, remove_stop_words: bool = True, ascii_folding: bool = True, + ngram_min_length: int = 3, + ngram_max_length: int = 3, + prefix_only: bool = False, ): config = FTS( with_position=with_position, @@ -168,6 +171,9 @@ class RemoteTable(Table): stem=stem, remove_stop_words=remove_stop_words, ascii_folding=ascii_folding, + ngram_min_length=ngram_min_length, + ngram_max_length=ngram_max_length, + prefix_only=prefix_only, ) LOOP.run( self._table.create_index( diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 6a964104..e409f5ed 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -838,6 +838,9 @@ class Table(ABC): stem: bool = True, remove_stop_words: bool = True, ascii_folding: bool = True, + ngram_min_length: int = 3, + ngram_max_length: int = 3, + prefix_only: bool = False, wait_timeout: Optional[timedelta] = None, ): """Create a full-text search index on the table. @@ -877,6 +880,7 @@ class Table(ABC): - "simple": Splits text by whitespace and punctuation. - "whitespace": Split text by whitespace, but not punctuation. - "raw": No tokenization. The entire text is treated as a single token. + - "ngram": N-Gram tokenizer. language : str, default "English" The language to use for tokenization. max_token_length : int, default 40 @@ -894,6 +898,12 @@ class Table(ABC): ascii_folding : bool, default True Whether to fold ASCII characters. This converts accented characters to their ASCII equivalent. For example, "café" would be converted to "cafe". + ngram_min_length: int, default 3 + The minimum length of an n-gram. + ngram_max_length: int, default 3 + The maximum length of an n-gram. + prefix_only: bool, default False + Whether to only index the prefix of the token for ngram tokenizer. wait_timeout: timedelta, optional The timeout to wait if indexing is asynchronous. """ @@ -1981,6 +1991,9 @@ class LanceTable(Table): stem: bool = True, remove_stop_words: bool = True, ascii_folding: bool = True, + ngram_min_length: int = 3, + ngram_max_length: int = 3, + prefix_only: bool = False, ): if not use_tantivy: if not isinstance(field_names, str): @@ -1996,6 +2009,9 @@ class LanceTable(Table): "stem": stem, "remove_stop_words": remove_stop_words, "ascii_folding": ascii_folding, + "ngram_min_length": ngram_min_length, + "ngram_max_length": ngram_max_length, + "prefix_only": prefix_only, } else: tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name) @@ -2065,6 +2081,9 @@ class LanceTable(Table): "stem": False, "remove_stop_words": False, "ascii_folding": False, + "ngram_min_length": 3, + "ngram_max_length": 3, + "prefix_only": False, } elif tokenizer_name == "raw": return { @@ -2075,6 +2094,9 @@ class LanceTable(Table): "stem": False, "remove_stop_words": False, "ascii_folding": False, + "ngram_min_length": 3, + "ngram_max_length": 3, + "prefix_only": False, } elif tokenizer_name == "whitespace": return { @@ -2085,6 +2107,9 @@ class LanceTable(Table): "stem": False, "remove_stop_words": False, "ascii_folding": False, + "ngram_min_length": 3, + "ngram_max_length": 3, + "prefix_only": False, } # or it's with language stemming with pattern like "en_stem" @@ -2103,6 +2128,9 @@ class LanceTable(Table): "stem": True, "remove_stop_words": False, "ascii_folding": False, + "ngram_min_length": 3, + "ngram_max_length": 3, + "prefix_only": False, } def add( diff --git a/python/python/lancedb/types.py b/python/python/lancedb/types.py index 456c5364..61df76c4 100644 --- a/python/python/lancedb/types.py +++ b/python/python/lancedb/types.py @@ -25,4 +25,4 @@ IndexType = Literal[ ] # Tokenizer literals -BaseTokenizerType = Literal["simple", "raw", "whitespace"] +BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"] diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index 2171d4d2..f3e25b02 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -669,3 +669,46 @@ def test_fts_on_list(mem_db: DBConnection): res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list() assert len(res) == 2 + + +def test_fts_ngram(mem_db: DBConnection): + data = pa.table({"text": ["hello world", "lance database", "lance is cool"]}) + table = mem_db.create_table("test", data=data) + table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram") + + results = table.search("lan", query_type="fts").limit(10).to_list() + assert len(results) == 2 + assert set(r["text"] for r in results) == {"lance database", "lance is cool"} + + results = ( + table.search("nce", query_type="fts").limit(10).to_list() + ) # spellchecker:disable-line + assert len(results) == 2 + assert set(r["text"] for r in results) == {"lance database", "lance is cool"} + + # the default min_ngram_length is 3, so "la" should not match + results = table.search("la", query_type="fts").limit(10).to_list() + assert len(results) == 0 + + # test setting min_ngram_length and prefix_only + table.create_fts_index( + "text", + use_tantivy=False, + base_tokenizer="ngram", + replace=True, + ngram_min_length=2, + prefix_only=True, + ) + + results = table.search("lan", query_type="fts").limit(10).to_list() + assert len(results) == 2 + assert set(r["text"] for r in results) == {"lance database", "lance is cool"} + + results = ( + table.search("nce", query_type="fts").limit(10).to_list() + ) # spellchecker:disable-line + assert len(results) == 0 + + results = table.search("la", query_type="fts").limit(10).to_list() + assert len(results) == 2 + assert set(r["text"] for r in results) == {"lance database", "lance is cool"} diff --git a/python/src/index.rs b/python/src/index.rs index 8b19efee..0381c21c 100644 --- a/python/src/index.rs +++ b/python/src/index.rs @@ -47,7 +47,10 @@ pub fn extract_index_params(source: &Option>) -> PyResult { @@ -130,6 +133,9 @@ struct FtsParams { stem: bool, remove_stop_words: bool, ascii_folding: bool, + ngram_min_length: u32, + ngram_max_length: u32, + prefix_only: bool, } #[derive(FromPyObject)]