From 03b62599d7cc40b567336599389f32cce1bab078 Mon Sep 17 00:00:00 2001
From: BubbleCal <bubble-cal@outlook.com>
Date: Tue, 15 Jul 2025 16:36:08 +0800
Subject: [PATCH] feat: support ngram tokenizer (#2507)

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
---
 nodejs/__test__/table.test.ts         | 54 +++++++++++++++++++++++++++
 nodejs/lancedb/indices.ts             | 20 +++++++++-
 nodejs/src/index.rs                   | 12 ++++++
 python/python/lancedb/index.py        |  3 ++
 python/python/lancedb/remote/table.py |  6 +++
 python/python/lancedb/table.py        | 28 ++++++++++++++
 python/python/lancedb/types.py        |  2 +-
 python/python/tests/test_fts.py       | 43 +++++++++++++++++++++
 python/src/index.rs                   |  8 +++-
 9 files changed, 173 insertions(+), 3 deletions(-)

diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts
index 4995f757..9d49f243 100644
--- a/nodejs/__test__/table.test.ts
+++ b/nodejs/__test__/table.test.ts
@@ -1706,6 +1706,60 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
       expect(mustNotResults.length).toBe(1);
     });
 
+    test("full text search ngram", async () => {
+      const db = await connect(tmpDir.name);
+      const data = [
+        { text: "hello world", vector: [0.1, 0.2, 0.3] },
+        { text: "lance database", vector: [0.4, 0.5, 0.6] },
+        { text: "lance is cool", vector: [0.7, 0.8, 0.9] },
+      ];
+      const table = await db.createTable("test", data);
+      await table.createIndex("text", {
+        config: Index.fts({ baseTokenizer: "ngram" }),
+      });
+
+      const results = await table.search("lan").toArray();
+      expect(results.length).toBe(2);
+      const resultSet = new Set(results.map((r) => r.text));
+      expect(resultSet.has("lance database")).toBe(true);
+      expect(resultSet.has("lance is cool")).toBe(true);
+
+      const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
+      expect(results2.length).toBe(2);
+      const resultSet2 = new Set(results2.map((r) => r.text));
+      expect(resultSet2.has("lance database")).toBe(true);
+      expect(resultSet2.has("lance is cool")).toBe(true);
+
+      // the default min_ngram_length is 3, so "la" should not match
+      const results3 = await table.search("la").toArray();
+      expect(results3.length).toBe(0);
+
+      // test setting min_ngram_length and prefix_only
+      await table.createIndex("text", {
+        config: Index.fts({
+          baseTokenizer: "ngram",
+          ngramMinLength: 2,
+          prefixOnly: true,
+        }),
+        replace: true,
+      });
+
+      const results4 = await table.search("lan").toArray();
+      expect(results4.length).toBe(2);
+      const resultSet4 = new Set(results4.map((r) => r.text));
+      expect(resultSet4.has("lance database")).toBe(true);
+      expect(resultSet4.has("lance is cool")).toBe(true);
+
+      const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
+      expect(results5.length).toBe(0);
+
+      const results6 = await table.search("la").toArray();
+      expect(results6.length).toBe(2);
+      const resultSet6 = new Set(results6.map((r) => r.text));
+      expect(resultSet6.has("lance database")).toBe(true);
+      expect(resultSet6.has("lance is cool")).toBe(true);
+    });
+
     test.each([
       [0.4, 0.5, 0.599], // number[]
       Float32Array.of(0.4, 0.5, 0.599), // Float32Array
diff --git a/nodejs/lancedb/indices.ts b/nodejs/lancedb/indices.ts
index 2aa14ff5..a03e732b 100644
--- a/nodejs/lancedb/indices.ts
+++ b/nodejs/lancedb/indices.ts
@@ -439,7 +439,7 @@ export interface FtsOptions {
    *
    * "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
    */
-  baseTokenizer?: "simple" | "whitespace" | "raw";
+  baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";
 
   /**
    * language for stemming and stop words
@@ -472,6 +472,21 @@ export interface FtsOptions {
    * whether to remove punctuation
    */
   asciiFolding?: boolean;
+
+  /**
+   * ngram min length
+   */
+  ngramMinLength?: number;
+
+  /**
+   * ngram max length
+   */
+  ngramMaxLength?: number;
+
+  /**
+   * whether to only index the prefix of the token for ngram tokenizer
+   */
+  prefixOnly?: boolean;
 }
 
 export class Index {
@@ -608,6 +623,9 @@ export class Index {
         options?.stem,
         options?.removeStopWords,
         options?.asciiFolding,
+        options?.ngramMinLength,
+        options?.ngramMaxLength,
+        options?.prefixOnly,
       ),
     );
   }
diff --git a/nodejs/src/index.rs b/nodejs/src/index.rs
index 8179d2d7..37b775f6 100644
--- a/nodejs/src/index.rs
+++ b/nodejs/src/index.rs
@@ -123,6 +123,9 @@ impl Index {
         stem: Option<bool>,
         remove_stop_words: Option<bool>,
         ascii_folding: Option<bool>,
+        ngram_min_length: Option<u32>,
+        ngram_max_length: Option<u32>,
+        prefix_only: Option<bool>,
     ) -> Self {
         let mut opts = FtsIndexBuilder::default();
         if let Some(with_position) = with_position {
@@ -149,6 +152,15 @@ impl Index {
         if let Some(ascii_folding) = ascii_folding {
             opts = opts.ascii_folding(ascii_folding);
         }
+        if let Some(ngram_min_length) = ngram_min_length {
+            opts = opts.ngram_min_length(ngram_min_length);
+        }
+        if let Some(ngram_max_length) = ngram_max_length {
+            opts = opts.ngram_max_length(ngram_max_length);
+        }
+        if let Some(prefix_only) = prefix_only {
+            opts = opts.ngram_prefix_only(prefix_only);
+        }
 
         Self {
             inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py
index b91e5582..dbcb6013 100644
--- a/python/python/lancedb/index.py
+++ b/python/python/lancedb/index.py
@@ -137,6 +137,9 @@ class FTS:
     stem: bool = True
     remove_stop_words: bool = True
     ascii_folding: bool = True
+    ngram_min_length: int = 3
+    ngram_max_length: int = 3
+    prefix_only: bool = False
 
 
 @dataclass
diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py
index 3c23d6e5..a07bc00d 100644
--- a/python/python/lancedb/remote/table.py
+++ b/python/python/lancedb/remote/table.py
@@ -158,6 +158,9 @@ class RemoteTable(Table):
         stem: bool = True,
         remove_stop_words: bool = True,
         ascii_folding: bool = True,
+        ngram_min_length: int = 3,
+        ngram_max_length: int = 3,
+        prefix_only: bool = False,
     ):
         config = FTS(
             with_position=with_position,
@@ -168,6 +171,9 @@ class RemoteTable(Table):
             stem=stem,
             remove_stop_words=remove_stop_words,
             ascii_folding=ascii_folding,
+            ngram_min_length=ngram_min_length,
+            ngram_max_length=ngram_max_length,
+            prefix_only=prefix_only,
         )
         LOOP.run(
             self._table.create_index(
diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py
index 6a964104..e409f5ed 100644
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -838,6 +838,9 @@ class Table(ABC):
         stem: bool = True,
         remove_stop_words: bool = True,
         ascii_folding: bool = True,
+        ngram_min_length: int = 3,
+        ngram_max_length: int = 3,
+        prefix_only: bool = False,
         wait_timeout: Optional[timedelta] = None,
     ):
         """Create a full-text search index on the table.
@@ -877,6 +880,7 @@ class Table(ABC):
             - "simple": Splits text by whitespace and punctuation.
             - "whitespace": Split text by whitespace, but not punctuation.
             - "raw": No tokenization. The entire text is treated as a single token.
+            - "ngram": N-Gram tokenizer.
         language : str, default "English"
             The language to use for tokenization.
         max_token_length : int, default 40
@@ -894,6 +898,12 @@ class Table(ABC):
         ascii_folding : bool, default True
             Whether to fold ASCII characters. This converts accented characters to
             their ASCII equivalent. For example, "café" would be converted to "cafe".
+        ngram_min_length: int, default 3
+            The minimum length of an n-gram.
+        ngram_max_length: int, default 3
+            The maximum length of an n-gram.
+        prefix_only: bool, default False
+            Whether to only index the prefix of the token for ngram tokenizer.
         wait_timeout: timedelta, optional
             The timeout to wait if indexing is asynchronous.
         """
@@ -1981,6 +1991,9 @@ class LanceTable(Table):
         stem: bool = True,
         remove_stop_words: bool = True,
         ascii_folding: bool = True,
+        ngram_min_length: int = 3,
+        ngram_max_length: int = 3,
+        prefix_only: bool = False,
     ):
         if not use_tantivy:
             if not isinstance(field_names, str):
@@ -1996,6 +2009,9 @@ class LanceTable(Table):
                     "stem": stem,
                     "remove_stop_words": remove_stop_words,
                     "ascii_folding": ascii_folding,
+                    "ngram_min_length": ngram_min_length,
+                    "ngram_max_length": ngram_max_length,
+                    "prefix_only": prefix_only,
                 }
             else:
                 tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
@@ -2065,6 +2081,9 @@ class LanceTable(Table):
                 "stem": False,
                 "remove_stop_words": False,
                 "ascii_folding": False,
+                "ngram_min_length": 3,
+                "ngram_max_length": 3,
+                "prefix_only": False,
             }
         elif tokenizer_name == "raw":
             return {
@@ -2075,6 +2094,9 @@ class LanceTable(Table):
                 "stem": False,
                 "remove_stop_words": False,
                 "ascii_folding": False,
+                "ngram_min_length": 3,
+                "ngram_max_length": 3,
+                "prefix_only": False,
             }
         elif tokenizer_name == "whitespace":
             return {
@@ -2085,6 +2107,9 @@ class LanceTable(Table):
                 "stem": False,
                 "remove_stop_words": False,
                 "ascii_folding": False,
+                "ngram_min_length": 3,
+                "ngram_max_length": 3,
+                "prefix_only": False,
             }
 
         # or it's with language stemming with pattern like "en_stem"
@@ -2103,6 +2128,9 @@ class LanceTable(Table):
             "stem": True,
             "remove_stop_words": False,
             "ascii_folding": False,
+            "ngram_min_length": 3,
+            "ngram_max_length": 3,
+            "prefix_only": False,
         }
 
     def add(
diff --git a/python/python/lancedb/types.py b/python/python/lancedb/types.py
index 456c5364..61df76c4 100644
--- a/python/python/lancedb/types.py
+++ b/python/python/lancedb/types.py
@@ -25,4 +25,4 @@ IndexType = Literal[
 ]
 
 # Tokenizer literals
-BaseTokenizerType = Literal["simple", "raw", "whitespace"]
+BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py
index 2171d4d2..f3e25b02 100644
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -669,3 +669,46 @@ def test_fts_on_list(mem_db: DBConnection):
 
     res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
     assert len(res) == 2
+
+
+def test_fts_ngram(mem_db: DBConnection):
+    data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
+    table = mem_db.create_table("test", data=data)
+    table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
+
+    results = table.search("lan", query_type="fts").limit(10).to_list()
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
+
+    results = (
+        table.search("nce", query_type="fts").limit(10).to_list()
+    )  # spellchecker:disable-line
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
+
+    # the default min_ngram_length is 3, so "la" should not match
+    results = table.search("la", query_type="fts").limit(10).to_list()
+    assert len(results) == 0
+
+    # test setting min_ngram_length and prefix_only
+    table.create_fts_index(
+        "text",
+        use_tantivy=False,
+        base_tokenizer="ngram",
+        replace=True,
+        ngram_min_length=2,
+        prefix_only=True,
+    )
+
+    results = table.search("lan", query_type="fts").limit(10).to_list()
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
+
+    results = (
+        table.search("nce", query_type="fts").limit(10).to_list()
+    )  # spellchecker:disable-line
+    assert len(results) == 0
+
+    results = table.search("la", query_type="fts").limit(10).to_list()
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
diff --git a/python/src/index.rs b/python/src/index.rs
index 8b19efee..0381c21c 100644
--- a/python/src/index.rs
+++ b/python/src/index.rs
@@ -47,7 +47,10 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
                     .max_token_length(params.max_token_length)
                     .remove_stop_words(params.remove_stop_words)
                     .stem(params.stem)
-                    .ascii_folding(params.ascii_folding);
+                    .ascii_folding(params.ascii_folding)
+                    .ngram_min_length(params.ngram_min_length)
+                    .ngram_max_length(params.ngram_max_length)
+                    .ngram_prefix_only(params.prefix_only);
                 Ok(LanceDbIndex::FTS(inner_opts))
             },
             "IvfFlat" => {
@@ -130,6 +133,9 @@ struct FtsParams {
     stem: bool,
     remove_stop_words: bool,
     ascii_folding: bool,
+    ngram_min_length: u32,
+    ngram_max_length: u32,
+    prefix_only: bool,
 }
 
 #[derive(FromPyObject)]