feat: support ngram tokenizer (#2507)

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2026-07-09 14:00:44 +00:00 · 2025-07-15 16:36:08 +08:00
parent 4c999fb651
commit 03b62599d7
9 changed files with 173 additions and 3 deletions
--- a/nodejs/test/table.test.ts
+++ b/nodejs/test/table.test.ts
@@ -1706,6 +1706,60 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
      expect(mustNotResults.length).toBe(1);
    });

+    test("full text search ngram", async () => {
+      const db = await connect(tmpDir.name);
+      const data = [
+        { text: "hello world", vector: [0.1, 0.2, 0.3] },
+        { text: "lance database", vector: [0.4, 0.5, 0.6] },
+        { text: "lance is cool", vector: [0.7, 0.8, 0.9] },
+      ];
+      const table = await db.createTable("test", data);
+      await table.createIndex("text", {
+        config: Index.fts({ baseTokenizer: "ngram" }),
+      });
+
+      const results = await table.search("lan").toArray();
+      expect(results.length).toBe(2);
+      const resultSet = new Set(results.map((r) => r.text));
+      expect(resultSet.has("lance database")).toBe(true);
+      expect(resultSet.has("lance is cool")).toBe(true);
+
+      const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
+      expect(results2.length).toBe(2);
+      const resultSet2 = new Set(results2.map((r) => r.text));
+      expect(resultSet2.has("lance database")).toBe(true);
+      expect(resultSet2.has("lance is cool")).toBe(true);
+
+      // the default min_ngram_length is 3, so "la" should not match
+      const results3 = await table.search("la").toArray();
+      expect(results3.length).toBe(0);
+
+      // test setting min_ngram_length and prefix_only
+      await table.createIndex("text", {
+        config: Index.fts({
+          baseTokenizer: "ngram",
+          ngramMinLength: 2,
+          prefixOnly: true,
+        }),
+        replace: true,
+      });
+
+      const results4 = await table.search("lan").toArray();
+      expect(results4.length).toBe(2);
+      const resultSet4 = new Set(results4.map((r) => r.text));
+      expect(resultSet4.has("lance database")).toBe(true);
+      expect(resultSet4.has("lance is cool")).toBe(true);
+
+      const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
+      expect(results5.length).toBe(0);
+
+      const results6 = await table.search("la").toArray();
+      expect(results6.length).toBe(2);
+      const resultSet6 = new Set(results6.map((r) => r.text));
+      expect(resultSet6.has("lance database")).toBe(true);
+      expect(resultSet6.has("lance is cool")).toBe(true);
+    });
+
    test.each([
      [0.4, 0.5, 0.599], // number[]
      Float32Array.of(0.4, 0.5, 0.599), // Float32Array
--- a/nodejs/lancedb/indices.ts
+++ b/nodejs/lancedb/indices.ts
@@ -439,7 +439,7 @@ export interface FtsOptions {
   *
   * "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
   */
-  baseTokenizer?: "simple" | "whitespace" | "raw";
+  baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";

  /**
   * language for stemming and stop words
@@ -472,6 +472,21 @@ export interface FtsOptions {
   * whether to remove punctuation
   */
  asciiFolding?: boolean;
+
+  /**
+   * ngram min length
+   */
+  ngramMinLength?: number;
+
+  /**
+   * ngram max length
+   */
+  ngramMaxLength?: number;
+
+  /**
+   * whether to only index the prefix of the token for ngram tokenizer
+   */
+  prefixOnly?: boolean;
 }

 export class Index {
@@ -608,6 +623,9 @@ export class Index {
        options?.stem,
        options?.removeStopWords,
        options?.asciiFolding,
+        options?.ngramMinLength,
+        options?.ngramMaxLength,
+        options?.prefixOnly,
      ),
    );
  }
--- a/nodejs/src/index.rs
+++ b/nodejs/src/index.rs
@@ -123,6 +123,9 @@ impl Index {
        stem: Option<bool>,
        remove_stop_words: Option<bool>,
        ascii_folding: Option<bool>,
+        ngram_min_length: Option<u32>,
+        ngram_max_length: Option<u32>,
+        prefix_only: Option<bool>,
    ) -> Self {
        let mut opts = FtsIndexBuilder::default();
        if let Some(with_position) = with_position {
@@ -149,6 +152,15 @@ impl Index {
        if let Some(ascii_folding) = ascii_folding {
            opts = opts.ascii_folding(ascii_folding);
        }
+        if let Some(ngram_min_length) = ngram_min_length {
+            opts = opts.ngram_min_length(ngram_min_length);
+        }
+        if let Some(ngram_max_length) = ngram_max_length {
+            opts = opts.ngram_max_length(ngram_max_length);
+        }
+        if let Some(prefix_only) = prefix_only {
+            opts = opts.ngram_prefix_only(prefix_only);
+        }

        Self {
            inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
--- a/python/python/lancedb/index.py
+++ b/python/python/lancedb/index.py
@@ -137,6 +137,9 @@ class FTS:
    stem: bool = True
    remove_stop_words: bool = True
    ascii_folding: bool = True
+    ngram_min_length: int = 3
+    ngram_max_length: int = 3
+    prefix_only: bool = False


@dataclass
--- a/python/python/lancedb/remote/table.py
+++ b/python/python/lancedb/remote/table.py
@@ -158,6 +158,9 @@ class RemoteTable(Table):
        stem: bool = True,
        remove_stop_words: bool = True,
        ascii_folding: bool = True,
+        ngram_min_length: int = 3,
+        ngram_max_length: int = 3,
+        prefix_only: bool = False,
    ):
        config = FTS(
            with_position=with_position,
@@ -168,6 +171,9 @@ class RemoteTable(Table):
            stem=stem,
            remove_stop_words=remove_stop_words,
            ascii_folding=ascii_folding,
+            ngram_min_length=ngram_min_length,
+            ngram_max_length=ngram_max_length,
+            prefix_only=prefix_only,
        )
        LOOP.run(
            self._table.create_index(
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -838,6 +838,9 @@ class Table(ABC):
        stem: bool = True,
        remove_stop_words: bool = True,
        ascii_folding: bool = True,
+        ngram_min_length: int = 3,
+        ngram_max_length: int = 3,
+        prefix_only: bool = False,
        wait_timeout: Optional[timedelta] = None,
    ):
        """Create a full-text search index on the table.
@@ -877,6 +880,7 @@ class Table(ABC):
            - "simple": Splits text by whitespace and punctuation.
            - "whitespace": Split text by whitespace, but not punctuation.
            - "raw": No tokenization. The entire text is treated as a single token.
+            - "ngram": N-Gram tokenizer.
        language : str, default "English"
            The language to use for tokenization.
        max_token_length : int, default 40
@@ -894,6 +898,12 @@ class Table(ABC):
        ascii_folding : bool, default True
            Whether to fold ASCII characters. This converts accented characters to
            their ASCII equivalent. For example, "café" would be converted to "cafe".
+        ngram_min_length: int, default 3
+            The minimum length of an n-gram.
+        ngram_max_length: int, default 3
+            The maximum length of an n-gram.
+        prefix_only: bool, default False
+            Whether to only index the prefix of the token for ngram tokenizer.
        wait_timeout: timedelta, optional
            The timeout to wait if indexing is asynchronous.
        """
@@ -1981,6 +1991,9 @@ class LanceTable(Table):
        stem: bool = True,
        remove_stop_words: bool = True,
        ascii_folding: bool = True,
+        ngram_min_length: int = 3,
+        ngram_max_length: int = 3,
+        prefix_only: bool = False,
    ):
        if not use_tantivy:
            if not isinstance(field_names, str):
@@ -1996,6 +2009,9 @@ class LanceTable(Table):
                    "stem": stem,
                    "remove_stop_words": remove_stop_words,
                    "ascii_folding": ascii_folding,
+                    "ngram_min_length": ngram_min_length,
+                    "ngram_max_length": ngram_max_length,
+                    "prefix_only": prefix_only,
                }
            else:
                tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
@@ -2065,6 +2081,9 @@ class LanceTable(Table):
                "stem": False,
                "remove_stop_words": False,
                "ascii_folding": False,
+                "ngram_min_length": 3,
+                "ngram_max_length": 3,
+                "prefix_only": False,
            }
        elif tokenizer_name == "raw":
            return {
@@ -2075,6 +2094,9 @@ class LanceTable(Table):
                "stem": False,
                "remove_stop_words": False,
                "ascii_folding": False,
+                "ngram_min_length": 3,
+                "ngram_max_length": 3,
+                "prefix_only": False,
            }
        elif tokenizer_name == "whitespace":
            return {
@@ -2085,6 +2107,9 @@ class LanceTable(Table):
                "stem": False,
                "remove_stop_words": False,
                "ascii_folding": False,
+                "ngram_min_length": 3,
+                "ngram_max_length": 3,
+                "prefix_only": False,
            }

        # or it's with language stemming with pattern like "en_stem"
@@ -2103,6 +2128,9 @@ class LanceTable(Table):
            "stem": True,
            "remove_stop_words": False,
            "ascii_folding": False,
+            "ngram_min_length": 3,
+            "ngram_max_length": 3,
+            "prefix_only": False,
        }

    def add(
--- a/python/python/lancedb/types.py
+++ b/python/python/lancedb/types.py
@@ -25,4 +25,4 @@ IndexType = Literal[
 ]

 # Tokenizer literals
-BaseTokenizerType = Literal["simple", "raw", "whitespace"]
+BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -669,3 +669,46 @@ def test_fts_on_list(mem_db: DBConnection):

    res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
    assert len(res) == 2
+
+
+def test_fts_ngram(mem_db: DBConnection):
+    data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
+    table = mem_db.create_table("test", data=data)
+    table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
+
+    results = table.search("lan", query_type="fts").limit(10).to_list()
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
+
+    results = (
+        table.search("nce", query_type="fts").limit(10).to_list()
+    )  # spellchecker:disable-line
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
+
+    # the default min_ngram_length is 3, so "la" should not match
+    results = table.search("la", query_type="fts").limit(10).to_list()
+    assert len(results) == 0
+
+    # test setting min_ngram_length and prefix_only
+    table.create_fts_index(
+        "text",
+        use_tantivy=False,
+        base_tokenizer="ngram",
+        replace=True,
+        ngram_min_length=2,
+        prefix_only=True,
+    )
+
+    results = table.search("lan", query_type="fts").limit(10).to_list()
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
+
+    results = (
+        table.search("nce", query_type="fts").limit(10).to_list()
+    )  # spellchecker:disable-line
+    assert len(results) == 0
+
+    results = table.search("la", query_type="fts").limit(10).to_list()
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
--- a/python/src/index.rs
+++ b/python/src/index.rs
@@ -47,7 +47,10 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
                    .max_token_length(params.max_token_length)
                    .remove_stop_words(params.remove_stop_words)
                    .stem(params.stem)
-                    .ascii_folding(params.ascii_folding);
+                    .ascii_folding(params.ascii_folding)
+                    .ngram_min_length(params.ngram_min_length)
+                    .ngram_max_length(params.ngram_max_length)
+                    .ngram_prefix_only(params.prefix_only);
                Ok(LanceDbIndex::FTS(inner_opts))
            },
            "IvfFlat" => {
@@ -130,6 +133,9 @@ struct FtsParams {
    stem: bool,
    remove_stop_words: bool,
    ascii_folding: bool,
+    ngram_min_length: u32,
+    ngram_max_length: u32,
+    prefix_only: bool,
 }

 #[derive(FromPyObject)]