feat(node): support FTS options in nodejs (#1934)

Closes #1790 --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-12-26 22:59:57 +00:00 · 2024-12-13 00:19:04 +08:00
parent 10f919a0a9
commit c3ebac1a92
3 changed files with 113 additions and 2 deletions
--- a/nodejs/test/table.test.ts
+++ b/nodejs/test/table.test.ts
@@ -1058,6 +1058,26 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
      expect(results[0].text).toBe(data[0].text);
    });

+    test("full text search without lowercase", async () => {
+      const db = await connect(tmpDir.name);
+      const data = [
+        { text: "hello world", vector: [0.1, 0.2, 0.3] },
+        { text: "Hello World", vector: [0.4, 0.5, 0.6] },
+      ];
+      const table = await db.createTable("test", data);
+      await table.createIndex("text", {
+        config: Index.fts({ withPosition: false }),
+      });
+      const results = await table.search("hello").toArray();
+      expect(results.length).toBe(2);
+
+      await table.createIndex("text", {
+        config: Index.fts({ withPosition: false, lowercase: false }),
+      });
+      const results2 = await table.search("hello").toArray();
+      expect(results2.length).toBe(1);
+    });
+
    test("full text search phrase query", async () => {
      const db = await connect(tmpDir.name);
      const data = [
--- a/nodejs/lancedb/indices.ts
+++ b/nodejs/lancedb/indices.ts
@@ -349,6 +349,52 @@ export interface FtsOptions {
   * which will make the index smaller and faster to build, but will not support phrase queries.
   */
  withPosition?: boolean;
+
+  /**
+   * The tokenizer to use when building the index.
+   * The default is "simple".
+   *
+   * The following tokenizers are available:
+   *
+   * "simple" - Simple tokenizer. This tokenizer splits the text into tokens using whitespace and punctuation as a delimiter.
+   *
+   * "whitespace" - Whitespace tokenizer. This tokenizer splits the text into tokens using whitespace as a delimiter.
+   *
+   * "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
+   */
+  baseTokenizer?: "simple" | "whitespace" | "raw";
+
+  /**
+   * language for stemming and stop words
+   * this is only used when `stem` or `remove_stop_words` is true
+   */
+  language?: string;
+
+  /**
+   * maximum token length
+   * tokens longer than this length will be ignored
+   */
+  maxTokenLength?: number;
+
+  /**
+   * whether to lowercase tokens
+   */
+  lowercase?: boolean;
+
+  /**
+   * whether to stem tokens
+   */
+  stem?: boolean;
+
+  /**
+   * whether to remove stop words
+   */
+  removeStopWords?: boolean;
+
+  /**
+   * whether to remove punctuation
+   */
+  asciiFolding?: boolean;
 }

 export class Index {
@@ -450,7 +496,18 @@ export class Index {
   * For now, the full text search index only supports English, and doesn't support phrase search.
   */
  static fts(options?: Partial<FtsOptions>) {
-    return new Index(LanceDbIndex.fts(options?.withPosition));
+    return new Index(
+      LanceDbIndex.fts(
+        options?.withPosition,
+        options?.baseTokenizer,
+        options?.language,
+        options?.maxTokenLength,
+        options?.lowercase,
+        options?.stem,
+        options?.removeStopWords,
+        options?.asciiFolding,
+      ),
+    );
  }

  /**
--- a/nodejs/src/index.rs
+++ b/nodejs/src/index.rs
@@ -96,11 +96,45 @@ impl Index {
    }

    #[napi(factory)]
-    pub fn fts(with_position: Option<bool>) -> Self {
+    #[allow(clippy::too_many_arguments)]
+    pub fn fts(
+        with_position: Option<bool>,
+        base_tokenizer: Option<String>,
+        language: Option<String>,
+        max_token_length: Option<u32>,
+        lower_case: Option<bool>,
+        stem: Option<bool>,
+        remove_stop_words: Option<bool>,
+        ascii_folding: Option<bool>,
+    ) -> Self {
        let mut opts = FtsIndexBuilder::default();
+        let mut tokenizer_configs = opts.tokenizer_configs.clone();
        if let Some(with_position) = with_position {
            opts = opts.with_position(with_position);
        }
+        if let Some(base_tokenizer) = base_tokenizer {
+            tokenizer_configs = tokenizer_configs.base_tokenizer(base_tokenizer);
+        }
+        if let Some(language) = language {
+            tokenizer_configs = tokenizer_configs.language(&language).unwrap();
+        }
+        if let Some(max_token_length) = max_token_length {
+            tokenizer_configs = tokenizer_configs.max_token_length(Some(max_token_length as usize));
+        }
+        if let Some(lower_case) = lower_case {
+            tokenizer_configs = tokenizer_configs.lower_case(lower_case);
+        }
+        if let Some(stem) = stem {
+            tokenizer_configs = tokenizer_configs.stem(stem);
+        }
+        if let Some(remove_stop_words) = remove_stop_words {
+            tokenizer_configs = tokenizer_configs.remove_stop_words(remove_stop_words);
+        }
+        if let Some(ascii_folding) = ascii_folding {
+            tokenizer_configs = tokenizer_configs.ascii_folding(ascii_folding);
+        }
+        opts.tokenizer_configs = tokenizer_configs;
+
        Self {
            inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
        }