feat(node): support FTS options in nodejs (#1934)

Closes #1790 --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2026-05-24 15:30:38 +00:00 · 2024-12-13 00:19:04 +08:00
parent 10f919a0a9
commit c3ebac1a92
3 changed files with 113 additions and 2 deletions
--- a/nodejs/lancedb/indices.ts
+++ b/nodejs/lancedb/indices.ts
@@ -349,6 +349,52 @@ export interface FtsOptions {
   * which will make the index smaller and faster to build, but will not support phrase queries.
   */
  withPosition?: boolean;
+
+  /**
+   * The tokenizer to use when building the index.
+   * The default is "simple".
+   *
+   * The following tokenizers are available:
+   *
+   * "simple" - Simple tokenizer. This tokenizer splits the text into tokens using whitespace and punctuation as a delimiter.
+   *
+   * "whitespace" - Whitespace tokenizer. This tokenizer splits the text into tokens using whitespace as a delimiter.
+   *
+   * "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
+   */
+  baseTokenizer?: "simple" | "whitespace" | "raw";
+
+  /**
+   * language for stemming and stop words
+   * this is only used when `stem` or `remove_stop_words` is true
+   */
+  language?: string;
+
+  /**
+   * maximum token length
+   * tokens longer than this length will be ignored
+   */
+  maxTokenLength?: number;
+
+  /**
+   * whether to lowercase tokens
+   */
+  lowercase?: boolean;
+
+  /**
+   * whether to stem tokens
+   */
+  stem?: boolean;
+
+  /**
+   * whether to remove stop words
+   */
+  removeStopWords?: boolean;
+
+  /**
+   * whether to remove punctuation
+   */
+  asciiFolding?: boolean;
 }

 export class Index {
@@ -450,7 +496,18 @@ export class Index {
   * For now, the full text search index only supports English, and doesn't support phrase search.
   */
  static fts(options?: Partial<FtsOptions>) {
-    return new Index(LanceDbIndex.fts(options?.withPosition));
+    return new Index(
+      LanceDbIndex.fts(
+        options?.withPosition,
+        options?.baseTokenizer,
+        options?.language,
+        options?.maxTokenLength,
+        options?.lowercase,
+        options?.stem,
+        options?.removeStopWords,
+        options?.asciiFolding,
+      ),
+    );
  }

  /**