diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index 44368827..9fd2986a 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -1058,6 +1058,26 @@ describe.each([arrow15, arrow16, arrow17, arrow18])( expect(results[0].text).toBe(data[0].text); }); + test("full text search without lowercase", async () => { + const db = await connect(tmpDir.name); + const data = [ + { text: "hello world", vector: [0.1, 0.2, 0.3] }, + { text: "Hello World", vector: [0.4, 0.5, 0.6] }, + ]; + const table = await db.createTable("test", data); + await table.createIndex("text", { + config: Index.fts({ withPosition: false }), + }); + const results = await table.search("hello").toArray(); + expect(results.length).toBe(2); + + await table.createIndex("text", { + config: Index.fts({ withPosition: false, lowercase: false }), + }); + const results2 = await table.search("hello").toArray(); + expect(results2.length).toBe(1); + }); + test("full text search phrase query", async () => { const db = await connect(tmpDir.name); const data = [ diff --git a/nodejs/lancedb/indices.ts b/nodejs/lancedb/indices.ts index 9001b732..503ee9ee 100644 --- a/nodejs/lancedb/indices.ts +++ b/nodejs/lancedb/indices.ts @@ -349,6 +349,52 @@ export interface FtsOptions { * which will make the index smaller and faster to build, but will not support phrase queries. */ withPosition?: boolean; + + /** + * The tokenizer to use when building the index. + * The default is "simple". + * + * The following tokenizers are available: + * + * "simple" - Simple tokenizer. This tokenizer splits the text into tokens using whitespace and punctuation as a delimiter. + * + * "whitespace" - Whitespace tokenizer. This tokenizer splits the text into tokens using whitespace as a delimiter. + * + * "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token. + */ + baseTokenizer?: "simple" | "whitespace" | "raw"; + + /** + * language for stemming and stop words + * this is only used when `stem` or `remove_stop_words` is true + */ + language?: string; + + /** + * maximum token length + * tokens longer than this length will be ignored + */ + maxTokenLength?: number; + + /** + * whether to lowercase tokens + */ + lowercase?: boolean; + + /** + * whether to stem tokens + */ + stem?: boolean; + + /** + * whether to remove stop words + */ + removeStopWords?: boolean; + + /** + * whether to remove punctuation + */ + asciiFolding?: boolean; } export class Index { @@ -450,7 +496,18 @@ export class Index { * For now, the full text search index only supports English, and doesn't support phrase search. */ static fts(options?: Partial) { - return new Index(LanceDbIndex.fts(options?.withPosition)); + return new Index( + LanceDbIndex.fts( + options?.withPosition, + options?.baseTokenizer, + options?.language, + options?.maxTokenLength, + options?.lowercase, + options?.stem, + options?.removeStopWords, + options?.asciiFolding, + ), + ); } /** diff --git a/nodejs/src/index.rs b/nodejs/src/index.rs index 99bfdb45..547358d1 100644 --- a/nodejs/src/index.rs +++ b/nodejs/src/index.rs @@ -96,11 +96,45 @@ impl Index { } #[napi(factory)] - pub fn fts(with_position: Option) -> Self { + #[allow(clippy::too_many_arguments)] + pub fn fts( + with_position: Option, + base_tokenizer: Option, + language: Option, + max_token_length: Option, + lower_case: Option, + stem: Option, + remove_stop_words: Option, + ascii_folding: Option, + ) -> Self { let mut opts = FtsIndexBuilder::default(); + let mut tokenizer_configs = opts.tokenizer_configs.clone(); if let Some(with_position) = with_position { opts = opts.with_position(with_position); } + if let Some(base_tokenizer) = base_tokenizer { + tokenizer_configs = tokenizer_configs.base_tokenizer(base_tokenizer); + } + if let Some(language) = language { + tokenizer_configs = tokenizer_configs.language(&language).unwrap(); + } + if let Some(max_token_length) = max_token_length { + tokenizer_configs = tokenizer_configs.max_token_length(Some(max_token_length as usize)); + } + if let Some(lower_case) = lower_case { + tokenizer_configs = tokenizer_configs.lower_case(lower_case); + } + if let Some(stem) = stem { + tokenizer_configs = tokenizer_configs.stem(stem); + } + if let Some(remove_stop_words) = remove_stop_words { + tokenizer_configs = tokenizer_configs.remove_stop_words(remove_stop_words); + } + if let Some(ascii_folding) = ascii_folding { + tokenizer_configs = tokenizer_configs.ascii_folding(ascii_folding); + } + opts.tokenizer_configs = tokenizer_configs; + Self { inner: Mutex::new(Some(LanceDbIndex::FTS(opts))), }