feat(node): support FTS options in nodejs (#1934)

Closes #1790

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2024-12-13 00:19:04 +08:00
committed by GitHub
parent 10f919a0a9
commit c3ebac1a92
3 changed files with 113 additions and 2 deletions

View File

@@ -349,6 +349,52 @@ export interface FtsOptions {
* which will make the index smaller and faster to build, but will not support phrase queries.
*/
withPosition?: boolean;
/**
* The tokenizer to use when building the index.
* The default is "simple".
*
* The following tokenizers are available:
*
* "simple" - Simple tokenizer. This tokenizer splits the text into tokens using whitespace and punctuation as a delimiter.
*
* "whitespace" - Whitespace tokenizer. This tokenizer splits the text into tokens using whitespace as a delimiter.
*
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
*/
baseTokenizer?: "simple" | "whitespace" | "raw";
/**
* language for stemming and stop words
* this is only used when `stem` or `remove_stop_words` is true
*/
language?: string;
/**
* maximum token length
* tokens longer than this length will be ignored
*/
maxTokenLength?: number;
/**
* whether to lowercase tokens
*/
lowercase?: boolean;
/**
* whether to stem tokens
*/
stem?: boolean;
/**
* whether to remove stop words
*/
removeStopWords?: boolean;
/**
* whether to remove punctuation
*/
asciiFolding?: boolean;
}
export class Index {
@@ -450,7 +496,18 @@ export class Index {
* For now, the full text search index only supports English, and doesn't support phrase search.
*/
static fts(options?: Partial<FtsOptions>) {
return new Index(LanceDbIndex.fts(options?.withPosition));
return new Index(
LanceDbIndex.fts(
options?.withPosition,
options?.baseTokenizer,
options?.language,
options?.maxTokenLength,
options?.lowercase,
options?.stem,
options?.removeStopWords,
options?.asciiFolding,
),
);
}
/**