feat(node): support FTS options in nodejs (#1934)

Closes #1790

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2024-12-13 00:19:04 +08:00
committed by GitHub
parent 10f919a0a9
commit c3ebac1a92
3 changed files with 113 additions and 2 deletions

View File

@@ -1058,6 +1058,26 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(results[0].text).toBe(data[0].text);
});
test("full text search without lowercase", async () => {
const db = await connect(tmpDir.name);
const data = [
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
{ text: "Hello World", vector: [0.4, 0.5, 0.6] },
];
const table = await db.createTable("test", data);
await table.createIndex("text", {
config: Index.fts({ withPosition: false }),
});
const results = await table.search("hello").toArray();
expect(results.length).toBe(2);
await table.createIndex("text", {
config: Index.fts({ withPosition: false, lowercase: false }),
});
const results2 = await table.search("hello").toArray();
expect(results2.length).toBe(1);
});
test("full text search phrase query", async () => {
const db = await connect(tmpDir.name);
const data = [

View File

@@ -349,6 +349,52 @@ export interface FtsOptions {
* which will make the index smaller and faster to build, but will not support phrase queries.
*/
withPosition?: boolean;
/**
* The tokenizer to use when building the index.
* The default is "simple".
*
* The following tokenizers are available:
*
* "simple" - Simple tokenizer. This tokenizer splits the text into tokens using whitespace and punctuation as a delimiter.
*
* "whitespace" - Whitespace tokenizer. This tokenizer splits the text into tokens using whitespace as a delimiter.
*
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
*/
baseTokenizer?: "simple" | "whitespace" | "raw";
/**
* language for stemming and stop words
* this is only used when `stem` or `remove_stop_words` is true
*/
language?: string;
/**
* maximum token length
* tokens longer than this length will be ignored
*/
maxTokenLength?: number;
/**
* whether to lowercase tokens
*/
lowercase?: boolean;
/**
* whether to stem tokens
*/
stem?: boolean;
/**
* whether to remove stop words
*/
removeStopWords?: boolean;
/**
* whether to remove punctuation
*/
asciiFolding?: boolean;
}
export class Index {
@@ -450,7 +496,18 @@ export class Index {
* For now, the full text search index only supports English, and doesn't support phrase search.
*/
static fts(options?: Partial<FtsOptions>) {
return new Index(LanceDbIndex.fts(options?.withPosition));
return new Index(
LanceDbIndex.fts(
options?.withPosition,
options?.baseTokenizer,
options?.language,
options?.maxTokenLength,
options?.lowercase,
options?.stem,
options?.removeStopWords,
options?.asciiFolding,
),
);
}
/**

View File

@@ -96,11 +96,45 @@ impl Index {
}
#[napi(factory)]
pub fn fts(with_position: Option<bool>) -> Self {
#[allow(clippy::too_many_arguments)]
pub fn fts(
with_position: Option<bool>,
base_tokenizer: Option<String>,
language: Option<String>,
max_token_length: Option<u32>,
lower_case: Option<bool>,
stem: Option<bool>,
remove_stop_words: Option<bool>,
ascii_folding: Option<bool>,
) -> Self {
let mut opts = FtsIndexBuilder::default();
let mut tokenizer_configs = opts.tokenizer_configs.clone();
if let Some(with_position) = with_position {
opts = opts.with_position(with_position);
}
if let Some(base_tokenizer) = base_tokenizer {
tokenizer_configs = tokenizer_configs.base_tokenizer(base_tokenizer);
}
if let Some(language) = language {
tokenizer_configs = tokenizer_configs.language(&language).unwrap();
}
if let Some(max_token_length) = max_token_length {
tokenizer_configs = tokenizer_configs.max_token_length(Some(max_token_length as usize));
}
if let Some(lower_case) = lower_case {
tokenizer_configs = tokenizer_configs.lower_case(lower_case);
}
if let Some(stem) = stem {
tokenizer_configs = tokenizer_configs.stem(stem);
}
if let Some(remove_stop_words) = remove_stop_words {
tokenizer_configs = tokenizer_configs.remove_stop_words(remove_stop_words);
}
if let Some(ascii_folding) = ascii_folding {
tokenizer_configs = tokenizer_configs.ascii_folding(ascii_folding);
}
opts.tokenizer_configs = tokenizer_configs;
Self {
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
}