mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 22:59:57 +00:00
feat(node): support FTS options in nodejs (#1934)
Closes #1790 --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
@@ -1058,6 +1058,26 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(results[0].text).toBe(data[0].text);
|
||||
});
|
||||
|
||||
test("full text search without lowercase", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
||||
{ text: "Hello World", vector: [0.4, 0.5, 0.6] },
|
||||
];
|
||||
const table = await db.createTable("test", data);
|
||||
await table.createIndex("text", {
|
||||
config: Index.fts({ withPosition: false }),
|
||||
});
|
||||
const results = await table.search("hello").toArray();
|
||||
expect(results.length).toBe(2);
|
||||
|
||||
await table.createIndex("text", {
|
||||
config: Index.fts({ withPosition: false, lowercase: false }),
|
||||
});
|
||||
const results2 = await table.search("hello").toArray();
|
||||
expect(results2.length).toBe(1);
|
||||
});
|
||||
|
||||
test("full text search phrase query", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
|
||||
@@ -349,6 +349,52 @@ export interface FtsOptions {
|
||||
* which will make the index smaller and faster to build, but will not support phrase queries.
|
||||
*/
|
||||
withPosition?: boolean;
|
||||
|
||||
/**
|
||||
* The tokenizer to use when building the index.
|
||||
* The default is "simple".
|
||||
*
|
||||
* The following tokenizers are available:
|
||||
*
|
||||
* "simple" - Simple tokenizer. This tokenizer splits the text into tokens using whitespace and punctuation as a delimiter.
|
||||
*
|
||||
* "whitespace" - Whitespace tokenizer. This tokenizer splits the text into tokens using whitespace as a delimiter.
|
||||
*
|
||||
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
||||
*/
|
||||
baseTokenizer?: "simple" | "whitespace" | "raw";
|
||||
|
||||
/**
|
||||
* language for stemming and stop words
|
||||
* this is only used when `stem` or `remove_stop_words` is true
|
||||
*/
|
||||
language?: string;
|
||||
|
||||
/**
|
||||
* maximum token length
|
||||
* tokens longer than this length will be ignored
|
||||
*/
|
||||
maxTokenLength?: number;
|
||||
|
||||
/**
|
||||
* whether to lowercase tokens
|
||||
*/
|
||||
lowercase?: boolean;
|
||||
|
||||
/**
|
||||
* whether to stem tokens
|
||||
*/
|
||||
stem?: boolean;
|
||||
|
||||
/**
|
||||
* whether to remove stop words
|
||||
*/
|
||||
removeStopWords?: boolean;
|
||||
|
||||
/**
|
||||
* whether to remove punctuation
|
||||
*/
|
||||
asciiFolding?: boolean;
|
||||
}
|
||||
|
||||
export class Index {
|
||||
@@ -450,7 +496,18 @@ export class Index {
|
||||
* For now, the full text search index only supports English, and doesn't support phrase search.
|
||||
*/
|
||||
static fts(options?: Partial<FtsOptions>) {
|
||||
return new Index(LanceDbIndex.fts(options?.withPosition));
|
||||
return new Index(
|
||||
LanceDbIndex.fts(
|
||||
options?.withPosition,
|
||||
options?.baseTokenizer,
|
||||
options?.language,
|
||||
options?.maxTokenLength,
|
||||
options?.lowercase,
|
||||
options?.stem,
|
||||
options?.removeStopWords,
|
||||
options?.asciiFolding,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -96,11 +96,45 @@ impl Index {
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn fts(with_position: Option<bool>) -> Self {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn fts(
|
||||
with_position: Option<bool>,
|
||||
base_tokenizer: Option<String>,
|
||||
language: Option<String>,
|
||||
max_token_length: Option<u32>,
|
||||
lower_case: Option<bool>,
|
||||
stem: Option<bool>,
|
||||
remove_stop_words: Option<bool>,
|
||||
ascii_folding: Option<bool>,
|
||||
) -> Self {
|
||||
let mut opts = FtsIndexBuilder::default();
|
||||
let mut tokenizer_configs = opts.tokenizer_configs.clone();
|
||||
if let Some(with_position) = with_position {
|
||||
opts = opts.with_position(with_position);
|
||||
}
|
||||
if let Some(base_tokenizer) = base_tokenizer {
|
||||
tokenizer_configs = tokenizer_configs.base_tokenizer(base_tokenizer);
|
||||
}
|
||||
if let Some(language) = language {
|
||||
tokenizer_configs = tokenizer_configs.language(&language).unwrap();
|
||||
}
|
||||
if let Some(max_token_length) = max_token_length {
|
||||
tokenizer_configs = tokenizer_configs.max_token_length(Some(max_token_length as usize));
|
||||
}
|
||||
if let Some(lower_case) = lower_case {
|
||||
tokenizer_configs = tokenizer_configs.lower_case(lower_case);
|
||||
}
|
||||
if let Some(stem) = stem {
|
||||
tokenizer_configs = tokenizer_configs.stem(stem);
|
||||
}
|
||||
if let Some(remove_stop_words) = remove_stop_words {
|
||||
tokenizer_configs = tokenizer_configs.remove_stop_words(remove_stop_words);
|
||||
}
|
||||
if let Some(ascii_folding) = ascii_folding {
|
||||
tokenizer_configs = tokenizer_configs.ascii_folding(ascii_folding);
|
||||
}
|
||||
opts.tokenizer_configs = tokenizer_configs;
|
||||
|
||||
Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user