feat: support ngram tokenizer (#2507)

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2025-07-15 16:36:08 +08:00
committed by GitHub
parent 4c999fb651
commit 03b62599d7
9 changed files with 173 additions and 3 deletions

View File

@@ -1706,6 +1706,60 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(mustNotResults.length).toBe(1);
});
test("full text search ngram", async () => {
const db = await connect(tmpDir.name);
const data = [
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
{ text: "lance database", vector: [0.4, 0.5, 0.6] },
{ text: "lance is cool", vector: [0.7, 0.8, 0.9] },
];
const table = await db.createTable("test", data);
await table.createIndex("text", {
config: Index.fts({ baseTokenizer: "ngram" }),
});
const results = await table.search("lan").toArray();
expect(results.length).toBe(2);
const resultSet = new Set(results.map((r) => r.text));
expect(resultSet.has("lance database")).toBe(true);
expect(resultSet.has("lance is cool")).toBe(true);
const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
expect(results2.length).toBe(2);
const resultSet2 = new Set(results2.map((r) => r.text));
expect(resultSet2.has("lance database")).toBe(true);
expect(resultSet2.has("lance is cool")).toBe(true);
// the default min_ngram_length is 3, so "la" should not match
const results3 = await table.search("la").toArray();
expect(results3.length).toBe(0);
// test setting min_ngram_length and prefix_only
await table.createIndex("text", {
config: Index.fts({
baseTokenizer: "ngram",
ngramMinLength: 2,
prefixOnly: true,
}),
replace: true,
});
const results4 = await table.search("lan").toArray();
expect(results4.length).toBe(2);
const resultSet4 = new Set(results4.map((r) => r.text));
expect(resultSet4.has("lance database")).toBe(true);
expect(resultSet4.has("lance is cool")).toBe(true);
const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
expect(results5.length).toBe(0);
const results6 = await table.search("la").toArray();
expect(results6.length).toBe(2);
const resultSet6 = new Set(results6.map((r) => r.text));
expect(resultSet6.has("lance database")).toBe(true);
expect(resultSet6.has("lance is cool")).toBe(true);
});
test.each([
[0.4, 0.5, 0.599], // number[]
Float32Array.of(0.4, 0.5, 0.599), // Float32Array

View File

@@ -439,7 +439,7 @@ export interface FtsOptions {
*
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
*/
baseTokenizer?: "simple" | "whitespace" | "raw";
baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";
/**
* language for stemming and stop words
@@ -472,6 +472,21 @@ export interface FtsOptions {
* whether to remove punctuation
*/
asciiFolding?: boolean;
/**
* ngram min length
*/
ngramMinLength?: number;
/**
* ngram max length
*/
ngramMaxLength?: number;
/**
* whether to only index the prefix of the token for ngram tokenizer
*/
prefixOnly?: boolean;
}
export class Index {
@@ -608,6 +623,9 @@ export class Index {
options?.stem,
options?.removeStopWords,
options?.asciiFolding,
options?.ngramMinLength,
options?.ngramMaxLength,
options?.prefixOnly,
),
);
}

View File

@@ -123,6 +123,9 @@ impl Index {
stem: Option<bool>,
remove_stop_words: Option<bool>,
ascii_folding: Option<bool>,
ngram_min_length: Option<u32>,
ngram_max_length: Option<u32>,
prefix_only: Option<bool>,
) -> Self {
let mut opts = FtsIndexBuilder::default();
if let Some(with_position) = with_position {
@@ -149,6 +152,15 @@ impl Index {
if let Some(ascii_folding) = ascii_folding {
opts = opts.ascii_folding(ascii_folding);
}
if let Some(ngram_min_length) = ngram_min_length {
opts = opts.ngram_min_length(ngram_min_length);
}
if let Some(ngram_max_length) = ngram_max_length {
opts = opts.ngram_max_length(ngram_max_length);
}
if let Some(prefix_only) = prefix_only {
opts = opts.ngram_prefix_only(prefix_only);
}
Self {
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),