mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-04 10:52:56 +00:00
feat!: migrate FTS from tantivy to lance-index (#1483)
Lance now supports FTS, so add it into lancedb Python, TypeScript and Rust SDKs. For Python, we still use tantivy based FTS by default because the lance FTS index now misses some features of tantivy. For Python: - Support to create lance based FTS index - Support to specify columns for full text search (only available for lance based FTS index) For TypeScript: - Change the search method so that it can accept both string and vector - Support full text search For Rust - Support full text search The others: - Update the FTS doc BREAKING CHANGE: - for Python, this renames the attached score column of FTS from "score" to "_score", this could be a breaking change for users that rely the scores --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
@@ -785,11 +785,26 @@ describe.each([arrow13, arrow14, arrow15, arrow16, arrow17])(
|
||||
];
|
||||
const table = await db.createTable("test", data);
|
||||
|
||||
expect(table.search("hello").toArray()).rejects.toThrow(
|
||||
expect(table.search("hello", "vector").toArray()).rejects.toThrow(
|
||||
"No embedding functions are defined in the table",
|
||||
);
|
||||
});
|
||||
|
||||
test("full text search if no embedding function provided", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
||||
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
|
||||
];
|
||||
const table = await db.createTable("test", data);
|
||||
await table.createIndex("text", {
|
||||
config: Index.fts(),
|
||||
});
|
||||
|
||||
const results = await table.search("hello").toArray();
|
||||
expect(results[0].text).toBe(data[0].text);
|
||||
});
|
||||
|
||||
test.each([
|
||||
[0.4, 0.5, 0.599], // number[]
|
||||
Float32Array.of(0.4, 0.5, 0.599), // Float32Array
|
||||
|
||||
52
nodejs/examples/full_text_search.ts
Normal file
52
nodejs/examples/full_text_search.ts
Normal file
@@ -0,0 +1,52 @@
|
||||
// Copyright 2024 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
import * as lancedb from "@lancedb/lancedb";
|
||||
|
||||
const db = await lancedb.connect("data/sample-lancedb");
|
||||
|
||||
const words = [
|
||||
"apple",
|
||||
"banana",
|
||||
"cherry",
|
||||
"date",
|
||||
"elderberry",
|
||||
"fig",
|
||||
"grape",
|
||||
];
|
||||
|
||||
const data = Array.from({ length: 10_000 }, (_, i) => ({
|
||||
vector: Array(1536).fill(i),
|
||||
id: i,
|
||||
item: `item ${i}`,
|
||||
strId: `${i}`,
|
||||
doc: words[i % words.length],
|
||||
}));
|
||||
|
||||
const tbl = await db.createTable("myVectors", data, { mode: "overwrite" });
|
||||
|
||||
await tbl.createIndex("doc", {
|
||||
config: lancedb.Index.fts(),
|
||||
});
|
||||
|
||||
// --8<-- [start:full_text_search]
|
||||
let result = await tbl
|
||||
.search("apple")
|
||||
.select(["id", "doc"])
|
||||
.limit(10)
|
||||
.toArray();
|
||||
console.log(result);
|
||||
// --8<-- [end:full_text_search]
|
||||
|
||||
console.log("SQL search: done");
|
||||
42
nodejs/examples/package-lock.json
generated
42
nodejs/examples/package-lock.json
generated
@@ -10,7 +10,11 @@
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@lancedb/lancedb": "file:../",
|
||||
"@xenova/transformers": "^2.17.2"
|
||||
"@xenova/transformers": "^2.17.2",
|
||||
"tsc": "^2.0.4"
|
||||
},
|
||||
"devDependencies": {
|
||||
"typescript": "^5.5.4"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"typescript": "^5.0.0"
|
||||
@@ -18,7 +22,7 @@
|
||||
},
|
||||
"..": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.7.1",
|
||||
"version": "0.8.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -43,26 +47,30 @@
|
||||
"@types/axios": "^0.14.0",
|
||||
"@types/jest": "^29.1.2",
|
||||
"@types/tmp": "^0.2.6",
|
||||
"apache-arrow-old": "npm:apache-arrow@13.0.0",
|
||||
"apache-arrow-13": "npm:apache-arrow@13.0.0",
|
||||
"apache-arrow-14": "npm:apache-arrow@14.0.0",
|
||||
"apache-arrow-15": "npm:apache-arrow@15.0.0",
|
||||
"apache-arrow-16": "npm:apache-arrow@16.0.0",
|
||||
"apache-arrow-17": "npm:apache-arrow@17.0.0",
|
||||
"eslint": "^8.57.0",
|
||||
"jest": "^29.7.0",
|
||||
"shx": "^0.3.4",
|
||||
"tmp": "^0.2.3",
|
||||
"ts-jest": "^29.1.2",
|
||||
"typedoc": "^0.25.7",
|
||||
"typedoc-plugin-markdown": "^3.17.1",
|
||||
"typescript": "^5.3.3",
|
||||
"typedoc": "^0.26.4",
|
||||
"typedoc-plugin-markdown": "^4.2.1",
|
||||
"typescript": "^5.5.4",
|
||||
"typescript-eslint": "^7.1.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@xenova/transformers": "^2.17.2",
|
||||
"@xenova/transformers": ">=2.17 < 3",
|
||||
"openai": "^4.29.2"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"apache-arrow": "^15.0.0"
|
||||
"apache-arrow": ">=13.0.0 <=17.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@huggingface/jinja": {
|
||||
@@ -785,6 +793,15 @@
|
||||
"b4a": "^1.6.4"
|
||||
}
|
||||
},
|
||||
"node_modules/tsc": {
|
||||
"version": "2.0.4",
|
||||
"resolved": "https://registry.npmjs.org/tsc/-/tsc-2.0.4.tgz",
|
||||
"integrity": "sha512-fzoSieZI5KKJVBYGvwbVZs/J5za84f2lSTLPYf6AGiIf43tZ3GNrI1QzTLcjtyDDP4aLxd46RTZq1nQxe7+k5Q==",
|
||||
"license": "MIT",
|
||||
"bin": {
|
||||
"tsc": "bin/tsc"
|
||||
}
|
||||
},
|
||||
"node_modules/tunnel-agent": {
|
||||
"version": "0.6.0",
|
||||
"resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
|
||||
@@ -797,10 +814,11 @@
|
||||
}
|
||||
},
|
||||
"node_modules/typescript": {
|
||||
"version": "5.5.2",
|
||||
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.2.tgz",
|
||||
"integrity": "sha512-NcRtPEOsPFFWjobJEtfihkLCZCXZt/os3zf8nTxjVH3RvTSxjrCamJpbExGvYOF+tFHc3pA65qpdwPbzjohhew==",
|
||||
"peer": true,
|
||||
"version": "5.5.4",
|
||||
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz",
|
||||
"integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"bin": {
|
||||
"tsc": "bin/tsc",
|
||||
"tsserver": "bin/tsserver"
|
||||
|
||||
@@ -13,7 +13,16 @@
|
||||
"@lancedb/lancedb": "file:../",
|
||||
"@xenova/transformers": "^2.17.2"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"typescript": "^5.0.0"
|
||||
"devDependencies": {
|
||||
"typescript": "^5.5.4"
|
||||
},
|
||||
"compilerOptions": {
|
||||
"target": "ESNext",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "Node",
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,6 +32,7 @@ const _results2 = await tbl
|
||||
.distanceType("cosine")
|
||||
.limit(10)
|
||||
.toArray();
|
||||
console.log(_results2);
|
||||
// --8<-- [end:search2]
|
||||
|
||||
console.log("search: done");
|
||||
|
||||
@@ -37,6 +37,13 @@ interface EmbeddingFunctionCreate<T extends EmbeddingFunction> {
|
||||
export class EmbeddingFunctionRegistry {
|
||||
#functions = new Map<string, EmbeddingFunctionConstructor>();
|
||||
|
||||
/**
|
||||
* Get the number of registered functions
|
||||
*/
|
||||
length() {
|
||||
return this.#functions.size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Register an embedding function
|
||||
* @param name The name of the function
|
||||
|
||||
@@ -175,6 +175,22 @@ export class Index {
|
||||
static btree() {
|
||||
return new Index(LanceDbIndex.btree());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a full text search index
|
||||
*
|
||||
* A full text search index is an index on a string column, so that you can conduct full
|
||||
* text searches on the column.
|
||||
*
|
||||
* The results of a full text search are ordered by relevance measured by BM25.
|
||||
*
|
||||
* You can combine filters with full text search.
|
||||
*
|
||||
* For now, the full text search index only supports English, and doesn't support phrase search.
|
||||
*/
|
||||
static fts() {
|
||||
return new Index(LanceDbIndex.fts());
|
||||
}
|
||||
}
|
||||
|
||||
export interface IndexOptions {
|
||||
|
||||
@@ -88,6 +88,19 @@ export interface QueryExecutionOptions {
|
||||
maxBatchLength?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options that control the behavior of a full text search
|
||||
*/
|
||||
export interface FullTextSearchOptions {
|
||||
/**
|
||||
* The columns to search
|
||||
*
|
||||
* If not specified, all indexed columns will be searched.
|
||||
* For now, only one column can be searched.
|
||||
*/
|
||||
columns?: string | string[];
|
||||
}
|
||||
|
||||
/** Common methods supported by all query types */
|
||||
export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
implements AsyncIterable<RecordBatch>
|
||||
@@ -134,6 +147,25 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
return this.where(predicate);
|
||||
}
|
||||
|
||||
fullTextSearch(
|
||||
query: string,
|
||||
options?: Partial<FullTextSearchOptions>,
|
||||
): this {
|
||||
let columns = null;
|
||||
if (options) {
|
||||
if (typeof options.columns === "string") {
|
||||
columns = [options.columns];
|
||||
} else if (Array.isArray(options.columns)) {
|
||||
columns = options.columns;
|
||||
}
|
||||
}
|
||||
|
||||
this.doCall((inner: NativeQueryType) =>
|
||||
inner.fullTextSearch(query, columns),
|
||||
);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return only the specified columns.
|
||||
*
|
||||
|
||||
@@ -270,22 +270,23 @@ export abstract class Table {
|
||||
* @returns {Query} A builder that can be used to parameterize the query
|
||||
*/
|
||||
abstract query(): Query;
|
||||
|
||||
/**
|
||||
* Create a search query to find the nearest neighbors
|
||||
* of the given query vector
|
||||
* @param {string} query - the query. This will be converted to a vector using the table's provided embedding function
|
||||
* @note If no embedding functions are defined in the table, this will error when collecting the results.
|
||||
* of the given query
|
||||
* @param {string | IntoVector} query - the query, a vector or string
|
||||
* @param {string} queryType - the type of the query, "vector", "fts", or "auto"
|
||||
* @param {string | string[]} ftsColumns - the columns to search in for full text search
|
||||
* for now, only one column can be searched at a time.
|
||||
*
|
||||
* This is just a convenience method for calling `.query().nearestTo(await myEmbeddingFunction(query))`
|
||||
* when "auto" is used, if the query is a string and an embedding function is defined, it will be treated as a vector query
|
||||
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
|
||||
*/
|
||||
abstract search(query: string): VectorQuery;
|
||||
/**
|
||||
* Create a search query to find the nearest neighbors
|
||||
* of the given query vector
|
||||
* @param {IntoVector} query - the query vector
|
||||
* This is just a convenience method for calling `.query().nearestTo(query)`
|
||||
*/
|
||||
abstract search(query: IntoVector): VectorQuery;
|
||||
abstract search(
|
||||
query: string | IntoVector,
|
||||
queryType?: string,
|
||||
ftsColumns?: string | string[],
|
||||
): VectorQuery | Query;
|
||||
/**
|
||||
* Search the table with a given query vector.
|
||||
*
|
||||
@@ -581,27 +582,50 @@ export class LocalTable extends Table {
|
||||
query(): Query {
|
||||
return new Query(this.inner);
|
||||
}
|
||||
search(query: string | IntoVector): VectorQuery {
|
||||
if (typeof query !== "string") {
|
||||
return this.vectorSearch(query);
|
||||
} else {
|
||||
const queryPromise = this.getEmbeddingFunctions().then(
|
||||
async (functions) => {
|
||||
// TODO: Support multiple embedding functions
|
||||
const embeddingFunc: EmbeddingFunctionConfig | undefined = functions
|
||||
.values()
|
||||
.next().value;
|
||||
if (!embeddingFunc) {
|
||||
return Promise.reject(
|
||||
new Error("No embedding functions are defined in the table"),
|
||||
);
|
||||
}
|
||||
return await embeddingFunc.function.computeQueryEmbeddings(query);
|
||||
},
|
||||
);
|
||||
|
||||
return this.query().nearestTo(queryPromise);
|
||||
search(
|
||||
query: string | IntoVector,
|
||||
queryType: string = "auto",
|
||||
ftsColumns?: string | string[],
|
||||
): VectorQuery | Query {
|
||||
if (typeof query !== "string") {
|
||||
if (queryType === "fts") {
|
||||
throw new Error("Cannot perform full text search on a vector query");
|
||||
}
|
||||
return this.vectorSearch(query);
|
||||
}
|
||||
|
||||
// If the query is a string, we need to determine if it is a vector query or a full text search query
|
||||
if (queryType === "fts") {
|
||||
return this.query().fullTextSearch(query, {
|
||||
columns: ftsColumns,
|
||||
});
|
||||
}
|
||||
|
||||
// The query type is auto or vector
|
||||
// fall back to full text search if no embedding functions are defined and the query is a string
|
||||
if (queryType === "auto" && getRegistry().length() === 0) {
|
||||
return this.query().fullTextSearch(query, {
|
||||
columns: ftsColumns,
|
||||
});
|
||||
}
|
||||
|
||||
const queryPromise = this.getEmbeddingFunctions().then(
|
||||
async (functions) => {
|
||||
// TODO: Support multiple embedding functions
|
||||
const embeddingFunc: EmbeddingFunctionConfig | undefined = functions
|
||||
.values()
|
||||
.next().value;
|
||||
if (!embeddingFunc) {
|
||||
return Promise.reject(
|
||||
new Error("No embedding functions are defined in the table"),
|
||||
);
|
||||
}
|
||||
return await embeddingFunc.function.computeQueryEmbeddings(query);
|
||||
},
|
||||
);
|
||||
|
||||
return this.query().nearestTo(queryPromise);
|
||||
}
|
||||
|
||||
vectorSearch(vector: IntoVector): VectorQuery {
|
||||
|
||||
9
nodejs/package-lock.json
generated
9
nodejs/package-lock.json
generated
@@ -43,7 +43,7 @@
|
||||
"ts-jest": "^29.1.2",
|
||||
"typedoc": "^0.26.4",
|
||||
"typedoc-plugin-markdown": "^4.2.1",
|
||||
"typescript": "^5.3.3",
|
||||
"typescript": "^5.5.4",
|
||||
"typescript-eslint": "^7.1.0"
|
||||
},
|
||||
"engines": {
|
||||
@@ -9292,10 +9292,11 @@
|
||||
}
|
||||
},
|
||||
"node_modules/typescript": {
|
||||
"version": "5.3.3",
|
||||
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.3.3.tgz",
|
||||
"integrity": "sha512-pXWcraxM0uxAS+tN0AG/BF2TyqmHO014Z070UsJ+pFvYuRSq8KH8DmWpnbXe0pEPDHXZV3FcAbJkijJ5oNEnWw==",
|
||||
"version": "5.5.4",
|
||||
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz",
|
||||
"integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"bin": {
|
||||
"tsc": "bin/tsc",
|
||||
"tsserver": "bin/tsserver"
|
||||
|
||||
@@ -53,7 +53,7 @@
|
||||
"ts-jest": "^29.1.2",
|
||||
"typedoc": "^0.26.4",
|
||||
"typedoc-plugin-markdown": "^4.2.1",
|
||||
"typescript": "^5.3.3",
|
||||
"typescript": "^5.5.4",
|
||||
"typescript-eslint": "^7.1.0"
|
||||
},
|
||||
"ava": {
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
use std::sync::Mutex;
|
||||
|
||||
use lancedb::index::scalar::BTreeIndexBuilder;
|
||||
use lancedb::index::scalar::{BTreeIndexBuilder, FtsIndexBuilder};
|
||||
use lancedb::index::vector::IvfPqIndexBuilder;
|
||||
use lancedb::index::Index as LanceDbIndex;
|
||||
use napi_derive::napi;
|
||||
@@ -76,4 +76,11 @@ impl Index {
|
||||
inner: Mutex::new(Some(LanceDbIndex::BTree(BTreeIndexBuilder::default()))),
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn fts() -> Self {
|
||||
Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::FTS(FtsIndexBuilder::default()))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use lancedb::index::scalar::FullTextSearchQuery;
|
||||
use lancedb::query::ExecutableQuery;
|
||||
use lancedb::query::Query as LanceDbQuery;
|
||||
use lancedb::query::QueryBase;
|
||||
@@ -42,6 +43,12 @@ impl Query {
|
||||
self.inner = self.inner.clone().only_if(predicate);
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn full_text_search(&mut self, query: String, columns: Option<Vec<String>>) {
|
||||
let query = FullTextSearchQuery::new(query).columns(columns);
|
||||
self.inner = self.inner.clone().full_text_search(query);
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
||||
@@ -138,6 +145,12 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().only_if(predicate);
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn full_text_search(&mut self, query: String, columns: Option<Vec<String>>) {
|
||||
let query = FullTextSearchQuery::new(query).columns(columns);
|
||||
self.inner = self.inner.clone().full_text_search(query);
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
||||
|
||||
@@ -9,7 +9,8 @@
|
||||
"allowJs": true,
|
||||
"resolveJsonModule": true,
|
||||
"emitDecoratorMetadata": true,
|
||||
"experimentalDecorators": true
|
||||
"experimentalDecorators": true,
|
||||
"moduleResolution": "Node"
|
||||
},
|
||||
"exclude": ["./dist/*"],
|
||||
"typedocOptions": {
|
||||
|
||||
Reference in New Issue
Block a user