feat!: migrate FTS from tantivy to lance-index (#1483)

Lance now supports FTS, so add it into lancedb Python, TypeScript and
Rust SDKs.

For Python, we still use tantivy based FTS by default because the lance
FTS index now misses some features of tantivy.

For Python:
- Support to create lance based FTS index
- Support to specify columns for full text search (only available for
lance based FTS index)

For TypeScript:
- Change the search method so that it can accept both string and vector
- Support full text search

For Rust
- Support full text search

The others:
- Update the FTS doc

BREAKING CHANGE: 
- for Python, this renames the attached score column of FTS from "score"
to "_score", this could be a breaking change for users that rely the
scores

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2024-08-08 15:33:15 +08:00
committed by GitHub
parent 4db554eea5
commit f9d5fa88a1
34 changed files with 713 additions and 145 deletions

View File

@@ -0,0 +1,52 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import * as lancedb from "@lancedb/lancedb";
const db = await lancedb.connect("data/sample-lancedb");
const words = [
"apple",
"banana",
"cherry",
"date",
"elderberry",
"fig",
"grape",
];
const data = Array.from({ length: 10_000 }, (_, i) => ({
vector: Array(1536).fill(i),
id: i,
item: `item ${i}`,
strId: `${i}`,
doc: words[i % words.length],
}));
const tbl = await db.createTable("myVectors", data, { mode: "overwrite" });
await tbl.createIndex("doc", {
config: lancedb.Index.fts(),
});
// --8<-- [start:full_text_search]
let result = await tbl
.search("apple")
.select(["id", "doc"])
.limit(10)
.toArray();
console.log(result);
// --8<-- [end:full_text_search]
console.log("SQL search: done");

View File

@@ -10,7 +10,11 @@
"license": "Apache-2.0",
"dependencies": {
"@lancedb/lancedb": "file:../",
"@xenova/transformers": "^2.17.2"
"@xenova/transformers": "^2.17.2",
"tsc": "^2.0.4"
},
"devDependencies": {
"typescript": "^5.5.4"
},
"peerDependencies": {
"typescript": "^5.0.0"
@@ -18,7 +22,7 @@
},
"..": {
"name": "@lancedb/lancedb",
"version": "0.7.1",
"version": "0.8.0",
"cpu": [
"x64",
"arm64"
@@ -43,26 +47,30 @@
"@types/axios": "^0.14.0",
"@types/jest": "^29.1.2",
"@types/tmp": "^0.2.6",
"apache-arrow-old": "npm:apache-arrow@13.0.0",
"apache-arrow-13": "npm:apache-arrow@13.0.0",
"apache-arrow-14": "npm:apache-arrow@14.0.0",
"apache-arrow-15": "npm:apache-arrow@15.0.0",
"apache-arrow-16": "npm:apache-arrow@16.0.0",
"apache-arrow-17": "npm:apache-arrow@17.0.0",
"eslint": "^8.57.0",
"jest": "^29.7.0",
"shx": "^0.3.4",
"tmp": "^0.2.3",
"ts-jest": "^29.1.2",
"typedoc": "^0.25.7",
"typedoc-plugin-markdown": "^3.17.1",
"typescript": "^5.3.3",
"typedoc": "^0.26.4",
"typedoc-plugin-markdown": "^4.2.1",
"typescript": "^5.5.4",
"typescript-eslint": "^7.1.0"
},
"engines": {
"node": ">= 18"
},
"optionalDependencies": {
"@xenova/transformers": "^2.17.2",
"@xenova/transformers": ">=2.17 < 3",
"openai": "^4.29.2"
},
"peerDependencies": {
"apache-arrow": "^15.0.0"
"apache-arrow": ">=13.0.0 <=17.0.0"
}
},
"node_modules/@huggingface/jinja": {
@@ -785,6 +793,15 @@
"b4a": "^1.6.4"
}
},
"node_modules/tsc": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/tsc/-/tsc-2.0.4.tgz",
"integrity": "sha512-fzoSieZI5KKJVBYGvwbVZs/J5za84f2lSTLPYf6AGiIf43tZ3GNrI1QzTLcjtyDDP4aLxd46RTZq1nQxe7+k5Q==",
"license": "MIT",
"bin": {
"tsc": "bin/tsc"
}
},
"node_modules/tunnel-agent": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
@@ -797,10 +814,11 @@
}
},
"node_modules/typescript": {
"version": "5.5.2",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.2.tgz",
"integrity": "sha512-NcRtPEOsPFFWjobJEtfihkLCZCXZt/os3zf8nTxjVH3RvTSxjrCamJpbExGvYOF+tFHc3pA65qpdwPbzjohhew==",
"peer": true,
"version": "5.5.4",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz",
"integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==",
"dev": true,
"license": "Apache-2.0",
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"

View File

@@ -13,7 +13,16 @@
"@lancedb/lancedb": "file:../",
"@xenova/transformers": "^2.17.2"
},
"peerDependencies": {
"typescript": "^5.0.0"
"devDependencies": {
"typescript": "^5.5.4"
},
"compilerOptions": {
"target": "ESNext",
"module": "ESNext",
"moduleResolution": "Node",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true
}
}

View File

@@ -32,6 +32,7 @@ const _results2 = await tbl
.distanceType("cosine")
.limit(10)
.toArray();
console.log(_results2);
// --8<-- [end:search2]
console.log("search: done");