mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 21:39:57 +00:00
Compare commits
31 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ce24457531 | ||
|
|
087fe6343d | ||
|
|
ab8cbe62dd | ||
|
|
f076bb41f4 | ||
|
|
902fb83d54 | ||
|
|
779118339f | ||
|
|
03b62599d7 | ||
|
|
4c999fb651 | ||
|
|
6d23d32ab5 | ||
|
|
704cec34e1 | ||
|
|
a300a238db | ||
|
|
a41ff1df0a | ||
|
|
77b005d849 | ||
|
|
167fccc427 | ||
|
|
2bffbcefa5 | ||
|
|
905552f993 | ||
|
|
e4898c9313 | ||
|
|
cab36d94b2 | ||
|
|
b64252d4fd | ||
|
|
6fc006072c | ||
|
|
d4bb59b542 | ||
|
|
6b2dd6de51 | ||
|
|
dbccd9e4f1 | ||
|
|
b12ebfed4c | ||
|
|
1dadb2aefa | ||
|
|
eb9784d7f2 | ||
|
|
ba755626cc | ||
|
|
7760799cb8 | ||
|
|
4beb2d2877 | ||
|
|
a00b8595d1 | ||
|
|
9c8314b4fd |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.20.1-beta.2"
|
current_version = "0.21.1"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
412
Cargo.lock
generated
412
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
34
Cargo.toml
34
Cargo.toml
@@ -21,14 +21,16 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.30.0", "features" = ["dynamodb"] }
|
lance = { "version" = "=0.31.2", "features" = [
|
||||||
lance-io = "=0.30.0"
|
"dynamodb",
|
||||||
lance-index = "=0.30.0"
|
], "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-linalg = "=0.30.0"
|
lance-io = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-table = "=0.30.0"
|
lance-index = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-testing = "=0.30.0"
|
lance-linalg = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-datafusion = "=0.30.0"
|
lance-table = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-encoding = "=0.30.0"
|
lance-testing = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-datafusion = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-encoding = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "55.1", optional = false }
|
arrow = { version = "55.1", optional = false }
|
||||||
arrow-array = "55.1"
|
arrow-array = "55.1"
|
||||||
@@ -39,20 +41,20 @@ arrow-schema = "55.1"
|
|||||||
arrow-arith = "55.1"
|
arrow-arith = "55.1"
|
||||||
arrow-cast = "55.1"
|
arrow-cast = "55.1"
|
||||||
async-trait = "0"
|
async-trait = "0"
|
||||||
datafusion = { version = "47.0", default-features = false }
|
datafusion = { version = "48.0", default-features = false }
|
||||||
datafusion-catalog = "47.0"
|
datafusion-catalog = "48.0"
|
||||||
datafusion-common = { version = "47.0", default-features = false }
|
datafusion-common = { version = "48.0", default-features = false }
|
||||||
datafusion-execution = "47.0"
|
datafusion-execution = "48.0"
|
||||||
datafusion-expr = "47.0"
|
datafusion-expr = "48.0"
|
||||||
datafusion-physical-plan = "47.0"
|
datafusion-physical-plan = "48.0"
|
||||||
env_logger = "0.11"
|
env_logger = "0.11"
|
||||||
half = { "version" = "=2.5.0", default-features = false, features = [
|
half = { "version" = "2.6.0", default-features = false, features = [
|
||||||
"num-traits",
|
"num-traits",
|
||||||
] }
|
] }
|
||||||
futures = "0"
|
futures = "0"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
moka = { version = "0.12", features = ["future"] }
|
moka = { version = "0.12", features = ["future"] }
|
||||||
object_store = "0.11.0"
|
object_store = "0.12.0"
|
||||||
pin-project = "1.0.7"
|
pin-project = "1.0.7"
|
||||||
snafu = "0.8"
|
snafu = "0.8"
|
||||||
url = "2"
|
url = "2"
|
||||||
|
|||||||
@@ -47,10 +47,10 @@ def extract_features(line: str) -> list:
|
|||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
|
|
||||||
match = re.search(r'"features"\s*=\s*\[(.*?)\]', line)
|
match = re.search(r'"features"\s*=\s*\[\s*(.*?)\s*\]', line, re.DOTALL)
|
||||||
if match:
|
if match:
|
||||||
features_str = match.group(1)
|
features_str = match.group(1)
|
||||||
return [f.strip('"') for f in features_str.split(",")]
|
return [f.strip('"') for f in features_str.split(",") if len(f) > 0]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
@@ -63,10 +63,24 @@ def update_cargo_toml(line_updater):
|
|||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
|
|
||||||
new_lines = []
|
new_lines = []
|
||||||
|
lance_line = ""
|
||||||
|
is_parsing_lance_line = False
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if line.startswith("lance"):
|
if line.startswith("lance"):
|
||||||
# Update the line using the provided function
|
# Update the line using the provided function
|
||||||
|
if line.strip().endswith("}"):
|
||||||
new_lines.append(line_updater(line))
|
new_lines.append(line_updater(line))
|
||||||
|
else:
|
||||||
|
lance_line = line
|
||||||
|
is_parsing_lance_line = True
|
||||||
|
elif is_parsing_lance_line:
|
||||||
|
lance_line += line
|
||||||
|
if line.strip().endswith("}"):
|
||||||
|
new_lines.append(line_updater(lance_line))
|
||||||
|
lance_line = ""
|
||||||
|
is_parsing_lance_line = False
|
||||||
|
else:
|
||||||
|
print("doesn't end with }:", line)
|
||||||
else:
|
else:
|
||||||
# Keep the line unchanged
|
# Keep the line unchanged
|
||||||
new_lines.append(line)
|
new_lines.append(line)
|
||||||
|
|||||||
@@ -428,7 +428,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"**Why?** \n",
|
"**Why?** \n",
|
||||||
"Embedding the UFO dataset and ingesting it into LanceDB takes **~2 hours on a T4 GPU**. To save time: \n",
|
"Embedding the UFO dataset and ingesting it into LanceDB takes **~2 hours on a T4 GPU**. To save time: \n",
|
||||||
"- **Use the pre-prepared table with index created ** (provided below) to proceed directly to step7: search. \n",
|
"- **Use the pre-prepared table with index created** (provided below) to proceed directly to **Step 7**: search. \n",
|
||||||
"- **Step 5a** contains the full ingestion code for reference (run it only if necessary). \n",
|
"- **Step 5a** contains the full ingestion code for reference (run it only if necessary). \n",
|
||||||
"- **Step 6** contains the details on creating the index on the multivector column"
|
"- **Step 6** contains the details on creating the index on the multivector column"
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.20.1-beta.2</version>
|
<version>0.21.1-final.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.20.1-beta.2</version>
|
<version>0.21.1-final.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
||||||
<name>LanceDB Parent</name>
|
<name>LanceDB Parent</name>
|
||||||
|
|||||||
49
node/package-lock.json
generated
49
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -52,11 +52,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.20.1-beta.2",
|
"@lancedb/vectordb-darwin-arm64": "0.21.1",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.20.1-beta.2",
|
"@lancedb/vectordb-darwin-x64": "0.21.1",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.20.1-beta.2",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.21.1",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.20.1-beta.2",
|
"@lancedb/vectordb-linux-x64-gnu": "0.21.1",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.20.1-beta.2"
|
"@lancedb/vectordb-win32-x64-msvc": "0.21.1"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
@@ -327,60 +327,65 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.20.1-beta.2.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.1.tgz",
|
||||||
"integrity": "sha512-mqi0yI+ZwBTydaDy1FRHAUZwrWS28u6tbHTe1s4uSrmERbVI6PfmoPR+NZWWAp6ZhlseSdl/+yeI4imk11rQSw==",
|
"integrity": "sha512-eXeOKgK5s7MSKDzA7Hl4/9E2X8tWWMNV7UJiFdwxrUcop86tM5ePBi8tApRnaQ3wBXrs99XTVBJ7+j+2gzilVA==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"darwin"
|
"darwin"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.20.1-beta.2.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.1.tgz",
|
||||||
"integrity": "sha512-m8EYYA8JZIeNsJqQsBDUMu6r31/u7FzpjonJ4Y+CjapVl6UdvI65KUkeL2dYrFao++RuIoaiqcm3e7gRgFZpXQ==",
|
"integrity": "sha512-vLoPWfg7OPw5vazLH5/YD/yQkZiTiPniuQgsH+xTodRfLf926lny53G7LQ6nFXNKIzX/jYKtg7AfMU8IcDLSEQ==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"darwin"
|
"darwin"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.20.1-beta.2.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.1.tgz",
|
||||||
"integrity": "sha512-3Og2+bk4GlWmMO1Yg2HBfeb5zrOMLaIHD7bEqQ4+6yw4IckAaV+ke05H0tyyqmOVrOQ0LpvtXgD7pPztjm9r9A==",
|
"integrity": "sha512-IMAxtXj5aHCv9peziN77IxQpkYFj83KvI8zQCHzbMMXv7BspkhAd0PaUViqHqtTf2TUHjYQ66a7clZrEn+xQuQ==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"linux"
|
"linux"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.20.1-beta.2.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.1.tgz",
|
||||||
"integrity": "sha512-mwTQyA/FBoU/FkPuvCNBZG3y83gBN+iYoejehBH2HBkLUIcmlsDgSRZ1OQ+f9ijj12EMBCA11tBUPA9zhHzyrw==",
|
"integrity": "sha512-9oPOxBsYGngIhtC/oC+fQ9V0w9mgFuj2Wyler8f5UYQdiAutsTNyOUA+XjtcROjVZrZ5oUeIrvOQSte9BbpRTg==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"linux"
|
"linux"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.20.1-beta.2.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.1.tgz",
|
||||||
"integrity": "sha512-VkjNpqhK3l3uHLLPmox+HrmKPMaZgV+qsGQWx0nfseGnSOEmXAWZWQFe0APVCQ9y0xTypQB0oH7eSOPZv2t4WQ==",
|
"integrity": "sha512-XqDXFLfdjNpDZ5jaqLerdx+sDU4YLuPK3VF4TowwcOlWDrUtI/L1lAyCaKxcyz1qE3VGuZvhNU89N5ioEICb4Q==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"private": false,
|
"private": false,
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
@@ -89,10 +89,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-x64": "0.20.1-beta.2",
|
"@lancedb/vectordb-darwin-x64": "0.21.1",
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.20.1-beta.2",
|
"@lancedb/vectordb-darwin-arm64": "0.21.1",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.20.1-beta.2",
|
"@lancedb/vectordb-linux-x64-gnu": "0.21.1",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.20.1-beta.2",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.21.1",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.20.1-beta.2"
|
"@lancedb/vectordb-win32-x64-msvc": "0.21.1"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.20.1-beta.2"
|
version = "0.21.1"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -368,9 +368,9 @@ describe("merge insert", () => {
|
|||||||
{ a: 4, b: "z" },
|
{ a: 4, b: "z" },
|
||||||
];
|
];
|
||||||
|
|
||||||
expect(
|
const result = (await table.toArrow()).toArray().sort((a, b) => a.a - b.a);
|
||||||
JSON.parse(JSON.stringify((await table.toArrow()).toArray())),
|
|
||||||
).toEqual(expected);
|
expect(result.map((row) => ({ ...row }))).toEqual(expected);
|
||||||
});
|
});
|
||||||
test("conditional update", async () => {
|
test("conditional update", async () => {
|
||||||
const newData = [
|
const newData = [
|
||||||
@@ -1706,6 +1706,60 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(mustNotResults.length).toBe(1);
|
expect(mustNotResults.length).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("full text search ngram", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const data = [
|
||||||
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
||||||
|
{ text: "lance database", vector: [0.4, 0.5, 0.6] },
|
||||||
|
{ text: "lance is cool", vector: [0.7, 0.8, 0.9] },
|
||||||
|
];
|
||||||
|
const table = await db.createTable("test", data);
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts({ baseTokenizer: "ngram" }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const results = await table.search("lan").toArray();
|
||||||
|
expect(results.length).toBe(2);
|
||||||
|
const resultSet = new Set(results.map((r) => r.text));
|
||||||
|
expect(resultSet.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet.has("lance is cool")).toBe(true);
|
||||||
|
|
||||||
|
const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
|
||||||
|
expect(results2.length).toBe(2);
|
||||||
|
const resultSet2 = new Set(results2.map((r) => r.text));
|
||||||
|
expect(resultSet2.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet2.has("lance is cool")).toBe(true);
|
||||||
|
|
||||||
|
// the default min_ngram_length is 3, so "la" should not match
|
||||||
|
const results3 = await table.search("la").toArray();
|
||||||
|
expect(results3.length).toBe(0);
|
||||||
|
|
||||||
|
// test setting min_ngram_length and prefix_only
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts({
|
||||||
|
baseTokenizer: "ngram",
|
||||||
|
ngramMinLength: 2,
|
||||||
|
prefixOnly: true,
|
||||||
|
}),
|
||||||
|
replace: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
const results4 = await table.search("lan").toArray();
|
||||||
|
expect(results4.length).toBe(2);
|
||||||
|
const resultSet4 = new Set(results4.map((r) => r.text));
|
||||||
|
expect(resultSet4.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet4.has("lance is cool")).toBe(true);
|
||||||
|
|
||||||
|
const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
|
||||||
|
expect(results5.length).toBe(0);
|
||||||
|
|
||||||
|
const results6 = await table.search("la").toArray();
|
||||||
|
expect(results6.length).toBe(2);
|
||||||
|
const resultSet6 = new Set(results6.map((r) => r.text));
|
||||||
|
expect(resultSet6.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet6.has("lance is cool")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
[0.4, 0.5, 0.599], // number[]
|
[0.4, 0.5, 0.599], // number[]
|
||||||
Float32Array.of(0.4, 0.5, 0.599), // Float32Array
|
Float32Array.of(0.4, 0.5, 0.599), // Float32Array
|
||||||
|
|||||||
@@ -439,7 +439,7 @@ export interface FtsOptions {
|
|||||||
*
|
*
|
||||||
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
||||||
*/
|
*/
|
||||||
baseTokenizer?: "simple" | "whitespace" | "raw";
|
baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* language for stemming and stop words
|
* language for stemming and stop words
|
||||||
@@ -472,6 +472,21 @@ export interface FtsOptions {
|
|||||||
* whether to remove punctuation
|
* whether to remove punctuation
|
||||||
*/
|
*/
|
||||||
asciiFolding?: boolean;
|
asciiFolding?: boolean;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ngram min length
|
||||||
|
*/
|
||||||
|
ngramMinLength?: number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ngram max length
|
||||||
|
*/
|
||||||
|
ngramMaxLength?: number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* whether to only index the prefix of the token for ngram tokenizer
|
||||||
|
*/
|
||||||
|
prefixOnly?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export class Index {
|
export class Index {
|
||||||
@@ -608,6 +623,9 @@ export class Index {
|
|||||||
options?.stem,
|
options?.stem,
|
||||||
options?.removeStopWords,
|
options?.removeStopWords,
|
||||||
options?.asciiFolding,
|
options?.asciiFolding,
|
||||||
|
options?.ngramMinLength,
|
||||||
|
options?.ngramMaxLength,
|
||||||
|
options?.prefixOnly,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -75,10 +75,10 @@ export interface OptimizeOptions {
|
|||||||
* // Delete all versions older than 1 day
|
* // Delete all versions older than 1 day
|
||||||
* const olderThan = new Date();
|
* const olderThan = new Date();
|
||||||
* olderThan.setDate(olderThan.getDate() - 1));
|
* olderThan.setDate(olderThan.getDate() - 1));
|
||||||
* tbl.cleanupOlderVersions(olderThan);
|
* tbl.optimize({cleanupOlderThan: olderThan});
|
||||||
*
|
*
|
||||||
* // Delete all versions except the current version
|
* // Delete all versions except the current version
|
||||||
* tbl.cleanupOlderVersions(new Date());
|
* tbl.optimize({cleanupOlderThan: new Date()});
|
||||||
*/
|
*/
|
||||||
cleanupOlderThan: Date;
|
cleanupOlderThan: Date;
|
||||||
deleteUnverified: boolean;
|
deleteUnverified: boolean;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.20.1-beta.2",
|
"version": "0.21.1",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
|
|||||||
@@ -123,6 +123,9 @@ impl Index {
|
|||||||
stem: Option<bool>,
|
stem: Option<bool>,
|
||||||
remove_stop_words: Option<bool>,
|
remove_stop_words: Option<bool>,
|
||||||
ascii_folding: Option<bool>,
|
ascii_folding: Option<bool>,
|
||||||
|
ngram_min_length: Option<u32>,
|
||||||
|
ngram_max_length: Option<u32>,
|
||||||
|
prefix_only: Option<bool>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let mut opts = FtsIndexBuilder::default();
|
let mut opts = FtsIndexBuilder::default();
|
||||||
if let Some(with_position) = with_position {
|
if let Some(with_position) = with_position {
|
||||||
@@ -149,6 +152,15 @@ impl Index {
|
|||||||
if let Some(ascii_folding) = ascii_folding {
|
if let Some(ascii_folding) = ascii_folding {
|
||||||
opts = opts.ascii_folding(ascii_folding);
|
opts = opts.ascii_folding(ascii_folding);
|
||||||
}
|
}
|
||||||
|
if let Some(ngram_min_length) = ngram_min_length {
|
||||||
|
opts = opts.ngram_min_length(ngram_min_length);
|
||||||
|
}
|
||||||
|
if let Some(ngram_max_length) = ngram_max_length {
|
||||||
|
opts = opts.ngram_max_length(ngram_max_length);
|
||||||
|
}
|
||||||
|
if let Some(prefix_only) = prefix_only {
|
||||||
|
opts = opts.ngram_prefix_only(prefix_only);
|
||||||
|
}
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.24.0"
|
current_version = "0.24.2-beta.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.24.0"
|
version = "0.24.2-beta.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ embeddings = [
|
|||||||
"boto3>=1.28.57",
|
"boto3>=1.28.57",
|
||||||
"awscli>=1.29.57",
|
"awscli>=1.29.57",
|
||||||
"botocore>=1.31.57",
|
"botocore>=1.31.57",
|
||||||
"ollama",
|
"ollama>=0.3.0",
|
||||||
"ibm-watsonx-ai>=1.1.2",
|
"ibm-watsonx-ai>=1.1.2",
|
||||||
]
|
]
|
||||||
azure = ["adlfs>=2024.2.0"]
|
azure = ["adlfs>=2024.2.0"]
|
||||||
|
|||||||
@@ -2,14 +2,15 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import TYPE_CHECKING, List, Optional, Union
|
from typing import TYPE_CHECKING, List, Optional, Sequence, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from ..util import attempt_import_or_raise
|
from ..util import attempt_import_or_raise
|
||||||
from .base import TextEmbeddingFunction
|
from .base import TextEmbeddingFunction
|
||||||
from .registry import register
|
from .registry import register
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import numpy as np
|
|
||||||
import ollama
|
import ollama
|
||||||
|
|
||||||
|
|
||||||
@@ -28,23 +29,21 @@ class OllamaEmbeddings(TextEmbeddingFunction):
|
|||||||
keep_alive: Optional[Union[float, str]] = None
|
keep_alive: Optional[Union[float, str]] = None
|
||||||
ollama_client_kwargs: Optional[dict] = {}
|
ollama_client_kwargs: Optional[dict] = {}
|
||||||
|
|
||||||
def ndims(self):
|
def ndims(self) -> int:
|
||||||
return len(self.generate_embeddings(["foo"])[0])
|
return len(self.generate_embeddings(["foo"])[0])
|
||||||
|
|
||||||
def _compute_embedding(self, text) -> Union["np.array", None]:
|
def _compute_embedding(self, text: Sequence[str]) -> Sequence[Sequence[float]]:
|
||||||
return (
|
response = self._ollama_client.embed(
|
||||||
self._ollama_client.embeddings(
|
|
||||||
model=self.name,
|
model=self.name,
|
||||||
prompt=text,
|
input=text,
|
||||||
options=self.options,
|
options=self.options,
|
||||||
keep_alive=self.keep_alive,
|
keep_alive=self.keep_alive,
|
||||||
)["embedding"]
|
|
||||||
or None
|
|
||||||
)
|
)
|
||||||
|
return response.embeddings
|
||||||
|
|
||||||
def generate_embeddings(
|
def generate_embeddings(
|
||||||
self, texts: Union[List[str], "np.ndarray"]
|
self, texts: Union[List[str], np.ndarray]
|
||||||
) -> list[Union["np.array", None]]:
|
) -> list[Union[np.array, None]]:
|
||||||
"""
|
"""
|
||||||
Get the embeddings for the given texts
|
Get the embeddings for the given texts
|
||||||
|
|
||||||
@@ -54,8 +53,8 @@ class OllamaEmbeddings(TextEmbeddingFunction):
|
|||||||
The texts to embed
|
The texts to embed
|
||||||
"""
|
"""
|
||||||
# TODO retry, rate limit, token limit
|
# TODO retry, rate limit, token limit
|
||||||
embeddings = [self._compute_embedding(text) for text in texts]
|
embeddings = self._compute_embedding(texts)
|
||||||
return embeddings
|
return list(embeddings)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _ollama_client(self) -> "ollama.Client":
|
def _ollama_client(self) -> "ollama.Client":
|
||||||
|
|||||||
@@ -137,6 +137,9 @@ class FTS:
|
|||||||
stem: bool = True
|
stem: bool = True
|
||||||
remove_stop_words: bool = True
|
remove_stop_words: bool = True
|
||||||
ascii_folding: bool = True
|
ascii_folding: bool = True
|
||||||
|
ngram_min_length: int = 3
|
||||||
|
ngram_max_length: int = 3
|
||||||
|
prefix_only: bool = False
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -1374,6 +1374,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
if query_string is not None and not isinstance(query_string, str):
|
if query_string is not None and not isinstance(query_string, str):
|
||||||
raise ValueError("Reranking currently only supports string queries")
|
raise ValueError("Reranking currently only supports string queries")
|
||||||
self._str_query = query_string if query_string is not None else self._str_query
|
self._str_query = query_string if query_string is not None else self._str_query
|
||||||
|
if reranker.score == "all":
|
||||||
|
self.with_row_id(True)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def bypass_vector_index(self) -> LanceVectorQueryBuilder:
|
def bypass_vector_index(self) -> LanceVectorQueryBuilder:
|
||||||
@@ -1569,6 +1571,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
|||||||
The LanceQueryBuilder object.
|
The LanceQueryBuilder object.
|
||||||
"""
|
"""
|
||||||
self._reranker = reranker
|
self._reranker = reranker
|
||||||
|
if reranker.score == "all":
|
||||||
|
self.with_row_id(True)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
@@ -1845,6 +1849,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
|
|
||||||
self._norm = normalize
|
self._norm = normalize
|
||||||
self._reranker = reranker
|
self._reranker = reranker
|
||||||
|
if reranker.score == "all":
|
||||||
|
self.with_row_id(True)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@@ -3049,8 +3055,14 @@ class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
|||||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||||
KNNVectorDistance: metric=l2
|
KNNVectorDistance: metric=l2
|
||||||
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
||||||
|
<BLANKLINE>
|
||||||
FTS Search Plan:
|
FTS Search Plan:
|
||||||
LanceScan: uri=..., projection=[vector, text], row_id=false, row_addr=false, ordered=true
|
ProjectionExec: expr=[vector@2 as vector, text@3 as text, _score@1 as _score]
|
||||||
|
Take: columns="_rowid, _score, (vector), (text)"
|
||||||
|
CoalesceBatchesExec: target_batch_size=1024
|
||||||
|
GlobalLimitExec: skip=0, fetch=10
|
||||||
|
MatchQuery: query=hello
|
||||||
|
<BLANKLINE>
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ from lancedb._lancedb import (
|
|||||||
UpdateResult,
|
UpdateResult,
|
||||||
)
|
)
|
||||||
from lancedb.embeddings.base import EmbeddingFunctionConfig
|
from lancedb.embeddings.base import EmbeddingFunctionConfig
|
||||||
from lancedb.index import FTS, BTree, Bitmap, HnswPq, HnswSq, IvfFlat, IvfPq, LabelList
|
from lancedb.index import FTS, BTree, Bitmap, HnswSq, IvfFlat, IvfPq, LabelList
|
||||||
from lancedb.remote.db import LOOP
|
from lancedb.remote.db import LOOP
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
|
||||||
@@ -89,7 +89,7 @@ class RemoteTable(Table):
|
|||||||
|
|
||||||
def to_pandas(self):
|
def to_pandas(self):
|
||||||
"""to_pandas() is not yet supported on LanceDB cloud."""
|
"""to_pandas() is not yet supported on LanceDB cloud."""
|
||||||
return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
raise NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
||||||
|
|
||||||
def checkout(self, version: Union[int, str]):
|
def checkout(self, version: Union[int, str]):
|
||||||
return LOOP.run(self._table.checkout(version))
|
return LOOP.run(self._table.checkout(version))
|
||||||
@@ -158,6 +158,9 @@ class RemoteTable(Table):
|
|||||||
stem: bool = True,
|
stem: bool = True,
|
||||||
remove_stop_words: bool = True,
|
remove_stop_words: bool = True,
|
||||||
ascii_folding: bool = True,
|
ascii_folding: bool = True,
|
||||||
|
ngram_min_length: int = 3,
|
||||||
|
ngram_max_length: int = 3,
|
||||||
|
prefix_only: bool = False,
|
||||||
):
|
):
|
||||||
config = FTS(
|
config = FTS(
|
||||||
with_position=with_position,
|
with_position=with_position,
|
||||||
@@ -168,6 +171,9 @@ class RemoteTable(Table):
|
|||||||
stem=stem,
|
stem=stem,
|
||||||
remove_stop_words=remove_stop_words,
|
remove_stop_words=remove_stop_words,
|
||||||
ascii_folding=ascii_folding,
|
ascii_folding=ascii_folding,
|
||||||
|
ngram_min_length=ngram_min_length,
|
||||||
|
ngram_max_length=ngram_max_length,
|
||||||
|
prefix_only=prefix_only,
|
||||||
)
|
)
|
||||||
LOOP.run(
|
LOOP.run(
|
||||||
self._table.create_index(
|
self._table.create_index(
|
||||||
@@ -186,6 +192,8 @@ class RemoteTable(Table):
|
|||||||
accelerator: Optional[str] = None,
|
accelerator: Optional[str] = None,
|
||||||
index_type="vector",
|
index_type="vector",
|
||||||
wait_timeout: Optional[timedelta] = None,
|
wait_timeout: Optional[timedelta] = None,
|
||||||
|
*,
|
||||||
|
num_bits: int = 8,
|
||||||
):
|
):
|
||||||
"""Create an index on the table.
|
"""Create an index on the table.
|
||||||
Currently, the only parameters that matter are
|
Currently, the only parameters that matter are
|
||||||
@@ -220,11 +228,6 @@ class RemoteTable(Table):
|
|||||||
>>> table.create_index("l2", "vector") # doctest: +SKIP
|
>>> table.create_index("l2", "vector") # doctest: +SKIP
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if num_partitions is not None:
|
|
||||||
logging.warning(
|
|
||||||
"num_partitions is not supported on LanceDB cloud."
|
|
||||||
"This parameter will be tuned automatically."
|
|
||||||
)
|
|
||||||
if num_sub_vectors is not None:
|
if num_sub_vectors is not None:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"num_sub_vectors is not supported on LanceDB cloud."
|
"num_sub_vectors is not supported on LanceDB cloud."
|
||||||
@@ -244,13 +247,21 @@ class RemoteTable(Table):
|
|||||||
|
|
||||||
index_type = index_type.upper()
|
index_type = index_type.upper()
|
||||||
if index_type == "VECTOR" or index_type == "IVF_PQ":
|
if index_type == "VECTOR" or index_type == "IVF_PQ":
|
||||||
config = IvfPq(distance_type=metric)
|
config = IvfPq(
|
||||||
|
distance_type=metric,
|
||||||
|
num_partitions=num_partitions,
|
||||||
|
num_sub_vectors=num_sub_vectors,
|
||||||
|
num_bits=num_bits,
|
||||||
|
)
|
||||||
elif index_type == "IVF_HNSW_PQ":
|
elif index_type == "IVF_HNSW_PQ":
|
||||||
config = HnswPq(distance_type=metric)
|
raise ValueError(
|
||||||
|
"IVF_HNSW_PQ is not supported on LanceDB cloud."
|
||||||
|
"Please use IVF_HNSW_SQ instead."
|
||||||
|
)
|
||||||
elif index_type == "IVF_HNSW_SQ":
|
elif index_type == "IVF_HNSW_SQ":
|
||||||
config = HnswSq(distance_type=metric)
|
config = HnswSq(distance_type=metric, num_partitions=num_partitions)
|
||||||
elif index_type == "IVF_FLAT":
|
elif index_type == "IVF_FLAT":
|
||||||
config = IvfFlat(distance_type=metric)
|
config = IvfFlat(distance_type=metric, num_partitions=num_partitions)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unknown vector index type: {index_type}. Valid options are"
|
f"Unknown vector index type: {index_type}. Valid options are"
|
||||||
|
|||||||
@@ -74,9 +74,7 @@ class AnswerdotaiRerankers(Reranker):
|
|||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
elif self.score == "all":
|
||||||
raise NotImplementedError(
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
"Answerdotai Reranker does not support score='all' yet"
|
|
||||||
)
|
|
||||||
combined_results = combined_results.sort_by(
|
combined_results = combined_results.sort_by(
|
||||||
[("_relevance_score", "descending")]
|
[("_relevance_score", "descending")]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -232,6 +232,39 @@ class Reranker(ABC):
|
|||||||
|
|
||||||
return deduped_table
|
return deduped_table
|
||||||
|
|
||||||
|
def _merge_and_keep_scores(self, vector_results: pa.Table, fts_results: pa.Table):
|
||||||
|
"""
|
||||||
|
Merge the results from the vector and FTS search and keep the scores.
|
||||||
|
This op is slower than just keeping relevance score but can be useful
|
||||||
|
for debugging.
|
||||||
|
"""
|
||||||
|
# add nulls to fts results for _distance
|
||||||
|
if "_distance" not in fts_results.column_names:
|
||||||
|
fts_results = fts_results.append_column(
|
||||||
|
"_distance",
|
||||||
|
pa.array([None] * len(fts_results), type=pa.float32()),
|
||||||
|
)
|
||||||
|
# add nulls to vector results for _score
|
||||||
|
if "_score" not in vector_results.column_names:
|
||||||
|
vector_results = vector_results.append_column(
|
||||||
|
"_score",
|
||||||
|
pa.array([None] * len(vector_results), type=pa.float32()),
|
||||||
|
)
|
||||||
|
|
||||||
|
# combine them and fill the scores
|
||||||
|
vector_results_dict = {row["_rowid"]: row for row in vector_results.to_pylist()}
|
||||||
|
fts_results_dict = {row["_rowid"]: row for row in fts_results.to_pylist()}
|
||||||
|
|
||||||
|
# merge them into vector_results
|
||||||
|
for key, value in fts_results_dict.items():
|
||||||
|
if key in vector_results_dict:
|
||||||
|
vector_results_dict[key]["_score"] = value["_score"]
|
||||||
|
else:
|
||||||
|
vector_results_dict[key] = value
|
||||||
|
|
||||||
|
combined = pa.Table.from_pylist(list(vector_results_dict.values()))
|
||||||
|
return combined
|
||||||
|
|
||||||
def _keep_relevance_score(self, combined_results: pa.Table):
|
def _keep_relevance_score(self, combined_results: pa.Table):
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
if "_score" in combined_results.column_names:
|
if "_score" in combined_results.column_names:
|
||||||
|
|||||||
@@ -92,14 +92,14 @@ class CohereReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
|
if self.score == "all":
|
||||||
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
|
else:
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
|
||||||
raise NotImplementedError(
|
|
||||||
"return_score='all' not implemented for cohere reranker"
|
|
||||||
)
|
|
||||||
return combined_results
|
return combined_results
|
||||||
|
|
||||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||||
|
|||||||
@@ -81,15 +81,15 @@ class CrossEncoderReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
|
if self.score == "all":
|
||||||
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
|
else:
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
# sort the results by _score
|
# sort the results by _score
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
|
||||||
raise NotImplementedError(
|
|
||||||
"return_score='all' not implemented for CrossEncoderReranker"
|
|
||||||
)
|
|
||||||
combined_results = combined_results.sort_by(
|
combined_results = combined_results.sort_by(
|
||||||
[("_relevance_score", "descending")]
|
[("_relevance_score", "descending")]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -97,14 +97,14 @@ class JinaReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
|
if self.score == "all":
|
||||||
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
|
else:
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
|
||||||
raise NotImplementedError(
|
|
||||||
"return_score='all' not implemented for JinaReranker"
|
|
||||||
)
|
|
||||||
return combined_results
|
return combined_results
|
||||||
|
|
||||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||||
|
|||||||
@@ -88,14 +88,13 @@ class OpenaiReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
|
if self.score == "all":
|
||||||
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
|
else:
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
|
||||||
raise NotImplementedError(
|
|
||||||
"OpenAI Reranker does not support score='all' yet"
|
|
||||||
)
|
|
||||||
|
|
||||||
combined_results = combined_results.sort_by(
|
combined_results = combined_results.sort_by(
|
||||||
[("_relevance_score", "descending")]
|
[("_relevance_score", "descending")]
|
||||||
|
|||||||
@@ -94,14 +94,14 @@ class VoyageAIReranker(Reranker):
|
|||||||
vector_results: pa.Table,
|
vector_results: pa.Table,
|
||||||
fts_results: pa.Table,
|
fts_results: pa.Table,
|
||||||
):
|
):
|
||||||
|
if self.score == "all":
|
||||||
|
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
|
||||||
|
else:
|
||||||
combined_results = self.merge_results(vector_results, fts_results)
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
combined_results = self._rerank(combined_results, query)
|
combined_results = self._rerank(combined_results, query)
|
||||||
if self.score == "relevance":
|
if self.score == "relevance":
|
||||||
combined_results = self._keep_relevance_score(combined_results)
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
elif self.score == "all":
|
|
||||||
raise NotImplementedError(
|
|
||||||
"return_score='all' not implemented for voyageai reranker"
|
|
||||||
)
|
|
||||||
return combined_results
|
return combined_results
|
||||||
|
|
||||||
def rerank_vector(self, query: str, vector_results: pa.Table):
|
def rerank_vector(self, query: str, vector_results: pa.Table):
|
||||||
|
|||||||
@@ -838,6 +838,9 @@ class Table(ABC):
|
|||||||
stem: bool = True,
|
stem: bool = True,
|
||||||
remove_stop_words: bool = True,
|
remove_stop_words: bool = True,
|
||||||
ascii_folding: bool = True,
|
ascii_folding: bool = True,
|
||||||
|
ngram_min_length: int = 3,
|
||||||
|
ngram_max_length: int = 3,
|
||||||
|
prefix_only: bool = False,
|
||||||
wait_timeout: Optional[timedelta] = None,
|
wait_timeout: Optional[timedelta] = None,
|
||||||
):
|
):
|
||||||
"""Create a full-text search index on the table.
|
"""Create a full-text search index on the table.
|
||||||
@@ -877,6 +880,7 @@ class Table(ABC):
|
|||||||
- "simple": Splits text by whitespace and punctuation.
|
- "simple": Splits text by whitespace and punctuation.
|
||||||
- "whitespace": Split text by whitespace, but not punctuation.
|
- "whitespace": Split text by whitespace, but not punctuation.
|
||||||
- "raw": No tokenization. The entire text is treated as a single token.
|
- "raw": No tokenization. The entire text is treated as a single token.
|
||||||
|
- "ngram": N-Gram tokenizer.
|
||||||
language : str, default "English"
|
language : str, default "English"
|
||||||
The language to use for tokenization.
|
The language to use for tokenization.
|
||||||
max_token_length : int, default 40
|
max_token_length : int, default 40
|
||||||
@@ -894,6 +898,12 @@ class Table(ABC):
|
|||||||
ascii_folding : bool, default True
|
ascii_folding : bool, default True
|
||||||
Whether to fold ASCII characters. This converts accented characters to
|
Whether to fold ASCII characters. This converts accented characters to
|
||||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||||
|
ngram_min_length: int, default 3
|
||||||
|
The minimum length of an n-gram.
|
||||||
|
ngram_max_length: int, default 3
|
||||||
|
The maximum length of an n-gram.
|
||||||
|
prefix_only: bool, default False
|
||||||
|
Whether to only index the prefix of the token for ngram tokenizer.
|
||||||
wait_timeout: timedelta, optional
|
wait_timeout: timedelta, optional
|
||||||
The timeout to wait if indexing is asynchronous.
|
The timeout to wait if indexing is asynchronous.
|
||||||
"""
|
"""
|
||||||
@@ -1981,6 +1991,9 @@ class LanceTable(Table):
|
|||||||
stem: bool = True,
|
stem: bool = True,
|
||||||
remove_stop_words: bool = True,
|
remove_stop_words: bool = True,
|
||||||
ascii_folding: bool = True,
|
ascii_folding: bool = True,
|
||||||
|
ngram_min_length: int = 3,
|
||||||
|
ngram_max_length: int = 3,
|
||||||
|
prefix_only: bool = False,
|
||||||
):
|
):
|
||||||
if not use_tantivy:
|
if not use_tantivy:
|
||||||
if not isinstance(field_names, str):
|
if not isinstance(field_names, str):
|
||||||
@@ -1996,6 +2009,9 @@ class LanceTable(Table):
|
|||||||
"stem": stem,
|
"stem": stem,
|
||||||
"remove_stop_words": remove_stop_words,
|
"remove_stop_words": remove_stop_words,
|
||||||
"ascii_folding": ascii_folding,
|
"ascii_folding": ascii_folding,
|
||||||
|
"ngram_min_length": ngram_min_length,
|
||||||
|
"ngram_max_length": ngram_max_length,
|
||||||
|
"prefix_only": prefix_only,
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||||
@@ -2065,6 +2081,9 @@ class LanceTable(Table):
|
|||||||
"stem": False,
|
"stem": False,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
|
"ngram_min_length": 3,
|
||||||
|
"ngram_max_length": 3,
|
||||||
|
"prefix_only": False,
|
||||||
}
|
}
|
||||||
elif tokenizer_name == "raw":
|
elif tokenizer_name == "raw":
|
||||||
return {
|
return {
|
||||||
@@ -2075,6 +2094,9 @@ class LanceTable(Table):
|
|||||||
"stem": False,
|
"stem": False,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
|
"ngram_min_length": 3,
|
||||||
|
"ngram_max_length": 3,
|
||||||
|
"prefix_only": False,
|
||||||
}
|
}
|
||||||
elif tokenizer_name == "whitespace":
|
elif tokenizer_name == "whitespace":
|
||||||
return {
|
return {
|
||||||
@@ -2085,6 +2107,9 @@ class LanceTable(Table):
|
|||||||
"stem": False,
|
"stem": False,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
|
"ngram_min_length": 3,
|
||||||
|
"ngram_max_length": 3,
|
||||||
|
"prefix_only": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
# or it's with language stemming with pattern like "en_stem"
|
# or it's with language stemming with pattern like "en_stem"
|
||||||
@@ -2103,6 +2128,9 @@ class LanceTable(Table):
|
|||||||
"stem": True,
|
"stem": True,
|
||||||
"remove_stop_words": False,
|
"remove_stop_words": False,
|
||||||
"ascii_folding": False,
|
"ascii_folding": False,
|
||||||
|
"ngram_min_length": 3,
|
||||||
|
"ngram_max_length": 3,
|
||||||
|
"prefix_only": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
def add(
|
def add(
|
||||||
|
|||||||
@@ -25,4 +25,4 @@ IndexType = Literal[
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Tokenizer literals
|
# Tokenizer literals
|
||||||
BaseTokenizerType = Literal["simple", "raw", "whitespace"]
|
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||||
|
|||||||
@@ -669,3 +669,46 @@ def test_fts_on_list(mem_db: DBConnection):
|
|||||||
|
|
||||||
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
|
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
|
||||||
assert len(res) == 2
|
assert len(res) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_fts_ngram(mem_db: DBConnection):
|
||||||
|
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
|
||||||
|
table = mem_db.create_table("test", data=data)
|
||||||
|
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
|
||||||
|
|
||||||
|
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||||
|
assert len(results) == 2
|
||||||
|
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||||
|
|
||||||
|
results = (
|
||||||
|
table.search("nce", query_type="fts").limit(10).to_list()
|
||||||
|
) # spellchecker:disable-line
|
||||||
|
assert len(results) == 2
|
||||||
|
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||||
|
|
||||||
|
# the default min_ngram_length is 3, so "la" should not match
|
||||||
|
results = table.search("la", query_type="fts").limit(10).to_list()
|
||||||
|
assert len(results) == 0
|
||||||
|
|
||||||
|
# test setting min_ngram_length and prefix_only
|
||||||
|
table.create_fts_index(
|
||||||
|
"text",
|
||||||
|
use_tantivy=False,
|
||||||
|
base_tokenizer="ngram",
|
||||||
|
replace=True,
|
||||||
|
ngram_min_length=2,
|
||||||
|
prefix_only=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||||
|
assert len(results) == 2
|
||||||
|
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||||
|
|
||||||
|
results = (
|
||||||
|
table.search("nce", query_type="fts").limit(10).to_list()
|
||||||
|
) # spellchecker:disable-line
|
||||||
|
assert len(results) == 0
|
||||||
|
|
||||||
|
results = table.search("la", query_type="fts").limit(10).to_list()
|
||||||
|
assert len(results) == 2
|
||||||
|
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||||
|
|||||||
@@ -272,7 +272,9 @@ async def test_distance_range_with_new_rows_async():
|
|||||||
# append more rows so that execution plan would be mixed with ANN & Flat KNN
|
# append more rows so that execution plan would be mixed with ANN & Flat KNN
|
||||||
new_data = pa.table(
|
new_data = pa.table(
|
||||||
{
|
{
|
||||||
"vector": pa.FixedShapeTensorArray.from_numpy_ndarray(np.random.rand(4, 2)),
|
"vector": pa.FixedShapeTensorArray.from_numpy_ndarray(
|
||||||
|
np.random.rand(4, 2) + 1
|
||||||
|
),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
await table.add(new_data)
|
await table.add(new_data)
|
||||||
@@ -775,6 +777,82 @@ async def test_explain_plan_async(table_async: AsyncTable):
|
|||||||
assert "KNN" in plan
|
assert "KNN" in plan
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_explain_plan_fts(table_async: AsyncTable):
|
||||||
|
"""Test explain plan for FTS queries"""
|
||||||
|
# Create FTS index
|
||||||
|
from lancedb.index import FTS
|
||||||
|
|
||||||
|
await table_async.create_index("text", config=FTS())
|
||||||
|
|
||||||
|
# Test pure FTS query
|
||||||
|
query = await table_async.search("dog", query_type="fts", fts_columns="text")
|
||||||
|
plan = await query.explain_plan()
|
||||||
|
# Should show FTS details (issue #2465 is now fixed)
|
||||||
|
assert "MatchQuery: query=dog" in plan
|
||||||
|
assert "GlobalLimitExec" in plan # Default limit
|
||||||
|
|
||||||
|
# Test FTS query with limit
|
||||||
|
query_with_limit = await table_async.search(
|
||||||
|
"dog", query_type="fts", fts_columns="text"
|
||||||
|
)
|
||||||
|
plan_with_limit = await query_with_limit.limit(1).explain_plan()
|
||||||
|
assert "MatchQuery: query=dog" in plan_with_limit
|
||||||
|
assert "GlobalLimitExec: skip=0, fetch=1" in plan_with_limit
|
||||||
|
|
||||||
|
# Test FTS query with offset and limit
|
||||||
|
query_with_offset = await table_async.search(
|
||||||
|
"dog", query_type="fts", fts_columns="text"
|
||||||
|
)
|
||||||
|
plan_with_offset = await query_with_offset.offset(1).limit(1).explain_plan()
|
||||||
|
assert "MatchQuery: query=dog" in plan_with_offset
|
||||||
|
assert "GlobalLimitExec: skip=1, fetch=1" in plan_with_offset
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_explain_plan_vector_with_limit_offset(table_async: AsyncTable):
|
||||||
|
"""Test explain plan for vector queries with limit and offset"""
|
||||||
|
# Test vector query with limit
|
||||||
|
plan_with_limit = await (
|
||||||
|
table_async.query().nearest_to(pa.array([1, 2])).limit(1).explain_plan()
|
||||||
|
)
|
||||||
|
assert "KNN" in plan_with_limit
|
||||||
|
assert "GlobalLimitExec: skip=0, fetch=1" in plan_with_limit
|
||||||
|
|
||||||
|
# Test vector query with offset and limit
|
||||||
|
plan_with_offset = await (
|
||||||
|
table_async.query()
|
||||||
|
.nearest_to(pa.array([1, 2]))
|
||||||
|
.offset(1)
|
||||||
|
.limit(1)
|
||||||
|
.explain_plan()
|
||||||
|
)
|
||||||
|
assert "KNN" in plan_with_offset
|
||||||
|
assert "GlobalLimitExec: skip=1, fetch=1" in plan_with_offset
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_explain_plan_with_filters(table_async: AsyncTable):
|
||||||
|
"""Test explain plan for queries with filters"""
|
||||||
|
# Test vector query with filter
|
||||||
|
plan_with_filter = await (
|
||||||
|
table_async.query().nearest_to(pa.array([1, 2])).where("id = 1").explain_plan()
|
||||||
|
)
|
||||||
|
assert "KNN" in plan_with_filter
|
||||||
|
assert "FilterExec" in plan_with_filter
|
||||||
|
|
||||||
|
# Test FTS query with filter
|
||||||
|
from lancedb.index import FTS
|
||||||
|
|
||||||
|
await table_async.create_index("text", config=FTS())
|
||||||
|
query_fts_filter = await table_async.search(
|
||||||
|
"dog", query_type="fts", fts_columns="text"
|
||||||
|
)
|
||||||
|
plan_fts_filter = await query_fts_filter.where("id = 1").explain_plan()
|
||||||
|
assert "MatchQuery: query=dog" in plan_fts_filter
|
||||||
|
assert "FilterExec: id@" in plan_fts_filter # Should show filter details
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_query_camelcase_async(tmp_path):
|
async def test_query_camelcase_async(tmp_path):
|
||||||
db = await lancedb.connect_async(tmp_path)
|
db = await lancedb.connect_async(tmp_path)
|
||||||
|
|||||||
@@ -210,6 +210,25 @@ async def test_retry_error():
|
|||||||
assert cause.status_code == 429
|
assert cause.status_code == 429
|
||||||
|
|
||||||
|
|
||||||
|
def test_table_unimplemented_functions():
|
||||||
|
def handler(request):
|
||||||
|
if request.path == "/v1/table/test/create/?mode=create":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
request.wfile.write(b"{}")
|
||||||
|
else:
|
||||||
|
request.send_response(404)
|
||||||
|
request.end_headers()
|
||||||
|
|
||||||
|
with mock_lancedb_connection(handler) as db:
|
||||||
|
table = db.create_table("test", [{"id": 1}])
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
table.to_arrow()
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
table.to_pandas()
|
||||||
|
|
||||||
|
|
||||||
def test_table_add_in_threadpool():
|
def test_table_add_in_threadpool():
|
||||||
def handler(request):
|
def handler(request):
|
||||||
if request.path == "/v1/table/test/insert/":
|
if request.path == "/v1/table/test/insert/":
|
||||||
|
|||||||
@@ -499,3 +499,19 @@ def test_empty_result_reranker():
|
|||||||
.rerank(reranker)
|
.rerank(reranker)
|
||||||
.to_arrow()
|
.to_arrow()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||||
|
def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy):
|
||||||
|
pytest.importorskip("sentence_transformers")
|
||||||
|
reranker = CrossEncoderReranker(return_score="all")
|
||||||
|
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||||
|
query = "single player experience"
|
||||||
|
result = (
|
||||||
|
table.search(query, query_type="hybrid", vector_column_name="vector")
|
||||||
|
.rerank(reranker=reranker)
|
||||||
|
.to_arrow()
|
||||||
|
)
|
||||||
|
assert "_relevance_score" in result.column_names
|
||||||
|
assert "_score" in result.column_names
|
||||||
|
assert "_distance" in result.column_names
|
||||||
|
|||||||
@@ -47,7 +47,10 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
|||||||
.max_token_length(params.max_token_length)
|
.max_token_length(params.max_token_length)
|
||||||
.remove_stop_words(params.remove_stop_words)
|
.remove_stop_words(params.remove_stop_words)
|
||||||
.stem(params.stem)
|
.stem(params.stem)
|
||||||
.ascii_folding(params.ascii_folding);
|
.ascii_folding(params.ascii_folding)
|
||||||
|
.ngram_min_length(params.ngram_min_length)
|
||||||
|
.ngram_max_length(params.ngram_max_length)
|
||||||
|
.ngram_prefix_only(params.prefix_only);
|
||||||
Ok(LanceDbIndex::FTS(inner_opts))
|
Ok(LanceDbIndex::FTS(inner_opts))
|
||||||
},
|
},
|
||||||
"IvfFlat" => {
|
"IvfFlat" => {
|
||||||
@@ -130,6 +133,9 @@ struct FtsParams {
|
|||||||
stem: bool,
|
stem: bool,
|
||||||
remove_stop_words: bool,
|
remove_stop_words: bool,
|
||||||
ascii_folding: bool,
|
ascii_folding: bool,
|
||||||
|
ngram_min_length: u32,
|
||||||
|
ngram_max_length: u32,
|
||||||
|
prefix_only: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(FromPyObject)]
|
#[derive(FromPyObject)]
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ impl FromPyObject<'_> for PyLanceDB<FtsQuery> {
|
|||||||
let operator = ob.getattr("operator")?.extract::<String>()?;
|
let operator = ob.getattr("operator")?.extract::<String>()?;
|
||||||
let prefix_length = ob.getattr("prefix_length")?.extract()?;
|
let prefix_length = ob.getattr("prefix_length")?.extract()?;
|
||||||
|
|
||||||
Ok(PyLanceDB(
|
Ok(Self(
|
||||||
MatchQuery::new(query)
|
MatchQuery::new(query)
|
||||||
.with_column(Some(column))
|
.with_column(Some(column))
|
||||||
.with_boost(boost)
|
.with_boost(boost)
|
||||||
@@ -70,7 +70,7 @@ impl FromPyObject<'_> for PyLanceDB<FtsQuery> {
|
|||||||
let column = ob.getattr("column")?.extract()?;
|
let column = ob.getattr("column")?.extract()?;
|
||||||
let slop = ob.getattr("slop")?.extract()?;
|
let slop = ob.getattr("slop")?.extract()?;
|
||||||
|
|
||||||
Ok(PyLanceDB(
|
Ok(Self(
|
||||||
PhraseQuery::new(query)
|
PhraseQuery::new(query)
|
||||||
.with_column(Some(column))
|
.with_column(Some(column))
|
||||||
.with_slop(slop)
|
.with_slop(slop)
|
||||||
@@ -78,10 +78,10 @@ impl FromPyObject<'_> for PyLanceDB<FtsQuery> {
|
|||||||
))
|
))
|
||||||
}
|
}
|
||||||
"BoostQuery" => {
|
"BoostQuery" => {
|
||||||
let positive: PyLanceDB<FtsQuery> = ob.getattr("positive")?.extract()?;
|
let positive: Self = ob.getattr("positive")?.extract()?;
|
||||||
let negative: PyLanceDB<FtsQuery> = ob.getattr("negative")?.extract()?;
|
let negative: Self = ob.getattr("negative")?.extract()?;
|
||||||
let negative_boost = ob.getattr("negative_boost")?.extract()?;
|
let negative_boost = ob.getattr("negative_boost")?.extract()?;
|
||||||
Ok(PyLanceDB(
|
Ok(Self(
|
||||||
BoostQuery::new(positive.0, negative.0, negative_boost).into(),
|
BoostQuery::new(positive.0, negative.0, negative_boost).into(),
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
@@ -103,18 +103,17 @@ impl FromPyObject<'_> for PyLanceDB<FtsQuery> {
|
|||||||
let op = Operator::try_from(operator.as_str())
|
let op = Operator::try_from(operator.as_str())
|
||||||
.map_err(|e| PyValueError::new_err(format!("Invalid operator: {}", e)))?;
|
.map_err(|e| PyValueError::new_err(format!("Invalid operator: {}", e)))?;
|
||||||
|
|
||||||
Ok(PyLanceDB(q.with_operator(op).into()))
|
Ok(Self(q.with_operator(op).into()))
|
||||||
}
|
}
|
||||||
"BooleanQuery" => {
|
"BooleanQuery" => {
|
||||||
let queries: Vec<(String, PyLanceDB<FtsQuery>)> =
|
let queries: Vec<(String, Self)> = ob.getattr("queries")?.extract()?;
|
||||||
ob.getattr("queries")?.extract()?;
|
|
||||||
let mut sub_queries = Vec::with_capacity(queries.len());
|
let mut sub_queries = Vec::with_capacity(queries.len());
|
||||||
for (occur, q) in queries {
|
for (occur, q) in queries {
|
||||||
let occur = Occur::try_from(occur.as_str())
|
let occur = Occur::try_from(occur.as_str())
|
||||||
.map_err(|e| PyValueError::new_err(e.to_string()))?;
|
.map_err(|e| PyValueError::new_err(e.to_string()))?;
|
||||||
sub_queries.push((occur, q.0));
|
sub_queries.push((occur, q.0));
|
||||||
}
|
}
|
||||||
Ok(PyLanceDB(BooleanQuery::new(sub_queries).into()))
|
Ok(Self(BooleanQuery::new(sub_queries).into()))
|
||||||
}
|
}
|
||||||
name => Err(PyValueError::new_err(format!(
|
name => Err(PyValueError::new_err(format!(
|
||||||
"Unsupported FTS query type: {}",
|
"Unsupported FTS query type: {}",
|
||||||
@@ -155,8 +154,8 @@ impl<'py> IntoPyObject<'py> for PyLanceDB<FtsQuery> {
|
|||||||
.call((query.terms, query.column.unwrap()), Some(&kwargs))
|
.call((query.terms, query.column.unwrap()), Some(&kwargs))
|
||||||
}
|
}
|
||||||
FtsQuery::Boost(query) => {
|
FtsQuery::Boost(query) => {
|
||||||
let positive = PyLanceDB(query.positive.as_ref().clone()).into_pyobject(py)?;
|
let positive = Self(query.positive.as_ref().clone()).into_pyobject(py)?;
|
||||||
let negative = PyLanceDB(query.negative.as_ref().clone()).into_pyobject(py)?;
|
let negative = Self(query.negative.as_ref().clone()).into_pyobject(py)?;
|
||||||
let kwargs = PyDict::new(py);
|
let kwargs = PyDict::new(py);
|
||||||
kwargs.set_item("negative_boost", query.negative_boost)?;
|
kwargs.set_item("negative_boost", query.negative_boost)?;
|
||||||
namespace
|
namespace
|
||||||
@@ -182,13 +181,13 @@ impl<'py> IntoPyObject<'py> for PyLanceDB<FtsQuery> {
|
|||||||
query.should.len() + query.must.len() + query.must_not.len(),
|
query.should.len() + query.must.len() + query.must_not.len(),
|
||||||
);
|
);
|
||||||
for q in query.should {
|
for q in query.should {
|
||||||
queries.push((Occur::Should.into(), PyLanceDB(q).into_pyobject(py)?));
|
queries.push((Occur::Should.into(), Self(q).into_pyobject(py)?));
|
||||||
}
|
}
|
||||||
for q in query.must {
|
for q in query.must {
|
||||||
queries.push((Occur::Must.into(), PyLanceDB(q).into_pyobject(py)?));
|
queries.push((Occur::Must.into(), Self(q).into_pyobject(py)?));
|
||||||
}
|
}
|
||||||
for q in query.must_not {
|
for q in query.must_not {
|
||||||
queries.push((Occur::MustNot.into(), PyLanceDB(q).into_pyobject(py)?));
|
queries.push((Occur::MustNot.into(), Self(q).into_pyobject(py)?));
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
@@ -563,7 +562,10 @@ impl FTSQuery {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn explain_plan(self_: PyRef<'_, Self>, verbose: bool) -> PyResult<Bound<'_, PyAny>> {
|
pub fn explain_plan(self_: PyRef<'_, Self>, verbose: bool) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.inner.clone();
|
let inner = self_
|
||||||
|
.inner
|
||||||
|
.clone()
|
||||||
|
.full_text_search(self_.fts_query.clone());
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
inner
|
inner
|
||||||
.explain_plan(verbose)
|
.explain_plan(verbose)
|
||||||
@@ -573,7 +575,10 @@ impl FTSQuery {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn analyze_plan(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
pub fn analyze_plan(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.inner.clone();
|
let inner = self_
|
||||||
|
.inner
|
||||||
|
.clone()
|
||||||
|
.full_text_search(self_.fts_query.clone());
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
inner
|
inner
|
||||||
.analyze_plan()
|
.analyze_plan()
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.20.1-beta.2"
|
version = "0.21.1"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.20.1-beta.2"
|
version = "0.21.1"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -105,7 +105,7 @@ impl ListingCatalog {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn open_path(path: &str) -> Result<Self> {
|
async fn open_path(path: &str) -> Result<Self> {
|
||||||
let (object_store, base_path) = ObjectStore::from_uri(path).await.unwrap();
|
let (object_store, base_path) = ObjectStore::from_uri(path).await?;
|
||||||
if object_store.is_local() {
|
if object_store.is_local() {
|
||||||
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
|
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use std::path::Path;
|
|||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
|
||||||
use lance::dataset::{ReadParams, WriteMode};
|
use lance::dataset::{ReadParams, WriteMode};
|
||||||
use lance::io::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry, WrappingObjectStore};
|
use lance::io::{ObjectStore, ObjectStoreParams, WrappingObjectStore};
|
||||||
use lance_datafusion::utils::StreamingWriteSource;
|
use lance_datafusion::utils::StreamingWriteSource;
|
||||||
use lance_encoding::version::LanceFileVersion;
|
use lance_encoding::version::LanceFileVersion;
|
||||||
use lance_table::io::commit::commit_handler_from_url;
|
use lance_table::io::commit::commit_handler_from_url;
|
||||||
@@ -217,6 +217,9 @@ pub struct ListingDatabase {
|
|||||||
|
|
||||||
// Options for tables created by this connection
|
// Options for tables created by this connection
|
||||||
new_table_config: NewTableConfig,
|
new_table_config: NewTableConfig,
|
||||||
|
|
||||||
|
// Session for object stores and caching
|
||||||
|
session: Arc<lance::session::Session>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for ListingDatabase {
|
impl std::fmt::Display for ListingDatabase {
|
||||||
@@ -313,13 +316,17 @@ impl ListingDatabase {
|
|||||||
|
|
||||||
let plain_uri = url.to_string();
|
let plain_uri = url.to_string();
|
||||||
|
|
||||||
let registry = Arc::new(ObjectStoreRegistry::default());
|
let session = Arc::new(lance::session::Session::default());
|
||||||
let os_params = ObjectStoreParams {
|
let os_params = ObjectStoreParams {
|
||||||
storage_options: Some(options.storage_options.clone()),
|
storage_options: Some(options.storage_options.clone()),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let (object_store, base_path) =
|
let (object_store, base_path) = ObjectStore::from_uri_and_params(
|
||||||
ObjectStore::from_uri_and_params(registry, &plain_uri, &os_params).await?;
|
session.store_registry(),
|
||||||
|
&plain_uri,
|
||||||
|
&os_params,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
if object_store.is_local() {
|
if object_store.is_local() {
|
||||||
Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
|
Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
|
||||||
}
|
}
|
||||||
@@ -342,6 +349,7 @@ impl ListingDatabase {
|
|||||||
read_consistency_interval: request.read_consistency_interval,
|
read_consistency_interval: request.read_consistency_interval,
|
||||||
storage_options: options.storage_options,
|
storage_options: options.storage_options,
|
||||||
new_table_config: options.new_table_config,
|
new_table_config: options.new_table_config,
|
||||||
|
session,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
@@ -360,7 +368,13 @@ impl ListingDatabase {
|
|||||||
read_consistency_interval: Option<std::time::Duration>,
|
read_consistency_interval: Option<std::time::Duration>,
|
||||||
new_table_config: NewTableConfig,
|
new_table_config: NewTableConfig,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
let (object_store, base_path) = ObjectStore::from_uri(path).await?;
|
let session = Arc::new(lance::session::Session::default());
|
||||||
|
let (object_store, base_path) = ObjectStore::from_uri_and_params(
|
||||||
|
session.store_registry(),
|
||||||
|
path,
|
||||||
|
&ObjectStoreParams::default(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
if object_store.is_local() {
|
if object_store.is_local() {
|
||||||
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
|
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
|
||||||
}
|
}
|
||||||
@@ -374,6 +388,7 @@ impl ListingDatabase {
|
|||||||
read_consistency_interval,
|
read_consistency_interval,
|
||||||
storage_options: HashMap::new(),
|
storage_options: HashMap::new(),
|
||||||
new_table_config,
|
new_table_config,
|
||||||
|
session,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -441,6 +456,128 @@ impl ListingDatabase {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Inherit storage options from the connection into the target map
|
||||||
|
fn inherit_storage_options(&self, target: &mut HashMap<String, String>) {
|
||||||
|
for (key, value) in self.storage_options.iter() {
|
||||||
|
if !target.contains_key(key) {
|
||||||
|
target.insert(key.clone(), value.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract storage option overrides from the request
|
||||||
|
fn extract_storage_overrides(
|
||||||
|
&self,
|
||||||
|
request: &CreateTableRequest,
|
||||||
|
) -> Result<(Option<LanceFileVersion>, Option<bool>)> {
|
||||||
|
let storage_options = request
|
||||||
|
.write_options
|
||||||
|
.lance_write_params
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|p| p.store_params.as_ref())
|
||||||
|
.and_then(|sp| sp.storage_options.as_ref());
|
||||||
|
|
||||||
|
let storage_version_override = storage_options
|
||||||
|
.and_then(|opts| opts.get(OPT_NEW_TABLE_STORAGE_VERSION))
|
||||||
|
.map(|s| s.parse::<LanceFileVersion>())
|
||||||
|
.transpose()?;
|
||||||
|
|
||||||
|
let v2_manifest_override = storage_options
|
||||||
|
.and_then(|opts| opts.get(OPT_NEW_TABLE_V2_MANIFEST_PATHS))
|
||||||
|
.map(|s| s.parse::<bool>())
|
||||||
|
.transpose()
|
||||||
|
.map_err(|_| Error::InvalidInput {
|
||||||
|
message: "enable_v2_manifest_paths must be a boolean".to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok((storage_version_override, v2_manifest_override))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Prepare write parameters for table creation
|
||||||
|
fn prepare_write_params(
|
||||||
|
&self,
|
||||||
|
request: &CreateTableRequest,
|
||||||
|
storage_version_override: Option<LanceFileVersion>,
|
||||||
|
v2_manifest_override: Option<bool>,
|
||||||
|
) -> lance::dataset::WriteParams {
|
||||||
|
let mut write_params = request
|
||||||
|
.write_options
|
||||||
|
.lance_write_params
|
||||||
|
.clone()
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Only modify the storage options if we actually have something to
|
||||||
|
// inherit. There is a difference between storage_options=None and
|
||||||
|
// storage_options=Some({}). Using storage_options=None will cause the
|
||||||
|
// connection's session store registry to be used. Supplying Some({})
|
||||||
|
// will cause a new connection to be created, and that connection will
|
||||||
|
// be dropped from the cache when python GCs the table object, which
|
||||||
|
// confounds reuse across tables.
|
||||||
|
if !self.storage_options.is_empty() {
|
||||||
|
let storage_options = write_params
|
||||||
|
.store_params
|
||||||
|
.get_or_insert_with(Default::default)
|
||||||
|
.storage_options
|
||||||
|
.get_or_insert_with(Default::default);
|
||||||
|
self.inherit_storage_options(storage_options);
|
||||||
|
}
|
||||||
|
|
||||||
|
write_params.data_storage_version = self
|
||||||
|
.new_table_config
|
||||||
|
.data_storage_version
|
||||||
|
.or(storage_version_override);
|
||||||
|
|
||||||
|
if let Some(enable_v2_manifest_paths) = self
|
||||||
|
.new_table_config
|
||||||
|
.enable_v2_manifest_paths
|
||||||
|
.or(v2_manifest_override)
|
||||||
|
{
|
||||||
|
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
||||||
|
}
|
||||||
|
|
||||||
|
if matches!(&request.mode, CreateTableMode::Overwrite) {
|
||||||
|
write_params.mode = WriteMode::Overwrite;
|
||||||
|
}
|
||||||
|
|
||||||
|
write_params.session = Some(self.session.clone());
|
||||||
|
|
||||||
|
write_params
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle the case where table already exists based on the create mode
|
||||||
|
async fn handle_table_exists(
|
||||||
|
&self,
|
||||||
|
table_name: &str,
|
||||||
|
mode: CreateTableMode,
|
||||||
|
data_schema: &arrow_schema::Schema,
|
||||||
|
) -> Result<Arc<dyn BaseTable>> {
|
||||||
|
match mode {
|
||||||
|
CreateTableMode::Create => Err(Error::TableAlreadyExists {
|
||||||
|
name: table_name.to_string(),
|
||||||
|
}),
|
||||||
|
CreateTableMode::ExistOk(callback) => {
|
||||||
|
let req = OpenTableRequest {
|
||||||
|
name: table_name.to_string(),
|
||||||
|
index_cache_size: None,
|
||||||
|
lance_read_params: None,
|
||||||
|
};
|
||||||
|
let req = (callback)(req);
|
||||||
|
let table = self.open_table(req).await?;
|
||||||
|
|
||||||
|
let table_schema = table.schema().await?;
|
||||||
|
|
||||||
|
if table_schema.as_ref() != data_schema {
|
||||||
|
return Err(Error::Schema {
|
||||||
|
message: "Provided schema does not match existing table schema".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(table)
|
||||||
|
}
|
||||||
|
CreateTableMode::Overwrite => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
@@ -475,50 +612,14 @@ impl Database for ListingDatabase {
|
|||||||
Ok(f)
|
Ok(f)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_table(&self, mut request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
|
async fn create_table(&self, request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||||
let table_uri = self.table_uri(&request.name)?;
|
let table_uri = self.table_uri(&request.name)?;
|
||||||
// Inherit storage options from the connection
|
|
||||||
let storage_options = request
|
|
||||||
.write_options
|
|
||||||
.lance_write_params
|
|
||||||
.get_or_insert_with(Default::default)
|
|
||||||
.store_params
|
|
||||||
.get_or_insert_with(Default::default)
|
|
||||||
.storage_options
|
|
||||||
.get_or_insert_with(Default::default);
|
|
||||||
for (key, value) in self.storage_options.iter() {
|
|
||||||
if !storage_options.contains_key(key) {
|
|
||||||
storage_options.insert(key.clone(), value.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let storage_options = storage_options.clone();
|
let (storage_version_override, v2_manifest_override) =
|
||||||
|
self.extract_storage_overrides(&request)?;
|
||||||
|
|
||||||
let mut write_params = request.write_options.lance_write_params.unwrap_or_default();
|
let write_params =
|
||||||
|
self.prepare_write_params(&request, storage_version_override, v2_manifest_override);
|
||||||
if let Some(storage_version) = &self.new_table_config.data_storage_version {
|
|
||||||
write_params.data_storage_version = Some(*storage_version);
|
|
||||||
} else {
|
|
||||||
// Allow the user to override the storage version via storage options (backwards compatibility)
|
|
||||||
if let Some(data_storage_version) = storage_options.get(OPT_NEW_TABLE_STORAGE_VERSION) {
|
|
||||||
write_params.data_storage_version = Some(data_storage_version.parse()?);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if let Some(enable_v2_manifest_paths) = self.new_table_config.enable_v2_manifest_paths {
|
|
||||||
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
|
||||||
} else {
|
|
||||||
// Allow the user to override the storage version via storage options (backwards compatibility)
|
|
||||||
if let Some(enable_v2_manifest_paths) = storage_options
|
|
||||||
.get(OPT_NEW_TABLE_V2_MANIFEST_PATHS)
|
|
||||||
.map(|s| s.parse::<bool>().unwrap())
|
|
||||||
{
|
|
||||||
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if matches!(&request.mode, CreateTableMode::Overwrite) {
|
|
||||||
write_params.mode = WriteMode::Overwrite;
|
|
||||||
}
|
|
||||||
|
|
||||||
let data_schema = request.data.arrow_schema();
|
let data_schema = request.data.arrow_schema();
|
||||||
|
|
||||||
@@ -533,30 +634,10 @@ impl Database for ListingDatabase {
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(table) => Ok(Arc::new(table)),
|
Ok(table) => Ok(Arc::new(table)),
|
||||||
Err(Error::TableAlreadyExists { name }) => match request.mode {
|
Err(Error::TableAlreadyExists { .. }) => {
|
||||||
CreateTableMode::Create => Err(Error::TableAlreadyExists { name }),
|
self.handle_table_exists(&request.name, request.mode, &data_schema)
|
||||||
CreateTableMode::ExistOk(callback) => {
|
.await
|
||||||
let req = OpenTableRequest {
|
|
||||||
name: request.name.clone(),
|
|
||||||
index_cache_size: None,
|
|
||||||
lance_read_params: None,
|
|
||||||
};
|
|
||||||
let req = (callback)(req);
|
|
||||||
let table = self.open_table(req).await?;
|
|
||||||
|
|
||||||
let table_schema = table.schema().await?;
|
|
||||||
|
|
||||||
if table_schema != data_schema {
|
|
||||||
return Err(Error::Schema {
|
|
||||||
message: "Provided schema does not match existing table schema"
|
|
||||||
.to_string(),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(table)
|
|
||||||
}
|
|
||||||
CreateTableMode::Overwrite => unreachable!(),
|
|
||||||
},
|
|
||||||
Err(err) => Err(err),
|
Err(err) => Err(err),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -564,7 +645,14 @@ impl Database for ListingDatabase {
|
|||||||
async fn open_table(&self, mut request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
async fn open_table(&self, mut request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||||
let table_uri = self.table_uri(&request.name)?;
|
let table_uri = self.table_uri(&request.name)?;
|
||||||
|
|
||||||
// Inherit storage options from the connection
|
// Only modify the storage options if we actually have something to
|
||||||
|
// inherit. There is a difference between storage_options=None and
|
||||||
|
// storage_options=Some({}). Using storage_options=None will cause the
|
||||||
|
// connection's session store registry to be used. Supplying Some({})
|
||||||
|
// will cause a new connection to be created, and that connection will
|
||||||
|
// be dropped from the cache when python GCs the table object, which
|
||||||
|
// confounds reuse across tables.
|
||||||
|
if !self.storage_options.is_empty() {
|
||||||
let storage_options = request
|
let storage_options = request
|
||||||
.lance_read_params
|
.lance_read_params
|
||||||
.get_or_insert_with(Default::default)
|
.get_or_insert_with(Default::default)
|
||||||
@@ -572,10 +660,7 @@ impl Database for ListingDatabase {
|
|||||||
.get_or_insert_with(Default::default)
|
.get_or_insert_with(Default::default)
|
||||||
.storage_options
|
.storage_options
|
||||||
.get_or_insert_with(Default::default);
|
.get_or_insert_with(Default::default);
|
||||||
for (key, value) in self.storage_options.iter() {
|
self.inherit_storage_options(storage_options);
|
||||||
if !storage_options.contains_key(key) {
|
|
||||||
storage_options.insert(key.clone(), value.clone());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Some ReadParams are exposed in the OpenTableBuilder, but we also
|
// Some ReadParams are exposed in the OpenTableBuilder, but we also
|
||||||
@@ -584,13 +669,14 @@ impl Database for ListingDatabase {
|
|||||||
// If we have a user provided ReadParams use that
|
// If we have a user provided ReadParams use that
|
||||||
// If we don't then start with the default ReadParams and customize it with
|
// If we don't then start with the default ReadParams and customize it with
|
||||||
// the options from the OpenTableBuilder
|
// the options from the OpenTableBuilder
|
||||||
let read_params = request.lance_read_params.unwrap_or_else(|| {
|
let mut read_params = request.lance_read_params.unwrap_or_else(|| {
|
||||||
let mut default_params = ReadParams::default();
|
let mut default_params = ReadParams::default();
|
||||||
if let Some(index_cache_size) = request.index_cache_size {
|
if let Some(index_cache_size) = request.index_cache_size {
|
||||||
default_params.index_cache_size = index_cache_size as usize;
|
default_params.index_cache_size = index_cache_size as usize;
|
||||||
}
|
}
|
||||||
default_params
|
default_params
|
||||||
});
|
});
|
||||||
|
read_params.session(self.session.clone());
|
||||||
|
|
||||||
let native_table = Arc::new(
|
let native_table = Arc::new(
|
||||||
NativeTable::open_with_params(
|
NativeTable::open_with_params(
|
||||||
|
|||||||
@@ -107,7 +107,7 @@ impl ObjectStore for MirroringObjectStore {
|
|||||||
self.primary.delete(location).await
|
self.primary.delete(location).await
|
||||||
}
|
}
|
||||||
|
|
||||||
fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
|
fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result<ObjectMeta>> {
|
||||||
self.primary.list(prefix)
|
self.primary.list(prefix)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -119,7 +119,7 @@ impl ObjectStore for IoTrackingStore {
|
|||||||
let result = self.target.get(location).await;
|
let result = self.target.get(location).await;
|
||||||
if let Ok(result) = &result {
|
if let Ok(result) = &result {
|
||||||
let num_bytes = result.range.end - result.range.start;
|
let num_bytes = result.range.end - result.range.start;
|
||||||
self.record_read(num_bytes as u64);
|
self.record_read(num_bytes);
|
||||||
}
|
}
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
@@ -128,12 +128,12 @@ impl ObjectStore for IoTrackingStore {
|
|||||||
let result = self.target.get_opts(location, options).await;
|
let result = self.target.get_opts(location, options).await;
|
||||||
if let Ok(result) = &result {
|
if let Ok(result) = &result {
|
||||||
let num_bytes = result.range.end - result.range.start;
|
let num_bytes = result.range.end - result.range.start;
|
||||||
self.record_read(num_bytes as u64);
|
self.record_read(num_bytes);
|
||||||
}
|
}
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_range(&self, location: &Path, range: std::ops::Range<usize>) -> OSResult<Bytes> {
|
async fn get_range(&self, location: &Path, range: std::ops::Range<u64>) -> OSResult<Bytes> {
|
||||||
let result = self.target.get_range(location, range).await;
|
let result = self.target.get_range(location, range).await;
|
||||||
if let Ok(result) = &result {
|
if let Ok(result) = &result {
|
||||||
self.record_read(result.len() as u64);
|
self.record_read(result.len() as u64);
|
||||||
@@ -144,7 +144,7 @@ impl ObjectStore for IoTrackingStore {
|
|||||||
async fn get_ranges(
|
async fn get_ranges(
|
||||||
&self,
|
&self,
|
||||||
location: &Path,
|
location: &Path,
|
||||||
ranges: &[std::ops::Range<usize>],
|
ranges: &[std::ops::Range<u64>],
|
||||||
) -> OSResult<Vec<Bytes>> {
|
) -> OSResult<Vec<Bytes>> {
|
||||||
let result = self.target.get_ranges(location, ranges).await;
|
let result = self.target.get_ranges(location, ranges).await;
|
||||||
if let Ok(result) = &result {
|
if let Ok(result) = &result {
|
||||||
@@ -170,7 +170,7 @@ impl ObjectStore for IoTrackingStore {
|
|||||||
self.target.delete_stream(locations)
|
self.target.delete_stream(locations)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, OSResult<ObjectMeta>> {
|
fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, OSResult<ObjectMeta>> {
|
||||||
self.record_read(0);
|
self.record_read(0);
|
||||||
self.target.list(prefix)
|
self.target.list(prefix)
|
||||||
}
|
}
|
||||||
@@ -179,7 +179,7 @@ impl ObjectStore for IoTrackingStore {
|
|||||||
&self,
|
&self,
|
||||||
prefix: Option<&Path>,
|
prefix: Option<&Path>,
|
||||||
offset: &Path,
|
offset: &Path,
|
||||||
) -> BoxStream<'_, OSResult<ObjectMeta>> {
|
) -> BoxStream<'static, OSResult<ObjectMeta>> {
|
||||||
self.record_read(0);
|
self.record_read(0);
|
||||||
self.target.list_with_offset(prefix, offset)
|
self.target.list_with_offset(prefix, offset)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -57,6 +57,8 @@ use crate::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
const REQUEST_TIMEOUT_HEADER: HeaderName = HeaderName::from_static("x-request-timeout-ms");
|
const REQUEST_TIMEOUT_HEADER: HeaderName = HeaderName::from_static("x-request-timeout-ms");
|
||||||
|
const METRIC_TYPE_KEY: &str = "metric_type";
|
||||||
|
const INDEX_TYPE_KEY: &str = "index_type";
|
||||||
|
|
||||||
pub struct RemoteTags<'a, S: HttpSend = Sender> {
|
pub struct RemoteTags<'a, S: HttpSend = Sender> {
|
||||||
inner: &'a RemoteTable<S>,
|
inner: &'a RemoteTable<S>,
|
||||||
@@ -997,23 +999,53 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
|||||||
"column": column
|
"column": column
|
||||||
});
|
});
|
||||||
|
|
||||||
let (index_type, distance_type) = match index.index {
|
match index.index {
|
||||||
// TODO: Should we pass the actual index parameters? SaaS does not
|
// TODO: Should we pass the actual index parameters? SaaS does not
|
||||||
// yet support them.
|
// yet support them.
|
||||||
Index::IvfFlat(index) => ("IVF_FLAT", Some(index.distance_type)),
|
Index::IvfFlat(index) => {
|
||||||
Index::IvfPq(index) => ("IVF_PQ", Some(index.distance_type)),
|
body[INDEX_TYPE_KEY] = serde_json::Value::String("IVF_FLAT".to_string());
|
||||||
Index::IvfHnswSq(index) => ("IVF_HNSW_SQ", Some(index.distance_type)),
|
body[METRIC_TYPE_KEY] =
|
||||||
Index::BTree(_) => ("BTREE", None),
|
serde_json::Value::String(index.distance_type.to_string().to_lowercase());
|
||||||
Index::Bitmap(_) => ("BITMAP", None),
|
if let Some(num_partitions) = index.num_partitions {
|
||||||
Index::LabelList(_) => ("LABEL_LIST", None),
|
body["num_partitions"] = serde_json::Value::Number(num_partitions.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Index::IvfPq(index) => {
|
||||||
|
body[INDEX_TYPE_KEY] = serde_json::Value::String("IVF_PQ".to_string());
|
||||||
|
body[METRIC_TYPE_KEY] =
|
||||||
|
serde_json::Value::String(index.distance_type.to_string().to_lowercase());
|
||||||
|
if let Some(num_partitions) = index.num_partitions {
|
||||||
|
body["num_partitions"] = serde_json::Value::Number(num_partitions.into());
|
||||||
|
}
|
||||||
|
if let Some(num_bits) = index.num_bits {
|
||||||
|
body["num_bits"] = serde_json::Value::Number(num_bits.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Index::IvfHnswSq(index) => {
|
||||||
|
body[INDEX_TYPE_KEY] = serde_json::Value::String("IVF_HNSW_SQ".to_string());
|
||||||
|
body[METRIC_TYPE_KEY] =
|
||||||
|
serde_json::Value::String(index.distance_type.to_string().to_lowercase());
|
||||||
|
if let Some(num_partitions) = index.num_partitions {
|
||||||
|
body["num_partitions"] = serde_json::Value::Number(num_partitions.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Index::BTree(_) => {
|
||||||
|
body[INDEX_TYPE_KEY] = serde_json::Value::String("BTREE".to_string());
|
||||||
|
}
|
||||||
|
Index::Bitmap(_) => {
|
||||||
|
body[INDEX_TYPE_KEY] = serde_json::Value::String("BITMAP".to_string());
|
||||||
|
}
|
||||||
|
Index::LabelList(_) => {
|
||||||
|
body[INDEX_TYPE_KEY] = serde_json::Value::String("LABEL_LIST".to_string());
|
||||||
|
}
|
||||||
Index::FTS(fts) => {
|
Index::FTS(fts) => {
|
||||||
|
body[INDEX_TYPE_KEY] = serde_json::Value::String("FTS".to_string());
|
||||||
let params = serde_json::to_value(&fts).map_err(|e| Error::InvalidInput {
|
let params = serde_json::to_value(&fts).map_err(|e| Error::InvalidInput {
|
||||||
message: format!("failed to serialize FTS index params {:?}", e),
|
message: format!("failed to serialize FTS index params {:?}", e),
|
||||||
})?;
|
})?;
|
||||||
for (key, value) in params.as_object().unwrap() {
|
for (key, value) in params.as_object().unwrap() {
|
||||||
body[key] = value.clone();
|
body[key] = value.clone();
|
||||||
}
|
}
|
||||||
("FTS", None)
|
|
||||||
}
|
}
|
||||||
Index::Auto => {
|
Index::Auto => {
|
||||||
let schema = self.schema().await?;
|
let schema = self.schema().await?;
|
||||||
@@ -1023,9 +1055,11 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
|||||||
message: format!("Column {} not found in schema", column),
|
message: format!("Column {} not found in schema", column),
|
||||||
})?;
|
})?;
|
||||||
if supported_vector_data_type(field.data_type()) {
|
if supported_vector_data_type(field.data_type()) {
|
||||||
("IVF_PQ", Some(DistanceType::L2))
|
body[INDEX_TYPE_KEY] = serde_json::Value::String("IVF_PQ".to_string());
|
||||||
|
body[METRIC_TYPE_KEY] =
|
||||||
|
serde_json::Value::String(DistanceType::L2.to_string().to_lowercase());
|
||||||
} else if supported_btree_data_type(field.data_type()) {
|
} else if supported_btree_data_type(field.data_type()) {
|
||||||
("BTREE", None)
|
body[INDEX_TYPE_KEY] = serde_json::Value::String("BTREE".to_string());
|
||||||
} else {
|
} else {
|
||||||
return Err(Error::NotSupported {
|
return Err(Error::NotSupported {
|
||||||
message: format!(
|
message: format!(
|
||||||
@@ -1042,12 +1076,6 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
body["index_type"] = serde_json::Value::String(index_type.into());
|
|
||||||
if let Some(distance_type) = distance_type {
|
|
||||||
// Phalanx expects this to be lowercase right now.
|
|
||||||
body["metric_type"] =
|
|
||||||
serde_json::Value::String(distance_type.to_string().to_lowercase());
|
|
||||||
}
|
|
||||||
|
|
||||||
let request = request.json(&body);
|
let request = request.json(&body);
|
||||||
|
|
||||||
@@ -1429,11 +1457,12 @@ mod tests {
|
|||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
||||||
use lance_index::scalar::inverted::query::MatchQuery;
|
use lance_index::scalar::inverted::query::MatchQuery;
|
||||||
use lance_index::scalar::FullTextSearchQuery;
|
use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams};
|
||||||
use reqwest::Body;
|
use reqwest::Body;
|
||||||
use rstest::rstest;
|
use rstest::rstest;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
use crate::index::vector::IvfFlatIndexBuilder;
|
use crate::index::vector::{IvfFlatIndexBuilder, IvfHnswSqIndexBuilder};
|
||||||
use crate::remote::db::DEFAULT_SERVER_VERSION;
|
use crate::remote::db::DEFAULT_SERVER_VERSION;
|
||||||
use crate::remote::JSON_CONTENT_TYPE;
|
use crate::remote::JSON_CONTENT_TYPE;
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -2433,29 +2462,79 @@ mod tests {
|
|||||||
let cases = [
|
let cases = [
|
||||||
(
|
(
|
||||||
"IVF_FLAT",
|
"IVF_FLAT",
|
||||||
Some("hamming"),
|
json!({
|
||||||
|
"metric_type": "hamming",
|
||||||
|
}),
|
||||||
Index::IvfFlat(IvfFlatIndexBuilder::default().distance_type(DistanceType::Hamming)),
|
Index::IvfFlat(IvfFlatIndexBuilder::default().distance_type(DistanceType::Hamming)),
|
||||||
),
|
),
|
||||||
("IVF_PQ", Some("l2"), Index::IvfPq(Default::default())),
|
(
|
||||||
|
"IVF_FLAT",
|
||||||
|
json!({
|
||||||
|
"metric_type": "hamming",
|
||||||
|
"num_partitions": 128,
|
||||||
|
}),
|
||||||
|
Index::IvfFlat(
|
||||||
|
IvfFlatIndexBuilder::default()
|
||||||
|
.distance_type(DistanceType::Hamming)
|
||||||
|
.num_partitions(128),
|
||||||
|
),
|
||||||
|
),
|
||||||
(
|
(
|
||||||
"IVF_PQ",
|
"IVF_PQ",
|
||||||
Some("cosine"),
|
json!({
|
||||||
Index::IvfPq(IvfPqIndexBuilder::default().distance_type(DistanceType::Cosine)),
|
"metric_type": "l2",
|
||||||
|
}),
|
||||||
|
Index::IvfPq(Default::default()),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"IVF_PQ",
|
||||||
|
json!({
|
||||||
|
"metric_type": "cosine",
|
||||||
|
"num_partitions": 128,
|
||||||
|
"num_bits": 4,
|
||||||
|
}),
|
||||||
|
Index::IvfPq(
|
||||||
|
IvfPqIndexBuilder::default()
|
||||||
|
.distance_type(DistanceType::Cosine)
|
||||||
|
.num_partitions(128)
|
||||||
|
.num_bits(4),
|
||||||
|
),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"IVF_HNSW_SQ",
|
"IVF_HNSW_SQ",
|
||||||
Some("l2"),
|
json!({
|
||||||
|
"metric_type": "l2",
|
||||||
|
}),
|
||||||
Index::IvfHnswSq(Default::default()),
|
Index::IvfHnswSq(Default::default()),
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
"IVF_HNSW_SQ",
|
||||||
|
json!({
|
||||||
|
"metric_type": "l2",
|
||||||
|
"num_partitions": 128,
|
||||||
|
}),
|
||||||
|
Index::IvfHnswSq(
|
||||||
|
IvfHnswSqIndexBuilder::default()
|
||||||
|
.distance_type(DistanceType::L2)
|
||||||
|
.num_partitions(128),
|
||||||
|
),
|
||||||
|
),
|
||||||
// HNSW_PQ isn't yet supported on SaaS
|
// HNSW_PQ isn't yet supported on SaaS
|
||||||
("BTREE", None, Index::BTree(Default::default())),
|
("BTREE", json!({}), Index::BTree(Default::default())),
|
||||||
("BITMAP", None, Index::Bitmap(Default::default())),
|
("BITMAP", json!({}), Index::Bitmap(Default::default())),
|
||||||
("LABEL_LIST", None, Index::LabelList(Default::default())),
|
(
|
||||||
("FTS", None, Index::FTS(Default::default())),
|
"LABEL_LIST",
|
||||||
|
json!({}),
|
||||||
|
Index::LabelList(Default::default()),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"FTS",
|
||||||
|
serde_json::to_value(InvertedIndexParams::default()).unwrap(),
|
||||||
|
Index::FTS(Default::default()),
|
||||||
|
),
|
||||||
];
|
];
|
||||||
|
|
||||||
for (index_type, distance_type, index) in cases {
|
for (index_type, expected_body, index) in cases {
|
||||||
let params = index.clone();
|
|
||||||
let table = Table::new_with_handler("my_table", move |request| {
|
let table = Table::new_with_handler("my_table", move |request| {
|
||||||
assert_eq!(request.method(), "POST");
|
assert_eq!(request.method(), "POST");
|
||||||
assert_eq!(request.url().path(), "/v1/table/my_table/create_index/");
|
assert_eq!(request.url().path(), "/v1/table/my_table/create_index/");
|
||||||
@@ -2465,19 +2544,9 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let body = request.body().unwrap().as_bytes().unwrap();
|
let body = request.body().unwrap().as_bytes().unwrap();
|
||||||
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||||
let mut expected_body = serde_json::json!({
|
let mut expected_body = expected_body.clone();
|
||||||
"column": "a",
|
expected_body["column"] = "a".into();
|
||||||
"index_type": index_type,
|
expected_body[INDEX_TYPE_KEY] = index_type.into();
|
||||||
});
|
|
||||||
if let Some(distance_type) = distance_type {
|
|
||||||
expected_body["metric_type"] = distance_type.to_lowercase().into();
|
|
||||||
}
|
|
||||||
if let Index::FTS(fts) = ¶ms {
|
|
||||||
let params = serde_json::to_value(fts).unwrap();
|
|
||||||
for (key, value) in params.as_object().unwrap() {
|
|
||||||
expected_body[key] = value.clone();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert_eq!(body, expected_body);
|
assert_eq!(body, expected_body);
|
||||||
|
|
||||||
|
|||||||
@@ -392,9 +392,18 @@ pub mod tests {
|
|||||||
} else {
|
} else {
|
||||||
expected_line.trim()
|
expected_line.trim()
|
||||||
};
|
};
|
||||||
assert_eq!(&actual_trimmed[..expected_trimmed.len()], expected_trimmed);
|
assert_eq!(
|
||||||
|
&actual_trimmed[..expected_trimmed.len()],
|
||||||
|
expected_trimmed,
|
||||||
|
"\nactual:\n{physical_plan}\nexpected:\n{expected}"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
assert_eq!(lines_checked, expected.lines().count());
|
assert_eq!(
|
||||||
|
lines_checked,
|
||||||
|
expected.lines().count(),
|
||||||
|
"\nlines_checked:\n{lines_checked}\nexpected:\n{}",
|
||||||
|
expected.lines().count()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -477,9 +486,9 @@ pub mod tests {
|
|||||||
TestFixture::check_plan(
|
TestFixture::check_plan(
|
||||||
plan,
|
plan,
|
||||||
"MetadataEraserExec
|
"MetadataEraserExec
|
||||||
RepartitionExec:...
|
|
||||||
CoalesceBatchesExec:...
|
CoalesceBatchesExec:...
|
||||||
FilterExec: i@0 >= 5
|
FilterExec: i@0 >= 5
|
||||||
|
RepartitionExec:...
|
||||||
ProjectionExec:...
|
ProjectionExec:...
|
||||||
LanceScan:...",
|
LanceScan:...",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -129,8 +129,10 @@ impl DatasetRef {
|
|||||||
dataset: ref mut ds,
|
dataset: ref mut ds,
|
||||||
..
|
..
|
||||||
} => {
|
} => {
|
||||||
|
if dataset.manifest().version > ds.manifest().version {
|
||||||
*ds = dataset;
|
*ds = dataset;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
_ => unreachable!("Dataset should be in latest mode at this point"),
|
_ => unreachable!("Dataset should be in latest mode at this point"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -281,6 +281,46 @@ async fn test_encryption() -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_table_storage_options_override() -> Result<()> {
|
||||||
|
// Test that table-level storage options override connection-level options
|
||||||
|
let bucket = S3Bucket::new("test-override").await;
|
||||||
|
let key1 = KMSKey::new().await;
|
||||||
|
let key2 = KMSKey::new().await;
|
||||||
|
|
||||||
|
let uri = format!("s3://{}", bucket.0);
|
||||||
|
|
||||||
|
// Create connection with key1 encryption
|
||||||
|
let db = lancedb::connect(&uri)
|
||||||
|
.storage_options(CONFIG.iter().cloned())
|
||||||
|
.storage_option("aws_server_side_encryption", "aws:kms")
|
||||||
|
.storage_option("aws_sse_kms_key_id", &key1.0)
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Create table overriding with key2 encryption
|
||||||
|
let data = test_data();
|
||||||
|
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
||||||
|
let _table = db
|
||||||
|
.create_table("test_override", data)
|
||||||
|
.storage_option("aws_sse_kms_key_id", &key2.0)
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Verify objects are encrypted with key2, not key1
|
||||||
|
validate_objects_encrypted(&bucket.0, "test_override", &key2.0).await;
|
||||||
|
|
||||||
|
// Also test that a table created without override uses connection settings
|
||||||
|
let data = test_data();
|
||||||
|
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
|
||||||
|
let _table2 = db.create_table("test_inherit", data).execute().await?;
|
||||||
|
|
||||||
|
// Verify this table uses key1 from connection
|
||||||
|
validate_objects_encrypted(&bucket.0, "test_inherit", &key1.0).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
struct DynamoDBCommitTable(String);
|
struct DynamoDBCommitTable(String);
|
||||||
|
|
||||||
impl DynamoDBCommitTable {
|
impl DynamoDBCommitTable {
|
||||||
|
|||||||
Reference in New Issue
Block a user