mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 07:09:57 +00:00
Compare commits
18 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1a66df2627 | ||
|
|
44670076c1 | ||
|
|
92f0b16e46 | ||
|
|
1620ba3508 | ||
|
|
3ae90dde80 | ||
|
|
4f07fea6df | ||
|
|
3d7d82cf86 | ||
|
|
edc4e40a7b | ||
|
|
ca3806a02f | ||
|
|
35cff12e31 | ||
|
|
c6c20cb2bd | ||
|
|
26080ee4c1 | ||
|
|
ef3a2b5357 | ||
|
|
c42a201389 | ||
|
|
24e42ccd4d | ||
|
|
8a50944061 | ||
|
|
40e066bc7c | ||
|
|
b3ad105fa0 |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.19.0-beta.6"
|
current_version = "0.19.0-beta.8"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
4
.github/workflows/python.yml
vendored
4
.github/workflows/python.yml
vendored
@@ -136,9 +136,9 @@ jobs:
|
|||||||
- uses: ./.github/workflows/run_tests
|
- uses: ./.github/workflows/run_tests
|
||||||
with:
|
with:
|
||||||
integration: true
|
integration: true
|
||||||
- name: Test without pylance
|
- name: Test without pylance or pandas
|
||||||
run: |
|
run: |
|
||||||
pip uninstall -y pylance
|
pip uninstall -y pylance pandas
|
||||||
pytest -vv python/tests/test_table.py
|
pytest -vv python/tests/test_table.py
|
||||||
# Make sure wheels are not included in the Rust cache
|
# Make sure wheels are not included in the Rust cache
|
||||||
- name: Delete wheels
|
- name: Delete wheels
|
||||||
|
|||||||
61
Cargo.lock
generated
61
Cargo.lock
generated
@@ -2721,8 +2721,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fsst"
|
name = "fsst"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
]
|
]
|
||||||
@@ -3711,8 +3711,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance"
|
name = "lance"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -3738,6 +3738,7 @@ dependencies = [
|
|||||||
"deepsize",
|
"deepsize",
|
||||||
"futures",
|
"futures",
|
||||||
"half",
|
"half",
|
||||||
|
"humantime",
|
||||||
"itertools 0.13.0",
|
"itertools 0.13.0",
|
||||||
"lance-arrow",
|
"lance-arrow",
|
||||||
"lance-core",
|
"lance-core",
|
||||||
@@ -3771,8 +3772,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-arrow"
|
name = "lance-arrow"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -3789,8 +3790,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-core"
|
name = "lance-core"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -3826,8 +3827,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-datafusion"
|
name = "lance-datafusion"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -3854,8 +3855,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-datagen"
|
name = "lance-datagen"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -3870,8 +3871,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-encoding"
|
name = "lance-encoding"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrayref",
|
"arrayref",
|
||||||
"arrow",
|
"arrow",
|
||||||
@@ -3910,8 +3911,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-file"
|
name = "lance-file"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -3945,8 +3946,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-index"
|
name = "lance-index"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -3999,8 +4000,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-io"
|
name = "lance-io"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -4038,8 +4039,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-linalg"
|
name = "lance-linalg"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-ord",
|
"arrow-ord",
|
||||||
@@ -4062,8 +4063,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-table"
|
name = "lance-table"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4102,8 +4103,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-testing"
|
name = "lance-testing"
|
||||||
version = "0.25.3"
|
version = "0.26.0"
|
||||||
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
|
source = "git+https://github.com/lancedb/lance?tag=v0.26.0-beta.1#8e46047e2dcb171bec28e28b507a9b7858348773"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
@@ -4114,7 +4115,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.19.0-beta.6"
|
version = "0.19.0-beta.8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4201,7 +4202,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.19.0-beta.6"
|
version = "0.19.0-beta.8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-ipc",
|
"arrow-ipc",
|
||||||
@@ -4226,7 +4227,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
version = "0.19.0-beta.6"
|
version = "0.19.0-beta.8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-ipc",
|
"arrow-ipc",
|
||||||
@@ -4244,7 +4245,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.22.0-beta.6"
|
version = "0.22.0-beta.8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
|
|||||||
18
Cargo.toml
18
Cargo.toml
@@ -21,16 +21,16 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.25.3", "features" = [
|
lance = { "version" = "=0.26.0", "features" = [
|
||||||
"dynamodb",
|
"dynamodb",
|
||||||
], tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
], tag = "v0.26.0-beta.1", git = "https://github.com/lancedb/lance" }
|
||||||
lance-io = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
lance-io = { version = "=0.26.0", tag = "v0.26.0-beta.1", git = "https://github.com/lancedb/lance" }
|
||||||
lance-index = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
lance-index = { version = "=0.26.0", tag = "v0.26.0-beta.1", git = "https://github.com/lancedb/lance" }
|
||||||
lance-linalg = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
lance-linalg = { version = "=0.26.0", tag = "v0.26.0-beta.1", git = "https://github.com/lancedb/lance" }
|
||||||
lance-table = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
lance-table = { version = "=0.26.0", tag = "v0.26.0-beta.1", git = "https://github.com/lancedb/lance" }
|
||||||
lance-testing = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
lance-testing = { version = "=0.26.0", tag = "v0.26.0-beta.1", git = "https://github.com/lancedb/lance" }
|
||||||
lance-datafusion = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
lance-datafusion = { version = "=0.26.0", tag = "v0.26.0-beta.1", git = "https://github.com/lancedb/lance" }
|
||||||
lance-encoding = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
lance-encoding = { version = "=0.26.0", tag = "v0.26.0-beta.1", git = "https://github.com/lancedb/lance" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "54.1", optional = false }
|
arrow = { version = "54.1", optional = false }
|
||||||
arrow-array = "54.1"
|
arrow-array = "54.1"
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
LanceDB docs are deployed to https://lancedb.github.io/lancedb/.
|
LanceDB docs are deployed to https://lancedb.github.io/lancedb/.
|
||||||
|
|
||||||
Docs is built and deployed automatically by [Github Actions](.github/workflows/docs.yml)
|
Docs is built and deployed automatically by [Github Actions](../.github/workflows/docs.yml)
|
||||||
whenever a commit is pushed to the `main` branch. So it is possible for the docs to show
|
whenever a commit is pushed to the `main` branch. So it is possible for the docs to show
|
||||||
unreleased features.
|
unreleased features.
|
||||||
|
|
||||||
|
|||||||
@@ -342,7 +342,7 @@ For **read and write access**, LanceDB will need a policy such as:
|
|||||||
"Action": [
|
"Action": [
|
||||||
"s3:PutObject",
|
"s3:PutObject",
|
||||||
"s3:GetObject",
|
"s3:GetObject",
|
||||||
"s3:DeleteObject",
|
"s3:DeleteObject"
|
||||||
],
|
],
|
||||||
"Resource": "arn:aws:s3:::<bucket>/<prefix>/*"
|
"Resource": "arn:aws:s3:::<bucket>/<prefix>/*"
|
||||||
},
|
},
|
||||||
@@ -374,7 +374,7 @@ For **read-only access**, LanceDB will need a policy such as:
|
|||||||
{
|
{
|
||||||
"Effect": "Allow",
|
"Effect": "Allow",
|
||||||
"Action": [
|
"Action": [
|
||||||
"s3:GetObject",
|
"s3:GetObject"
|
||||||
],
|
],
|
||||||
"Resource": "arn:aws:s3:::<bucket>/<prefix>/*"
|
"Resource": "arn:aws:s3:::<bucket>/<prefix>/*"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -765,7 +765,10 @@ This can be used to update zero to all rows depending on how many rows match the
|
|||||||
];
|
];
|
||||||
const tbl = await db.createTable("my_table", data)
|
const tbl = await db.createTable("my_table", data)
|
||||||
|
|
||||||
await tbl.update({vector: [10, 10]}, { where: "x = 2"})
|
await tbl.update({
|
||||||
|
values: { vector: [10, 10] },
|
||||||
|
where: "x = 2"
|
||||||
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "vectordb (deprecated)"
|
=== "vectordb (deprecated)"
|
||||||
@@ -784,7 +787,10 @@ This can be used to update zero to all rows depending on how many rows match the
|
|||||||
];
|
];
|
||||||
const tbl = await db.createTable("my_table", data)
|
const tbl = await db.createTable("my_table", data)
|
||||||
|
|
||||||
await tbl.update({ where: "x = 2", values: {vector: [10, 10]} })
|
await tbl.update({
|
||||||
|
where: "x = 2",
|
||||||
|
values: { vector: [10, 10] }
|
||||||
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Updating using a sql query
|
#### Updating using a sql query
|
||||||
|
|||||||
@@ -454,6 +454,28 @@ Modeled after ``VACUUM`` in PostgreSQL.
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### prewarmIndex()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
abstract prewarmIndex(name): Promise<void>
|
||||||
|
```
|
||||||
|
|
||||||
|
Prewarm an index in the table.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **name**: `string`
|
||||||
|
The name of the index.
|
||||||
|
This will load the index into memory. This may reduce the cold-start time for
|
||||||
|
future queries. If the index does not fit in the cache then this call may be
|
||||||
|
wasteful.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`Promise`<`void`>
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### query()
|
### query()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
@@ -731,3 +753,26 @@ Retrieve the version of the table
|
|||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
`Promise`<`number`>
|
`Promise`<`number`>
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### waitForIndex()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
abstract waitForIndex(indexNames, timeoutSeconds): Promise<void>
|
||||||
|
```
|
||||||
|
|
||||||
|
Waits for asynchronous indexing to complete on the table.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **indexNames**: `string`[]
|
||||||
|
The name of the indices to wait for
|
||||||
|
|
||||||
|
* **timeoutSeconds**: `number`
|
||||||
|
The number of seconds to wait before timing out
|
||||||
|
This will raise an error if the indices are not created and fully indexed within the timeout.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`Promise`<`void`>
|
||||||
|
|||||||
@@ -39,3 +39,11 @@ and the same name, then an error will be returned. This is true even if
|
|||||||
that index is out of date.
|
that index is out of date.
|
||||||
|
|
||||||
The default is true
|
The default is true
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### waitTimeoutSeconds?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional waitTimeoutSeconds: number;
|
||||||
|
```
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.19.0-beta.6</version>
|
<version>0.19.0-beta.8</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.19.0-beta.6</version>
|
<version>0.19.0-beta.8</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
||||||
<name>LanceDB Parent</name>
|
<name>LanceDB Parent</name>
|
||||||
|
|||||||
44
node/package-lock.json
generated
44
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -52,11 +52,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.19.0-beta.6",
|
"@lancedb/vectordb-darwin-arm64": "0.19.0-beta.8",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.19.0-beta.6",
|
"@lancedb/vectordb-darwin-x64": "0.19.0-beta.8",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.19.0-beta.6",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.19.0-beta.8",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.19.0-beta.6",
|
"@lancedb/vectordb-linux-x64-gnu": "0.19.0-beta.8",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.19.0-beta.6"
|
"@lancedb/vectordb-win32-x64-msvc": "0.19.0-beta.8"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
@@ -327,9 +327,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.0-beta.6.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.0-beta.8.tgz",
|
||||||
"integrity": "sha512-fujUe3Gt1n1vgxXMDaUatZEQICh9VAmj1CJK/gQCMZo9ky/MH1TnxP0nA6hN7fkRvl28C2Ms2adlTdlnTxLSlw==",
|
"integrity": "sha512-zNKTlHemHUyU3+WtIQ029tZSl5C5hXWvwI073kfKuYOWGSRZeOcrU8WAuS9b17nfFD40X28YUD5qPB10GbMrNQ==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
@@ -340,9 +340,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.0-beta.6.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.0-beta.8.tgz",
|
||||||
"integrity": "sha512-ZKUvPwKvnK5WfyCR3Asbm1XXXA5JWYfDVD2ovPU/mv/rqoroYEpxm7TH1OG8AQ8bvBmrCmPc0sPJP5kijd6BFg==",
|
"integrity": "sha512-OdnduXdX5ZTZd2s+5wW5gssDYQKwEfUKxjOWOjjLS8SQeTlPM6pI0z9QP9K1sipbTYpYoCgokr5+PKKhvMPezw==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
@@ -353,9 +353,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.0-beta.6.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.0-beta.8.tgz",
|
||||||
"integrity": "sha512-m4DuGCEhEAy+EtamSBMF1ujiVkpJD3ybF/Yp1pYYo9FTFThczAeRiyUg7diRZYfahZExKsATj62PqHXNVo8x9A==",
|
"integrity": "sha512-9Y52zhZYFbgCJA3Vxj8EFnZ8lVuvqAJNapQPo7bH56ZgnEcAnWikk8yWwT63PtI22T6XOcj1hWWYfWKrUXMggg==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
@@ -366,9 +366,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.0-beta.6.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.0-beta.8.tgz",
|
||||||
"integrity": "sha512-npUR23GZJDVfkPUPtaxLuYUeqyAQ/vcp4R7RjCSdBo+hJNiQAG4TX31YAE8OKnOGskEO7XJ3BgEAxM+upiNmnA==",
|
"integrity": "sha512-e0H+gSkvMGYx2DPcriXwwkALvZtmbWNtdpMAZceS8qHYv7xMtUPXG86od5vTbhKTrnC2hJLVj5E3JcAs8sJn6w==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
@@ -379,9 +379,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.0-beta.6.tgz",
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.0-beta.8.tgz",
|
||||||
"integrity": "sha512-Ebas+phT0D7NoB1e3lMZn5h7WVyT5pPIwO1Kk1cZ93V4zaxn2BQRwjLTLxJwR9G+emQoLv659Ze0NtnFuEbXaA==",
|
"integrity": "sha512-olQKVpoWKJWOuVsFM92hmtHYFpCtITiKhUQ8gZu7ngrgLe7ofAASyqvWp5THV2zSXpwYITqrYjHOrtLy1/I9Jw==",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64"
|
"x64"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"private": false,
|
"private": false,
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
@@ -89,10 +89,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-x64": "0.19.0-beta.6",
|
"@lancedb/vectordb-darwin-x64": "0.19.0-beta.8",
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.19.0-beta.6",
|
"@lancedb/vectordb-darwin-arm64": "0.19.0-beta.8",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.19.0-beta.6",
|
"@lancedb/vectordb-linux-x64-gnu": "0.19.0-beta.8",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.19.0-beta.6",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.19.0-beta.8",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.19.0-beta.6"
|
"@lancedb/vectordb-win32-x64-msvc": "0.19.0-beta.8"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.19.0-beta.6"
|
version = "0.19.0-beta.8"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -507,6 +507,15 @@ describe("When creating an index", () => {
|
|||||||
expect(indices2.length).toBe(0);
|
expect(indices2.length).toBe(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should wait for index readiness", async () => {
|
||||||
|
// Create an index and then wait for it to be ready
|
||||||
|
await tbl.createIndex("vec");
|
||||||
|
const indices = await tbl.listIndices();
|
||||||
|
expect(indices.length).toBeGreaterThan(0);
|
||||||
|
const idxName = indices[0].name;
|
||||||
|
await expect(tbl.waitForIndex([idxName], 5)).resolves.toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
it("should search with distance range", async () => {
|
it("should search with distance range", async () => {
|
||||||
await tbl.createIndex("vec");
|
await tbl.createIndex("vec");
|
||||||
|
|
||||||
@@ -824,6 +833,7 @@ describe("When creating an index", () => {
|
|||||||
// Only build index over v1
|
// Only build index over v1
|
||||||
await tbl.createIndex("vec", {
|
await tbl.createIndex("vec", {
|
||||||
config: Index.ivfPq({ numPartitions: 2, numSubVectors: 2 }),
|
config: Index.ivfPq({ numPartitions: 2, numSubVectors: 2 }),
|
||||||
|
waitTimeoutSeconds: 30,
|
||||||
});
|
});
|
||||||
|
|
||||||
const rst = await tbl
|
const rst = await tbl
|
||||||
@@ -1312,6 +1322,28 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(results2[0].text).toBe(data[1].text);
|
expect(results2[0].text).toBe(data[1].text);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("prewarm full text search index", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const data = [
|
||||||
|
{ text: ["lance database", "the", "search"], vector: [0.1, 0.2, 0.3] },
|
||||||
|
{ text: ["lance database"], vector: [0.4, 0.5, 0.6] },
|
||||||
|
{ text: ["lance", "search"], vector: [0.7, 0.8, 0.9] },
|
||||||
|
{ text: ["database", "search"], vector: [1.0, 1.1, 1.2] },
|
||||||
|
{ text: ["unrelated", "doc"], vector: [1.3, 1.4, 1.5] },
|
||||||
|
];
|
||||||
|
const table = await db.createTable("test", data);
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts(),
|
||||||
|
});
|
||||||
|
|
||||||
|
// For the moment, we just confirm we can call prewarmIndex without error
|
||||||
|
// and still search it afterwards
|
||||||
|
await table.prewarmIndex("text_idx");
|
||||||
|
|
||||||
|
const results = await table.search("lance").toArray();
|
||||||
|
expect(results.length).toBe(3);
|
||||||
|
});
|
||||||
|
|
||||||
test("full text index on list", async () => {
|
test("full text index on list", async () => {
|
||||||
const db = await connect(tmpDir.name);
|
const db = await connect(tmpDir.name);
|
||||||
const data = [
|
const data = [
|
||||||
|
|||||||
@@ -681,4 +681,6 @@ export interface IndexOptions {
|
|||||||
* The default is true
|
* The default is true
|
||||||
*/
|
*/
|
||||||
replace?: boolean;
|
replace?: boolean;
|
||||||
|
|
||||||
|
waitTimeoutSeconds?: number;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -235,6 +235,30 @@ export abstract class Table {
|
|||||||
*/
|
*/
|
||||||
abstract dropIndex(name: string): Promise<void>;
|
abstract dropIndex(name: string): Promise<void>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prewarm an index in the table.
|
||||||
|
*
|
||||||
|
* @param name The name of the index.
|
||||||
|
*
|
||||||
|
* This will load the index into memory. This may reduce the cold-start time for
|
||||||
|
* future queries. If the index does not fit in the cache then this call may be
|
||||||
|
* wasteful.
|
||||||
|
*/
|
||||||
|
abstract prewarmIndex(name: string): Promise<void>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Waits for asynchronous indexing to complete on the table.
|
||||||
|
*
|
||||||
|
* @param indexNames The name of the indices to wait for
|
||||||
|
* @param timeoutSeconds The number of seconds to wait before timing out
|
||||||
|
*
|
||||||
|
* This will raise an error if the indices are not created and fully indexed within the timeout.
|
||||||
|
*/
|
||||||
|
abstract waitForIndex(
|
||||||
|
indexNames: string[],
|
||||||
|
timeoutSeconds: number,
|
||||||
|
): Promise<void>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a {@link Query} Builder.
|
* Create a {@link Query} Builder.
|
||||||
*
|
*
|
||||||
@@ -558,13 +582,29 @@ export class LocalTable extends Table {
|
|||||||
// Bit of a hack to get around the fact that TS has no package-scope.
|
// Bit of a hack to get around the fact that TS has no package-scope.
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||||
const nativeIndex = (options?.config as any)?.inner;
|
const nativeIndex = (options?.config as any)?.inner;
|
||||||
await this.inner.createIndex(nativeIndex, column, options?.replace);
|
await this.inner.createIndex(
|
||||||
|
nativeIndex,
|
||||||
|
column,
|
||||||
|
options?.replace,
|
||||||
|
options?.waitTimeoutSeconds,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async dropIndex(name: string): Promise<void> {
|
async dropIndex(name: string): Promise<void> {
|
||||||
await this.inner.dropIndex(name);
|
await this.inner.dropIndex(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async prewarmIndex(name: string): Promise<void> {
|
||||||
|
await this.inner.prewarmIndex(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
async waitForIndex(
|
||||||
|
indexNames: string[],
|
||||||
|
timeoutSeconds: number,
|
||||||
|
): Promise<void> {
|
||||||
|
await this.inner.waitForIndex(indexNames, timeoutSeconds);
|
||||||
|
}
|
||||||
|
|
||||||
query(): Query {
|
query(): Query {
|
||||||
return new Query(this.inner);
|
return new Query(this.inner);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.19.0-beta.6",
|
"version": "0.19.0-beta.8",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
|
|||||||
@@ -327,6 +327,7 @@ impl JsFullTextQuery {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[napi(factory)]
|
#[napi(factory)]
|
||||||
|
#[allow(clippy::use_self)] // NAPI doesn't allow Self here but clippy reports it
|
||||||
pub fn boost_query(
|
pub fn boost_query(
|
||||||
positive: &JsFullTextQuery,
|
positive: &JsFullTextQuery,
|
||||||
negative: &JsFullTextQuery,
|
negative: &JsFullTextQuery,
|
||||||
@@ -349,11 +350,8 @@ impl JsFullTextQuery {
|
|||||||
boosts: Option<Vec<f64>>,
|
boosts: Option<Vec<f64>>,
|
||||||
) -> napi::Result<Self> {
|
) -> napi::Result<Self> {
|
||||||
let q = match boosts {
|
let q = match boosts {
|
||||||
Some(boosts) => MultiMatchQuery::try_new_with_boosts(
|
Some(boosts) => MultiMatchQuery::try_new(query, columns)
|
||||||
query,
|
.and_then(|q| q.try_with_boosts(boosts.into_iter().map(|v| v as f32).collect())),
|
||||||
columns,
|
|
||||||
boosts.into_iter().map(|v| v as f32).collect(),
|
|
||||||
),
|
|
||||||
None => MultiMatchQuery::try_new(query, columns),
|
None => MultiMatchQuery::try_new(query, columns),
|
||||||
}
|
}
|
||||||
.map_err(|e| {
|
.map_err(|e| {
|
||||||
|
|||||||
@@ -111,6 +111,7 @@ impl Table {
|
|||||||
index: Option<&Index>,
|
index: Option<&Index>,
|
||||||
column: String,
|
column: String,
|
||||||
replace: Option<bool>,
|
replace: Option<bool>,
|
||||||
|
wait_timeout_s: Option<i64>,
|
||||||
) -> napi::Result<()> {
|
) -> napi::Result<()> {
|
||||||
let lancedb_index = if let Some(index) = index {
|
let lancedb_index = if let Some(index) = index {
|
||||||
index.consume()?
|
index.consume()?
|
||||||
@@ -121,6 +122,10 @@ impl Table {
|
|||||||
if let Some(replace) = replace {
|
if let Some(replace) = replace {
|
||||||
builder = builder.replace(replace);
|
builder = builder.replace(replace);
|
||||||
}
|
}
|
||||||
|
if let Some(timeout) = wait_timeout_s {
|
||||||
|
builder =
|
||||||
|
builder.wait_timeout(std::time::Duration::from_secs(timeout.try_into().unwrap()));
|
||||||
|
}
|
||||||
builder.execute().await.default_error()
|
builder.execute().await.default_error()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -132,6 +137,26 @@ impl Table {
|
|||||||
.default_error()
|
.default_error()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(catch_unwind)]
|
||||||
|
pub async fn prewarm_index(&self, index_name: String) -> napi::Result<()> {
|
||||||
|
self.inner_ref()?
|
||||||
|
.prewarm_index(&index_name)
|
||||||
|
.await
|
||||||
|
.default_error()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi(catch_unwind)]
|
||||||
|
pub async fn wait_for_index(&self, index_names: Vec<String>, timeout_s: i64) -> Result<()> {
|
||||||
|
let timeout = std::time::Duration::from_secs(timeout_s.try_into().unwrap());
|
||||||
|
let index_names: Vec<&str> = index_names.iter().map(|s| s.as_str()).collect();
|
||||||
|
let slice: &[&str] = &index_names;
|
||||||
|
|
||||||
|
self.inner_ref()?
|
||||||
|
.wait_for_index(slice, timeout)
|
||||||
|
.await
|
||||||
|
.default_error()
|
||||||
|
}
|
||||||
|
|
||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub async fn update(
|
pub async fn update(
|
||||||
&self,
|
&self,
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.22.0-beta.7"
|
current_version = "0.22.0-beta.9"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.22.0-beta.7"
|
version = "0.22.0-beta.9"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ repository = "https://github.com/lancedb/lancedb"
|
|||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
pylance = [
|
pylance = [
|
||||||
"pylance>=0.23.2",
|
"pylance>=0.25",
|
||||||
]
|
]
|
||||||
tests = [
|
tests = [
|
||||||
"aiohttp",
|
"aiohttp",
|
||||||
@@ -58,7 +58,7 @@ tests = [
|
|||||||
"polars>=0.19, <=1.3.0",
|
"polars>=0.19, <=1.3.0",
|
||||||
"tantivy",
|
"tantivy",
|
||||||
"pyarrow-stubs",
|
"pyarrow-stubs",
|
||||||
"pylance>=0.23.2",
|
"pylance>=0.25",
|
||||||
"requests",
|
"requests",
|
||||||
]
|
]
|
||||||
dev = [
|
dev = [
|
||||||
@@ -77,6 +77,7 @@ embeddings = [
|
|||||||
"pillow",
|
"pillow",
|
||||||
"open-clip-torch",
|
"open-clip-torch",
|
||||||
"cohere",
|
"cohere",
|
||||||
|
"colpali-engine>=0.3.10",
|
||||||
"huggingface_hub",
|
"huggingface_hub",
|
||||||
"InstructorEmbedding",
|
"InstructorEmbedding",
|
||||||
"google.generativeai",
|
"google.generativeai",
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import numpy as np
|
|||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pyarrow.dataset
|
import pyarrow.dataset
|
||||||
|
|
||||||
from .dependencies import pandas as pd
|
from .dependencies import _check_for_pandas, pandas as pd
|
||||||
|
|
||||||
DATA = Union[List[dict], "pd.DataFrame", pa.Table, Iterable[pa.RecordBatch]]
|
DATA = Union[List[dict], "pd.DataFrame", pa.Table, Iterable[pa.RecordBatch]]
|
||||||
VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray]
|
VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray]
|
||||||
@@ -63,7 +63,7 @@ def data_to_reader(
|
|||||||
data: DATA, schema: Optional[pa.Schema] = None
|
data: DATA, schema: Optional[pa.Schema] = None
|
||||||
) -> pa.RecordBatchReader:
|
) -> pa.RecordBatchReader:
|
||||||
"""Convert various types of input into a RecordBatchReader"""
|
"""Convert various types of input into a RecordBatchReader"""
|
||||||
if pd is not None and isinstance(data, pd.DataFrame):
|
if _check_for_pandas(data) and isinstance(data, pd.DataFrame):
|
||||||
return pa.Table.from_pandas(data, schema=schema).to_reader()
|
return pa.Table.from_pandas(data, schema=schema).to_reader()
|
||||||
elif isinstance(data, pa.Table):
|
elif isinstance(data, pa.Table):
|
||||||
return data.to_reader()
|
return data.to_reader()
|
||||||
|
|||||||
@@ -19,3 +19,4 @@ from .imagebind import ImageBindEmbeddings
|
|||||||
from .jinaai import JinaEmbeddings
|
from .jinaai import JinaEmbeddings
|
||||||
from .watsonx import WatsonxEmbeddings
|
from .watsonx import WatsonxEmbeddings
|
||||||
from .voyageai import VoyageAIEmbeddingFunction
|
from .voyageai import VoyageAIEmbeddingFunction
|
||||||
|
from .colpali import ColPaliEmbeddings
|
||||||
|
|||||||
255
python/python/lancedb/embeddings/colpali.py
Normal file
255
python/python/lancedb/embeddings/colpali.py
Normal file
@@ -0,0 +1,255 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import List, Union, Optional, Any
|
||||||
|
import numpy as np
|
||||||
|
import io
|
||||||
|
|
||||||
|
from ..util import attempt_import_or_raise
|
||||||
|
from .base import EmbeddingFunction
|
||||||
|
from .registry import register
|
||||||
|
from .utils import TEXT, IMAGES, is_flash_attn_2_available
|
||||||
|
|
||||||
|
|
||||||
|
@register("colpali")
|
||||||
|
class ColPaliEmbeddings(EmbeddingFunction):
|
||||||
|
"""
|
||||||
|
An embedding function that uses the ColPali engine for
|
||||||
|
multimodal multi-vector embeddings.
|
||||||
|
|
||||||
|
This embedding function supports ColQwen2.5 models, producing multivector outputs
|
||||||
|
for both text and image inputs. The output embeddings are lists of vectors, each
|
||||||
|
vector being 128-dimensional by default, represented as List[List[float]].
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
model_name : str
|
||||||
|
The name of the model to use (e.g., "Metric-AI/ColQwen2.5-3b-multilingual-v1.0")
|
||||||
|
device : str
|
||||||
|
The device for inference (default "cuda:0").
|
||||||
|
dtype : str
|
||||||
|
Data type for model weights (default "bfloat16").
|
||||||
|
use_token_pooling : bool
|
||||||
|
Whether to use token pooling to reduce embedding size (default True).
|
||||||
|
pool_factor : int
|
||||||
|
Factor to reduce sequence length if token pooling is enabled (default 2).
|
||||||
|
quantization_config : Optional[BitsAndBytesConfig]
|
||||||
|
Quantization configuration for the model. (default None, bitsandbytes needed)
|
||||||
|
batch_size : int
|
||||||
|
Batch size for processing inputs (default 2).
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_name: str = "Metric-AI/ColQwen2.5-3b-multilingual-v1.0"
|
||||||
|
device: str = "auto"
|
||||||
|
dtype: str = "bfloat16"
|
||||||
|
use_token_pooling: bool = True
|
||||||
|
pool_factor: int = 2
|
||||||
|
quantization_config: Optional[Any] = None
|
||||||
|
batch_size: int = 2
|
||||||
|
|
||||||
|
_model = None
|
||||||
|
_processor = None
|
||||||
|
_token_pooler = None
|
||||||
|
_vector_dim = None
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
(
|
||||||
|
self._model,
|
||||||
|
self._processor,
|
||||||
|
self._token_pooler,
|
||||||
|
) = self._load_model(
|
||||||
|
self.model_name,
|
||||||
|
self.dtype,
|
||||||
|
self.device,
|
||||||
|
self.use_token_pooling,
|
||||||
|
self.quantization_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _load_model(
|
||||||
|
model_name: str,
|
||||||
|
dtype: str,
|
||||||
|
device: str,
|
||||||
|
use_token_pooling: bool,
|
||||||
|
quantization_config: Optional[Any],
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize and cache the ColPali model, processor, and token pooler.
|
||||||
|
"""
|
||||||
|
torch = attempt_import_or_raise("torch", "torch")
|
||||||
|
transformers = attempt_import_or_raise("transformers", "transformers")
|
||||||
|
colpali_engine = attempt_import_or_raise("colpali_engine", "colpali_engine")
|
||||||
|
from colpali_engine.compression.token_pooling import HierarchicalTokenPooler
|
||||||
|
|
||||||
|
if quantization_config is not None:
|
||||||
|
if not isinstance(quantization_config, transformers.BitsAndBytesConfig):
|
||||||
|
raise ValueError("quantization_config must be a BitsAndBytesConfig")
|
||||||
|
|
||||||
|
if dtype == "bfloat16":
|
||||||
|
torch_dtype = torch.bfloat16
|
||||||
|
elif dtype == "float16":
|
||||||
|
torch_dtype = torch.float16
|
||||||
|
elif dtype == "float64":
|
||||||
|
torch_dtype = torch.float64
|
||||||
|
else:
|
||||||
|
torch_dtype = torch.float32
|
||||||
|
|
||||||
|
model = colpali_engine.models.ColQwen2_5.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype=torch_dtype,
|
||||||
|
device_map=device,
|
||||||
|
quantization_config=quantization_config
|
||||||
|
if quantization_config is not None
|
||||||
|
else None,
|
||||||
|
attn_implementation="flash_attention_2"
|
||||||
|
if is_flash_attn_2_available()
|
||||||
|
else None,
|
||||||
|
).eval()
|
||||||
|
processor = colpali_engine.models.ColQwen2_5_Processor.from_pretrained(
|
||||||
|
model_name
|
||||||
|
)
|
||||||
|
token_pooler = HierarchicalTokenPooler() if use_token_pooling else None
|
||||||
|
return model, processor, token_pooler
|
||||||
|
|
||||||
|
def ndims(self):
|
||||||
|
"""
|
||||||
|
Return the dimension of a vector in the multivector output (e.g., 128).
|
||||||
|
"""
|
||||||
|
torch = attempt_import_or_raise("torch", "torch")
|
||||||
|
if self._vector_dim is None:
|
||||||
|
dummy_query = "test"
|
||||||
|
batch_queries = self._processor.process_queries([dummy_query]).to(
|
||||||
|
self._model.device
|
||||||
|
)
|
||||||
|
with torch.no_grad():
|
||||||
|
query_embeddings = self._model(**batch_queries)
|
||||||
|
|
||||||
|
if self.use_token_pooling and self._token_pooler is not None:
|
||||||
|
query_embeddings = self._token_pooler.pool_embeddings(
|
||||||
|
query_embeddings,
|
||||||
|
pool_factor=self.pool_factor,
|
||||||
|
padding=True,
|
||||||
|
padding_side=self._processor.tokenizer.padding_side,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._vector_dim = query_embeddings[0].shape[-1]
|
||||||
|
return self._vector_dim
|
||||||
|
|
||||||
|
def _process_embeddings(self, embeddings):
|
||||||
|
"""
|
||||||
|
Format model embeddings into List[List[float]].
|
||||||
|
Use token pooling if enabled.
|
||||||
|
"""
|
||||||
|
torch = attempt_import_or_raise("torch", "torch")
|
||||||
|
if self.use_token_pooling and self._token_pooler is not None:
|
||||||
|
embeddings = self._token_pooler.pool_embeddings(
|
||||||
|
embeddings,
|
||||||
|
pool_factor=self.pool_factor,
|
||||||
|
padding=True,
|
||||||
|
padding_side=self._processor.tokenizer.padding_side,
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(embeddings, torch.Tensor):
|
||||||
|
tensors = embeddings.detach().cpu()
|
||||||
|
if tensors.dtype == torch.bfloat16:
|
||||||
|
tensors = tensors.to(torch.float32)
|
||||||
|
return (
|
||||||
|
tensors.numpy()
|
||||||
|
.astype(np.float64 if self.dtype == "float64" else np.float32)
|
||||||
|
.tolist()
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def generate_text_embeddings(self, text: TEXT) -> List[List[List[float]]]:
|
||||||
|
"""
|
||||||
|
Generate embeddings for text input.
|
||||||
|
"""
|
||||||
|
torch = attempt_import_or_raise("torch", "torch")
|
||||||
|
text = self.sanitize_input(text)
|
||||||
|
all_embeddings = []
|
||||||
|
|
||||||
|
for i in range(0, len(text), self.batch_size):
|
||||||
|
batch_text = text[i : i + self.batch_size]
|
||||||
|
batch_queries = self._processor.process_queries(batch_text).to(
|
||||||
|
self._model.device
|
||||||
|
)
|
||||||
|
with torch.no_grad():
|
||||||
|
query_embeddings = self._model(**batch_queries)
|
||||||
|
all_embeddings.extend(self._process_embeddings(query_embeddings))
|
||||||
|
return all_embeddings
|
||||||
|
|
||||||
|
def _prepare_images(self, images: IMAGES) -> List:
|
||||||
|
"""
|
||||||
|
Convert image inputs to PIL Images.
|
||||||
|
"""
|
||||||
|
PIL = attempt_import_or_raise("PIL", "pillow")
|
||||||
|
requests = attempt_import_or_raise("requests", "requests")
|
||||||
|
images = self.sanitize_input(images)
|
||||||
|
pil_images = []
|
||||||
|
try:
|
||||||
|
for image in images:
|
||||||
|
if isinstance(image, str):
|
||||||
|
if image.startswith(("http://", "https://")):
|
||||||
|
response = requests.get(image, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
pil_images.append(PIL.Image.open(io.BytesIO(response.content)))
|
||||||
|
else:
|
||||||
|
with PIL.Image.open(image) as im:
|
||||||
|
pil_images.append(im.copy())
|
||||||
|
elif isinstance(image, bytes):
|
||||||
|
pil_images.append(PIL.Image.open(io.BytesIO(image)))
|
||||||
|
else:
|
||||||
|
# Assume it's a PIL Image; will raise if invalid
|
||||||
|
pil_images.append(image)
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"Failed to process image: {e}")
|
||||||
|
|
||||||
|
return pil_images
|
||||||
|
|
||||||
|
def generate_image_embeddings(self, images: IMAGES) -> List[List[List[float]]]:
|
||||||
|
"""
|
||||||
|
Generate embeddings for a batch of images.
|
||||||
|
"""
|
||||||
|
torch = attempt_import_or_raise("torch", "torch")
|
||||||
|
pil_images = self._prepare_images(images)
|
||||||
|
all_embeddings = []
|
||||||
|
|
||||||
|
for i in range(0, len(pil_images), self.batch_size):
|
||||||
|
batch_images = pil_images[i : i + self.batch_size]
|
||||||
|
batch_images = self._processor.process_images(batch_images).to(
|
||||||
|
self._model.device
|
||||||
|
)
|
||||||
|
with torch.no_grad():
|
||||||
|
image_embeddings = self._model(**batch_images)
|
||||||
|
all_embeddings.extend(self._process_embeddings(image_embeddings))
|
||||||
|
return all_embeddings
|
||||||
|
|
||||||
|
def compute_query_embeddings(
|
||||||
|
self, query: Union[str, IMAGES], *args, **kwargs
|
||||||
|
) -> List[List[List[float]]]:
|
||||||
|
"""
|
||||||
|
Compute embeddings for a single user query (text only).
|
||||||
|
"""
|
||||||
|
if not isinstance(query, str):
|
||||||
|
raise ValueError(
|
||||||
|
"Query must be a string, image to image search is not supported"
|
||||||
|
)
|
||||||
|
return self.generate_text_embeddings([query])
|
||||||
|
|
||||||
|
def compute_source_embeddings(
|
||||||
|
self, images: IMAGES, *args, **kwargs
|
||||||
|
) -> List[List[List[float]]]:
|
||||||
|
"""
|
||||||
|
Compute embeddings for a batch of source images.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
images : Union[str, bytes, List, pa.Array, pa.ChunkedArray, np.ndarray]
|
||||||
|
Batch of images (paths, URLs, bytes, or PIL Images).
|
||||||
|
"""
|
||||||
|
images = self.sanitize_input(images)
|
||||||
|
return self.generate_image_embeddings(images)
|
||||||
@@ -18,6 +18,7 @@ import numpy as np
|
|||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
|
||||||
from ..dependencies import pandas as pd
|
from ..dependencies import pandas as pd
|
||||||
|
from ..util import attempt_import_or_raise
|
||||||
|
|
||||||
|
|
||||||
# ruff: noqa: PERF203
|
# ruff: noqa: PERF203
|
||||||
@@ -275,3 +276,12 @@ def url_retrieve(url: str):
|
|||||||
def api_key_not_found_help(provider):
|
def api_key_not_found_help(provider):
|
||||||
logging.error("Could not find API key for %s", provider)
|
logging.error("Could not find API key for %s", provider)
|
||||||
raise ValueError(f"Please set the {provider.upper()}_API_KEY environment variable.")
|
raise ValueError(f"Please set the {provider.upper()}_API_KEY environment variable.")
|
||||||
|
|
||||||
|
|
||||||
|
def is_flash_attn_2_available():
|
||||||
|
try:
|
||||||
|
attempt_import_or_raise("flash_attn", "flash_attn")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|||||||
@@ -152,6 +152,104 @@ def Vector(
|
|||||||
return FixedSizeList
|
return FixedSizeList
|
||||||
|
|
||||||
|
|
||||||
|
def MultiVector(
|
||||||
|
dim: int, value_type: pa.DataType = pa.float32(), nullable: bool = True
|
||||||
|
) -> Type:
|
||||||
|
"""Pydantic MultiVector Type for multi-vector embeddings.
|
||||||
|
|
||||||
|
This type represents a list of vectors, each with the same dimension.
|
||||||
|
Useful for models that produce multiple embeddings per input, like ColPali.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dim : int
|
||||||
|
The dimension of each vector in the multi-vector.
|
||||||
|
value_type : pyarrow.DataType, optional
|
||||||
|
The value type of the vectors, by default pa.float32()
|
||||||
|
nullable : bool, optional
|
||||||
|
Whether the multi-vector is nullable, by default it is True.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
>>> import pydantic
|
||||||
|
>>> from lancedb.pydantic import MultiVector
|
||||||
|
...
|
||||||
|
>>> class MyModel(pydantic.BaseModel):
|
||||||
|
... id: int
|
||||||
|
... text: str
|
||||||
|
... embeddings: MultiVector(128) # List of 128-dimensional vectors
|
||||||
|
>>> schema = pydantic_to_schema(MyModel)
|
||||||
|
>>> assert schema == pa.schema([
|
||||||
|
... pa.field("id", pa.int64(), False),
|
||||||
|
... pa.field("text", pa.utf8(), False),
|
||||||
|
... pa.field("embeddings", pa.list_(pa.list_(pa.float32(), 128)))
|
||||||
|
... ])
|
||||||
|
"""
|
||||||
|
|
||||||
|
class MultiVectorList(list, FixedSizeListMixin):
|
||||||
|
def __repr__(self):
|
||||||
|
return f"MultiVector(dim={dim})"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def nullable() -> bool:
|
||||||
|
return nullable
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def dim() -> int:
|
||||||
|
return dim
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def value_arrow_type() -> pa.DataType:
|
||||||
|
return value_type
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_multi_vector() -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __get_pydantic_core_schema__(
|
||||||
|
cls, _source_type: Any, _handler: pydantic.GetCoreSchemaHandler
|
||||||
|
) -> CoreSchema:
|
||||||
|
return core_schema.no_info_after_validator_function(
|
||||||
|
cls,
|
||||||
|
core_schema.list_schema(
|
||||||
|
items_schema=core_schema.list_schema(
|
||||||
|
min_length=dim,
|
||||||
|
max_length=dim,
|
||||||
|
items_schema=core_schema.float_schema(),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __get_validators__(cls) -> Generator[Callable, None, None]:
|
||||||
|
yield cls.validate
|
||||||
|
|
||||||
|
# For pydantic v1
|
||||||
|
@classmethod
|
||||||
|
def validate(cls, v):
|
||||||
|
if not isinstance(v, (list, range)):
|
||||||
|
raise TypeError("A list of vectors is needed")
|
||||||
|
for vec in v:
|
||||||
|
if not isinstance(vec, (list, range, np.ndarray)) or len(vec) != dim:
|
||||||
|
raise TypeError(f"Each vector must be a list of {dim} numbers")
|
||||||
|
return cls(v)
|
||||||
|
|
||||||
|
if PYDANTIC_VERSION.major < 2:
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __modify_schema__(cls, field_schema: Dict[str, Any]):
|
||||||
|
field_schema["items"] = {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "number"},
|
||||||
|
"minItems": dim,
|
||||||
|
"maxItems": dim,
|
||||||
|
}
|
||||||
|
|
||||||
|
return MultiVectorList
|
||||||
|
|
||||||
|
|
||||||
def _py_type_to_arrow_type(py_type: Type[Any], field: FieldInfo) -> pa.DataType:
|
def _py_type_to_arrow_type(py_type: Type[Any], field: FieldInfo) -> pa.DataType:
|
||||||
"""Convert a field with native Python type to Arrow data type.
|
"""Convert a field with native Python type to Arrow data type.
|
||||||
|
|
||||||
@@ -206,6 +304,9 @@ def _pydantic_type_to_arrow_type(tp: Any, field: FieldInfo) -> pa.DataType:
|
|||||||
fields = _pydantic_model_to_fields(tp)
|
fields = _pydantic_model_to_fields(tp)
|
||||||
return pa.struct(fields)
|
return pa.struct(fields)
|
||||||
if issubclass(tp, FixedSizeListMixin):
|
if issubclass(tp, FixedSizeListMixin):
|
||||||
|
if getattr(tp, "is_multi_vector", lambda: False)():
|
||||||
|
return pa.list_(pa.list_(tp.value_arrow_type(), tp.dim()))
|
||||||
|
# For regular Vector
|
||||||
return pa.list_(tp.value_arrow_type(), tp.dim())
|
return pa.list_(tp.value_arrow_type(), tp.dim())
|
||||||
return _py_type_to_arrow_type(tp, field)
|
return _py_type_to_arrow_type(tp, field)
|
||||||
|
|
||||||
|
|||||||
@@ -104,6 +104,7 @@ class RemoteTable(Table):
|
|||||||
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
|
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
|
||||||
*,
|
*,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
|
wait_timeout: timedelta = None,
|
||||||
):
|
):
|
||||||
"""Creates a scalar index
|
"""Creates a scalar index
|
||||||
Parameters
|
Parameters
|
||||||
@@ -126,13 +127,18 @@ class RemoteTable(Table):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown index type: {index_type}")
|
raise ValueError(f"Unknown index type: {index_type}")
|
||||||
|
|
||||||
LOOP.run(self._table.create_index(column, config=config, replace=replace))
|
LOOP.run(
|
||||||
|
self._table.create_index(
|
||||||
|
column, config=config, replace=replace, wait_timeout=wait_timeout
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def create_fts_index(
|
def create_fts_index(
|
||||||
self,
|
self,
|
||||||
column: str,
|
column: str,
|
||||||
*,
|
*,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
|
wait_timeout: timedelta = None,
|
||||||
with_position: bool = True,
|
with_position: bool = True,
|
||||||
# tokenizer configs:
|
# tokenizer configs:
|
||||||
base_tokenizer: str = "simple",
|
base_tokenizer: str = "simple",
|
||||||
@@ -153,7 +159,11 @@ class RemoteTable(Table):
|
|||||||
remove_stop_words=remove_stop_words,
|
remove_stop_words=remove_stop_words,
|
||||||
ascii_folding=ascii_folding,
|
ascii_folding=ascii_folding,
|
||||||
)
|
)
|
||||||
LOOP.run(self._table.create_index(column, config=config, replace=replace))
|
LOOP.run(
|
||||||
|
self._table.create_index(
|
||||||
|
column, config=config, replace=replace, wait_timeout=wait_timeout
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def create_index(
|
def create_index(
|
||||||
self,
|
self,
|
||||||
@@ -165,6 +175,7 @@ class RemoteTable(Table):
|
|||||||
replace: Optional[bool] = None,
|
replace: Optional[bool] = None,
|
||||||
accelerator: Optional[str] = None,
|
accelerator: Optional[str] = None,
|
||||||
index_type="vector",
|
index_type="vector",
|
||||||
|
wait_timeout: Optional[timedelta] = None,
|
||||||
):
|
):
|
||||||
"""Create an index on the table.
|
"""Create an index on the table.
|
||||||
Currently, the only parameters that matter are
|
Currently, the only parameters that matter are
|
||||||
@@ -236,7 +247,11 @@ class RemoteTable(Table):
|
|||||||
" 'IVF_FLAT', 'IVF_PQ', 'IVF_HNSW_PQ', 'IVF_HNSW_SQ'"
|
" 'IVF_FLAT', 'IVF_PQ', 'IVF_HNSW_PQ', 'IVF_HNSW_SQ'"
|
||||||
)
|
)
|
||||||
|
|
||||||
LOOP.run(self._table.create_index(vector_column_name, config=config))
|
LOOP.run(
|
||||||
|
self._table.create_index(
|
||||||
|
vector_column_name, config=config, wait_timeout=wait_timeout
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def add(
|
def add(
|
||||||
self,
|
self,
|
||||||
@@ -554,6 +569,11 @@ class RemoteTable(Table):
|
|||||||
def drop_index(self, index_name: str):
|
def drop_index(self, index_name: str):
|
||||||
return LOOP.run(self._table.drop_index(index_name))
|
return LOOP.run(self._table.drop_index(index_name))
|
||||||
|
|
||||||
|
def wait_for_index(
|
||||||
|
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
|
||||||
|
):
|
||||||
|
return LOOP.run(self._table.wait_for_index(index_names, timeout))
|
||||||
|
|
||||||
def uses_v2_manifest_paths(self) -> bool:
|
def uses_v2_manifest_paths(self) -> bool:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"uses_v2_manifest_paths() is not supported on the LanceDB Cloud"
|
"uses_v2_manifest_paths() is not supported on the LanceDB Cloud"
|
||||||
|
|||||||
@@ -631,6 +631,7 @@ class Table(ABC):
|
|||||||
index_cache_size: Optional[int] = None,
|
index_cache_size: Optional[int] = None,
|
||||||
*,
|
*,
|
||||||
index_type: VectorIndexType = "IVF_PQ",
|
index_type: VectorIndexType = "IVF_PQ",
|
||||||
|
wait_timeout: Optional[timedelta] = None,
|
||||||
num_bits: int = 8,
|
num_bits: int = 8,
|
||||||
max_iterations: int = 50,
|
max_iterations: int = 50,
|
||||||
sample_rate: int = 256,
|
sample_rate: int = 256,
|
||||||
@@ -666,6 +667,8 @@ class Table(ABC):
|
|||||||
num_bits: int
|
num_bits: int
|
||||||
The number of bits to encode sub-vectors. Only used with the IVF_PQ index.
|
The number of bits to encode sub-vectors. Only used with the IVF_PQ index.
|
||||||
Only 4 and 8 are supported.
|
Only 4 and 8 are supported.
|
||||||
|
wait_timeout: timedelta, optional
|
||||||
|
The timeout to wait if indexing is asynchronous.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@@ -689,6 +692,23 @@ class Table(ABC):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def wait_for_index(
|
||||||
|
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Wait for indexing to complete for the given index names.
|
||||||
|
This will poll the table until all the indices are fully indexed,
|
||||||
|
or raise a timeout exception if the timeout is reached.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
index_names: str
|
||||||
|
The name of the indices to poll
|
||||||
|
timeout: timedelta
|
||||||
|
Timeout to wait for asynchronous indexing. The default is 5 minutes.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def create_scalar_index(
|
def create_scalar_index(
|
||||||
self,
|
self,
|
||||||
@@ -696,6 +716,7 @@ class Table(ABC):
|
|||||||
*,
|
*,
|
||||||
replace: bool = True,
|
replace: bool = True,
|
||||||
index_type: ScalarIndexType = "BTREE",
|
index_type: ScalarIndexType = "BTREE",
|
||||||
|
wait_timeout: Optional[timedelta] = None,
|
||||||
):
|
):
|
||||||
"""Create a scalar index on a column.
|
"""Create a scalar index on a column.
|
||||||
|
|
||||||
@@ -708,7 +729,8 @@ class Table(ABC):
|
|||||||
Replace the existing index if it exists.
|
Replace the existing index if it exists.
|
||||||
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"], default "BTREE"
|
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"], default "BTREE"
|
||||||
The type of index to create.
|
The type of index to create.
|
||||||
|
wait_timeout: timedelta, optional
|
||||||
|
The timeout to wait if indexing is asynchronous.
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
|
|
||||||
@@ -767,6 +789,7 @@ class Table(ABC):
|
|||||||
stem: bool = False,
|
stem: bool = False,
|
||||||
remove_stop_words: bool = False,
|
remove_stop_words: bool = False,
|
||||||
ascii_folding: bool = False,
|
ascii_folding: bool = False,
|
||||||
|
wait_timeout: Optional[timedelta] = None,
|
||||||
):
|
):
|
||||||
"""Create a full-text search index on the table.
|
"""Create a full-text search index on the table.
|
||||||
|
|
||||||
@@ -822,6 +845,8 @@ class Table(ABC):
|
|||||||
ascii_folding : bool, default False
|
ascii_folding : bool, default False
|
||||||
Whether to fold ASCII characters. This converts accented characters to
|
Whether to fold ASCII characters. This converts accented characters to
|
||||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||||
|
wait_timeout: timedelta, optional
|
||||||
|
The timeout to wait if indexing is asynchronous.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@@ -1745,8 +1770,37 @@ class LanceTable(Table):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def drop_index(self, name: str) -> None:
|
def drop_index(self, name: str) -> None:
|
||||||
|
"""
|
||||||
|
Drops an index from the table
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
name: str
|
||||||
|
The name of the index to drop
|
||||||
|
"""
|
||||||
return LOOP.run(self._table.drop_index(name))
|
return LOOP.run(self._table.drop_index(name))
|
||||||
|
|
||||||
|
def prewarm_index(self, name: str) -> None:
|
||||||
|
"""
|
||||||
|
Prewarms an index in the table
|
||||||
|
|
||||||
|
This loads the entire index into memory
|
||||||
|
|
||||||
|
If the index does not fit into the available cache this call
|
||||||
|
may be wasteful
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
name: str
|
||||||
|
The name of the index to prewarm
|
||||||
|
"""
|
||||||
|
return LOOP.run(self._table.prewarm_index(name))
|
||||||
|
|
||||||
|
def wait_for_index(
|
||||||
|
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
|
||||||
|
) -> None:
|
||||||
|
return LOOP.run(self._table.wait_for_index(index_names, timeout))
|
||||||
|
|
||||||
def create_scalar_index(
|
def create_scalar_index(
|
||||||
self,
|
self,
|
||||||
column: str,
|
column: str,
|
||||||
@@ -2940,6 +2994,7 @@ class AsyncTable:
|
|||||||
config: Optional[
|
config: Optional[
|
||||||
Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
||||||
] = None,
|
] = None,
|
||||||
|
wait_timeout: Optional[timedelta] = None,
|
||||||
):
|
):
|
||||||
"""Create an index to speed up queries
|
"""Create an index to speed up queries
|
||||||
|
|
||||||
@@ -2964,6 +3019,8 @@ class AsyncTable:
|
|||||||
For advanced configuration you can specify the type of index you would
|
For advanced configuration you can specify the type of index you would
|
||||||
like to create. You can also specify index-specific parameters when
|
like to create. You can also specify index-specific parameters when
|
||||||
creating an index object.
|
creating an index object.
|
||||||
|
wait_timeout: timedelta, optional
|
||||||
|
The timeout to wait if indexing is asynchronous.
|
||||||
"""
|
"""
|
||||||
if config is not None:
|
if config is not None:
|
||||||
if not isinstance(
|
if not isinstance(
|
||||||
@@ -2974,7 +3031,9 @@ class AsyncTable:
|
|||||||
" Bitmap, LabelList, or FTS"
|
" Bitmap, LabelList, or FTS"
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
await self._inner.create_index(column, index=config, replace=replace)
|
await self._inner.create_index(
|
||||||
|
column, index=config, replace=replace, wait_timeout=wait_timeout
|
||||||
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
if "not support the requested language" in str(e):
|
if "not support the requested language" in str(e):
|
||||||
supported_langs = ", ".join(lang_mapping.values())
|
supported_langs = ", ".join(lang_mapping.values())
|
||||||
@@ -3002,6 +3061,40 @@ class AsyncTable:
|
|||||||
"""
|
"""
|
||||||
await self._inner.drop_index(name)
|
await self._inner.drop_index(name)
|
||||||
|
|
||||||
|
async def prewarm_index(self, name: str) -> None:
|
||||||
|
"""
|
||||||
|
Prewarm an index in the table.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
name: str
|
||||||
|
The name of the index to prewarm
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
This will load the index into memory. This may reduce the cold-start time for
|
||||||
|
future queries. If the index does not fit in the cache then this call may be
|
||||||
|
wasteful.
|
||||||
|
"""
|
||||||
|
await self._inner.prewarm_index(name)
|
||||||
|
|
||||||
|
async def wait_for_index(
|
||||||
|
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Wait for indexing to complete for the given index names.
|
||||||
|
This will poll the table until all the indices are fully indexed,
|
||||||
|
or raise a timeout exception if the timeout is reached.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
index_names: str
|
||||||
|
The name of the indices to poll
|
||||||
|
timeout: timedelta
|
||||||
|
Timeout to wait for asynchronous indexing. The default is 5 minutes.
|
||||||
|
"""
|
||||||
|
await self._inner.wait_for_index(index_names, timeout)
|
||||||
|
|
||||||
async def add(
|
async def add(
|
||||||
self,
|
self,
|
||||||
data: DATA,
|
data: DATA,
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ import pandas as pd
|
|||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
from lancedb.embeddings import get_registry
|
from lancedb.embeddings import get_registry
|
||||||
from lancedb.pydantic import LanceModel, Vector
|
from lancedb.pydantic import LanceModel, Vector, MultiVector
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
# These are integration tests for embedding functions.
|
# These are integration tests for embedding functions.
|
||||||
@@ -575,3 +575,67 @@ def test_voyageai_multimodal_embedding_text_function():
|
|||||||
|
|
||||||
tbl.add(df)
|
tbl.add(df)
|
||||||
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
importlib.util.find_spec("colpali_engine") is None,
|
||||||
|
reason="colpali_engine not installed",
|
||||||
|
)
|
||||||
|
def test_colpali(tmp_path):
|
||||||
|
import requests
|
||||||
|
from lancedb.pydantic import LanceModel
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
registry = get_registry()
|
||||||
|
func = registry.get("colpali").create()
|
||||||
|
|
||||||
|
class MediaItems(LanceModel):
|
||||||
|
text: str
|
||||||
|
image_uri: str = func.SourceField()
|
||||||
|
image_bytes: bytes = func.SourceField()
|
||||||
|
image_vectors: MultiVector(func.ndims()) = (
|
||||||
|
func.VectorField()
|
||||||
|
) # Multivector image embeddings
|
||||||
|
|
||||||
|
table = db.create_table("media", schema=MediaItems)
|
||||||
|
|
||||||
|
texts = [
|
||||||
|
"a cute cat playing with yarn",
|
||||||
|
"a puppy in a flower field",
|
||||||
|
"a red sports car on the highway",
|
||||||
|
"a vintage bicycle leaning against a wall",
|
||||||
|
"a plate of delicious pasta",
|
||||||
|
"fresh fruit salad in a bowl",
|
||||||
|
]
|
||||||
|
|
||||||
|
uris = [
|
||||||
|
"http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg",
|
||||||
|
"http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg",
|
||||||
|
"http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg",
|
||||||
|
"http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg",
|
||||||
|
"http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg",
|
||||||
|
"http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Get images as bytes
|
||||||
|
image_bytes = [requests.get(uri).content for uri in uris]
|
||||||
|
|
||||||
|
table.add(
|
||||||
|
pd.DataFrame({"text": texts, "image_uri": uris, "image_bytes": image_bytes})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test text-to-image search
|
||||||
|
image_results = (
|
||||||
|
table.search("fluffy companion", vector_column_name="image_vectors")
|
||||||
|
.limit(1)
|
||||||
|
.to_pydantic(MediaItems)[0]
|
||||||
|
)
|
||||||
|
assert "cat" in image_results.text.lower() or "puppy" in image_results.text.lower()
|
||||||
|
|
||||||
|
# Verify multivector dimensions
|
||||||
|
first_row = table.to_arrow().to_pylist()[0]
|
||||||
|
assert len(first_row["image_vectors"]) > 1, "Should have multiple image vectors"
|
||||||
|
assert len(first_row["image_vectors"][0]) == func.ndims(), (
|
||||||
|
"Vector dimension mismatch"
|
||||||
|
)
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import pyarrow as pa
|
|||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
from lancedb import AsyncConnection, AsyncTable, connect_async
|
from lancedb import AsyncConnection, AsyncTable, connect_async
|
||||||
from lancedb.index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq
|
from lancedb.index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
@pytest_asyncio.fixture
|
||||||
@@ -119,6 +119,18 @@ async def test_create_label_list_index(some_table: AsyncTable):
|
|||||||
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
|
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_full_text_search_index(some_table: AsyncTable):
|
||||||
|
await some_table.create_index("tags", config=FTS(with_position=False))
|
||||||
|
indices = await some_table.list_indices()
|
||||||
|
assert str(indices) == '[Index(FTS, columns=["tags"], name="tags_idx")]'
|
||||||
|
|
||||||
|
await some_table.prewarm_index("tags_idx")
|
||||||
|
|
||||||
|
res = await (await some_table.search("tag0")).to_arrow()
|
||||||
|
assert res.num_rows > 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_create_vector_index(some_table: AsyncTable):
|
async def test_create_vector_index(some_table: AsyncTable):
|
||||||
# Can create
|
# Can create
|
||||||
|
|||||||
@@ -9,7 +9,13 @@ from typing import List, Optional, Tuple
|
|||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pydantic
|
import pydantic
|
||||||
import pytest
|
import pytest
|
||||||
from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, Vector, pydantic_to_schema
|
from lancedb.pydantic import (
|
||||||
|
PYDANTIC_VERSION,
|
||||||
|
LanceModel,
|
||||||
|
Vector,
|
||||||
|
pydantic_to_schema,
|
||||||
|
MultiVector,
|
||||||
|
)
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
|
|
||||||
@@ -354,3 +360,55 @@ def test_optional_nested_model():
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_multi_vector():
|
||||||
|
class TestModel(pydantic.BaseModel):
|
||||||
|
vec: MultiVector(8)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(TestModel)
|
||||||
|
assert schema == pa.schema(
|
||||||
|
[pa.field("vec", pa.list_(pa.list_(pa.float32(), 8)), True)]
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(pydantic.ValidationError):
|
||||||
|
TestModel(vec=[[1.0] * 7])
|
||||||
|
|
||||||
|
with pytest.raises(pydantic.ValidationError):
|
||||||
|
TestModel(vec=[[1.0] * 9])
|
||||||
|
|
||||||
|
TestModel(vec=[[1.0] * 8])
|
||||||
|
TestModel(vec=[[1.0] * 8, [2.0] * 8])
|
||||||
|
|
||||||
|
TestModel(vec=[])
|
||||||
|
|
||||||
|
|
||||||
|
def test_multi_vector_nullable():
|
||||||
|
class NullableModel(pydantic.BaseModel):
|
||||||
|
vec: MultiVector(16, nullable=False)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(NullableModel)
|
||||||
|
assert schema == pa.schema(
|
||||||
|
[pa.field("vec", pa.list_(pa.list_(pa.float32(), 16)), False)]
|
||||||
|
)
|
||||||
|
|
||||||
|
class DefaultModel(pydantic.BaseModel):
|
||||||
|
vec: MultiVector(16)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(DefaultModel)
|
||||||
|
assert schema == pa.schema(
|
||||||
|
[pa.field("vec", pa.list_(pa.list_(pa.float32(), 16)), True)]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_multi_vector_in_lance_model():
|
||||||
|
class TestModel(LanceModel):
|
||||||
|
id: int
|
||||||
|
vectors: MultiVector(16) = Field(default=[[0.0] * 16])
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(TestModel)
|
||||||
|
assert schema == TestModel.to_arrow_schema()
|
||||||
|
assert TestModel.field_names() == ["id", "vectors"]
|
||||||
|
|
||||||
|
t = TestModel(id=1)
|
||||||
|
assert t.vectors == [[0.0] * 16]
|
||||||
|
|||||||
@@ -257,7 +257,9 @@ async def test_distance_range_with_new_rows_async():
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
table = await conn.create_table("test", data)
|
table = await conn.create_table("test", data)
|
||||||
table.create_index("vector", config=IvfPq(num_partitions=1, num_sub_vectors=2))
|
await table.create_index(
|
||||||
|
"vector", config=IvfPq(num_partitions=1, num_sub_vectors=2)
|
||||||
|
)
|
||||||
|
|
||||||
q = [0, 0]
|
q = [0, 0]
|
||||||
rs = await table.query().nearest_to(q).to_arrow()
|
rs = await table.query().nearest_to(q).to_arrow()
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
import re
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
import contextlib
|
import contextlib
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
@@ -235,6 +235,10 @@ def test_table_add_in_threadpool():
|
|||||||
|
|
||||||
def test_table_create_indices():
|
def test_table_create_indices():
|
||||||
def handler(request):
|
def handler(request):
|
||||||
|
index_stats = dict(
|
||||||
|
index_type="IVF_PQ", num_indexed_rows=1000, num_unindexed_rows=0
|
||||||
|
)
|
||||||
|
|
||||||
if request.path == "/v1/table/test/create_index/":
|
if request.path == "/v1/table/test/create_index/":
|
||||||
request.send_response(200)
|
request.send_response(200)
|
||||||
request.end_headers()
|
request.end_headers()
|
||||||
@@ -258,6 +262,47 @@ def test_table_create_indices():
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
request.wfile.write(payload.encode())
|
request.wfile.write(payload.encode())
|
||||||
|
elif request.path == "/v1/table/test/index/list/":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
payload = json.dumps(
|
||||||
|
dict(
|
||||||
|
indexes=[
|
||||||
|
{
|
||||||
|
"index_name": "id_idx",
|
||||||
|
"columns": ["id"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index_name": "text_idx",
|
||||||
|
"columns": ["text"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index_name": "vector_idx",
|
||||||
|
"columns": ["vector"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
request.wfile.write(payload.encode())
|
||||||
|
elif request.path == "/v1/table/test/index/id_idx/stats/":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
payload = json.dumps(index_stats)
|
||||||
|
request.wfile.write(payload.encode())
|
||||||
|
elif request.path == "/v1/table/test/index/text_idx/stats/":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
payload = json.dumps(index_stats)
|
||||||
|
request.wfile.write(payload.encode())
|
||||||
|
elif request.path == "/v1/table/test/index/vector_idx/stats/":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
payload = json.dumps(index_stats)
|
||||||
|
request.wfile.write(payload.encode())
|
||||||
elif "/drop/" in request.path:
|
elif "/drop/" in request.path:
|
||||||
request.send_response(200)
|
request.send_response(200)
|
||||||
request.end_headers()
|
request.end_headers()
|
||||||
@@ -269,14 +314,81 @@ def test_table_create_indices():
|
|||||||
# Parameters are well-tested through local and async tests.
|
# Parameters are well-tested through local and async tests.
|
||||||
# This is a smoke-test.
|
# This is a smoke-test.
|
||||||
table = db.create_table("test", [{"id": 1}])
|
table = db.create_table("test", [{"id": 1}])
|
||||||
table.create_scalar_index("id")
|
table.create_scalar_index("id", wait_timeout=timedelta(seconds=2))
|
||||||
table.create_fts_index("text")
|
table.create_fts_index("text", wait_timeout=timedelta(seconds=2))
|
||||||
table.create_scalar_index("vector")
|
table.create_index(
|
||||||
|
vector_column_name="vector", wait_timeout=timedelta(seconds=10)
|
||||||
|
)
|
||||||
|
table.wait_for_index(["id_idx"], timedelta(seconds=2))
|
||||||
|
table.wait_for_index(["text_idx", "vector_idx"], timedelta(seconds=2))
|
||||||
table.drop_index("vector_idx")
|
table.drop_index("vector_idx")
|
||||||
table.drop_index("id_idx")
|
table.drop_index("id_idx")
|
||||||
table.drop_index("text_idx")
|
table.drop_index("text_idx")
|
||||||
|
|
||||||
|
|
||||||
|
def test_table_wait_for_index_timeout():
|
||||||
|
def handler(request):
|
||||||
|
index_stats = dict(
|
||||||
|
index_type="BTREE", num_indexed_rows=1000, num_unindexed_rows=1
|
||||||
|
)
|
||||||
|
|
||||||
|
if request.path == "/v1/table/test/create/?mode=create":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
request.wfile.write(b"{}")
|
||||||
|
elif request.path == "/v1/table/test/describe/":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
payload = json.dumps(
|
||||||
|
dict(
|
||||||
|
version=1,
|
||||||
|
schema=dict(
|
||||||
|
fields=[
|
||||||
|
dict(name="id", type={"type": "int64"}, nullable=False),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
request.wfile.write(payload.encode())
|
||||||
|
elif request.path == "/v1/table/test/index/list/":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
payload = json.dumps(
|
||||||
|
dict(
|
||||||
|
indexes=[
|
||||||
|
{
|
||||||
|
"index_name": "id_idx",
|
||||||
|
"columns": ["id"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
request.wfile.write(payload.encode())
|
||||||
|
elif request.path == "/v1/table/test/index/id_idx/stats/":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
payload = json.dumps(index_stats)
|
||||||
|
print(f"{index_stats=}")
|
||||||
|
request.wfile.write(payload.encode())
|
||||||
|
else:
|
||||||
|
request.send_response(404)
|
||||||
|
request.end_headers()
|
||||||
|
|
||||||
|
with mock_lancedb_connection(handler) as db:
|
||||||
|
table = db.create_table("test", [{"id": 1}])
|
||||||
|
with pytest.raises(
|
||||||
|
RuntimeError,
|
||||||
|
match=re.escape(
|
||||||
|
'Timeout error: timed out waiting for indices: ["id_idx"] after 1s'
|
||||||
|
),
|
||||||
|
):
|
||||||
|
table.wait_for_index(["id_idx"], timedelta(seconds=1))
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def query_test_table(query_handler, *, server_version=Version("0.1.0")):
|
def query_test_table(query_handler, *, server_version=Version("0.1.0")):
|
||||||
def handler(request):
|
def handler(request):
|
||||||
|
|||||||
@@ -9,9 +9,9 @@ from typing import List
|
|||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import lancedb
|
import lancedb
|
||||||
|
from lancedb.dependencies import _PANDAS_AVAILABLE
|
||||||
from lancedb.index import HnswPq, HnswSq, IvfPq
|
from lancedb.index import HnswPq, HnswSq, IvfPq
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
|
||||||
import polars as pl
|
import polars as pl
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pyarrow.dataset
|
import pyarrow.dataset
|
||||||
@@ -138,13 +138,16 @@ def test_create_table(mem_db: DBConnection):
|
|||||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||||
]
|
]
|
||||||
df = pd.DataFrame(rows)
|
pa_table = pa.Table.from_pylist(rows, schema=schema)
|
||||||
pa_table = pa.Table.from_pandas(df, schema=schema)
|
|
||||||
data = [
|
data = [
|
||||||
("Rows", rows),
|
("Rows", rows),
|
||||||
("pd_DataFrame", df),
|
|
||||||
("pa_Table", pa_table),
|
("pa_Table", pa_table),
|
||||||
]
|
]
|
||||||
|
if _PANDAS_AVAILABLE:
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df = pd.DataFrame(rows)
|
||||||
|
data.append(("pd_DataFrame", df))
|
||||||
|
|
||||||
for name, d in data:
|
for name, d in data:
|
||||||
tbl = mem_db.create_table(name, data=d, schema=schema).to_arrow()
|
tbl = mem_db.create_table(name, data=d, schema=schema).to_arrow()
|
||||||
@@ -296,7 +299,7 @@ def test_add_subschema(mem_db: DBConnection):
|
|||||||
|
|
||||||
data = {"price": 10.0, "item": "foo"}
|
data = {"price": 10.0, "item": "foo"}
|
||||||
table.add([data])
|
table.add([data])
|
||||||
data = pd.DataFrame({"price": [2.0], "vector": [[3.1, 4.1]]})
|
data = pa.Table.from_pydict({"price": [2.0], "vector": [[3.1, 4.1]]})
|
||||||
table.add(data)
|
table.add(data)
|
||||||
data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
|
data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
|
||||||
table.add([data])
|
table.add([data])
|
||||||
@@ -405,6 +408,7 @@ def test_add_nullability(mem_db: DBConnection):
|
|||||||
|
|
||||||
|
|
||||||
def test_add_pydantic_model(mem_db: DBConnection):
|
def test_add_pydantic_model(mem_db: DBConnection):
|
||||||
|
pytest.importorskip("pandas")
|
||||||
# https://github.com/lancedb/lancedb/issues/562
|
# https://github.com/lancedb/lancedb/issues/562
|
||||||
|
|
||||||
class Metadata(BaseModel):
|
class Metadata(BaseModel):
|
||||||
@@ -473,10 +477,10 @@ def test_polars(mem_db: DBConnection):
|
|||||||
table = mem_db.create_table("test", data=pl.DataFrame(data))
|
table = mem_db.create_table("test", data=pl.DataFrame(data))
|
||||||
assert len(table) == 2
|
assert len(table) == 2
|
||||||
|
|
||||||
result = table.to_pandas()
|
result = table.to_arrow()
|
||||||
assert np.allclose(result["vector"].tolist(), data["vector"])
|
assert np.allclose(result["vector"].to_pylist(), data["vector"])
|
||||||
assert result["item"].tolist() == data["item"]
|
assert result["item"].to_pylist() == data["item"]
|
||||||
assert np.allclose(result["price"].tolist(), data["price"])
|
assert np.allclose(result["price"].to_pylist(), data["price"])
|
||||||
|
|
||||||
schema = pa.schema(
|
schema = pa.schema(
|
||||||
[
|
[
|
||||||
@@ -688,7 +692,7 @@ def test_delete(mem_db: DBConnection):
|
|||||||
assert len(table.list_versions()) == 2
|
assert len(table.list_versions()) == 2
|
||||||
assert table.version == 2
|
assert table.version == 2
|
||||||
assert len(table) == 1
|
assert len(table) == 1
|
||||||
assert table.to_pandas()["id"].tolist() == [1]
|
assert table.to_arrow()["id"].to_pylist() == [1]
|
||||||
|
|
||||||
|
|
||||||
def test_update(mem_db: DBConnection):
|
def test_update(mem_db: DBConnection):
|
||||||
@@ -852,6 +856,7 @@ def test_merge_insert(mem_db: DBConnection):
|
|||||||
ids=["pa.Table", "pd.DataFrame", "rows"],
|
ids=["pa.Table", "pd.DataFrame", "rows"],
|
||||||
)
|
)
|
||||||
def test_merge_insert_subschema(mem_db: DBConnection, data_format):
|
def test_merge_insert_subschema(mem_db: DBConnection, data_format):
|
||||||
|
pytest.importorskip("pandas")
|
||||||
initial_data = pa.table(
|
initial_data = pa.table(
|
||||||
{"id": range(3), "a": [1.0, 2.0, 3.0], "c": ["x", "x", "x"]}
|
{"id": range(3), "a": [1.0, 2.0, 3.0], "c": ["x", "x", "x"]}
|
||||||
)
|
)
|
||||||
@@ -948,7 +953,7 @@ def test_create_with_embedding_function(mem_db: DBConnection):
|
|||||||
|
|
||||||
func = MockTextEmbeddingFunction.create()
|
func = MockTextEmbeddingFunction.create()
|
||||||
texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
|
texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
|
||||||
df = pd.DataFrame({"text": texts, "vector": func.compute_source_embeddings(texts)})
|
df = pa.table({"text": texts, "vector": func.compute_source_embeddings(texts)})
|
||||||
|
|
||||||
conf = EmbeddingFunctionConfig(
|
conf = EmbeddingFunctionConfig(
|
||||||
source_column="text", vector_column="vector", function=func
|
source_column="text", vector_column="vector", function=func
|
||||||
@@ -973,7 +978,7 @@ def test_create_f16_table(mem_db: DBConnection):
|
|||||||
text: str
|
text: str
|
||||||
vector: Vector(32, value_type=pa.float16())
|
vector: Vector(32, value_type=pa.float16())
|
||||||
|
|
||||||
df = pd.DataFrame(
|
df = pa.table(
|
||||||
{
|
{
|
||||||
"text": [f"s-{i}" for i in range(512)],
|
"text": [f"s-{i}" for i in range(512)],
|
||||||
"vector": [np.random.randn(32).astype(np.float16) for _ in range(512)],
|
"vector": [np.random.randn(32).astype(np.float16) for _ in range(512)],
|
||||||
@@ -986,7 +991,7 @@ def test_create_f16_table(mem_db: DBConnection):
|
|||||||
table.add(df)
|
table.add(df)
|
||||||
table.create_index(num_partitions=2, num_sub_vectors=2)
|
table.create_index(num_partitions=2, num_sub_vectors=2)
|
||||||
|
|
||||||
query = df.vector.iloc[2]
|
query = df["vector"][2].as_py()
|
||||||
expected = table.search(query).limit(2).to_arrow()
|
expected = table.search(query).limit(2).to_arrow()
|
||||||
|
|
||||||
assert "s-2" in expected["text"].to_pylist()
|
assert "s-2" in expected["text"].to_pylist()
|
||||||
@@ -1002,7 +1007,7 @@ def test_add_with_embedding_function(mem_db: DBConnection):
|
|||||||
table = mem_db.create_table("my_table", schema=MyTable)
|
table = mem_db.create_table("my_table", schema=MyTable)
|
||||||
|
|
||||||
texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
|
texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
|
||||||
df = pd.DataFrame({"text": texts})
|
df = pa.table({"text": texts})
|
||||||
table.add(df)
|
table.add(df)
|
||||||
|
|
||||||
texts = ["the quick brown fox", "jumped over the lazy dog"]
|
texts = ["the quick brown fox", "jumped over the lazy dog"]
|
||||||
@@ -1033,14 +1038,14 @@ def test_multiple_vector_columns(mem_db: DBConnection):
|
|||||||
{"vector1": v1, "vector2": v2, "text": "foo"},
|
{"vector1": v1, "vector2": v2, "text": "foo"},
|
||||||
{"vector1": v2, "vector2": v1, "text": "bar"},
|
{"vector1": v2, "vector2": v1, "text": "bar"},
|
||||||
]
|
]
|
||||||
df = pd.DataFrame(data)
|
df = pa.Table.from_pylist(data)
|
||||||
table.add(df)
|
table.add(df)
|
||||||
|
|
||||||
q = np.random.randn(10)
|
q = np.random.randn(10)
|
||||||
result1 = table.search(q, vector_column_name="vector1").limit(1).to_pandas()
|
result1 = table.search(q, vector_column_name="vector1").limit(1).to_arrow()
|
||||||
result2 = table.search(q, vector_column_name="vector2").limit(1).to_pandas()
|
result2 = table.search(q, vector_column_name="vector2").limit(1).to_arrow()
|
||||||
|
|
||||||
assert result1["text"].iloc[0] != result2["text"].iloc[0]
|
assert result1["text"][0] != result2["text"][0]
|
||||||
|
|
||||||
|
|
||||||
def test_create_scalar_index(mem_db: DBConnection):
|
def test_create_scalar_index(mem_db: DBConnection):
|
||||||
@@ -1078,22 +1083,22 @@ def test_empty_query(mem_db: DBConnection):
|
|||||||
"my_table",
|
"my_table",
|
||||||
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
|
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
|
||||||
)
|
)
|
||||||
df = table.search().select(["id"]).where("text='bar'").limit(1).to_pandas()
|
df = table.search().select(["id"]).where("text='bar'").limit(1).to_arrow()
|
||||||
val = df.id.iloc[0]
|
val = df["id"][0].as_py()
|
||||||
assert val == 1
|
assert val == 1
|
||||||
|
|
||||||
table = mem_db.create_table("my_table2", data=[{"id": i} for i in range(100)])
|
table = mem_db.create_table("my_table2", data=[{"id": i} for i in range(100)])
|
||||||
df = table.search().select(["id"]).to_pandas()
|
df = table.search().select(["id"]).to_arrow()
|
||||||
assert len(df) == 100
|
assert df.num_rows == 100
|
||||||
# None is the same as default
|
# None is the same as default
|
||||||
df = table.search().select(["id"]).limit(None).to_pandas()
|
df = table.search().select(["id"]).limit(None).to_arrow()
|
||||||
assert len(df) == 100
|
assert df.num_rows == 100
|
||||||
# invalid limist is the same as None, wihch is the same as default
|
# invalid limist is the same as None, wihch is the same as default
|
||||||
df = table.search().select(["id"]).limit(-1).to_pandas()
|
df = table.search().select(["id"]).limit(-1).to_arrow()
|
||||||
assert len(df) == 100
|
assert df.num_rows == 100
|
||||||
# valid limit should work
|
# valid limit should work
|
||||||
df = table.search().select(["id"]).limit(42).to_pandas()
|
df = table.search().select(["id"]).limit(42).to_arrow()
|
||||||
assert len(df) == 42
|
assert df.num_rows == 42
|
||||||
|
|
||||||
|
|
||||||
def test_search_with_schema_inf_single_vector(mem_db: DBConnection):
|
def test_search_with_schema_inf_single_vector(mem_db: DBConnection):
|
||||||
@@ -1112,14 +1117,14 @@ def test_search_with_schema_inf_single_vector(mem_db: DBConnection):
|
|||||||
{"vector_col": v1, "text": "foo"},
|
{"vector_col": v1, "text": "foo"},
|
||||||
{"vector_col": v2, "text": "bar"},
|
{"vector_col": v2, "text": "bar"},
|
||||||
]
|
]
|
||||||
df = pd.DataFrame(data)
|
df = pa.Table.from_pylist(data)
|
||||||
table.add(df)
|
table.add(df)
|
||||||
|
|
||||||
q = np.random.randn(10)
|
q = np.random.randn(10)
|
||||||
result1 = table.search(q, vector_column_name="vector_col").limit(1).to_pandas()
|
result1 = table.search(q, vector_column_name="vector_col").limit(1).to_arrow()
|
||||||
result2 = table.search(q).limit(1).to_pandas()
|
result2 = table.search(q).limit(1).to_arrow()
|
||||||
|
|
||||||
assert result1["text"].iloc[0] == result2["text"].iloc[0]
|
assert result1["text"][0].as_py() == result2["text"][0].as_py()
|
||||||
|
|
||||||
|
|
||||||
def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection):
|
def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection):
|
||||||
@@ -1139,12 +1144,12 @@ def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection):
|
|||||||
{"vector1": v1, "vector2": v2, "text": "foo"},
|
{"vector1": v1, "vector2": v2, "text": "foo"},
|
||||||
{"vector1": v2, "vector2": v1, "text": "bar"},
|
{"vector1": v2, "vector2": v1, "text": "bar"},
|
||||||
]
|
]
|
||||||
df = pd.DataFrame(data)
|
df = pa.Table.from_pylist(data)
|
||||||
table.add(df)
|
table.add(df)
|
||||||
|
|
||||||
q = np.random.randn(10)
|
q = np.random.randn(10)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
table.search(q).limit(1).to_pandas()
|
table.search(q).limit(1).to_arrow()
|
||||||
|
|
||||||
|
|
||||||
def test_compact_cleanup(tmp_db: DBConnection):
|
def test_compact_cleanup(tmp_db: DBConnection):
|
||||||
|
|||||||
@@ -177,15 +177,19 @@ impl Table {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (column, index=None, replace=None))]
|
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None))]
|
||||||
pub fn create_index<'a>(
|
pub fn create_index<'a>(
|
||||||
self_: PyRef<'a, Self>,
|
self_: PyRef<'a, Self>,
|
||||||
column: String,
|
column: String,
|
||||||
index: Option<Bound<'_, PyAny>>,
|
index: Option<Bound<'_, PyAny>>,
|
||||||
replace: Option<bool>,
|
replace: Option<bool>,
|
||||||
|
wait_timeout: Option<Bound<'_, PyAny>>,
|
||||||
) -> PyResult<Bound<'a, PyAny>> {
|
) -> PyResult<Bound<'a, PyAny>> {
|
||||||
let index = extract_index_params(&index)?;
|
let index = extract_index_params(&index)?;
|
||||||
let mut op = self_.inner_ref()?.create_index(&[column], index);
|
let timeout = wait_timeout.map(|t| t.extract::<std::time::Duration>().unwrap());
|
||||||
|
let mut op = self_
|
||||||
|
.inner_ref()?
|
||||||
|
.create_index_with_timeout(&[column], index, timeout);
|
||||||
if let Some(replace) = replace {
|
if let Some(replace) = replace {
|
||||||
op = op.replace(replace);
|
op = op.replace(replace);
|
||||||
}
|
}
|
||||||
@@ -204,6 +208,34 @@ impl Table {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn wait_for_index<'a>(
|
||||||
|
self_: PyRef<'a, Self>,
|
||||||
|
index_names: Vec<String>,
|
||||||
|
timeout: Bound<'_, PyAny>,
|
||||||
|
) -> PyResult<Bound<'a, PyAny>> {
|
||||||
|
let inner = self_.inner_ref()?.clone();
|
||||||
|
let timeout = timeout.extract::<std::time::Duration>()?;
|
||||||
|
future_into_py(self_.py(), async move {
|
||||||
|
let index_refs = index_names
|
||||||
|
.iter()
|
||||||
|
.map(String::as_str)
|
||||||
|
.collect::<Vec<&str>>();
|
||||||
|
inner
|
||||||
|
.wait_for_index(&index_refs, timeout)
|
||||||
|
.await
|
||||||
|
.infer_error()?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn prewarm_index(self_: PyRef<'_, Self>, index_name: String) -> PyResult<Bound<'_, PyAny>> {
|
||||||
|
let inner = self_.inner_ref()?.clone();
|
||||||
|
future_into_py(self_.py(), async move {
|
||||||
|
inner.prewarm_index(&index_name).await.infer_error()?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub fn list_indices(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
pub fn list_indices(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.inner_ref()?.clone();
|
let inner = self_.inner_ref()?.clone();
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
|
|||||||
@@ -163,8 +163,9 @@ pub fn parse_fts_query(query: &Bound<'_, PyDict>) -> PyResult<FtsQuery> {
|
|||||||
.ok_or(PyValueError::new_err("boost not found"))?
|
.ok_or(PyValueError::new_err("boost not found"))?
|
||||||
.extract::<Vec<f32>>()?;
|
.extract::<Vec<f32>>()?;
|
||||||
|
|
||||||
let query =
|
let query = MultiMatchQuery::try_new(query, columns)
|
||||||
MultiMatchQuery::try_new_with_boosts(query, columns, boost).map_err(|e| {
|
.and_then(|q| q.try_with_boosts(boost))
|
||||||
|
.map_err(|e| {
|
||||||
PyValueError::new_err(format!("Error creating MultiMatchQuery: {}", e))
|
PyValueError::new_err(format!("Error creating MultiMatchQuery: {}", e))
|
||||||
})?;
|
})?;
|
||||||
Ok(query.into())
|
Ok(query.into())
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.19.0-beta.6"
|
version = "0.19.0-beta.8"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.19.0-beta.6"
|
version = "0.19.0-beta.8"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -35,6 +35,8 @@ pub enum Error {
|
|||||||
Schema { message: String },
|
Schema { message: String },
|
||||||
#[snafu(display("Runtime error: {message}"))]
|
#[snafu(display("Runtime error: {message}"))]
|
||||||
Runtime { message: String },
|
Runtime { message: String },
|
||||||
|
#[snafu(display("Timeout error: {message}"))]
|
||||||
|
Timeout { message: String },
|
||||||
|
|
||||||
// 3rd party / external errors
|
// 3rd party / external errors
|
||||||
#[snafu(display("object_store error: {source}"))]
|
#[snafu(display("object_store error: {source}"))]
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use scalar::FtsIndexBuilder;
|
use scalar::FtsIndexBuilder;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde_with::skip_serializing_none;
|
use serde_with::skip_serializing_none;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
use vector::IvfFlatIndexBuilder;
|
use vector::IvfFlatIndexBuilder;
|
||||||
|
|
||||||
use crate::{table::BaseTable, DistanceType, Error, Result};
|
use crate::{table::BaseTable, DistanceType, Error, Result};
|
||||||
@@ -17,6 +17,7 @@ use self::{
|
|||||||
|
|
||||||
pub mod scalar;
|
pub mod scalar;
|
||||||
pub mod vector;
|
pub mod vector;
|
||||||
|
pub mod waiter;
|
||||||
|
|
||||||
/// Supported index types.
|
/// Supported index types.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -69,6 +70,7 @@ pub struct IndexBuilder {
|
|||||||
pub(crate) index: Index,
|
pub(crate) index: Index,
|
||||||
pub(crate) columns: Vec<String>,
|
pub(crate) columns: Vec<String>,
|
||||||
pub(crate) replace: bool,
|
pub(crate) replace: bool,
|
||||||
|
pub(crate) wait_timeout: Option<Duration>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IndexBuilder {
|
impl IndexBuilder {
|
||||||
@@ -78,6 +80,7 @@ impl IndexBuilder {
|
|||||||
index,
|
index,
|
||||||
columns,
|
columns,
|
||||||
replace: true,
|
replace: true,
|
||||||
|
wait_timeout: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -91,6 +94,15 @@ impl IndexBuilder {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Duration of time to wait for asynchronous indexing to complete. If not set,
|
||||||
|
/// `create_index()` will not wait.
|
||||||
|
///
|
||||||
|
/// This is not supported for `NativeTable` since indexing is synchronous.
|
||||||
|
pub fn wait_timeout(mut self, d: Duration) -> Self {
|
||||||
|
self.wait_timeout = Some(d);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn execute(self) -> Result<()> {
|
pub async fn execute(self) -> Result<()> {
|
||||||
self.parent.clone().create_index(self).await
|
self.parent.clone().create_index(self).await
|
||||||
}
|
}
|
||||||
|
|||||||
90
rust/lancedb/src/index/waiter.rs
Normal file
90
rust/lancedb/src/index/waiter.rs
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
use crate::error::Result;
|
||||||
|
use crate::table::BaseTable;
|
||||||
|
use crate::Error;
|
||||||
|
use log::debug;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
use tokio::time::sleep;
|
||||||
|
|
||||||
|
const DEFAULT_SLEEP_MS: u64 = 1000;
|
||||||
|
const MAX_WAIT: Duration = Duration::from_secs(2 * 60 * 60);
|
||||||
|
|
||||||
|
/// Poll the table using list_indices() and index_stats() until all of the indices have 0 un-indexed rows.
|
||||||
|
/// Will return Error::Timeout if the columns are not fully indexed within the timeout.
|
||||||
|
pub async fn wait_for_index(
|
||||||
|
table: &dyn BaseTable,
|
||||||
|
index_names: &[&str],
|
||||||
|
timeout: Duration,
|
||||||
|
) -> Result<()> {
|
||||||
|
if timeout > MAX_WAIT {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: format!("timeout must be less than {:?}", MAX_WAIT).to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
let start = Instant::now();
|
||||||
|
let mut remaining = index_names.to_vec();
|
||||||
|
|
||||||
|
// poll via list_indices() and index_stats() until all indices are created and fully indexed
|
||||||
|
while start.elapsed() < timeout {
|
||||||
|
let mut completed = vec![];
|
||||||
|
let indices = table.list_indices().await?;
|
||||||
|
|
||||||
|
for &idx in &remaining {
|
||||||
|
if !indices.iter().any(|i| i.name == *idx) {
|
||||||
|
debug!("still waiting for new index '{}'", idx);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let stats = table.index_stats(idx.as_ref()).await?;
|
||||||
|
match stats {
|
||||||
|
None => {
|
||||||
|
debug!("still waiting for new index '{}'", idx);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Some(s) => {
|
||||||
|
if s.num_unindexed_rows == 0 {
|
||||||
|
// note: this may never stabilize under constant writes.
|
||||||
|
// we should later replace this with a status/job model
|
||||||
|
completed.push(idx);
|
||||||
|
debug!(
|
||||||
|
"fully indexed '{}'. indexed rows: {}",
|
||||||
|
idx, s.num_indexed_rows
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
debug!(
|
||||||
|
"still waiting for index '{}'. unindexed rows: {}",
|
||||||
|
idx, s.num_unindexed_rows
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
remaining.retain(|idx| !completed.contains(idx));
|
||||||
|
if remaining.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
sleep(Duration::from_millis(DEFAULT_SLEEP_MS)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// debug log index diagnostics
|
||||||
|
for &r in &remaining {
|
||||||
|
let stats = table.index_stats(r.as_ref()).await?;
|
||||||
|
match stats {
|
||||||
|
Some(s) => debug!(
|
||||||
|
"index '{}' not fully indexed after {:?}. stats: {:?}",
|
||||||
|
r, timeout, s
|
||||||
|
),
|
||||||
|
None => debug!("index '{}' not found after {:?}", r, timeout),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(Error::Timeout {
|
||||||
|
message: format!(
|
||||||
|
"timed out waiting for indices: {:?} after {:?}",
|
||||||
|
remaining, timeout
|
||||||
|
)
|
||||||
|
.to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -1,10 +1,6 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
use std::io::Cursor;
|
|
||||||
use std::pin::Pin;
|
|
||||||
use std::sync::{Arc, Mutex};
|
|
||||||
|
|
||||||
use crate::index::Index;
|
use crate::index::Index;
|
||||||
use crate::index::IndexStatistics;
|
use crate::index::IndexStatistics;
|
||||||
use crate::query::{QueryFilter, QueryRequest, Select, VectorQueryRequest};
|
use crate::query::{QueryFilter, QueryRequest, Select, VectorQueryRequest};
|
||||||
@@ -26,8 +22,17 @@ use lance::dataset::scanner::DatasetRecordBatchStream;
|
|||||||
use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
|
use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
|
||||||
use lance_datafusion::exec::{execute_plan, OneShotExec};
|
use lance_datafusion::exec::{execute_plan, OneShotExec};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::io::Cursor;
|
||||||
|
use std::pin::Pin;
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
use std::time::Duration;
|
||||||
use tokio::sync::RwLock;
|
use tokio::sync::RwLock;
|
||||||
|
|
||||||
|
use super::client::RequestResultExt;
|
||||||
|
use super::client::{HttpSend, RestfulLanceDbClient, Sender};
|
||||||
|
use super::db::ServerVersion;
|
||||||
|
use super::ARROW_STREAM_CONTENT_TYPE;
|
||||||
|
use crate::index::waiter::wait_for_index;
|
||||||
use crate::{
|
use crate::{
|
||||||
connection::NoData,
|
connection::NoData,
|
||||||
error::Result,
|
error::Result,
|
||||||
@@ -39,11 +44,6 @@ use crate::{
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::client::RequestResultExt;
|
|
||||||
use super::client::{HttpSend, RestfulLanceDbClient, Sender};
|
|
||||||
use super::db::ServerVersion;
|
|
||||||
use super::ARROW_STREAM_CONTENT_TYPE;
|
|
||||||
|
|
||||||
const REQUEST_TIMEOUT_HEADER: HeaderName = HeaderName::from_static("x-request-timeout-ms");
|
const REQUEST_TIMEOUT_HEADER: HeaderName = HeaderName::from_static("x-request-timeout-ms");
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -339,8 +339,6 @@ impl<S: HttpSend> RemoteTable<S> {
|
|||||||
let mut request = self.client.post(&format!("/v1/table/{}/query/", self.name));
|
let mut request = self.client.post(&format!("/v1/table/{}/query/", self.name));
|
||||||
|
|
||||||
if let Some(timeout) = options.timeout {
|
if let Some(timeout) = options.timeout {
|
||||||
// Client side timeout
|
|
||||||
request = request.timeout(timeout);
|
|
||||||
// Also send to server, so it can abort the query if it takes too long.
|
// Also send to server, so it can abort the query if it takes too long.
|
||||||
// (If it doesn't fit into u64, it's not worth sending anyways.)
|
// (If it doesn't fit into u64, it's not worth sending anyways.)
|
||||||
if let Ok(timeout_ms) = u64::try_from(timeout.as_millis()) {
|
if let Ok(timeout_ms) = u64::try_from(timeout.as_millis()) {
|
||||||
@@ -358,8 +356,26 @@ impl<S: HttpSend> RemoteTable<S> {
|
|||||||
let (request_id, response) = self.client.send(req, true).await?;
|
let (request_id, response) = self.client.send(req, true).await?;
|
||||||
self.read_arrow_stream(&request_id, response).await
|
self.read_arrow_stream(&request_id, response).await
|
||||||
});
|
});
|
||||||
let streams = futures::future::try_join_all(futures).await?;
|
let streams = futures::future::try_join_all(futures);
|
||||||
Ok(streams)
|
|
||||||
|
if let Some(timeout) = options.timeout {
|
||||||
|
let timeout_future = tokio::time::sleep(timeout);
|
||||||
|
tokio::pin!(timeout_future);
|
||||||
|
tokio::pin!(streams);
|
||||||
|
tokio::select! {
|
||||||
|
_ = &mut timeout_future => {
|
||||||
|
Err(Error::Other {
|
||||||
|
message: format!("Query timeout after {} ms", timeout.as_millis()),
|
||||||
|
source: None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
result = &mut streams => {
|
||||||
|
Ok(result?)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Ok(streams.await?)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn prepare_query_bodies(&self, query: &AnyQuery) -> Result<Vec<serde_json::Value>> {
|
async fn prepare_query_bodies(&self, query: &AnyQuery) -> Result<Vec<serde_json::Value>> {
|
||||||
@@ -800,9 +816,20 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
|||||||
|
|
||||||
self.check_table_response(&request_id, response).await?;
|
self.check_table_response(&request_id, response).await?;
|
||||||
|
|
||||||
|
if let Some(wait_timeout) = index.wait_timeout {
|
||||||
|
let name = format!("{}_idx", column);
|
||||||
|
self.wait_for_index(&[&name], wait_timeout).await?;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Poll until the columns are fully indexed. Will return Error::Timeout if the columns
|
||||||
|
/// are not fully indexed within the timeout.
|
||||||
|
async fn wait_for_index(&self, index_names: &[&str], timeout: Duration) -> Result<()> {
|
||||||
|
wait_for_index(self, index_names, timeout).await
|
||||||
|
}
|
||||||
|
|
||||||
async fn merge_insert(
|
async fn merge_insert(
|
||||||
&self,
|
&self,
|
||||||
params: MergeInsertBuilder,
|
params: MergeInsertBuilder,
|
||||||
@@ -984,6 +1011,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
|||||||
|
|
||||||
let body = response.text().await.err_to_http(request_id.clone())?;
|
let body = response.text().await.err_to_http(request_id.clone())?;
|
||||||
|
|
||||||
|
println!("body: {:?}", body);
|
||||||
let stats = serde_json::from_str(&body).map_err(|e| Error::Http {
|
let stats = serde_json::from_str(&body).map_err(|e| Error::Http {
|
||||||
source: format!("Failed to parse index statistics: {}", e).into(),
|
source: format!("Failed to parse index statistics: {}", e).into(),
|
||||||
request_id,
|
request_id,
|
||||||
@@ -1003,6 +1031,12 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn prewarm_index(&self, _index_name: &str) -> Result<()> {
|
||||||
|
Err(Error::NotSupported {
|
||||||
|
message: "prewarm_index is not yet supported on LanceDB cloud.".into(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
async fn table_definition(&self) -> Result<TableDefinition> {
|
async fn table_definition(&self) -> Result<TableDefinition> {
|
||||||
Err(Error::NotSupported {
|
Err(Error::NotSupported {
|
||||||
message: "table_definition is not supported on LanceDB cloud.".into(),
|
message: "table_definition is not supported on LanceDB cloud.".into(),
|
||||||
@@ -1769,6 +1803,7 @@ mod tests {
|
|||||||
"boost": 1.0,
|
"boost": 1.0,
|
||||||
"fuzziness": 0,
|
"fuzziness": 0,
|
||||||
"max_expansions": 50,
|
"max_expansions": 50,
|
||||||
|
"operator": "Or",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -2409,4 +2444,88 @@ mod tests {
|
|||||||
});
|
});
|
||||||
table.drop_index("my_index").await.unwrap();
|
table.drop_index("my_index").await.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_wait_for_index() {
|
||||||
|
let table = _make_table_with_indices(0);
|
||||||
|
table
|
||||||
|
.wait_for_index(&["vector_idx", "my_idx"], Duration::from_secs(1))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_wait_for_index_timeout() {
|
||||||
|
let table = _make_table_with_indices(100);
|
||||||
|
let e = table
|
||||||
|
.wait_for_index(&["vector_idx", "my_idx"], Duration::from_secs(1))
|
||||||
|
.await
|
||||||
|
.unwrap_err();
|
||||||
|
assert_eq!(
|
||||||
|
e.to_string(),
|
||||||
|
"Timeout error: timed out waiting for indices: [\"vector_idx\", \"my_idx\"] after 1s"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_wait_for_index_timeout_never_created() {
|
||||||
|
let table = _make_table_with_indices(0);
|
||||||
|
let e = table
|
||||||
|
.wait_for_index(&["doesnt_exist_idx"], Duration::from_secs(1))
|
||||||
|
.await
|
||||||
|
.unwrap_err();
|
||||||
|
assert_eq!(
|
||||||
|
e.to_string(),
|
||||||
|
"Timeout error: timed out waiting for indices: [\"doesnt_exist_idx\"] after 1s"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn _make_table_with_indices(unindexed_rows: usize) -> Table {
|
||||||
|
let table = Table::new_with_handler("my_table", move |request| {
|
||||||
|
assert_eq!(request.method(), "POST");
|
||||||
|
|
||||||
|
let response_body = match request.url().path() {
|
||||||
|
"/v1/table/my_table/index/list/" => {
|
||||||
|
serde_json::json!({
|
||||||
|
"indexes": [
|
||||||
|
{
|
||||||
|
"index_name": "vector_idx",
|
||||||
|
"index_uuid": "3fa85f64-5717-4562-b3fc-2c963f66afa6",
|
||||||
|
"columns": ["vector"],
|
||||||
|
"index_status": "done",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index_name": "my_idx",
|
||||||
|
"index_uuid": "34255f64-5717-4562-b3fc-2c963f66afa6",
|
||||||
|
"columns": ["my_column"],
|
||||||
|
"index_status": "done",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
})
|
||||||
|
}
|
||||||
|
"/v1/table/my_table/index/vector_idx/stats/" => {
|
||||||
|
serde_json::json!({
|
||||||
|
"num_indexed_rows": 100000,
|
||||||
|
"num_unindexed_rows": unindexed_rows,
|
||||||
|
"index_type": "IVF_PQ",
|
||||||
|
"distance_type": "l2"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
"/v1/table/my_table/index/my_idx/stats/" => {
|
||||||
|
serde_json::json!({
|
||||||
|
"num_indexed_rows": 100000,
|
||||||
|
"num_unindexed_rows": unindexed_rows,
|
||||||
|
"index_type": "LABEL_LIST"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
_path => {
|
||||||
|
serde_json::json!(None::<String>)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let body = serde_json::to_string(&response_body).unwrap();
|
||||||
|
let status = if body == "null" { 404 } else { 200 };
|
||||||
|
http::Response::builder().status(status).body(body).unwrap()
|
||||||
|
});
|
||||||
|
table
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,10 +3,6 @@
|
|||||||
|
|
||||||
//! LanceDB Table APIs
|
//! LanceDB Table APIs
|
||||||
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::path::Path;
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use arrow::array::{AsArray, FixedSizeListBuilder, Float32Builder};
|
use arrow::array::{AsArray, FixedSizeListBuilder, Float32Builder};
|
||||||
use arrow::datatypes::{Float32Type, UInt8Type};
|
use arrow::datatypes::{Float32Type, UInt8Type};
|
||||||
use arrow_array::{RecordBatchIterator, RecordBatchReader};
|
use arrow_array::{RecordBatchIterator, RecordBatchReader};
|
||||||
@@ -45,6 +41,10 @@ use lance_table::format::Manifest;
|
|||||||
use lance_table::io::commit::ManifestNamingScheme;
|
use lance_table::io::commit::ManifestNamingScheme;
|
||||||
use log::info;
|
use log::info;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::format;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crate::arrow::IntoArrow;
|
use crate::arrow::IntoArrow;
|
||||||
use crate::connection::NoData;
|
use crate::connection::NoData;
|
||||||
@@ -78,6 +78,7 @@ pub mod datafusion;
|
|||||||
pub(crate) mod dataset;
|
pub(crate) mod dataset;
|
||||||
pub mod merge;
|
pub mod merge;
|
||||||
|
|
||||||
|
use crate::index::waiter::wait_for_index;
|
||||||
pub use chrono::Duration;
|
pub use chrono::Duration;
|
||||||
pub use lance::dataset::optimize::CompactionOptions;
|
pub use lance::dataset::optimize::CompactionOptions;
|
||||||
pub use lance::dataset::scanner::DatasetRecordBatchStream;
|
pub use lance::dataset::scanner::DatasetRecordBatchStream;
|
||||||
@@ -455,6 +456,8 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
|||||||
async fn list_indices(&self) -> Result<Vec<IndexConfig>>;
|
async fn list_indices(&self) -> Result<Vec<IndexConfig>>;
|
||||||
/// Drop an index from the table.
|
/// Drop an index from the table.
|
||||||
async fn drop_index(&self, name: &str) -> Result<()>;
|
async fn drop_index(&self, name: &str) -> Result<()>;
|
||||||
|
/// Prewarm an index in the table
|
||||||
|
async fn prewarm_index(&self, name: &str) -> Result<()>;
|
||||||
/// Get statistics about the index.
|
/// Get statistics about the index.
|
||||||
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>>;
|
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>>;
|
||||||
/// Merge insert new records into the table.
|
/// Merge insert new records into the table.
|
||||||
@@ -489,6 +492,13 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
|||||||
async fn table_definition(&self) -> Result<TableDefinition>;
|
async fn table_definition(&self) -> Result<TableDefinition>;
|
||||||
/// Get the table URI
|
/// Get the table URI
|
||||||
fn dataset_uri(&self) -> &str;
|
fn dataset_uri(&self) -> &str;
|
||||||
|
/// Poll until the columns are fully indexed. Will return Error::Timeout if the columns
|
||||||
|
/// are not fully indexed within the timeout.
|
||||||
|
async fn wait_for_index(
|
||||||
|
&self,
|
||||||
|
index_names: &[&str],
|
||||||
|
timeout: std::time::Duration,
|
||||||
|
) -> Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A Table is a collection of strong typed Rows.
|
/// A Table is a collection of strong typed Rows.
|
||||||
@@ -767,6 +777,28 @@ impl Table {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// See [Table::create_index]
|
||||||
|
/// For remote tables, this allows an optional wait_timeout to poll until asynchronous indexing is complete
|
||||||
|
pub fn create_index_with_timeout(
|
||||||
|
&self,
|
||||||
|
columns: &[impl AsRef<str>],
|
||||||
|
index: Index,
|
||||||
|
wait_timeout: Option<std::time::Duration>,
|
||||||
|
) -> IndexBuilder {
|
||||||
|
let mut builder = IndexBuilder::new(
|
||||||
|
self.inner.clone(),
|
||||||
|
columns
|
||||||
|
.iter()
|
||||||
|
.map(|val| val.as_ref().to_string())
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
|
index,
|
||||||
|
);
|
||||||
|
if let Some(timeout) = wait_timeout {
|
||||||
|
builder = builder.wait_timeout(timeout);
|
||||||
|
}
|
||||||
|
builder
|
||||||
|
}
|
||||||
|
|
||||||
/// Create a builder for a merge insert operation
|
/// Create a builder for a merge insert operation
|
||||||
///
|
///
|
||||||
/// This operation can add rows, update rows, and remove rows all in a single
|
/// This operation can add rows, update rows, and remove rows all in a single
|
||||||
@@ -1086,6 +1118,32 @@ impl Table {
|
|||||||
self.inner.drop_index(name).await
|
self.inner.drop_index(name).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Prewarm an index in the table
|
||||||
|
///
|
||||||
|
/// This is a hint to fully load the index into memory. It can be used to
|
||||||
|
/// avoid cold starts
|
||||||
|
///
|
||||||
|
/// It is generally wasteful to call this if the index does not fit into the
|
||||||
|
/// available cache.
|
||||||
|
///
|
||||||
|
/// Note: This function is not yet supported on all indices, in which case it
|
||||||
|
/// may do nothing.
|
||||||
|
///
|
||||||
|
/// Use [`Self::list_indices()`] to find the names of the indices.
|
||||||
|
pub async fn prewarm_index(&self, name: &str) -> Result<()> {
|
||||||
|
self.inner.prewarm_index(name).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Poll until the columns are fully indexed. Will return Error::Timeout if the columns
|
||||||
|
/// are not fully indexed within the timeout.
|
||||||
|
pub async fn wait_for_index(
|
||||||
|
&self,
|
||||||
|
index_names: &[&str],
|
||||||
|
timeout: std::time::Duration,
|
||||||
|
) -> Result<()> {
|
||||||
|
self.inner.wait_for_index(index_names, timeout).await
|
||||||
|
}
|
||||||
|
|
||||||
// Take many execution plans and map them into a single plan that adds
|
// Take many execution plans and map them into a single plan that adds
|
||||||
// a query_index column and unions them.
|
// a query_index column and unions them.
|
||||||
pub(crate) fn multi_vector_plan(
|
pub(crate) fn multi_vector_plan(
|
||||||
@@ -2006,6 +2064,11 @@ impl BaseTable for NativeTable {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn prewarm_index(&self, index_name: &str) -> Result<()> {
|
||||||
|
let dataset = self.dataset.get().await?;
|
||||||
|
Ok(dataset.prewarm_index(index_name).await?)
|
||||||
|
}
|
||||||
|
|
||||||
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
|
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
|
||||||
let dataset = self.dataset.get().await?.clone();
|
let dataset = self.dataset.get().await?.clone();
|
||||||
let mut builder = LanceUpdateBuilder::new(Arc::new(dataset));
|
let mut builder = LanceUpdateBuilder::new(Arc::new(dataset));
|
||||||
@@ -2407,6 +2470,16 @@ impl BaseTable for NativeTable {
|
|||||||
loss,
|
loss,
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Poll until the columns are fully indexed. Will return Error::Timeout if the columns
|
||||||
|
/// are not fully indexed within the timeout.
|
||||||
|
async fn wait_for_index(
|
||||||
|
&self,
|
||||||
|
index_names: &[&str],
|
||||||
|
timeout: std::time::Duration,
|
||||||
|
) -> Result<()> {
|
||||||
|
wait_for_index(self, index_names, timeout).await
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -3190,7 +3263,10 @@ mod tests {
|
|||||||
.execute()
|
.execute()
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
table
|
||||||
|
.wait_for_index(&["embeddings_idx"], Duration::from_millis(10))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
let index_configs = table.list_indices().await.unwrap();
|
let index_configs = table.list_indices().await.unwrap();
|
||||||
assert_eq!(index_configs.len(), 1);
|
assert_eq!(index_configs.len(), 1);
|
||||||
let index = index_configs.into_iter().next().unwrap();
|
let index = index_configs.into_iter().next().unwrap();
|
||||||
@@ -3258,7 +3334,10 @@ mod tests {
|
|||||||
.execute()
|
.execute()
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
table
|
||||||
|
.wait_for_index(&["i_idx"], Duration::from_millis(10))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
let index_configs = table.list_indices().await.unwrap();
|
let index_configs = table.list_indices().await.unwrap();
|
||||||
assert_eq!(index_configs.len(), 1);
|
assert_eq!(index_configs.len(), 1);
|
||||||
let index = index_configs.into_iter().next().unwrap();
|
let index = index_configs.into_iter().next().unwrap();
|
||||||
@@ -3455,6 +3534,9 @@ mod tests {
|
|||||||
assert_eq!(stats.num_unindexed_rows, 0);
|
assert_eq!(stats.num_unindexed_rows, 0);
|
||||||
assert_eq!(stats.index_type, crate::index::IndexType::FTS);
|
assert_eq!(stats.index_type, crate::index::IndexType::FTS);
|
||||||
assert_eq!(stats.distance_type, None);
|
assert_eq!(stats.distance_type, None);
|
||||||
|
|
||||||
|
// Make sure we can call prewarm without error
|
||||||
|
table.prewarm_index("text_idx").await.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
@@ -3550,7 +3632,7 @@ mod tests {
|
|||||||
let native_tbl = table.as_native().unwrap();
|
let native_tbl = table.as_native().unwrap();
|
||||||
|
|
||||||
let manifest = native_tbl.manifest().await.unwrap();
|
let manifest = native_tbl.manifest().await.unwrap();
|
||||||
assert_eq!(manifest.config.len(), 0);
|
let base_config_len = manifest.config.len();
|
||||||
|
|
||||||
native_tbl
|
native_tbl
|
||||||
.update_config(vec![("test_key1".to_string(), "test_val1".to_string())])
|
.update_config(vec![("test_key1".to_string(), "test_val1".to_string())])
|
||||||
@@ -3558,7 +3640,7 @@ mod tests {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let manifest = native_tbl.manifest().await.unwrap();
|
let manifest = native_tbl.manifest().await.unwrap();
|
||||||
assert_eq!(manifest.config.len(), 1);
|
assert_eq!(manifest.config.len(), 1 + base_config_len);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
manifest.config.get("test_key1"),
|
manifest.config.get("test_key1"),
|
||||||
Some(&"test_val1".to_string())
|
Some(&"test_val1".to_string())
|
||||||
@@ -3569,7 +3651,7 @@ mod tests {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let manifest = native_tbl.manifest().await.unwrap();
|
let manifest = native_tbl.manifest().await.unwrap();
|
||||||
assert_eq!(manifest.config.len(), 2);
|
assert_eq!(manifest.config.len(), 2 + base_config_len);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
manifest.config.get("test_key1"),
|
manifest.config.get("test_key1"),
|
||||||
Some(&"test_val1".to_string())
|
Some(&"test_val1".to_string())
|
||||||
@@ -3587,7 +3669,7 @@ mod tests {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let manifest = native_tbl.manifest().await.unwrap();
|
let manifest = native_tbl.manifest().await.unwrap();
|
||||||
assert_eq!(manifest.config.len(), 2);
|
assert_eq!(manifest.config.len(), 2 + base_config_len);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
manifest.config.get("test_key1"),
|
manifest.config.get("test_key1"),
|
||||||
Some(&"test_val1".to_string())
|
Some(&"test_val1".to_string())
|
||||||
@@ -3599,7 +3681,7 @@ mod tests {
|
|||||||
|
|
||||||
native_tbl.delete_config_keys(&["test_key1"]).await.unwrap();
|
native_tbl.delete_config_keys(&["test_key1"]).await.unwrap();
|
||||||
let manifest = native_tbl.manifest().await.unwrap();
|
let manifest = native_tbl.manifest().await.unwrap();
|
||||||
assert_eq!(manifest.config.len(), 1);
|
assert_eq!(manifest.config.len(), 1 + base_config_len);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
manifest.config.get("test_key2"),
|
manifest.config.get("test_key2"),
|
||||||
Some(&"test_val2_update".to_string())
|
Some(&"test_val2_update".to_string())
|
||||||
|
|||||||
Reference in New Issue
Block a user