mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
82 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
04f962f6b0 | ||
|
|
19e896ff69 | ||
|
|
272e4103b2 | ||
|
|
75c257ebb6 | ||
|
|
9ee152eb42 | ||
|
|
c9ae1b1737 | ||
|
|
89dc80c42a | ||
|
|
7b020ac799 | ||
|
|
529e774bbb | ||
|
|
7c12239305 | ||
|
|
d83424d6b4 | ||
|
|
8bf89f887c | ||
|
|
b2160b2304 | ||
|
|
1bb82597be | ||
|
|
e4eee38b3c | ||
|
|
64fc2be503 | ||
|
|
dc8054e90d | ||
|
|
1684940946 | ||
|
|
695813463c | ||
|
|
ed594b0f76 | ||
|
|
cee2b5ea42 | ||
|
|
f315f9665a | ||
|
|
5deb26bc8b | ||
|
|
3cc670ac38 | ||
|
|
4ade3e31e2 | ||
|
|
a222d2cd91 | ||
|
|
508e621f3d | ||
|
|
a1a0472f3f | ||
|
|
3425a6d339 | ||
|
|
af54e0ce06 | ||
|
|
089905fe8f | ||
|
|
554939e5d2 | ||
|
|
7a13814922 | ||
|
|
e9f25f6a12 | ||
|
|
419a433244 | ||
|
|
a9311c4dc0 | ||
|
|
178bcf9c90 | ||
|
|
b9be092cb1 | ||
|
|
e8c0c52315 | ||
|
|
a60fa0d3b7 | ||
|
|
726d629b9b | ||
|
|
b493f56dee | ||
|
|
a8b5ad7e74 | ||
|
|
f8f6264883 | ||
|
|
d8517117f1 | ||
|
|
ab66dd5ed2 | ||
|
|
cbb9a7877c | ||
|
|
b7fc223535 | ||
|
|
1fdaf7a1a4 | ||
|
|
d11819c90c | ||
|
|
9b902272f1 | ||
|
|
8c0622fa2c | ||
|
|
2191f948c3 | ||
|
|
acc3b03004 | ||
|
|
7f091b8c8e | ||
|
|
c19bdd9a24 | ||
|
|
dad0ff5cd2 | ||
|
|
a705621067 | ||
|
|
39614fdb7d | ||
|
|
96d534d4bc | ||
|
|
5051d30d09 | ||
|
|
db853c4041 | ||
|
|
76d1d22bdc | ||
|
|
d8746c61c6 | ||
|
|
1a66df2627 | ||
|
|
44670076c1 | ||
|
|
92f0b16e46 | ||
|
|
1620ba3508 | ||
|
|
3ae90dde80 | ||
|
|
4f07fea6df | ||
|
|
3d7d82cf86 | ||
|
|
edc4e40a7b | ||
|
|
ca3806a02f | ||
|
|
35cff12e31 | ||
|
|
c6c20cb2bd | ||
|
|
26080ee4c1 | ||
|
|
ef3a2b5357 | ||
|
|
c42a201389 | ||
|
|
24e42ccd4d | ||
|
|
8a50944061 | ||
|
|
40e066bc7c | ||
|
|
b3ad105fa0 |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.19.0-beta.6"
|
||||
current_version = "0.19.1-beta.3"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
13
.github/workflows/docs.yml
vendored
13
.github/workflows/docs.yml
vendored
@@ -18,17 +18,24 @@ concurrency:
|
||||
group: "pages"
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
# This reduces the disk space needed for the build
|
||||
RUSTFLAGS: "-C debuginfo=0"
|
||||
# according to: https://matklad.github.io/2021/09/04/fast-rust-builds.html
|
||||
# CI builds are faster with incremental disabled.
|
||||
CARGO_INCREMENTAL: "0"
|
||||
|
||||
jobs:
|
||||
# Single deploy job since we're just deploying
|
||||
build:
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
runs-on: buildjet-8vcpu-ubuntu-2204
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install dependecies needed for ubuntu
|
||||
- name: Install dependencies needed for ubuntu
|
||||
run: |
|
||||
sudo apt install -y protobuf-compiler libssl-dev
|
||||
rustup update && rustup default
|
||||
@@ -38,6 +45,7 @@ jobs:
|
||||
python-version: "3.10"
|
||||
cache: "pip"
|
||||
cache-dependency-path: "docs/requirements.txt"
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Build Python
|
||||
working-directory: python
|
||||
run: |
|
||||
@@ -49,7 +57,6 @@ jobs:
|
||||
node-version: 20
|
||||
cache: 'npm'
|
||||
cache-dependency-path: node/package-lock.json
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Install node dependencies
|
||||
working-directory: node
|
||||
run: |
|
||||
|
||||
5
.github/workflows/python.yml
vendored
5
.github/workflows/python.yml
vendored
@@ -136,9 +136,9 @@ jobs:
|
||||
- uses: ./.github/workflows/run_tests
|
||||
with:
|
||||
integration: true
|
||||
- name: Test without pylance
|
||||
- name: Test without pylance or pandas
|
||||
run: |
|
||||
pip uninstall -y pylance
|
||||
pip uninstall -y pylance pandas
|
||||
pytest -vv python/tests/test_table.py
|
||||
# Make sure wheels are not included in the Rust cache
|
||||
- name: Delete wheels
|
||||
@@ -228,6 +228,7 @@ jobs:
|
||||
- name: Install lancedb
|
||||
run: |
|
||||
pip install "pydantic<2"
|
||||
pip install pyarrow==16
|
||||
pip install --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests]
|
||||
pip install tantivy
|
||||
- name: Run tests
|
||||
|
||||
484
Cargo.lock
generated
484
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
18
Cargo.toml
18
Cargo.toml
@@ -21,16 +21,14 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.25.3", "features" = [
|
||||
"dynamodb",
|
||||
], tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-io = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-index = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-linalg = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-table = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-testing = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-datafusion = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance-encoding = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
|
||||
lance = { "version" = "=0.27.0", "features" = ["dynamodb"], tag = "v0.27.0-beta.5", git="https://github.com/lancedb/lance.git" }
|
||||
lance-io = { version = "=0.27.0", tag = "v0.27.0-beta.5", git="https://github.com/lancedb/lance.git" }
|
||||
lance-index = { version = "=0.27.0", tag = "v0.27.0-beta.5", git="https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { version = "=0.27.0", tag = "v0.27.0-beta.5", git="https://github.com/lancedb/lance.git" }
|
||||
lance-table = { version = "=0.27.0", tag = "v0.27.0-beta.5", git="https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { version = "=0.27.0", tag = "v0.27.0-beta.5", git="https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { version = "=0.27.0", tag = "v0.27.0-beta.5", git="https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { version = "=0.27.0", tag = "v0.27.0-beta.5", git="https://github.com/lancedb/lance.git" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "54.1", optional = false }
|
||||
arrow-array = "54.1"
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
LanceDB docs are deployed to https://lancedb.github.io/lancedb/.
|
||||
|
||||
Docs is built and deployed automatically by [Github Actions](.github/workflows/docs.yml)
|
||||
Docs is built and deployed automatically by [Github Actions](../.github/workflows/docs.yml)
|
||||
whenever a commit is pushed to the `main` branch. So it is possible for the docs to show
|
||||
unreleased features.
|
||||
|
||||
|
||||
@@ -342,7 +342,7 @@ For **read and write access**, LanceDB will need a policy such as:
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::<bucket>/<prefix>/*"
|
||||
},
|
||||
@@ -374,7 +374,7 @@ For **read-only access**, LanceDB will need a policy such as:
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:GetObject",
|
||||
"s3:GetObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::<bucket>/<prefix>/*"
|
||||
},
|
||||
|
||||
@@ -765,7 +765,10 @@ This can be used to update zero to all rows depending on how many rows match the
|
||||
];
|
||||
const tbl = await db.createTable("my_table", data)
|
||||
|
||||
await tbl.update({vector: [10, 10]}, { where: "x = 2"})
|
||||
await tbl.update({
|
||||
values: { vector: [10, 10] },
|
||||
where: "x = 2"
|
||||
});
|
||||
```
|
||||
|
||||
=== "vectordb (deprecated)"
|
||||
@@ -784,7 +787,10 @@ This can be used to update zero to all rows depending on how many rows match the
|
||||
];
|
||||
const tbl = await db.createTable("my_table", data)
|
||||
|
||||
await tbl.update({ where: "x = 2", values: {vector: [10, 10]} })
|
||||
await tbl.update({
|
||||
where: "x = 2",
|
||||
values: { vector: [10, 10] }
|
||||
});
|
||||
```
|
||||
|
||||
#### Updating using a sql query
|
||||
|
||||
@@ -33,20 +33,22 @@ Construct a MergeInsertBuilder. __Internal use only.__
|
||||
### execute()
|
||||
|
||||
```ts
|
||||
execute(data): Promise<void>
|
||||
execute(data, execOptions?): Promise<MergeResult>
|
||||
```
|
||||
|
||||
Executes the merge insert operation
|
||||
|
||||
Nothing is returned but the `Table` is updated
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **data**: [`Data`](../type-aliases/Data.md)
|
||||
|
||||
* **execOptions?**: `Partial`<[`WriteExecutionOptions`](../interfaces/WriteExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
`Promise`<[`MergeResult`](../interfaces/MergeResult.md)>
|
||||
|
||||
the merge result
|
||||
|
||||
***
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ Returns the name of the table
|
||||
### add()
|
||||
|
||||
```ts
|
||||
abstract add(data, options?): Promise<void>
|
||||
abstract add(data, options?): Promise<AddResult>
|
||||
```
|
||||
|
||||
Insert records into this Table.
|
||||
@@ -54,14 +54,17 @@ Insert records into this Table.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
`Promise`<[`AddResult`](../interfaces/AddResult.md)>
|
||||
|
||||
A promise that resolves to an object
|
||||
containing the new version number of the table
|
||||
|
||||
***
|
||||
|
||||
### addColumns()
|
||||
|
||||
```ts
|
||||
abstract addColumns(newColumnTransforms): Promise<void>
|
||||
abstract addColumns(newColumnTransforms): Promise<AddColumnsResult>
|
||||
```
|
||||
|
||||
Add new columns with defined values.
|
||||
@@ -76,14 +79,17 @@ Add new columns with defined values.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
`Promise`<[`AddColumnsResult`](../interfaces/AddColumnsResult.md)>
|
||||
|
||||
A promise that resolves to an object
|
||||
containing the new version number of the table after adding the columns.
|
||||
|
||||
***
|
||||
|
||||
### alterColumns()
|
||||
|
||||
```ts
|
||||
abstract alterColumns(columnAlterations): Promise<void>
|
||||
abstract alterColumns(columnAlterations): Promise<AlterColumnsResult>
|
||||
```
|
||||
|
||||
Alter the name or nullability of columns.
|
||||
@@ -96,7 +102,10 @@ Alter the name or nullability of columns.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
`Promise`<[`AlterColumnsResult`](../interfaces/AlterColumnsResult.md)>
|
||||
|
||||
A promise that resolves to an object
|
||||
containing the new version number of the table after altering the columns.
|
||||
|
||||
***
|
||||
|
||||
@@ -117,8 +126,8 @@ wish to return to standard mode, call `checkoutLatest`.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **version**: `number`
|
||||
The version to checkout
|
||||
* **version**: `string` \| `number`
|
||||
The version to checkout, could be version number or tag
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -252,7 +261,7 @@ await table.createIndex("my_float_col");
|
||||
### delete()
|
||||
|
||||
```ts
|
||||
abstract delete(predicate): Promise<void>
|
||||
abstract delete(predicate): Promise<DeleteResult>
|
||||
```
|
||||
|
||||
Delete the rows that satisfy the predicate.
|
||||
@@ -263,7 +272,10 @@ Delete the rows that satisfy the predicate.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
`Promise`<[`DeleteResult`](../interfaces/DeleteResult.md)>
|
||||
|
||||
A promise that resolves to an object
|
||||
containing the new version number of the table
|
||||
|
||||
***
|
||||
|
||||
@@ -284,7 +296,7 @@ Return a brief description of the table
|
||||
### dropColumns()
|
||||
|
||||
```ts
|
||||
abstract dropColumns(columnNames): Promise<void>
|
||||
abstract dropColumns(columnNames): Promise<DropColumnsResult>
|
||||
```
|
||||
|
||||
Drop one or more columns from the dataset
|
||||
@@ -303,7 +315,10 @@ then call ``cleanup_files`` to remove the old files.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
`Promise`<[`DropColumnsResult`](../interfaces/DropColumnsResult.md)>
|
||||
|
||||
A promise that resolves to an object
|
||||
containing the new version number of the table after dropping the columns.
|
||||
|
||||
***
|
||||
|
||||
@@ -454,6 +469,28 @@ Modeled after ``VACUUM`` in PostgreSQL.
|
||||
|
||||
***
|
||||
|
||||
### prewarmIndex()
|
||||
|
||||
```ts
|
||||
abstract prewarmIndex(name): Promise<void>
|
||||
```
|
||||
|
||||
Prewarm an index in the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
The name of the index.
|
||||
This will load the index into memory. This may reduce the cold-start time for
|
||||
future queries. If the index does not fit in the cache then this call may be
|
||||
wasteful.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
|
||||
***
|
||||
|
||||
### query()
|
||||
|
||||
```ts
|
||||
@@ -593,6 +630,50 @@ of the given query
|
||||
|
||||
***
|
||||
|
||||
### stats()
|
||||
|
||||
```ts
|
||||
abstract stats(): Promise<TableStatistics>
|
||||
```
|
||||
|
||||
Returns table and fragment statistics
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`TableStatistics`](../interfaces/TableStatistics.md)>
|
||||
|
||||
The table and fragment statistics
|
||||
|
||||
***
|
||||
|
||||
### tags()
|
||||
|
||||
```ts
|
||||
abstract tags(): Promise<Tags>
|
||||
```
|
||||
|
||||
Get a tags manager for this table.
|
||||
|
||||
Tags allow you to label specific versions of a table with a human-readable name.
|
||||
The returned tags manager can be used to list, create, update, or delete tags.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Tags`](Tags.md)>
|
||||
|
||||
A tags manager for this table
|
||||
|
||||
#### Example
|
||||
|
||||
```typescript
|
||||
const tagsManager = await table.tags();
|
||||
await tagsManager.create("v1", 1);
|
||||
const tags = await tagsManager.list();
|
||||
console.log(tags); // { "v1": { version: 1, manifestSize: ... } }
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### toArrow()
|
||||
|
||||
```ts
|
||||
@@ -612,7 +693,7 @@ Return the table as an arrow table
|
||||
#### update(opts)
|
||||
|
||||
```ts
|
||||
abstract update(opts): Promise<void>
|
||||
abstract update(opts): Promise<UpdateResult>
|
||||
```
|
||||
|
||||
Update existing records in the Table
|
||||
@@ -623,7 +704,10 @@ Update existing records in the Table
|
||||
|
||||
##### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
`Promise`<[`UpdateResult`](../interfaces/UpdateResult.md)>
|
||||
|
||||
A promise that resolves to an object containing
|
||||
the number of rows updated and the new version number
|
||||
|
||||
##### Example
|
||||
|
||||
@@ -634,7 +718,7 @@ table.update({where:"x = 2", values:{"vector": [10, 10]}})
|
||||
#### update(opts)
|
||||
|
||||
```ts
|
||||
abstract update(opts): Promise<void>
|
||||
abstract update(opts): Promise<UpdateResult>
|
||||
```
|
||||
|
||||
Update existing records in the Table
|
||||
@@ -645,7 +729,10 @@ Update existing records in the Table
|
||||
|
||||
##### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
`Promise`<[`UpdateResult`](../interfaces/UpdateResult.md)>
|
||||
|
||||
A promise that resolves to an object containing
|
||||
the number of rows updated and the new version number
|
||||
|
||||
##### Example
|
||||
|
||||
@@ -656,7 +743,7 @@ table.update({where:"x = 2", valuesSql:{"x": "x + 1"}})
|
||||
#### update(updates, options)
|
||||
|
||||
```ts
|
||||
abstract update(updates, options?): Promise<void>
|
||||
abstract update(updates, options?): Promise<UpdateResult>
|
||||
```
|
||||
|
||||
Update existing records in the Table
|
||||
@@ -679,10 +766,6 @@ repeatedly calilng this method.
|
||||
* **updates**: `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
the
|
||||
columns to update
|
||||
Keys in the map should specify the name of the column to update.
|
||||
Values in the map provide the new value of the column. These can
|
||||
be SQL literal strings (e.g. "7" or "'foo'") or they can be expressions
|
||||
based on the row being updated (e.g. "my_col + 1")
|
||||
|
||||
* **options?**: `Partial`<[`UpdateOptions`](../interfaces/UpdateOptions.md)>
|
||||
additional options to control
|
||||
@@ -690,7 +773,15 @@ repeatedly calilng this method.
|
||||
|
||||
##### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
`Promise`<[`UpdateResult`](../interfaces/UpdateResult.md)>
|
||||
|
||||
A promise that resolves to an object
|
||||
containing the number of rows updated and the new version number
|
||||
|
||||
Keys in the map should specify the name of the column to update.
|
||||
Values in the map provide the new value of the column. These can
|
||||
be SQL literal strings (e.g. "7" or "'foo'") or they can be expressions
|
||||
based on the row being updated (e.g. "my_col + 1")
|
||||
|
||||
***
|
||||
|
||||
@@ -731,3 +822,26 @@ Retrieve the version of the table
|
||||
#### Returns
|
||||
|
||||
`Promise`<`number`>
|
||||
|
||||
***
|
||||
|
||||
### waitForIndex()
|
||||
|
||||
```ts
|
||||
abstract waitForIndex(indexNames, timeoutSeconds): Promise<void>
|
||||
```
|
||||
|
||||
Waits for asynchronous indexing to complete on the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **indexNames**: `string`[]
|
||||
The name of the indices to wait for
|
||||
|
||||
* **timeoutSeconds**: `number`
|
||||
The number of seconds to wait before timing out
|
||||
This will raise an error if the indices are not created and fully indexed within the timeout.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
|
||||
35
docs/src/js/classes/TagContents.md
Normal file
35
docs/src/js/classes/TagContents.md
Normal file
@@ -0,0 +1,35 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / TagContents
|
||||
|
||||
# Class: TagContents
|
||||
|
||||
## Constructors
|
||||
|
||||
### new TagContents()
|
||||
|
||||
```ts
|
||||
new TagContents(): TagContents
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
[`TagContents`](TagContents.md)
|
||||
|
||||
## Properties
|
||||
|
||||
### manifestSize
|
||||
|
||||
```ts
|
||||
manifestSize: number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### version
|
||||
|
||||
```ts
|
||||
version: number;
|
||||
```
|
||||
99
docs/src/js/classes/Tags.md
Normal file
99
docs/src/js/classes/Tags.md
Normal file
@@ -0,0 +1,99 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / Tags
|
||||
|
||||
# Class: Tags
|
||||
|
||||
## Constructors
|
||||
|
||||
### new Tags()
|
||||
|
||||
```ts
|
||||
new Tags(): Tags
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
[`Tags`](Tags.md)
|
||||
|
||||
## Methods
|
||||
|
||||
### create()
|
||||
|
||||
```ts
|
||||
create(tag, version): Promise<void>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **tag**: `string`
|
||||
|
||||
* **version**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
|
||||
***
|
||||
|
||||
### delete()
|
||||
|
||||
```ts
|
||||
delete(tag): Promise<void>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **tag**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
|
||||
***
|
||||
|
||||
### getVersion()
|
||||
|
||||
```ts
|
||||
getVersion(tag): Promise<number>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **tag**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`number`>
|
||||
|
||||
***
|
||||
|
||||
### list()
|
||||
|
||||
```ts
|
||||
list(): Promise<Record<string, TagContents>>
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`Record`<`string`, [`TagContents`](TagContents.md)>>
|
||||
|
||||
***
|
||||
|
||||
### update()
|
||||
|
||||
```ts
|
||||
update(tag, version): Promise<void>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **tag**: `string`
|
||||
|
||||
* **version**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
@@ -27,19 +27,28 @@
|
||||
- [QueryBase](classes/QueryBase.md)
|
||||
- [RecordBatchIterator](classes/RecordBatchIterator.md)
|
||||
- [Table](classes/Table.md)
|
||||
- [TagContents](classes/TagContents.md)
|
||||
- [Tags](classes/Tags.md)
|
||||
- [VectorColumnOptions](classes/VectorColumnOptions.md)
|
||||
- [VectorQuery](classes/VectorQuery.md)
|
||||
|
||||
## Interfaces
|
||||
|
||||
- [AddColumnsResult](interfaces/AddColumnsResult.md)
|
||||
- [AddColumnsSql](interfaces/AddColumnsSql.md)
|
||||
- [AddDataOptions](interfaces/AddDataOptions.md)
|
||||
- [AddResult](interfaces/AddResult.md)
|
||||
- [AlterColumnsResult](interfaces/AlterColumnsResult.md)
|
||||
- [ClientConfig](interfaces/ClientConfig.md)
|
||||
- [ColumnAlteration](interfaces/ColumnAlteration.md)
|
||||
- [CompactionStats](interfaces/CompactionStats.md)
|
||||
- [ConnectionOptions](interfaces/ConnectionOptions.md)
|
||||
- [CreateTableOptions](interfaces/CreateTableOptions.md)
|
||||
- [DeleteResult](interfaces/DeleteResult.md)
|
||||
- [DropColumnsResult](interfaces/DropColumnsResult.md)
|
||||
- [ExecutableQuery](interfaces/ExecutableQuery.md)
|
||||
- [FragmentStatistics](interfaces/FragmentStatistics.md)
|
||||
- [FragmentSummaryStats](interfaces/FragmentSummaryStats.md)
|
||||
- [FtsOptions](interfaces/FtsOptions.md)
|
||||
- [FullTextQuery](interfaces/FullTextQuery.md)
|
||||
- [FullTextSearchOptions](interfaces/FullTextSearchOptions.md)
|
||||
@@ -50,6 +59,7 @@
|
||||
- [IndexStatistics](interfaces/IndexStatistics.md)
|
||||
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
|
||||
- [IvfPqOptions](interfaces/IvfPqOptions.md)
|
||||
- [MergeResult](interfaces/MergeResult.md)
|
||||
- [OpenTableOptions](interfaces/OpenTableOptions.md)
|
||||
- [OptimizeOptions](interfaces/OptimizeOptions.md)
|
||||
- [OptimizeStats](interfaces/OptimizeStats.md)
|
||||
@@ -57,9 +67,12 @@
|
||||
- [RemovalStats](interfaces/RemovalStats.md)
|
||||
- [RetryConfig](interfaces/RetryConfig.md)
|
||||
- [TableNamesOptions](interfaces/TableNamesOptions.md)
|
||||
- [TableStatistics](interfaces/TableStatistics.md)
|
||||
- [TimeoutConfig](interfaces/TimeoutConfig.md)
|
||||
- [UpdateOptions](interfaces/UpdateOptions.md)
|
||||
- [UpdateResult](interfaces/UpdateResult.md)
|
||||
- [Version](interfaces/Version.md)
|
||||
- [WriteExecutionOptions](interfaces/WriteExecutionOptions.md)
|
||||
|
||||
## Type Aliases
|
||||
|
||||
|
||||
15
docs/src/js/interfaces/AddColumnsResult.md
Normal file
15
docs/src/js/interfaces/AddColumnsResult.md
Normal file
@@ -0,0 +1,15 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / AddColumnsResult
|
||||
|
||||
# Interface: AddColumnsResult
|
||||
|
||||
## Properties
|
||||
|
||||
### version
|
||||
|
||||
```ts
|
||||
version: number;
|
||||
```
|
||||
15
docs/src/js/interfaces/AddResult.md
Normal file
15
docs/src/js/interfaces/AddResult.md
Normal file
@@ -0,0 +1,15 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / AddResult
|
||||
|
||||
# Interface: AddResult
|
||||
|
||||
## Properties
|
||||
|
||||
### version
|
||||
|
||||
```ts
|
||||
version: number;
|
||||
```
|
||||
15
docs/src/js/interfaces/AlterColumnsResult.md
Normal file
15
docs/src/js/interfaces/AlterColumnsResult.md
Normal file
@@ -0,0 +1,15 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / AlterColumnsResult
|
||||
|
||||
# Interface: AlterColumnsResult
|
||||
|
||||
## Properties
|
||||
|
||||
### version
|
||||
|
||||
```ts
|
||||
version: number;
|
||||
```
|
||||
15
docs/src/js/interfaces/DeleteResult.md
Normal file
15
docs/src/js/interfaces/DeleteResult.md
Normal file
@@ -0,0 +1,15 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / DeleteResult
|
||||
|
||||
# Interface: DeleteResult
|
||||
|
||||
## Properties
|
||||
|
||||
### version
|
||||
|
||||
```ts
|
||||
version: number;
|
||||
```
|
||||
15
docs/src/js/interfaces/DropColumnsResult.md
Normal file
15
docs/src/js/interfaces/DropColumnsResult.md
Normal file
@@ -0,0 +1,15 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / DropColumnsResult
|
||||
|
||||
# Interface: DropColumnsResult
|
||||
|
||||
## Properties
|
||||
|
||||
### version
|
||||
|
||||
```ts
|
||||
version: number;
|
||||
```
|
||||
37
docs/src/js/interfaces/FragmentStatistics.md
Normal file
37
docs/src/js/interfaces/FragmentStatistics.md
Normal file
@@ -0,0 +1,37 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / FragmentStatistics
|
||||
|
||||
# Interface: FragmentStatistics
|
||||
|
||||
## Properties
|
||||
|
||||
### lengths
|
||||
|
||||
```ts
|
||||
lengths: FragmentSummaryStats;
|
||||
```
|
||||
|
||||
Statistics on the number of rows in the table fragments
|
||||
|
||||
***
|
||||
|
||||
### numFragments
|
||||
|
||||
```ts
|
||||
numFragments: number;
|
||||
```
|
||||
|
||||
The number of fragments in the table
|
||||
|
||||
***
|
||||
|
||||
### numSmallFragments
|
||||
|
||||
```ts
|
||||
numSmallFragments: number;
|
||||
```
|
||||
|
||||
The number of uncompacted fragments in the table
|
||||
77
docs/src/js/interfaces/FragmentSummaryStats.md
Normal file
77
docs/src/js/interfaces/FragmentSummaryStats.md
Normal file
@@ -0,0 +1,77 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / FragmentSummaryStats
|
||||
|
||||
# Interface: FragmentSummaryStats
|
||||
|
||||
## Properties
|
||||
|
||||
### max
|
||||
|
||||
```ts
|
||||
max: number;
|
||||
```
|
||||
|
||||
The number of rows in the fragment with the most rows
|
||||
|
||||
***
|
||||
|
||||
### mean
|
||||
|
||||
```ts
|
||||
mean: number;
|
||||
```
|
||||
|
||||
The mean number of rows in the fragments
|
||||
|
||||
***
|
||||
|
||||
### min
|
||||
|
||||
```ts
|
||||
min: number;
|
||||
```
|
||||
|
||||
The number of rows in the fragment with the fewest rows
|
||||
|
||||
***
|
||||
|
||||
### p25
|
||||
|
||||
```ts
|
||||
p25: number;
|
||||
```
|
||||
|
||||
The 25th percentile of number of rows in the fragments
|
||||
|
||||
***
|
||||
|
||||
### p50
|
||||
|
||||
```ts
|
||||
p50: number;
|
||||
```
|
||||
|
||||
The 50th percentile of number of rows in the fragments
|
||||
|
||||
***
|
||||
|
||||
### p75
|
||||
|
||||
```ts
|
||||
p75: number;
|
||||
```
|
||||
|
||||
The 75th percentile of number of rows in the fragments
|
||||
|
||||
***
|
||||
|
||||
### p99
|
||||
|
||||
```ts
|
||||
p99: number;
|
||||
```
|
||||
|
||||
The 99th percentile of number of rows in the fragments
|
||||
@@ -39,3 +39,11 @@ and the same name, then an error will be returned. This is true even if
|
||||
that index is out of date.
|
||||
|
||||
The default is true
|
||||
|
||||
***
|
||||
|
||||
### waitTimeoutSeconds?
|
||||
|
||||
```ts
|
||||
optional waitTimeoutSeconds: number;
|
||||
```
|
||||
|
||||
39
docs/src/js/interfaces/MergeResult.md
Normal file
39
docs/src/js/interfaces/MergeResult.md
Normal file
@@ -0,0 +1,39 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / MergeResult
|
||||
|
||||
# Interface: MergeResult
|
||||
|
||||
## Properties
|
||||
|
||||
### numDeletedRows
|
||||
|
||||
```ts
|
||||
numDeletedRows: number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### numInsertedRows
|
||||
|
||||
```ts
|
||||
numInsertedRows: number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### numUpdatedRows
|
||||
|
||||
```ts
|
||||
numUpdatedRows: number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### version
|
||||
|
||||
```ts
|
||||
version: number;
|
||||
```
|
||||
47
docs/src/js/interfaces/TableStatistics.md
Normal file
47
docs/src/js/interfaces/TableStatistics.md
Normal file
@@ -0,0 +1,47 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / TableStatistics
|
||||
|
||||
# Interface: TableStatistics
|
||||
|
||||
## Properties
|
||||
|
||||
### fragmentStats
|
||||
|
||||
```ts
|
||||
fragmentStats: FragmentStatistics;
|
||||
```
|
||||
|
||||
Statistics on table fragments
|
||||
|
||||
***
|
||||
|
||||
### numIndices
|
||||
|
||||
```ts
|
||||
numIndices: number;
|
||||
```
|
||||
|
||||
The number of indices in the table
|
||||
|
||||
***
|
||||
|
||||
### numRows
|
||||
|
||||
```ts
|
||||
numRows: number;
|
||||
```
|
||||
|
||||
The number of rows in the table
|
||||
|
||||
***
|
||||
|
||||
### totalBytes
|
||||
|
||||
```ts
|
||||
totalBytes: number;
|
||||
```
|
||||
|
||||
The total number of bytes in the table
|
||||
23
docs/src/js/interfaces/UpdateResult.md
Normal file
23
docs/src/js/interfaces/UpdateResult.md
Normal file
@@ -0,0 +1,23 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / UpdateResult
|
||||
|
||||
# Interface: UpdateResult
|
||||
|
||||
## Properties
|
||||
|
||||
### rowsUpdated
|
||||
|
||||
```ts
|
||||
rowsUpdated: number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### version
|
||||
|
||||
```ts
|
||||
version: number;
|
||||
```
|
||||
26
docs/src/js/interfaces/WriteExecutionOptions.md
Normal file
26
docs/src/js/interfaces/WriteExecutionOptions.md
Normal file
@@ -0,0 +1,26 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / WriteExecutionOptions
|
||||
|
||||
# Interface: WriteExecutionOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### timeoutMs?
|
||||
|
||||
```ts
|
||||
optional timeoutMs: number;
|
||||
```
|
||||
|
||||
Maximum time to run the operation before cancelling it.
|
||||
|
||||
By default, there is a 30-second timeout that is only enforced after the
|
||||
first attempt. This is to prevent spending too long retrying to resolve
|
||||
conflicts. For example, if a write attempt takes 20 seconds and fails,
|
||||
the second attempt will be cancelled after 10 seconds, hitting the
|
||||
30-second timeout. However, a write that takes one hour and succeeds on the
|
||||
first attempt will not be cancelled.
|
||||
|
||||
When this is set, the timeout is enforced on all attempts, including the first.
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.19.0-beta.6</version>
|
||||
<version>0.19.1-beta.3</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.19.0-beta.6</version>
|
||||
<version>0.19.1-beta.3</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<name>LanceDB Parent</name>
|
||||
|
||||
44
node/package-lock.json
generated
44
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,11 +52,11 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.19.0-beta.6",
|
||||
"@lancedb/vectordb-darwin-x64": "0.19.0-beta.6",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.19.0-beta.6",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.19.0-beta.6",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.19.0-beta.6"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.19.1-beta.3",
|
||||
"@lancedb/vectordb-darwin-x64": "0.19.1-beta.3",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.3",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.3",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
@@ -327,9 +327,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.19.0-beta.6",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.0-beta.6.tgz",
|
||||
"integrity": "sha512-fujUe3Gt1n1vgxXMDaUatZEQICh9VAmj1CJK/gQCMZo9ky/MH1TnxP0nA6hN7fkRvl28C2Ms2adlTdlnTxLSlw==",
|
||||
"version": "0.19.1-beta.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.1-beta.3.tgz",
|
||||
"integrity": "sha512-TglTNkvgxxHHhh8YbEwj5t9XuInNVUNeFN34Zyk+7ab/rDdMASiKv6ZvDkwacVm7aXeBbLw39/6+IegStJfFCg==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -340,9 +340,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.19.0-beta.6",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.0-beta.6.tgz",
|
||||
"integrity": "sha512-ZKUvPwKvnK5WfyCR3Asbm1XXXA5JWYfDVD2ovPU/mv/rqoroYEpxm7TH1OG8AQ8bvBmrCmPc0sPJP5kijd6BFg==",
|
||||
"version": "0.19.1-beta.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.1-beta.3.tgz",
|
||||
"integrity": "sha512-mwBbOVgeUT3xyegzga0gTBJ+DXI3dP1zPKcOQRQDRJk+GkfHk1CblGXT3h/YL18NWfR1FGMe9s59PNJR6r6l8A==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -353,9 +353,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.19.0-beta.6",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.0-beta.6.tgz",
|
||||
"integrity": "sha512-m4DuGCEhEAy+EtamSBMF1ujiVkpJD3ybF/Yp1pYYo9FTFThczAeRiyUg7diRZYfahZExKsATj62PqHXNVo8x9A==",
|
||||
"version": "0.19.1-beta.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.1-beta.3.tgz",
|
||||
"integrity": "sha512-amihspQ5ThSKRJsPpeAte/edWDGAN5ZjdqhtX8AUuuOmoJ5EekfsgXZc+fyFNwl6RzGT7PKqpL7SQzOdLKMijQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -366,9 +366,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.19.0-beta.6",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.0-beta.6.tgz",
|
||||
"integrity": "sha512-npUR23GZJDVfkPUPtaxLuYUeqyAQ/vcp4R7RjCSdBo+hJNiQAG4TX31YAE8OKnOGskEO7XJ3BgEAxM+upiNmnA==",
|
||||
"version": "0.19.1-beta.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.1-beta.3.tgz",
|
||||
"integrity": "sha512-mZzOETBii+UUu7D2TOohhukXNjjOfldbNADRB20FF2a3hYzrVteiFudCQRYtbVunpHE0qvNRTkyuRqM7DwOygw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -379,9 +379,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.19.0-beta.6",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.0-beta.6.tgz",
|
||||
"integrity": "sha512-Ebas+phT0D7NoB1e3lMZn5h7WVyT5pPIwO1Kk1cZ93V4zaxn2BQRwjLTLxJwR9G+emQoLv659Ze0NtnFuEbXaA==",
|
||||
"version": "0.19.1-beta.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.1-beta.3.tgz",
|
||||
"integrity": "sha512-LHsKFtJZRRZ4MVa6uSeWqPJ9vfw0atmp6bvVDByxgouVN4CwdlnAxOu69YJtwDPxnfg8Pn+eQ5txIFvhFtCAnA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"private": false,
|
||||
"main": "dist/index.js",
|
||||
@@ -89,10 +89,10 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-x64": "0.19.0-beta.6",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.19.0-beta.6",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.19.0-beta.6",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.19.0-beta.6",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.19.0-beta.6"
|
||||
"@lancedb/vectordb-darwin-x64": "0.19.1-beta.3",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.19.1-beta.3",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.3",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.3",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.3"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.19.0-beta.6"
|
||||
version = "0.19.1-beta.3"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
@@ -28,6 +28,9 @@ napi-derive = "2.16.4"
|
||||
lzma-sys = { version = "*", features = ["static"] }
|
||||
log.workspace = true
|
||||
|
||||
# Workaround for build failure until we can fix it.
|
||||
aws-lc-sys = "=0.28.0"
|
||||
|
||||
[build-dependencies]
|
||||
napi-build = "2.1"
|
||||
|
||||
|
||||
@@ -374,6 +374,71 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(table2.numRows).toBe(4);
|
||||
expect(table2.schema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("should correctly retain values in nested struct fields", async function () {
|
||||
// Define test data with nested struct
|
||||
const testData = [
|
||||
{
|
||||
id: "doc1",
|
||||
vector: [1, 2, 3],
|
||||
metadata: {
|
||||
filePath: "/path/to/file1.ts",
|
||||
startLine: 10,
|
||||
endLine: 20,
|
||||
text: "function test() { return true; }",
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "doc2",
|
||||
vector: [4, 5, 6],
|
||||
metadata: {
|
||||
filePath: "/path/to/file2.ts",
|
||||
startLine: 30,
|
||||
endLine: 40,
|
||||
text: "function test2() { return false; }",
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
// Create Arrow table from the data
|
||||
const table = makeArrowTable(testData);
|
||||
|
||||
// Verify schema has the nested struct fields
|
||||
const metadataField = table.schema.fields.find(
|
||||
(f) => f.name === "metadata",
|
||||
);
|
||||
expect(metadataField).toBeDefined();
|
||||
// biome-ignore lint/suspicious/noExplicitAny: accessing fields in different Arrow versions
|
||||
const childNames = metadataField?.type.children.map((c: any) => c.name);
|
||||
expect(childNames).toEqual([
|
||||
"filePath",
|
||||
"startLine",
|
||||
"endLine",
|
||||
"text",
|
||||
]);
|
||||
|
||||
// Convert to buffer and back (simulating storage and retrieval)
|
||||
const buf = await fromTableToBuffer(table);
|
||||
const retrievedTable = tableFromIPC(buf);
|
||||
|
||||
// Verify the retrieved table has the same structure
|
||||
const rows = [];
|
||||
for (let i = 0; i < retrievedTable.numRows; i++) {
|
||||
rows.push(retrievedTable.get(i));
|
||||
}
|
||||
|
||||
// Check values in the first row
|
||||
const firstRow = rows[0];
|
||||
expect(firstRow.id).toBe("doc1");
|
||||
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
|
||||
|
||||
// Verify metadata values are preserved (this is where the bug is)
|
||||
expect(firstRow.metadata).toBeDefined();
|
||||
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
|
||||
expect(firstRow.metadata.startLine).toBe(10);
|
||||
expect(firstRow.metadata.endLine).toBe(20);
|
||||
expect(firstRow.metadata.text).toBe("function test() { return true; }");
|
||||
});
|
||||
});
|
||||
|
||||
class DummyEmbedding extends EmbeddingFunction<string> {
|
||||
|
||||
@@ -34,6 +34,7 @@ import {
|
||||
} from "../lancedb/embedding";
|
||||
import { Index } from "../lancedb/indices";
|
||||
import { instanceOfFullTextQuery } from "../lancedb/query";
|
||||
import exp = require("constants");
|
||||
|
||||
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
"Given a table",
|
||||
@@ -71,8 +72,33 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
await expect(table.countRows()).resolves.toBe(3);
|
||||
});
|
||||
|
||||
it("should overwrite data if asked", async () => {
|
||||
it("should show table stats", async () => {
|
||||
await table.add([{ id: 1 }, { id: 2 }]);
|
||||
await table.add([{ id: 1 }]);
|
||||
await expect(table.stats()).resolves.toEqual({
|
||||
fragmentStats: {
|
||||
lengths: {
|
||||
max: 2,
|
||||
mean: 1,
|
||||
min: 1,
|
||||
p25: 1,
|
||||
p50: 2,
|
||||
p75: 2,
|
||||
p99: 2,
|
||||
},
|
||||
numFragments: 2,
|
||||
numSmallFragments: 2,
|
||||
},
|
||||
numIndices: 0,
|
||||
numRows: 3,
|
||||
totalBytes: 24,
|
||||
});
|
||||
});
|
||||
|
||||
it("should overwrite data if asked", async () => {
|
||||
const addRes = await table.add([{ id: 1 }, { id: 2 }]);
|
||||
expect(addRes).toHaveProperty("version");
|
||||
expect(addRes.version).toBe(2);
|
||||
await table.add([{ id: 1 }], { mode: "overwrite" });
|
||||
await expect(table.countRows()).resolves.toBe(1);
|
||||
});
|
||||
@@ -88,7 +114,11 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
await table.add([{ id: 1 }]);
|
||||
expect(await table.countRows("id == 1")).toBe(1);
|
||||
expect(await table.countRows("id == 7")).toBe(0);
|
||||
await table.update({ id: "7" });
|
||||
const updateRes = await table.update({ id: "7" });
|
||||
expect(updateRes).toHaveProperty("version");
|
||||
expect(updateRes.version).toBe(3);
|
||||
expect(updateRes).toHaveProperty("rowsUpdated");
|
||||
expect(updateRes.rowsUpdated).toBe(1);
|
||||
expect(await table.countRows("id == 1")).toBe(0);
|
||||
expect(await table.countRows("id == 7")).toBe(1);
|
||||
await table.add([{ id: 2 }]);
|
||||
@@ -315,11 +345,17 @@ describe("merge insert", () => {
|
||||
{ a: 3, b: "y" },
|
||||
{ a: 4, b: "z" },
|
||||
];
|
||||
await table
|
||||
const mergeInsertRes = await table
|
||||
.mergeInsert("a")
|
||||
.whenMatchedUpdateAll()
|
||||
.whenNotMatchedInsertAll()
|
||||
.execute(newData);
|
||||
.execute(newData, { timeoutMs: 10_000 });
|
||||
expect(mergeInsertRes).toHaveProperty("version");
|
||||
expect(mergeInsertRes.version).toBe(2);
|
||||
expect(mergeInsertRes.numInsertedRows).toBe(1);
|
||||
expect(mergeInsertRes.numUpdatedRows).toBe(2);
|
||||
expect(mergeInsertRes.numDeletedRows).toBe(0);
|
||||
|
||||
const expected = [
|
||||
{ a: 1, b: "a" },
|
||||
{ a: 2, b: "x" },
|
||||
@@ -337,10 +373,12 @@ describe("merge insert", () => {
|
||||
{ a: 3, b: "y" },
|
||||
{ a: 4, b: "z" },
|
||||
];
|
||||
await table
|
||||
const mergeInsertRes = await table
|
||||
.mergeInsert("a")
|
||||
.whenMatchedUpdateAll({ where: "target.b = 'b'" })
|
||||
.execute(newData);
|
||||
expect(mergeInsertRes).toHaveProperty("version");
|
||||
expect(mergeInsertRes.version).toBe(2);
|
||||
|
||||
const expected = [
|
||||
{ a: 1, b: "a" },
|
||||
@@ -425,6 +463,20 @@ describe("merge insert", () => {
|
||||
res = res.sort((a, b) => a.a - b.a);
|
||||
expect(res).toEqual(expected);
|
||||
});
|
||||
|
||||
test("timeout", async () => {
|
||||
const newData = [
|
||||
{ a: 2, b: "x" },
|
||||
{ a: 4, b: "z" },
|
||||
];
|
||||
await expect(
|
||||
table
|
||||
.mergeInsert("a")
|
||||
.whenMatchedUpdateAll()
|
||||
.whenNotMatchedInsertAll()
|
||||
.execute(newData, { timeoutMs: 0 }),
|
||||
).rejects.toThrow("merge insert timed out");
|
||||
});
|
||||
});
|
||||
|
||||
describe("When creating an index", () => {
|
||||
@@ -507,6 +559,15 @@ describe("When creating an index", () => {
|
||||
expect(indices2.length).toBe(0);
|
||||
});
|
||||
|
||||
it("should wait for index readiness", async () => {
|
||||
// Create an index and then wait for it to be ready
|
||||
await tbl.createIndex("vec");
|
||||
const indices = await tbl.listIndices();
|
||||
expect(indices.length).toBeGreaterThan(0);
|
||||
const idxName = indices[0].name;
|
||||
await expect(tbl.waitForIndex([idxName], 5)).resolves.toBeUndefined();
|
||||
});
|
||||
|
||||
it("should search with distance range", async () => {
|
||||
await tbl.createIndex("vec");
|
||||
|
||||
@@ -824,6 +885,7 @@ describe("When creating an index", () => {
|
||||
// Only build index over v1
|
||||
await tbl.createIndex("vec", {
|
||||
config: Index.ivfPq({ numPartitions: 2, numSubVectors: 2 }),
|
||||
waitTimeoutSeconds: 30,
|
||||
});
|
||||
|
||||
const rst = await tbl
|
||||
@@ -990,15 +1052,19 @@ describe("schema evolution", function () {
|
||||
{ id: 1n, vector: [0.1, 0.2] },
|
||||
]);
|
||||
// Can create a non-nullable column only through addColumns at the moment.
|
||||
await table.addColumns([
|
||||
const addColumnsRes = await table.addColumns([
|
||||
{ name: "price", valueSql: "cast(10.0 as double)" },
|
||||
]);
|
||||
expect(addColumnsRes).toHaveProperty("version");
|
||||
expect(addColumnsRes.version).toBe(2);
|
||||
expect(await table.schema()).toEqual(schema);
|
||||
|
||||
await table.alterColumns([
|
||||
const alterColumnsRes = await table.alterColumns([
|
||||
{ path: "id", rename: "new_id" },
|
||||
{ path: "price", nullable: true },
|
||||
]);
|
||||
expect(alterColumnsRes).toHaveProperty("version");
|
||||
expect(alterColumnsRes.version).toBe(3);
|
||||
|
||||
const expectedSchema = new Schema([
|
||||
new Field("new_id", new Int64(), true),
|
||||
@@ -1116,7 +1182,9 @@ describe("schema evolution", function () {
|
||||
const table = await con.createTable("vectors", [
|
||||
{ id: 1n, vector: [0.1, 0.2] },
|
||||
]);
|
||||
await table.dropColumns(["vector"]);
|
||||
const dropColumnsRes = await table.dropColumns(["vector"]);
|
||||
expect(dropColumnsRes).toHaveProperty("version");
|
||||
expect(dropColumnsRes.version).toBe(2);
|
||||
|
||||
const expectedSchema = new Schema([new Field("id", new Int64(), true)]);
|
||||
expect(await table.schema()).toEqual(expectedSchema);
|
||||
@@ -1168,6 +1236,99 @@ describe("when dealing with versioning", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("when dealing with tags", () => {
|
||||
let tmpDir: tmp.DirResult;
|
||||
beforeEach(() => {
|
||||
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||
});
|
||||
afterEach(() => {
|
||||
tmpDir.removeCallback();
|
||||
});
|
||||
|
||||
it("can manage tags", async () => {
|
||||
const conn = await connect(tmpDir.name, {
|
||||
readConsistencyInterval: 0,
|
||||
});
|
||||
|
||||
const table = await conn.createTable("my_table", [
|
||||
{ id: 1n, vector: [0.1, 0.2] },
|
||||
]);
|
||||
expect(await table.version()).toBe(1);
|
||||
|
||||
await table.add([{ id: 2n, vector: [0.3, 0.4] }]);
|
||||
expect(await table.version()).toBe(2);
|
||||
|
||||
const tagsManager = await table.tags();
|
||||
|
||||
const initialTags = await tagsManager.list();
|
||||
expect(Object.keys(initialTags).length).toBe(0);
|
||||
|
||||
const tag1 = "tag1";
|
||||
await tagsManager.create(tag1, 1);
|
||||
expect(await tagsManager.getVersion(tag1)).toBe(1);
|
||||
|
||||
const tagsAfterFirst = await tagsManager.list();
|
||||
expect(Object.keys(tagsAfterFirst).length).toBe(1);
|
||||
expect(tagsAfterFirst).toHaveProperty(tag1);
|
||||
expect(tagsAfterFirst[tag1].version).toBe(1);
|
||||
|
||||
await tagsManager.create("tag2", 2);
|
||||
expect(await tagsManager.getVersion("tag2")).toBe(2);
|
||||
|
||||
const tagsAfterSecond = await tagsManager.list();
|
||||
expect(Object.keys(tagsAfterSecond).length).toBe(2);
|
||||
expect(tagsAfterSecond).toHaveProperty(tag1);
|
||||
expect(tagsAfterSecond[tag1].version).toBe(1);
|
||||
expect(tagsAfterSecond).toHaveProperty("tag2");
|
||||
expect(tagsAfterSecond["tag2"].version).toBe(2);
|
||||
|
||||
await table.add([{ id: 3n, vector: [0.5, 0.6] }]);
|
||||
await tagsManager.update(tag1, 3);
|
||||
expect(await tagsManager.getVersion(tag1)).toBe(3);
|
||||
|
||||
await tagsManager.delete("tag2");
|
||||
const tagsAfterDelete = await tagsManager.list();
|
||||
expect(Object.keys(tagsAfterDelete).length).toBe(1);
|
||||
expect(tagsAfterDelete).toHaveProperty(tag1);
|
||||
expect(tagsAfterDelete[tag1].version).toBe(3);
|
||||
|
||||
await table.add([{ id: 4n, vector: [0.7, 0.8] }]);
|
||||
expect(await table.version()).toBe(4);
|
||||
|
||||
await table.checkout(tag1);
|
||||
expect(await table.version()).toBe(3);
|
||||
|
||||
await table.checkoutLatest();
|
||||
expect(await table.version()).toBe(4);
|
||||
});
|
||||
|
||||
it("can checkout and restore tags", async () => {
|
||||
const conn = await connect(tmpDir.name, {
|
||||
readConsistencyInterval: 0,
|
||||
});
|
||||
|
||||
const table = await conn.createTable("my_table", [
|
||||
{ id: 1n, vector: [0.1, 0.2] },
|
||||
]);
|
||||
expect(await table.version()).toBe(1);
|
||||
expect(await table.countRows()).toBe(1);
|
||||
const tagsManager = await table.tags();
|
||||
const tag1 = "tag1";
|
||||
await tagsManager.create(tag1, 1);
|
||||
await table.add([{ id: 2n, vector: [0.3, 0.4] }]);
|
||||
const tag2 = "tag2";
|
||||
await tagsManager.create(tag2, 2);
|
||||
expect(await table.version()).toBe(2);
|
||||
await table.checkout(tag1);
|
||||
expect(await table.version()).toBe(1);
|
||||
await table.restore();
|
||||
expect(await table.version()).toBe(3);
|
||||
expect(await table.countRows()).toBe(1);
|
||||
await table.add([{ id: 3n, vector: [0.5, 0.6] }]);
|
||||
expect(await table.countRows()).toBe(2);
|
||||
});
|
||||
});
|
||||
|
||||
describe("when optimizing a dataset", () => {
|
||||
let tmpDir: tmp.DirResult;
|
||||
let table: Table;
|
||||
@@ -1312,6 +1473,28 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(results2[0].text).toBe(data[1].text);
|
||||
});
|
||||
|
||||
test("prewarm full text search index", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
{ text: ["lance database", "the", "search"], vector: [0.1, 0.2, 0.3] },
|
||||
{ text: ["lance database"], vector: [0.4, 0.5, 0.6] },
|
||||
{ text: ["lance", "search"], vector: [0.7, 0.8, 0.9] },
|
||||
{ text: ["database", "search"], vector: [1.0, 1.1, 1.2] },
|
||||
{ text: ["unrelated", "doc"], vector: [1.3, 1.4, 1.5] },
|
||||
];
|
||||
const table = await db.createTable("test", data);
|
||||
await table.createIndex("text", {
|
||||
config: Index.fts(),
|
||||
});
|
||||
|
||||
// For the moment, we just confirm we can call prewarmIndex without error
|
||||
// and still search it afterwards
|
||||
await table.prewarmIndex("text_idx");
|
||||
|
||||
const results = await table.search("lance").toArray();
|
||||
expect(results.length).toBe(3);
|
||||
});
|
||||
|
||||
test("full text index on list", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
|
||||
@@ -639,8 +639,9 @@ function transposeData(
|
||||
): Vector {
|
||||
if (field.type instanceof Struct) {
|
||||
const childFields = field.type.children;
|
||||
const fullPath = [...path, field.name];
|
||||
const childVectors = childFields.map((child) => {
|
||||
return transposeData(data, child, [...path, child.name]);
|
||||
return transposeData(data, child, fullPath);
|
||||
});
|
||||
const structData = makeData({
|
||||
type: field.type,
|
||||
@@ -652,7 +653,14 @@ function transposeData(
|
||||
const values = data.map((datum) => {
|
||||
let current: unknown = datum;
|
||||
for (const key of valuesPath) {
|
||||
if (isObject(current) && Object.hasOwn(current, key)) {
|
||||
if (current == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (
|
||||
isObject(current) &&
|
||||
(Object.hasOwn(current, key) || key in current)
|
||||
) {
|
||||
current = current[key];
|
||||
} else {
|
||||
return null;
|
||||
|
||||
@@ -23,6 +23,18 @@ export {
|
||||
OptimizeStats,
|
||||
CompactionStats,
|
||||
RemovalStats,
|
||||
TableStatistics,
|
||||
FragmentStatistics,
|
||||
FragmentSummaryStats,
|
||||
Tags,
|
||||
TagContents,
|
||||
MergeResult,
|
||||
AddResult,
|
||||
AddColumnsResult,
|
||||
AlterColumnsResult,
|
||||
DeleteResult,
|
||||
DropColumnsResult,
|
||||
UpdateResult,
|
||||
} from "./native.js";
|
||||
|
||||
export {
|
||||
@@ -74,7 +86,7 @@ export {
|
||||
ColumnAlteration,
|
||||
} from "./table";
|
||||
|
||||
export { MergeInsertBuilder } from "./merge";
|
||||
export { MergeInsertBuilder, WriteExecutionOptions } from "./merge";
|
||||
|
||||
export * as embedding from "./embedding";
|
||||
export * as rerankers from "./rerankers";
|
||||
|
||||
@@ -681,4 +681,6 @@ export interface IndexOptions {
|
||||
* The default is true
|
||||
*/
|
||||
replace?: boolean;
|
||||
|
||||
waitTimeoutSeconds?: number;
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
import { Data, Schema, fromDataToBuffer } from "./arrow";
|
||||
import { NativeMergeInsertBuilder } from "./native";
|
||||
import { MergeResult, NativeMergeInsertBuilder } from "./native";
|
||||
|
||||
/** A builder used to create and run a merge insert operation */
|
||||
export class MergeInsertBuilder {
|
||||
@@ -73,9 +73,12 @@ export class MergeInsertBuilder {
|
||||
/**
|
||||
* Executes the merge insert operation
|
||||
*
|
||||
* Nothing is returned but the `Table` is updated
|
||||
* @returns {Promise<MergeResult>} the merge result
|
||||
*/
|
||||
async execute(data: Data): Promise<void> {
|
||||
async execute(
|
||||
data: Data,
|
||||
execOptions?: Partial<WriteExecutionOptions>,
|
||||
): Promise<MergeResult> {
|
||||
let schema: Schema;
|
||||
if (this.#schema instanceof Promise) {
|
||||
schema = await this.#schema;
|
||||
@@ -83,7 +86,28 @@ export class MergeInsertBuilder {
|
||||
} else {
|
||||
schema = this.#schema;
|
||||
}
|
||||
|
||||
if (execOptions?.timeoutMs !== undefined) {
|
||||
this.#native.setTimeout(execOptions.timeoutMs);
|
||||
}
|
||||
|
||||
const buffer = await fromDataToBuffer(data, undefined, schema);
|
||||
await this.#native.execute(buffer);
|
||||
return await this.#native.execute(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
export interface WriteExecutionOptions {
|
||||
/**
|
||||
* Maximum time to run the operation before cancelling it.
|
||||
*
|
||||
* By default, there is a 30-second timeout that is only enforced after the
|
||||
* first attempt. This is to prevent spending too long retrying to resolve
|
||||
* conflicts. For example, if a write attempt takes 20 seconds and fails,
|
||||
* the second attempt will be cancelled after 10 seconds, hitting the
|
||||
* 30-second timeout. However, a write that takes one hour and succeeds on the
|
||||
* first attempt will not be cancelled.
|
||||
*
|
||||
* When this is set, the timeout is enforced on all attempts, including the first.
|
||||
*/
|
||||
timeoutMs?: number;
|
||||
}
|
||||
|
||||
@@ -16,10 +16,18 @@ import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
||||
import { IndexOptions } from "./indices";
|
||||
import { MergeInsertBuilder } from "./merge";
|
||||
import {
|
||||
AddColumnsResult,
|
||||
AddColumnsSql,
|
||||
AddResult,
|
||||
AlterColumnsResult,
|
||||
DeleteResult,
|
||||
DropColumnsResult,
|
||||
IndexConfig,
|
||||
IndexStatistics,
|
||||
OptimizeStats,
|
||||
TableStatistics,
|
||||
Tags,
|
||||
UpdateResult,
|
||||
Table as _NativeTable,
|
||||
} from "./native";
|
||||
import {
|
||||
@@ -124,12 +132,19 @@ export abstract class Table {
|
||||
/**
|
||||
* Insert records into this Table.
|
||||
* @param {Data} data Records to be inserted into the Table
|
||||
* @returns {Promise<AddResult>} A promise that resolves to an object
|
||||
* containing the new version number of the table
|
||||
*/
|
||||
abstract add(data: Data, options?: Partial<AddDataOptions>): Promise<void>;
|
||||
abstract add(
|
||||
data: Data,
|
||||
options?: Partial<AddDataOptions>,
|
||||
): Promise<AddResult>;
|
||||
/**
|
||||
* Update existing records in the Table
|
||||
* @param opts.values The values to update. The keys are the column names and the values
|
||||
* are the values to set.
|
||||
* @returns {Promise<UpdateResult>} A promise that resolves to an object containing
|
||||
* the number of rows updated and the new version number
|
||||
* @example
|
||||
* ```ts
|
||||
* table.update({where:"x = 2", values:{"vector": [10, 10]}})
|
||||
@@ -139,11 +154,13 @@ export abstract class Table {
|
||||
opts: {
|
||||
values: Map<string, IntoSql> | Record<string, IntoSql>;
|
||||
} & Partial<UpdateOptions>,
|
||||
): Promise<void>;
|
||||
): Promise<UpdateResult>;
|
||||
/**
|
||||
* Update existing records in the Table
|
||||
* @param opts.valuesSql The values to update. The keys are the column names and the values
|
||||
* are the values to set. The values are SQL expressions.
|
||||
* @returns {Promise<UpdateResult>} A promise that resolves to an object containing
|
||||
* the number of rows updated and the new version number
|
||||
* @example
|
||||
* ```ts
|
||||
* table.update({where:"x = 2", valuesSql:{"x": "x + 1"}})
|
||||
@@ -153,7 +170,7 @@ export abstract class Table {
|
||||
opts: {
|
||||
valuesSql: Map<string, string> | Record<string, string>;
|
||||
} & Partial<UpdateOptions>,
|
||||
): Promise<void>;
|
||||
): Promise<UpdateResult>;
|
||||
/**
|
||||
* Update existing records in the Table
|
||||
*
|
||||
@@ -171,6 +188,8 @@ export abstract class Table {
|
||||
* repeatedly calilng this method.
|
||||
* @param {Map<string, string> | Record<string, string>} updates - the
|
||||
* columns to update
|
||||
* @returns {Promise<UpdateResult>} A promise that resolves to an object
|
||||
* containing the number of rows updated and the new version number
|
||||
*
|
||||
* Keys in the map should specify the name of the column to update.
|
||||
* Values in the map provide the new value of the column. These can
|
||||
@@ -182,12 +201,16 @@ export abstract class Table {
|
||||
abstract update(
|
||||
updates: Map<string, string> | Record<string, string>,
|
||||
options?: Partial<UpdateOptions>,
|
||||
): Promise<void>;
|
||||
): Promise<UpdateResult>;
|
||||
|
||||
/** Count the total number of rows in the dataset. */
|
||||
abstract countRows(filter?: string): Promise<number>;
|
||||
/** Delete the rows that satisfy the predicate. */
|
||||
abstract delete(predicate: string): Promise<void>;
|
||||
/**
|
||||
* Delete the rows that satisfy the predicate.
|
||||
* @returns {Promise<DeleteResult>} A promise that resolves to an object
|
||||
* containing the new version number of the table
|
||||
*/
|
||||
abstract delete(predicate: string): Promise<DeleteResult>;
|
||||
/**
|
||||
* Create an index to speed up queries.
|
||||
*
|
||||
@@ -235,6 +258,30 @@ export abstract class Table {
|
||||
*/
|
||||
abstract dropIndex(name: string): Promise<void>;
|
||||
|
||||
/**
|
||||
* Prewarm an index in the table.
|
||||
*
|
||||
* @param name The name of the index.
|
||||
*
|
||||
* This will load the index into memory. This may reduce the cold-start time for
|
||||
* future queries. If the index does not fit in the cache then this call may be
|
||||
* wasteful.
|
||||
*/
|
||||
abstract prewarmIndex(name: string): Promise<void>;
|
||||
|
||||
/**
|
||||
* Waits for asynchronous indexing to complete on the table.
|
||||
*
|
||||
* @param indexNames The name of the indices to wait for
|
||||
* @param timeoutSeconds The number of seconds to wait before timing out
|
||||
*
|
||||
* This will raise an error if the indices are not created and fully indexed within the timeout.
|
||||
*/
|
||||
abstract waitForIndex(
|
||||
indexNames: string[],
|
||||
timeoutSeconds: number,
|
||||
): Promise<void>;
|
||||
|
||||
/**
|
||||
* Create a {@link Query} Builder.
|
||||
*
|
||||
@@ -317,15 +364,23 @@ export abstract class Table {
|
||||
* the SQL expression to use to calculate the value of the new column. These
|
||||
* expressions will be evaluated for each row in the table, and can
|
||||
* reference existing columns in the table.
|
||||
* @returns {Promise<AddColumnsResult>} A promise that resolves to an object
|
||||
* containing the new version number of the table after adding the columns.
|
||||
*/
|
||||
abstract addColumns(newColumnTransforms: AddColumnsSql[]): Promise<void>;
|
||||
abstract addColumns(
|
||||
newColumnTransforms: AddColumnsSql[],
|
||||
): Promise<AddColumnsResult>;
|
||||
|
||||
/**
|
||||
* Alter the name or nullability of columns.
|
||||
* @param {ColumnAlteration[]} columnAlterations One or more alterations to
|
||||
* apply to columns.
|
||||
* @returns {Promise<AlterColumnsResult>} A promise that resolves to an object
|
||||
* containing the new version number of the table after altering the columns.
|
||||
*/
|
||||
abstract alterColumns(columnAlterations: ColumnAlteration[]): Promise<void>;
|
||||
abstract alterColumns(
|
||||
columnAlterations: ColumnAlteration[],
|
||||
): Promise<AlterColumnsResult>;
|
||||
/**
|
||||
* Drop one or more columns from the dataset
|
||||
*
|
||||
@@ -336,8 +391,10 @@ export abstract class Table {
|
||||
* @param {string[]} columnNames The names of the columns to drop. These can
|
||||
* be nested column references (e.g. "a.b.c") or top-level column names
|
||||
* (e.g. "a").
|
||||
* @returns {Promise<DropColumnsResult>} A promise that resolves to an object
|
||||
* containing the new version number of the table after dropping the columns.
|
||||
*/
|
||||
abstract dropColumns(columnNames: string[]): Promise<void>;
|
||||
abstract dropColumns(columnNames: string[]): Promise<DropColumnsResult>;
|
||||
/** Retrieve the version of the table */
|
||||
|
||||
abstract version(): Promise<number>;
|
||||
@@ -350,7 +407,7 @@ export abstract class Table {
|
||||
*
|
||||
* Calling this method will set the table into time-travel mode. If you
|
||||
* wish to return to standard mode, call `checkoutLatest`.
|
||||
* @param {number} version The version to checkout
|
||||
* @param {number | string} version The version to checkout, could be version number or tag
|
||||
* @example
|
||||
* ```typescript
|
||||
* import * as lancedb from "@lancedb/lancedb"
|
||||
@@ -366,7 +423,8 @@ export abstract class Table {
|
||||
* console.log(await table.version()); // 2
|
||||
* ```
|
||||
*/
|
||||
abstract checkout(version: number): Promise<void>;
|
||||
abstract checkout(version: number | string): Promise<void>;
|
||||
|
||||
/**
|
||||
* Checkout the latest version of the table. _This is an in-place operation._
|
||||
*
|
||||
@@ -380,6 +438,23 @@ export abstract class Table {
|
||||
*/
|
||||
abstract listVersions(): Promise<Version[]>;
|
||||
|
||||
/**
|
||||
* Get a tags manager for this table.
|
||||
*
|
||||
* Tags allow you to label specific versions of a table with a human-readable name.
|
||||
* The returned tags manager can be used to list, create, update, or delete tags.
|
||||
*
|
||||
* @returns {Tags} A tags manager for this table
|
||||
* @example
|
||||
* ```typescript
|
||||
* const tagsManager = await table.tags();
|
||||
* await tagsManager.create("v1", 1);
|
||||
* const tags = await tagsManager.list();
|
||||
* console.log(tags); // { "v1": { version: 1, manifestSize: ... } }
|
||||
* ```
|
||||
*/
|
||||
abstract tags(): Promise<Tags>;
|
||||
|
||||
/**
|
||||
* Restore the table to the currently checked out version
|
||||
*
|
||||
@@ -439,6 +514,13 @@ export abstract class Table {
|
||||
* Use {@link Table.listIndices} to find the names of the indices.
|
||||
*/
|
||||
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
|
||||
|
||||
/** Returns table and fragment statistics
|
||||
*
|
||||
* @returns {TableStatistics} The table and fragment statistics
|
||||
*
|
||||
*/
|
||||
abstract stats(): Promise<TableStatistics>;
|
||||
}
|
||||
|
||||
export class LocalTable extends Table {
|
||||
@@ -478,12 +560,12 @@ export class LocalTable extends Table {
|
||||
return tbl.schema;
|
||||
}
|
||||
|
||||
async add(data: Data, options?: Partial<AddDataOptions>): Promise<void> {
|
||||
async add(data: Data, options?: Partial<AddDataOptions>): Promise<AddResult> {
|
||||
const mode = options?.mode ?? "append";
|
||||
const schema = await this.schema();
|
||||
|
||||
const buffer = await fromDataToBuffer(data, undefined, schema);
|
||||
await this.inner.add(buffer, mode);
|
||||
return await this.inner.add(buffer, mode);
|
||||
}
|
||||
|
||||
async update(
|
||||
@@ -496,7 +578,7 @@ export class LocalTable extends Table {
|
||||
valuesSql: Map<string, string> | Record<string, string>;
|
||||
} & Partial<UpdateOptions>),
|
||||
options?: Partial<UpdateOptions>,
|
||||
) {
|
||||
): Promise<UpdateResult> {
|
||||
const isValues =
|
||||
"values" in optsOrUpdates && typeof optsOrUpdates.values !== "string";
|
||||
const isValuesSql =
|
||||
@@ -543,28 +625,44 @@ export class LocalTable extends Table {
|
||||
columns = Object.entries(optsOrUpdates as Record<string, string>);
|
||||
predicate = options?.where;
|
||||
}
|
||||
await this.inner.update(predicate, columns);
|
||||
return await this.inner.update(predicate, columns);
|
||||
}
|
||||
|
||||
async countRows(filter?: string): Promise<number> {
|
||||
return await this.inner.countRows(filter);
|
||||
}
|
||||
|
||||
async delete(predicate: string): Promise<void> {
|
||||
await this.inner.delete(predicate);
|
||||
async delete(predicate: string): Promise<DeleteResult> {
|
||||
return await this.inner.delete(predicate);
|
||||
}
|
||||
|
||||
async createIndex(column: string, options?: Partial<IndexOptions>) {
|
||||
// Bit of a hack to get around the fact that TS has no package-scope.
|
||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||
const nativeIndex = (options?.config as any)?.inner;
|
||||
await this.inner.createIndex(nativeIndex, column, options?.replace);
|
||||
await this.inner.createIndex(
|
||||
nativeIndex,
|
||||
column,
|
||||
options?.replace,
|
||||
options?.waitTimeoutSeconds,
|
||||
);
|
||||
}
|
||||
|
||||
async dropIndex(name: string): Promise<void> {
|
||||
await this.inner.dropIndex(name);
|
||||
}
|
||||
|
||||
async prewarmIndex(name: string): Promise<void> {
|
||||
await this.inner.prewarmIndex(name);
|
||||
}
|
||||
|
||||
async waitForIndex(
|
||||
indexNames: string[],
|
||||
timeoutSeconds: number,
|
||||
): Promise<void> {
|
||||
await this.inner.waitForIndex(indexNames, timeoutSeconds);
|
||||
}
|
||||
|
||||
query(): Query {
|
||||
return new Query(this.inner);
|
||||
}
|
||||
@@ -623,11 +721,15 @@ export class LocalTable extends Table {
|
||||
|
||||
// TODO: Support BatchUDF
|
||||
|
||||
async addColumns(newColumnTransforms: AddColumnsSql[]): Promise<void> {
|
||||
await this.inner.addColumns(newColumnTransforms);
|
||||
async addColumns(
|
||||
newColumnTransforms: AddColumnsSql[],
|
||||
): Promise<AddColumnsResult> {
|
||||
return await this.inner.addColumns(newColumnTransforms);
|
||||
}
|
||||
|
||||
async alterColumns(columnAlterations: ColumnAlteration[]): Promise<void> {
|
||||
async alterColumns(
|
||||
columnAlterations: ColumnAlteration[],
|
||||
): Promise<AlterColumnsResult> {
|
||||
const processedAlterations = columnAlterations.map((alteration) => {
|
||||
if (typeof alteration.dataType === "string") {
|
||||
return {
|
||||
@@ -648,19 +750,22 @@ export class LocalTable extends Table {
|
||||
}
|
||||
});
|
||||
|
||||
await this.inner.alterColumns(processedAlterations);
|
||||
return await this.inner.alterColumns(processedAlterations);
|
||||
}
|
||||
|
||||
async dropColumns(columnNames: string[]): Promise<void> {
|
||||
await this.inner.dropColumns(columnNames);
|
||||
async dropColumns(columnNames: string[]): Promise<DropColumnsResult> {
|
||||
return await this.inner.dropColumns(columnNames);
|
||||
}
|
||||
|
||||
async version(): Promise<number> {
|
||||
return await this.inner.version();
|
||||
}
|
||||
|
||||
async checkout(version: number): Promise<void> {
|
||||
await this.inner.checkout(version);
|
||||
async checkout(version: number | string): Promise<void> {
|
||||
if (typeof version === "string") {
|
||||
return this.inner.checkoutTag(version);
|
||||
}
|
||||
return this.inner.checkout(version);
|
||||
}
|
||||
|
||||
async checkoutLatest(): Promise<void> {
|
||||
@@ -679,6 +784,10 @@ export class LocalTable extends Table {
|
||||
await this.inner.restore();
|
||||
}
|
||||
|
||||
async tags(): Promise<Tags> {
|
||||
return await this.inner.tags();
|
||||
}
|
||||
|
||||
async optimize(options?: Partial<OptimizeOptions>): Promise<OptimizeStats> {
|
||||
let cleanupOlderThanMs;
|
||||
if (
|
||||
@@ -709,6 +818,11 @@ export class LocalTable extends Table {
|
||||
}
|
||||
return stats;
|
||||
}
|
||||
|
||||
async stats(): Promise<TableStatistics> {
|
||||
return await this.inner.stats();
|
||||
}
|
||||
|
||||
mergeInsert(on: string | string[]): MergeInsertBuilder {
|
||||
on = Array.isArray(on) ? on : [on];
|
||||
return new MergeInsertBuilder(this.inner.mergeInsert(on), this.schema());
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.19.0-beta.6",
|
||||
"version": "0.19.1-beta.3",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use lancedb::{arrow::IntoArrow, ipc::ipc_file_to_batches, table::merge::MergeInsertBuilder};
|
||||
use napi::bindgen_prelude::*;
|
||||
use napi_derive::napi;
|
||||
|
||||
use crate::error::convert_error;
|
||||
use crate::{error::convert_error, table::MergeResult};
|
||||
|
||||
#[napi]
|
||||
#[derive(Clone)]
|
||||
@@ -36,8 +38,13 @@ impl NativeMergeInsertBuilder {
|
||||
this
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn set_timeout(&mut self, timeout: u32) {
|
||||
self.inner.timeout(Duration::from_millis(timeout as u64));
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn execute(&self, buf: Buffer) -> napi::Result<()> {
|
||||
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeResult> {
|
||||
let data = ipc_file_to_batches(buf.to_vec())
|
||||
.and_then(IntoArrow::into_arrow)
|
||||
.map_err(|e| {
|
||||
@@ -46,12 +53,13 @@ impl NativeMergeInsertBuilder {
|
||||
|
||||
let this = self.clone();
|
||||
|
||||
this.inner.execute(data).await.map_err(|e| {
|
||||
let res = this.inner.execute(data).await.map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to execute merge insert: {}",
|
||||
convert_error(&e)
|
||||
))
|
||||
})
|
||||
})?;
|
||||
Ok(res.into())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -327,6 +327,7 @@ impl JsFullTextQuery {
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
#[allow(clippy::use_self)] // NAPI doesn't allow Self here but clippy reports it
|
||||
pub fn boost_query(
|
||||
positive: &JsFullTextQuery,
|
||||
negative: &JsFullTextQuery,
|
||||
@@ -349,11 +350,8 @@ impl JsFullTextQuery {
|
||||
boosts: Option<Vec<f64>>,
|
||||
) -> napi::Result<Self> {
|
||||
let q = match boosts {
|
||||
Some(boosts) => MultiMatchQuery::try_new_with_boosts(
|
||||
query,
|
||||
columns,
|
||||
boosts.into_iter().map(|v| v as f32).collect(),
|
||||
),
|
||||
Some(boosts) => MultiMatchQuery::try_new(query, columns)
|
||||
.and_then(|q| q.try_with_boosts(boosts.into_iter().map(|v| v as f32).collect())),
|
||||
None => MultiMatchQuery::try_new(query, columns),
|
||||
}
|
||||
.map_err(|e| {
|
||||
|
||||
@@ -75,7 +75,7 @@ impl Table {
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn add(&self, buf: Buffer, mode: String) -> napi::Result<()> {
|
||||
pub async fn add(&self, buf: Buffer, mode: String) -> napi::Result<AddResult> {
|
||||
let batches = ipc_file_to_batches(buf.to_vec())
|
||||
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
|
||||
let mut op = self.inner_ref()?.add(batches);
|
||||
@@ -88,7 +88,8 @@ impl Table {
|
||||
return Err(napi::Error::from_reason(format!("Invalid mode: {}", mode)));
|
||||
};
|
||||
|
||||
op.execute().await.default_error()
|
||||
let res = op.execute().await.default_error()?;
|
||||
Ok(res.into())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
@@ -101,8 +102,9 @@ impl Table {
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn delete(&self, predicate: String) -> napi::Result<()> {
|
||||
self.inner_ref()?.delete(&predicate).await.default_error()
|
||||
pub async fn delete(&self, predicate: String) -> napi::Result<DeleteResult> {
|
||||
let res = self.inner_ref()?.delete(&predicate).await.default_error()?;
|
||||
Ok(res.into())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
@@ -111,6 +113,7 @@ impl Table {
|
||||
index: Option<&Index>,
|
||||
column: String,
|
||||
replace: Option<bool>,
|
||||
wait_timeout_s: Option<i64>,
|
||||
) -> napi::Result<()> {
|
||||
let lancedb_index = if let Some(index) = index {
|
||||
index.consume()?
|
||||
@@ -121,6 +124,10 @@ impl Table {
|
||||
if let Some(replace) = replace {
|
||||
builder = builder.replace(replace);
|
||||
}
|
||||
if let Some(timeout) = wait_timeout_s {
|
||||
builder =
|
||||
builder.wait_timeout(std::time::Duration::from_secs(timeout.try_into().unwrap()));
|
||||
}
|
||||
builder.execute().await.default_error()
|
||||
}
|
||||
|
||||
@@ -132,12 +139,38 @@ impl Table {
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn prewarm_index(&self, index_name: String) -> napi::Result<()> {
|
||||
self.inner_ref()?
|
||||
.prewarm_index(&index_name)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn wait_for_index(&self, index_names: Vec<String>, timeout_s: i64) -> Result<()> {
|
||||
let timeout = std::time::Duration::from_secs(timeout_s.try_into().unwrap());
|
||||
let index_names: Vec<&str> = index_names.iter().map(|s| s.as_str()).collect();
|
||||
let slice: &[&str] = &index_names;
|
||||
|
||||
self.inner_ref()?
|
||||
.wait_for_index(slice, timeout)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn stats(&self) -> Result<TableStatistics> {
|
||||
let stats = self.inner_ref()?.stats().await.default_error()?;
|
||||
Ok(stats.into())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn update(
|
||||
&self,
|
||||
only_if: Option<String>,
|
||||
columns: Vec<(String, String)>,
|
||||
) -> napi::Result<u64> {
|
||||
) -> napi::Result<UpdateResult> {
|
||||
let mut op = self.inner_ref()?.update();
|
||||
if let Some(only_if) = only_if {
|
||||
op = op.only_if(only_if);
|
||||
@@ -145,7 +178,8 @@ impl Table {
|
||||
for (column_name, value) in columns {
|
||||
op = op.column(column_name, value);
|
||||
}
|
||||
op.execute().await.default_error()
|
||||
let res = op.execute().await.default_error()?;
|
||||
Ok(res.into())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
@@ -159,21 +193,28 @@ impl Table {
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn add_columns(&self, transforms: Vec<AddColumnsSql>) -> napi::Result<()> {
|
||||
pub async fn add_columns(
|
||||
&self,
|
||||
transforms: Vec<AddColumnsSql>,
|
||||
) -> napi::Result<AddColumnsResult> {
|
||||
let transforms = transforms
|
||||
.into_iter()
|
||||
.map(|sql| (sql.name, sql.value_sql))
|
||||
.collect::<Vec<_>>();
|
||||
let transforms = NewColumnTransform::SqlExpressions(transforms);
|
||||
self.inner_ref()?
|
||||
let res = self
|
||||
.inner_ref()?
|
||||
.add_columns(transforms, None)
|
||||
.await
|
||||
.default_error()?;
|
||||
Ok(())
|
||||
Ok(res.into())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn alter_columns(&self, alterations: Vec<ColumnAlteration>) -> napi::Result<()> {
|
||||
pub async fn alter_columns(
|
||||
&self,
|
||||
alterations: Vec<ColumnAlteration>,
|
||||
) -> napi::Result<AlterColumnsResult> {
|
||||
for alteration in &alterations {
|
||||
if alteration.rename.is_none()
|
||||
&& alteration.nullable.is_none()
|
||||
@@ -190,21 +231,23 @@ impl Table {
|
||||
.collect::<std::result::Result<Vec<_>, String>>()
|
||||
.map_err(napi::Error::from_reason)?;
|
||||
|
||||
self.inner_ref()?
|
||||
let res = self
|
||||
.inner_ref()?
|
||||
.alter_columns(&alterations)
|
||||
.await
|
||||
.default_error()?;
|
||||
Ok(())
|
||||
Ok(res.into())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn drop_columns(&self, columns: Vec<String>) -> napi::Result<()> {
|
||||
pub async fn drop_columns(&self, columns: Vec<String>) -> napi::Result<DropColumnsResult> {
|
||||
let col_refs = columns.iter().map(String::as_str).collect::<Vec<_>>();
|
||||
self.inner_ref()?
|
||||
let res = self
|
||||
.inner_ref()?
|
||||
.drop_columns(&col_refs)
|
||||
.await
|
||||
.default_error()?;
|
||||
Ok(())
|
||||
Ok(res.into())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
@@ -224,6 +267,14 @@ impl Table {
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn checkout_tag(&self, tag: String) -> napi::Result<()> {
|
||||
self.inner_ref()?
|
||||
.checkout_tag(tag.as_str())
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn checkout_latest(&self) -> napi::Result<()> {
|
||||
self.inner_ref()?.checkout_latest().await.default_error()
|
||||
@@ -256,6 +307,13 @@ impl Table {
|
||||
self.inner_ref()?.restore().await.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn tags(&self) -> napi::Result<Tags> {
|
||||
Ok(Tags {
|
||||
inner: self.inner_ref()?.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn optimize(
|
||||
&self,
|
||||
@@ -515,9 +573,257 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct TableStatistics {
|
||||
/// The total number of bytes in the table
|
||||
pub total_bytes: i64,
|
||||
|
||||
/// The number of rows in the table
|
||||
pub num_rows: i64,
|
||||
|
||||
/// The number of indices in the table
|
||||
pub num_indices: i64,
|
||||
|
||||
/// Statistics on table fragments
|
||||
pub fragment_stats: FragmentStatistics,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct FragmentStatistics {
|
||||
/// The number of fragments in the table
|
||||
pub num_fragments: i64,
|
||||
|
||||
/// The number of uncompacted fragments in the table
|
||||
pub num_small_fragments: i64,
|
||||
|
||||
/// Statistics on the number of rows in the table fragments
|
||||
pub lengths: FragmentSummaryStats,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct FragmentSummaryStats {
|
||||
/// The number of rows in the fragment with the fewest rows
|
||||
pub min: i64,
|
||||
|
||||
/// The number of rows in the fragment with the most rows
|
||||
pub max: i64,
|
||||
|
||||
/// The mean number of rows in the fragments
|
||||
pub mean: i64,
|
||||
|
||||
/// The 25th percentile of number of rows in the fragments
|
||||
pub p25: i64,
|
||||
|
||||
/// The 50th percentile of number of rows in the fragments
|
||||
pub p50: i64,
|
||||
|
||||
/// The 75th percentile of number of rows in the fragments
|
||||
pub p75: i64,
|
||||
|
||||
/// The 99th percentile of number of rows in the fragments
|
||||
pub p99: i64,
|
||||
}
|
||||
|
||||
impl From<lancedb::table::TableStatistics> for TableStatistics {
|
||||
fn from(v: lancedb::table::TableStatistics) -> Self {
|
||||
Self {
|
||||
total_bytes: v.total_bytes as i64,
|
||||
num_rows: v.num_rows as i64,
|
||||
num_indices: v.num_indices as i64,
|
||||
fragment_stats: FragmentStatistics {
|
||||
num_fragments: v.fragment_stats.num_fragments as i64,
|
||||
num_small_fragments: v.fragment_stats.num_small_fragments as i64,
|
||||
lengths: FragmentSummaryStats {
|
||||
min: v.fragment_stats.lengths.min as i64,
|
||||
max: v.fragment_stats.lengths.max as i64,
|
||||
mean: v.fragment_stats.lengths.mean as i64,
|
||||
p25: v.fragment_stats.lengths.p25 as i64,
|
||||
p50: v.fragment_stats.lengths.p50 as i64,
|
||||
p75: v.fragment_stats.lengths.p75 as i64,
|
||||
p99: v.fragment_stats.lengths.p99 as i64,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct Version {
|
||||
pub version: i64,
|
||||
pub timestamp: i64,
|
||||
pub metadata: HashMap<String, String>,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct UpdateResult {
|
||||
pub rows_updated: i64,
|
||||
pub version: i64,
|
||||
}
|
||||
|
||||
impl From<lancedb::table::UpdateResult> for UpdateResult {
|
||||
fn from(value: lancedb::table::UpdateResult) -> Self {
|
||||
Self {
|
||||
rows_updated: value.rows_updated as i64,
|
||||
version: value.version as i64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct AddResult {
|
||||
pub version: i64,
|
||||
}
|
||||
|
||||
impl From<lancedb::table::AddResult> for AddResult {
|
||||
fn from(value: lancedb::table::AddResult) -> Self {
|
||||
Self {
|
||||
version: value.version as i64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct DeleteResult {
|
||||
pub version: i64,
|
||||
}
|
||||
|
||||
impl From<lancedb::table::DeleteResult> for DeleteResult {
|
||||
fn from(value: lancedb::table::DeleteResult) -> Self {
|
||||
Self {
|
||||
version: value.version as i64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct MergeResult {
|
||||
pub version: i64,
|
||||
pub num_inserted_rows: i64,
|
||||
pub num_updated_rows: i64,
|
||||
pub num_deleted_rows: i64,
|
||||
}
|
||||
|
||||
impl From<lancedb::table::MergeResult> for MergeResult {
|
||||
fn from(value: lancedb::table::MergeResult) -> Self {
|
||||
Self {
|
||||
version: value.version as i64,
|
||||
num_inserted_rows: value.num_inserted_rows as i64,
|
||||
num_updated_rows: value.num_updated_rows as i64,
|
||||
num_deleted_rows: value.num_deleted_rows as i64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct AddColumnsResult {
|
||||
pub version: i64,
|
||||
}
|
||||
|
||||
impl From<lancedb::table::AddColumnsResult> for AddColumnsResult {
|
||||
fn from(value: lancedb::table::AddColumnsResult) -> Self {
|
||||
Self {
|
||||
version: value.version as i64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct AlterColumnsResult {
|
||||
pub version: i64,
|
||||
}
|
||||
|
||||
impl From<lancedb::table::AlterColumnsResult> for AlterColumnsResult {
|
||||
fn from(value: lancedb::table::AlterColumnsResult) -> Self {
|
||||
Self {
|
||||
version: value.version as i64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct DropColumnsResult {
|
||||
pub version: i64,
|
||||
}
|
||||
|
||||
impl From<lancedb::table::DropColumnsResult> for DropColumnsResult {
|
||||
fn from(value: lancedb::table::DropColumnsResult) -> Self {
|
||||
Self {
|
||||
version: value.version as i64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub struct TagContents {
|
||||
pub version: i64,
|
||||
pub manifest_size: i64,
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub struct Tags {
|
||||
inner: LanceDbTable,
|
||||
}
|
||||
|
||||
#[napi]
|
||||
impl Tags {
|
||||
#[napi]
|
||||
pub async fn list(&self) -> napi::Result<HashMap<String, TagContents>> {
|
||||
let rust_tags = self.inner.tags().await.default_error()?;
|
||||
let tag_list = rust_tags.as_ref().list().await.default_error()?;
|
||||
let tag_contents = tag_list
|
||||
.into_iter()
|
||||
.map(|(k, v)| {
|
||||
(
|
||||
k,
|
||||
TagContents {
|
||||
version: v.version as i64,
|
||||
manifest_size: v.manifest_size as i64,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(tag_contents)
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub async fn get_version(&self, tag: String) -> napi::Result<i64> {
|
||||
let rust_tags = self.inner.tags().await.default_error()?;
|
||||
rust_tags
|
||||
.as_ref()
|
||||
.get_version(tag.as_str())
|
||||
.await
|
||||
.map(|v| v as i64)
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub async unsafe fn create(&mut self, tag: String, version: i64) -> napi::Result<()> {
|
||||
let mut rust_tags = self.inner.tags().await.default_error()?;
|
||||
rust_tags
|
||||
.as_mut()
|
||||
.create(tag.as_str(), version as u64)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub async unsafe fn delete(&mut self, tag: String) -> napi::Result<()> {
|
||||
let mut rust_tags = self.inner.tags().await.default_error()?;
|
||||
rust_tags
|
||||
.as_mut()
|
||||
.delete(tag.as_str())
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub async unsafe fn update(&mut self, tag: String, version: i64) -> napi::Result<()> {
|
||||
let mut rust_tags = self.inner.tags().await.default_error()?;
|
||||
rust_tags
|
||||
.as_mut()
|
||||
.update(tag.as_str(), version as u64)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.22.0-beta.7"
|
||||
current_version = "0.22.1-beta.4"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.22.0-beta.7"
|
||||
version = "0.22.1-beta.4"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -7,7 +7,7 @@ dependencies = [
|
||||
"numpy",
|
||||
"overrides>=0.7",
|
||||
"packaging",
|
||||
"pyarrow>=14",
|
||||
"pyarrow>=16",
|
||||
"pydantic>=1.10",
|
||||
"tqdm>=4.27.0",
|
||||
]
|
||||
@@ -44,7 +44,7 @@ repository = "https://github.com/lancedb/lancedb"
|
||||
|
||||
[project.optional-dependencies]
|
||||
pylance = [
|
||||
"pylance>=0.23.2",
|
||||
"pylance>=0.25",
|
||||
]
|
||||
tests = [
|
||||
"aiohttp",
|
||||
@@ -58,7 +58,7 @@ tests = [
|
||||
"polars>=0.19, <=1.3.0",
|
||||
"tantivy",
|
||||
"pyarrow-stubs",
|
||||
"pylance>=0.23.2",
|
||||
"pylance>=0.25",
|
||||
"requests",
|
||||
]
|
||||
dev = [
|
||||
@@ -77,6 +77,7 @@ embeddings = [
|
||||
"pillow",
|
||||
"open-clip-torch",
|
||||
"cohere",
|
||||
"colpali-engine>=0.3.10",
|
||||
"huggingface_hub",
|
||||
"InstructorEmbedding",
|
||||
"google.generativeai",
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from datetime import timedelta
|
||||
from typing import Dict, List, Optional, Tuple, Any, Union, Literal
|
||||
from typing import Dict, List, Optional, Tuple, Any, TypedDict, Union, Literal
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -36,8 +36,10 @@ class Table:
|
||||
async def schema(self) -> pa.Schema: ...
|
||||
async def add(
|
||||
self, data: pa.RecordBatchReader, mode: Literal["append", "overwrite"]
|
||||
) -> None: ...
|
||||
async def update(self, updates: Dict[str, str], where: Optional[str]) -> None: ...
|
||||
) -> AddResult: ...
|
||||
async def update(
|
||||
self, updates: Dict[str, str], where: Optional[str]
|
||||
) -> UpdateResult: ...
|
||||
async def count_rows(self, filter: Optional[str]) -> int: ...
|
||||
async def create_index(
|
||||
self,
|
||||
@@ -47,23 +49,34 @@ class Table:
|
||||
): ...
|
||||
async def list_versions(self) -> List[Dict[str, Any]]: ...
|
||||
async def version(self) -> int: ...
|
||||
async def checkout(self, version: int): ...
|
||||
async def checkout(self, version: Union[int, str]): ...
|
||||
async def checkout_latest(self): ...
|
||||
async def restore(self, version: Optional[int] = None): ...
|
||||
async def restore(self, version: Optional[Union[int, str]] = None): ...
|
||||
async def list_indices(self) -> list[IndexConfig]: ...
|
||||
async def delete(self, filter: str): ...
|
||||
async def add_columns(self, columns: list[tuple[str, str]]) -> None: ...
|
||||
async def add_columns_with_schema(self, schema: pa.Schema) -> None: ...
|
||||
async def alter_columns(self, columns: list[dict[str, Any]]) -> None: ...
|
||||
async def delete(self, filter: str) -> DeleteResult: ...
|
||||
async def add_columns(self, columns: list[tuple[str, str]]) -> AddColumnsResult: ...
|
||||
async def add_columns_with_schema(self, schema: pa.Schema) -> AddColumnsResult: ...
|
||||
async def alter_columns(
|
||||
self, columns: list[dict[str, Any]]
|
||||
) -> AlterColumnsResult: ...
|
||||
async def optimize(
|
||||
self,
|
||||
*,
|
||||
cleanup_since_ms: Optional[int] = None,
|
||||
delete_unverified: Optional[bool] = None,
|
||||
) -> OptimizeStats: ...
|
||||
@property
|
||||
def tags(self) -> Tags: ...
|
||||
def query(self) -> Query: ...
|
||||
def vector_search(self) -> VectorQuery: ...
|
||||
|
||||
class Tags:
|
||||
async def list(self) -> Dict[str, Tag]: ...
|
||||
async def get_version(self, tag: str) -> int: ...
|
||||
async def create(self, tag: str, version: int): ...
|
||||
async def delete(self, tag: str): ...
|
||||
async def update(self, tag: str, version: int): ...
|
||||
|
||||
class IndexConfig:
|
||||
index_type: str
|
||||
columns: List[str]
|
||||
@@ -195,3 +208,32 @@ class RemovalStats:
|
||||
class OptimizeStats:
|
||||
compaction: CompactionStats
|
||||
prune: RemovalStats
|
||||
|
||||
class Tag(TypedDict):
|
||||
version: int
|
||||
manifest_size: int
|
||||
|
||||
class AddResult:
|
||||
version: int
|
||||
|
||||
class DeleteResult:
|
||||
version: int
|
||||
|
||||
class UpdateResult:
|
||||
rows_updated: int
|
||||
version: int
|
||||
|
||||
class MergeResult:
|
||||
version: int
|
||||
num_updated_rows: int
|
||||
num_inserted_rows: int
|
||||
num_deleted_rows: int
|
||||
|
||||
class AddColumnsResult:
|
||||
version: int
|
||||
|
||||
class AlterColumnsResult:
|
||||
version: int
|
||||
|
||||
class DropColumnsResult:
|
||||
version: int
|
||||
|
||||
@@ -9,7 +9,7 @@ import numpy as np
|
||||
import pyarrow as pa
|
||||
import pyarrow.dataset
|
||||
|
||||
from .dependencies import pandas as pd
|
||||
from .dependencies import _check_for_pandas, pandas as pd
|
||||
|
||||
DATA = Union[List[dict], "pd.DataFrame", pa.Table, Iterable[pa.RecordBatch]]
|
||||
VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray]
|
||||
@@ -63,7 +63,7 @@ def data_to_reader(
|
||||
data: DATA, schema: Optional[pa.Schema] = None
|
||||
) -> pa.RecordBatchReader:
|
||||
"""Convert various types of input into a RecordBatchReader"""
|
||||
if pd is not None and isinstance(data, pd.DataFrame):
|
||||
if _check_for_pandas(data) and isinstance(data, pd.DataFrame):
|
||||
return pa.Table.from_pandas(data, schema=schema).to_reader()
|
||||
elif isinstance(data, pa.Table):
|
||||
return data.to_reader()
|
||||
|
||||
@@ -19,3 +19,4 @@ from .imagebind import ImageBindEmbeddings
|
||||
from .jinaai import JinaEmbeddings
|
||||
from .watsonx import WatsonxEmbeddings
|
||||
from .voyageai import VoyageAIEmbeddingFunction
|
||||
from .colpali import ColPaliEmbeddings
|
||||
|
||||
255
python/python/lancedb/embeddings/colpali.py
Normal file
255
python/python/lancedb/embeddings/colpali.py
Normal file
@@ -0,0 +1,255 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import List, Union, Optional, Any
|
||||
import numpy as np
|
||||
import io
|
||||
|
||||
from ..util import attempt_import_or_raise
|
||||
from .base import EmbeddingFunction
|
||||
from .registry import register
|
||||
from .utils import TEXT, IMAGES, is_flash_attn_2_available
|
||||
|
||||
|
||||
@register("colpali")
|
||||
class ColPaliEmbeddings(EmbeddingFunction):
|
||||
"""
|
||||
An embedding function that uses the ColPali engine for
|
||||
multimodal multi-vector embeddings.
|
||||
|
||||
This embedding function supports ColQwen2.5 models, producing multivector outputs
|
||||
for both text and image inputs. The output embeddings are lists of vectors, each
|
||||
vector being 128-dimensional by default, represented as List[List[float]].
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model_name : str
|
||||
The name of the model to use (e.g., "Metric-AI/ColQwen2.5-3b-multilingual-v1.0")
|
||||
device : str
|
||||
The device for inference (default "cuda:0").
|
||||
dtype : str
|
||||
Data type for model weights (default "bfloat16").
|
||||
use_token_pooling : bool
|
||||
Whether to use token pooling to reduce embedding size (default True).
|
||||
pool_factor : int
|
||||
Factor to reduce sequence length if token pooling is enabled (default 2).
|
||||
quantization_config : Optional[BitsAndBytesConfig]
|
||||
Quantization configuration for the model. (default None, bitsandbytes needed)
|
||||
batch_size : int
|
||||
Batch size for processing inputs (default 2).
|
||||
"""
|
||||
|
||||
model_name: str = "Metric-AI/ColQwen2.5-3b-multilingual-v1.0"
|
||||
device: str = "auto"
|
||||
dtype: str = "bfloat16"
|
||||
use_token_pooling: bool = True
|
||||
pool_factor: int = 2
|
||||
quantization_config: Optional[Any] = None
|
||||
batch_size: int = 2
|
||||
|
||||
_model = None
|
||||
_processor = None
|
||||
_token_pooler = None
|
||||
_vector_dim = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
(
|
||||
self._model,
|
||||
self._processor,
|
||||
self._token_pooler,
|
||||
) = self._load_model(
|
||||
self.model_name,
|
||||
self.dtype,
|
||||
self.device,
|
||||
self.use_token_pooling,
|
||||
self.quantization_config,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=1)
|
||||
def _load_model(
|
||||
model_name: str,
|
||||
dtype: str,
|
||||
device: str,
|
||||
use_token_pooling: bool,
|
||||
quantization_config: Optional[Any],
|
||||
):
|
||||
"""
|
||||
Initialize and cache the ColPali model, processor, and token pooler.
|
||||
"""
|
||||
torch = attempt_import_or_raise("torch", "torch")
|
||||
transformers = attempt_import_or_raise("transformers", "transformers")
|
||||
colpali_engine = attempt_import_or_raise("colpali_engine", "colpali_engine")
|
||||
from colpali_engine.compression.token_pooling import HierarchicalTokenPooler
|
||||
|
||||
if quantization_config is not None:
|
||||
if not isinstance(quantization_config, transformers.BitsAndBytesConfig):
|
||||
raise ValueError("quantization_config must be a BitsAndBytesConfig")
|
||||
|
||||
if dtype == "bfloat16":
|
||||
torch_dtype = torch.bfloat16
|
||||
elif dtype == "float16":
|
||||
torch_dtype = torch.float16
|
||||
elif dtype == "float64":
|
||||
torch_dtype = torch.float64
|
||||
else:
|
||||
torch_dtype = torch.float32
|
||||
|
||||
model = colpali_engine.models.ColQwen2_5.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
device_map=device,
|
||||
quantization_config=quantization_config
|
||||
if quantization_config is not None
|
||||
else None,
|
||||
attn_implementation="flash_attention_2"
|
||||
if is_flash_attn_2_available()
|
||||
else None,
|
||||
).eval()
|
||||
processor = colpali_engine.models.ColQwen2_5_Processor.from_pretrained(
|
||||
model_name
|
||||
)
|
||||
token_pooler = HierarchicalTokenPooler() if use_token_pooling else None
|
||||
return model, processor, token_pooler
|
||||
|
||||
def ndims(self):
|
||||
"""
|
||||
Return the dimension of a vector in the multivector output (e.g., 128).
|
||||
"""
|
||||
torch = attempt_import_or_raise("torch", "torch")
|
||||
if self._vector_dim is None:
|
||||
dummy_query = "test"
|
||||
batch_queries = self._processor.process_queries([dummy_query]).to(
|
||||
self._model.device
|
||||
)
|
||||
with torch.no_grad():
|
||||
query_embeddings = self._model(**batch_queries)
|
||||
|
||||
if self.use_token_pooling and self._token_pooler is not None:
|
||||
query_embeddings = self._token_pooler.pool_embeddings(
|
||||
query_embeddings,
|
||||
pool_factor=self.pool_factor,
|
||||
padding=True,
|
||||
padding_side=self._processor.tokenizer.padding_side,
|
||||
)
|
||||
|
||||
self._vector_dim = query_embeddings[0].shape[-1]
|
||||
return self._vector_dim
|
||||
|
||||
def _process_embeddings(self, embeddings):
|
||||
"""
|
||||
Format model embeddings into List[List[float]].
|
||||
Use token pooling if enabled.
|
||||
"""
|
||||
torch = attempt_import_or_raise("torch", "torch")
|
||||
if self.use_token_pooling and self._token_pooler is not None:
|
||||
embeddings = self._token_pooler.pool_embeddings(
|
||||
embeddings,
|
||||
pool_factor=self.pool_factor,
|
||||
padding=True,
|
||||
padding_side=self._processor.tokenizer.padding_side,
|
||||
)
|
||||
|
||||
if isinstance(embeddings, torch.Tensor):
|
||||
tensors = embeddings.detach().cpu()
|
||||
if tensors.dtype == torch.bfloat16:
|
||||
tensors = tensors.to(torch.float32)
|
||||
return (
|
||||
tensors.numpy()
|
||||
.astype(np.float64 if self.dtype == "float64" else np.float32)
|
||||
.tolist()
|
||||
)
|
||||
return []
|
||||
|
||||
def generate_text_embeddings(self, text: TEXT) -> List[List[List[float]]]:
|
||||
"""
|
||||
Generate embeddings for text input.
|
||||
"""
|
||||
torch = attempt_import_or_raise("torch", "torch")
|
||||
text = self.sanitize_input(text)
|
||||
all_embeddings = []
|
||||
|
||||
for i in range(0, len(text), self.batch_size):
|
||||
batch_text = text[i : i + self.batch_size]
|
||||
batch_queries = self._processor.process_queries(batch_text).to(
|
||||
self._model.device
|
||||
)
|
||||
with torch.no_grad():
|
||||
query_embeddings = self._model(**batch_queries)
|
||||
all_embeddings.extend(self._process_embeddings(query_embeddings))
|
||||
return all_embeddings
|
||||
|
||||
def _prepare_images(self, images: IMAGES) -> List:
|
||||
"""
|
||||
Convert image inputs to PIL Images.
|
||||
"""
|
||||
PIL = attempt_import_or_raise("PIL", "pillow")
|
||||
requests = attempt_import_or_raise("requests", "requests")
|
||||
images = self.sanitize_input(images)
|
||||
pil_images = []
|
||||
try:
|
||||
for image in images:
|
||||
if isinstance(image, str):
|
||||
if image.startswith(("http://", "https://")):
|
||||
response = requests.get(image, timeout=10)
|
||||
response.raise_for_status()
|
||||
pil_images.append(PIL.Image.open(io.BytesIO(response.content)))
|
||||
else:
|
||||
with PIL.Image.open(image) as im:
|
||||
pil_images.append(im.copy())
|
||||
elif isinstance(image, bytes):
|
||||
pil_images.append(PIL.Image.open(io.BytesIO(image)))
|
||||
else:
|
||||
# Assume it's a PIL Image; will raise if invalid
|
||||
pil_images.append(image)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Failed to process image: {e}")
|
||||
|
||||
return pil_images
|
||||
|
||||
def generate_image_embeddings(self, images: IMAGES) -> List[List[List[float]]]:
|
||||
"""
|
||||
Generate embeddings for a batch of images.
|
||||
"""
|
||||
torch = attempt_import_or_raise("torch", "torch")
|
||||
pil_images = self._prepare_images(images)
|
||||
all_embeddings = []
|
||||
|
||||
for i in range(0, len(pil_images), self.batch_size):
|
||||
batch_images = pil_images[i : i + self.batch_size]
|
||||
batch_images = self._processor.process_images(batch_images).to(
|
||||
self._model.device
|
||||
)
|
||||
with torch.no_grad():
|
||||
image_embeddings = self._model(**batch_images)
|
||||
all_embeddings.extend(self._process_embeddings(image_embeddings))
|
||||
return all_embeddings
|
||||
|
||||
def compute_query_embeddings(
|
||||
self, query: Union[str, IMAGES], *args, **kwargs
|
||||
) -> List[List[List[float]]]:
|
||||
"""
|
||||
Compute embeddings for a single user query (text only).
|
||||
"""
|
||||
if not isinstance(query, str):
|
||||
raise ValueError(
|
||||
"Query must be a string, image to image search is not supported"
|
||||
)
|
||||
return self.generate_text_embeddings([query])
|
||||
|
||||
def compute_source_embeddings(
|
||||
self, images: IMAGES, *args, **kwargs
|
||||
) -> List[List[List[float]]]:
|
||||
"""
|
||||
Compute embeddings for a batch of source images.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
images : Union[str, bytes, List, pa.Array, pa.ChunkedArray, np.ndarray]
|
||||
Batch of images (paths, URLs, bytes, or PIL Images).
|
||||
"""
|
||||
images = self.sanitize_input(images)
|
||||
return self.generate_image_embeddings(images)
|
||||
@@ -18,6 +18,7 @@ import numpy as np
|
||||
import pyarrow as pa
|
||||
|
||||
from ..dependencies import pandas as pd
|
||||
from ..util import attempt_import_or_raise
|
||||
|
||||
|
||||
# ruff: noqa: PERF203
|
||||
@@ -275,3 +276,12 @@ def url_retrieve(url: str):
|
||||
def api_key_not_found_help(provider):
|
||||
logging.error("Could not find API key for %s", provider)
|
||||
raise ValueError(f"Please set the {provider.upper()}_API_KEY environment variable.")
|
||||
|
||||
|
||||
def is_flash_attn_2_available():
|
||||
try:
|
||||
attempt_import_or_raise("flash_attn", "flash_attn")
|
||||
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
@@ -4,10 +4,14 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .common import DATA
|
||||
from ._lancedb import (
|
||||
MergeInsertResult,
|
||||
)
|
||||
|
||||
|
||||
class LanceMergeInsertBuilder(object):
|
||||
@@ -28,6 +32,7 @@ class LanceMergeInsertBuilder(object):
|
||||
self._when_not_matched_insert_all = False
|
||||
self._when_not_matched_by_source_delete = False
|
||||
self._when_not_matched_by_source_condition = None
|
||||
self._timeout = None
|
||||
|
||||
def when_matched_update_all(
|
||||
self, *, where: Optional[str] = None
|
||||
@@ -78,7 +83,8 @@ class LanceMergeInsertBuilder(object):
|
||||
new_data: DATA,
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
):
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> MergeInsertResult:
|
||||
"""
|
||||
Executes the merge insert operation
|
||||
|
||||
@@ -95,5 +101,24 @@ class LanceMergeInsertBuilder(object):
|
||||
One of "error", "drop", "fill".
|
||||
fill_value: float, default 0.
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
timeout: Optional[timedelta], default None
|
||||
Maximum time to run the operation before cancelling it.
|
||||
|
||||
By default, there is a 30-second timeout that is only enforced after the
|
||||
first attempt. This is to prevent spending too long retrying to resolve
|
||||
conflicts. For example, if a write attempt takes 20 seconds and fails,
|
||||
the second attempt will be cancelled after 10 seconds, hitting the
|
||||
30-second timeout. However, a write that takes one hour and succeeds on the
|
||||
first attempt will not be cancelled.
|
||||
|
||||
When this is set, the timeout is enforced on all attempts, including
|
||||
the first.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MergeInsertResult
|
||||
version: the new version number of the table after doing merge insert.
|
||||
"""
|
||||
if timeout is not None:
|
||||
self._timeout = timeout
|
||||
return self._table._do_merge(self, new_data, on_bad_vectors, fill_value)
|
||||
|
||||
@@ -152,6 +152,104 @@ def Vector(
|
||||
return FixedSizeList
|
||||
|
||||
|
||||
def MultiVector(
|
||||
dim: int, value_type: pa.DataType = pa.float32(), nullable: bool = True
|
||||
) -> Type:
|
||||
"""Pydantic MultiVector Type for multi-vector embeddings.
|
||||
|
||||
This type represents a list of vectors, each with the same dimension.
|
||||
Useful for models that produce multiple embeddings per input, like ColPali.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dim : int
|
||||
The dimension of each vector in the multi-vector.
|
||||
value_type : pyarrow.DataType, optional
|
||||
The value type of the vectors, by default pa.float32()
|
||||
nullable : bool, optional
|
||||
Whether the multi-vector is nullable, by default it is True.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> import pydantic
|
||||
>>> from lancedb.pydantic import MultiVector
|
||||
...
|
||||
>>> class MyModel(pydantic.BaseModel):
|
||||
... id: int
|
||||
... text: str
|
||||
... embeddings: MultiVector(128) # List of 128-dimensional vectors
|
||||
>>> schema = pydantic_to_schema(MyModel)
|
||||
>>> assert schema == pa.schema([
|
||||
... pa.field("id", pa.int64(), False),
|
||||
... pa.field("text", pa.utf8(), False),
|
||||
... pa.field("embeddings", pa.list_(pa.list_(pa.float32(), 128)))
|
||||
... ])
|
||||
"""
|
||||
|
||||
class MultiVectorList(list, FixedSizeListMixin):
|
||||
def __repr__(self):
|
||||
return f"MultiVector(dim={dim})"
|
||||
|
||||
@staticmethod
|
||||
def nullable() -> bool:
|
||||
return nullable
|
||||
|
||||
@staticmethod
|
||||
def dim() -> int:
|
||||
return dim
|
||||
|
||||
@staticmethod
|
||||
def value_arrow_type() -> pa.DataType:
|
||||
return value_type
|
||||
|
||||
@staticmethod
|
||||
def is_multi_vector() -> bool:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def __get_pydantic_core_schema__(
|
||||
cls, _source_type: Any, _handler: pydantic.GetCoreSchemaHandler
|
||||
) -> CoreSchema:
|
||||
return core_schema.no_info_after_validator_function(
|
||||
cls,
|
||||
core_schema.list_schema(
|
||||
items_schema=core_schema.list_schema(
|
||||
min_length=dim,
|
||||
max_length=dim,
|
||||
items_schema=core_schema.float_schema(),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def __get_validators__(cls) -> Generator[Callable, None, None]:
|
||||
yield cls.validate
|
||||
|
||||
# For pydantic v1
|
||||
@classmethod
|
||||
def validate(cls, v):
|
||||
if not isinstance(v, (list, range)):
|
||||
raise TypeError("A list of vectors is needed")
|
||||
for vec in v:
|
||||
if not isinstance(vec, (list, range, np.ndarray)) or len(vec) != dim:
|
||||
raise TypeError(f"Each vector must be a list of {dim} numbers")
|
||||
return cls(v)
|
||||
|
||||
if PYDANTIC_VERSION.major < 2:
|
||||
|
||||
@classmethod
|
||||
def __modify_schema__(cls, field_schema: Dict[str, Any]):
|
||||
field_schema["items"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "number"},
|
||||
"minItems": dim,
|
||||
"maxItems": dim,
|
||||
}
|
||||
|
||||
return MultiVectorList
|
||||
|
||||
|
||||
def _py_type_to_arrow_type(py_type: Type[Any], field: FieldInfo) -> pa.DataType:
|
||||
"""Convert a field with native Python type to Arrow data type.
|
||||
|
||||
@@ -206,6 +304,9 @@ def _pydantic_type_to_arrow_type(tp: Any, field: FieldInfo) -> pa.DataType:
|
||||
fields = _pydantic_model_to_fields(tp)
|
||||
return pa.struct(fields)
|
||||
if issubclass(tp, FixedSizeListMixin):
|
||||
if getattr(tp, "is_multi_vector", lambda: False)():
|
||||
return pa.list_(pa.list_(tp.value_arrow_type(), tp.dim()))
|
||||
# For regular Vector
|
||||
return pa.list_(tp.value_arrow_type(), tp.dim())
|
||||
return _py_type_to_arrow_type(tp, field)
|
||||
|
||||
@@ -314,6 +415,7 @@ class LanceModel(pydantic.BaseModel):
|
||||
>>> table.add([
|
||||
... TestModel(name="test", vector=[1.0, 2.0])
|
||||
... ])
|
||||
AddResult(version=2)
|
||||
>>> table.search([0., 0.]).limit(1).to_pydantic(TestModel)
|
||||
[TestModel(name='test', vector=FixedSizeList(dim=2))]
|
||||
"""
|
||||
|
||||
@@ -28,6 +28,8 @@ import pyarrow.compute as pc
|
||||
import pyarrow.fs as pa_fs
|
||||
import pydantic
|
||||
|
||||
from lancedb.pydantic import PYDANTIC_VERSION
|
||||
|
||||
from . import __version__
|
||||
from .arrow import AsyncRecordBatchReader
|
||||
from .dependencies import pandas as pd
|
||||
@@ -498,10 +500,14 @@ class Query(pydantic.BaseModel):
|
||||
)
|
||||
return query
|
||||
|
||||
class Config:
|
||||
# This tells pydantic to allow custom types (needed for the `vector` query since
|
||||
# pa.Array wouln't be allowed otherwise)
|
||||
arbitrary_types_allowed = True
|
||||
# This tells pydantic to allow custom types (needed for the `vector` query since
|
||||
# pa.Array wouln't be allowed otherwise)
|
||||
if PYDANTIC_VERSION.major < 2: # Pydantic 1.x compat
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
else:
|
||||
model_config = {"arbitrary_types_allowed": True}
|
||||
|
||||
|
||||
class LanceQueryBuilder(ABC):
|
||||
@@ -1586,6 +1592,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._refine_factor = None
|
||||
self._distance_type = None
|
||||
self._phrase_query = None
|
||||
self._lower_bound = None
|
||||
self._upper_bound = None
|
||||
|
||||
def _validate_query(self, query, vector=None, text=None):
|
||||
if query is not None and (vector is not None or text is not None):
|
||||
@@ -1628,47 +1636,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
raise NotImplementedError("to_query_object not yet supported on a hybrid query")
|
||||
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
vector_query, fts_query = self._validate_query(
|
||||
self._query, self._vector, self._text
|
||||
)
|
||||
self._fts_query = LanceFtsQueryBuilder(
|
||||
self._table, fts_query, fts_columns=self._fts_columns
|
||||
)
|
||||
vector_query = self._query_to_vector(
|
||||
self._table, vector_query, self._vector_column
|
||||
)
|
||||
self._vector_query = LanceVectorQueryBuilder(
|
||||
self._table, vector_query, self._vector_column
|
||||
)
|
||||
|
||||
if self._limit:
|
||||
self._vector_query.limit(self._limit)
|
||||
self._fts_query.limit(self._limit)
|
||||
if self._columns:
|
||||
self._vector_query.select(self._columns)
|
||||
self._fts_query.select(self._columns)
|
||||
if self._where:
|
||||
self._vector_query.where(self._where, self._postfilter)
|
||||
self._fts_query.where(self._where, self._postfilter)
|
||||
if self._with_row_id:
|
||||
self._vector_query.with_row_id(True)
|
||||
self._fts_query.with_row_id(True)
|
||||
if self._phrase_query:
|
||||
self._fts_query.phrase_query(True)
|
||||
if self._distance_type:
|
||||
self._vector_query.metric(self._distance_type)
|
||||
if self._nprobes:
|
||||
self._vector_query.nprobes(self._nprobes)
|
||||
if self._refine_factor:
|
||||
self._vector_query.refine_factor(self._refine_factor)
|
||||
if self._ef:
|
||||
self._vector_query.ef(self._ef)
|
||||
if self._bypass_vector_index:
|
||||
self._vector_query.bypass_vector_index()
|
||||
|
||||
if self._reranker is None:
|
||||
self._reranker = RRFReranker()
|
||||
|
||||
self._create_query_builders()
|
||||
with ThreadPoolExecutor() as executor:
|
||||
fts_future = executor.submit(
|
||||
self._fts_query.with_row_id(True).to_arrow, timeout=timeout
|
||||
@@ -1991,6 +1959,112 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._bypass_vector_index = True
|
||||
return self
|
||||
|
||||
def explain_plan(self, verbose: Optional[bool] = False) -> str:
|
||||
"""Return the execution plan for this query.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
>>> db = lancedb.connect("./.lancedb")
|
||||
>>> table = db.create_table("my_table", [{"vector": [99.0, 99]}])
|
||||
>>> query = [100, 100]
|
||||
>>> plan = table.search(query).explain_plan(True)
|
||||
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
||||
|
||||
Parameters
|
||||
----------
|
||||
verbose : bool, default False
|
||||
Use a verbose output format.
|
||||
|
||||
Returns
|
||||
-------
|
||||
plan : str
|
||||
""" # noqa: E501
|
||||
self._create_query_builders()
|
||||
|
||||
results = ["Vector Search Plan:"]
|
||||
results.append(
|
||||
self._table._explain_plan(
|
||||
self._vector_query.to_query_object(), verbose=verbose
|
||||
)
|
||||
)
|
||||
results.append("FTS Search Plan:")
|
||||
results.append(
|
||||
self._table._explain_plan(
|
||||
self._fts_query.to_query_object(), verbose=verbose
|
||||
)
|
||||
)
|
||||
return "\n".join(results)
|
||||
|
||||
def analyze_plan(self):
|
||||
"""Execute the query and display with runtime metrics.
|
||||
|
||||
Returns
|
||||
-------
|
||||
plan : str
|
||||
"""
|
||||
self._create_query_builders()
|
||||
|
||||
results = ["Vector Search Plan:"]
|
||||
results.append(self._table._analyze_plan(self._vector_query.to_query_object()))
|
||||
results.append("FTS Search Plan:")
|
||||
results.append(self._table._analyze_plan(self._fts_query.to_query_object()))
|
||||
return "\n".join(results)
|
||||
|
||||
def _create_query_builders(self):
|
||||
"""Set up and configure the vector and FTS query builders."""
|
||||
vector_query, fts_query = self._validate_query(
|
||||
self._query, self._vector, self._text
|
||||
)
|
||||
self._fts_query = LanceFtsQueryBuilder(
|
||||
self._table, fts_query, fts_columns=self._fts_columns
|
||||
)
|
||||
vector_query = self._query_to_vector(
|
||||
self._table, vector_query, self._vector_column
|
||||
)
|
||||
self._vector_query = LanceVectorQueryBuilder(
|
||||
self._table, vector_query, self._vector_column
|
||||
)
|
||||
|
||||
# Apply common configurations
|
||||
if self._limit:
|
||||
self._vector_query.limit(self._limit)
|
||||
self._fts_query.limit(self._limit)
|
||||
if self._columns:
|
||||
self._vector_query.select(self._columns)
|
||||
self._fts_query.select(self._columns)
|
||||
if self._where:
|
||||
self._vector_query.where(self._where, self._postfilter)
|
||||
self._fts_query.where(self._where, self._postfilter)
|
||||
if self._with_row_id:
|
||||
self._vector_query.with_row_id(True)
|
||||
self._fts_query.with_row_id(True)
|
||||
if self._phrase_query:
|
||||
self._fts_query.phrase_query(True)
|
||||
if self._distance_type:
|
||||
self._vector_query.metric(self._distance_type)
|
||||
if self._nprobes:
|
||||
self._vector_query.nprobes(self._nprobes)
|
||||
if self._refine_factor:
|
||||
self._vector_query.refine_factor(self._refine_factor)
|
||||
if self._ef:
|
||||
self._vector_query.ef(self._ef)
|
||||
if self._bypass_vector_index:
|
||||
self._vector_query.bypass_vector_index()
|
||||
if self._lower_bound or self._upper_bound:
|
||||
self._vector_query.distance_range(
|
||||
lower_bound=self._lower_bound, upper_bound=self._upper_bound
|
||||
)
|
||||
|
||||
if self._reranker is None:
|
||||
self._reranker = RRFReranker()
|
||||
|
||||
|
||||
class AsyncQueryBase(object):
|
||||
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery]):
|
||||
|
||||
@@ -7,7 +7,16 @@ from functools import cached_property
|
||||
from typing import Dict, Iterable, List, Optional, Union, Literal
|
||||
import warnings
|
||||
|
||||
from lancedb._lancedb import IndexConfig
|
||||
from lancedb._lancedb import (
|
||||
AddColumnsResult,
|
||||
AddResult,
|
||||
AlterColumnsResult,
|
||||
DeleteResult,
|
||||
DropColumnsResult,
|
||||
IndexConfig,
|
||||
MergeResult,
|
||||
UpdateResult,
|
||||
)
|
||||
from lancedb.embeddings.base import EmbeddingFunctionConfig
|
||||
from lancedb.index import FTS, BTree, Bitmap, HnswPq, HnswSq, IvfFlat, IvfPq, LabelList
|
||||
from lancedb.remote.db import LOOP
|
||||
@@ -18,7 +27,7 @@ from lancedb.merge import LanceMergeInsertBuilder
|
||||
from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||
|
||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder
|
||||
from ..table import AsyncTable, IndexStatistics, Query, Table
|
||||
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
||||
|
||||
|
||||
class RemoteTable(Table):
|
||||
@@ -38,9 +47,6 @@ class RemoteTable(Table):
|
||||
def __repr__(self) -> str:
|
||||
return f"RemoteTable({self.db_name}.{self.name})"
|
||||
|
||||
def __len__(self) -> int:
|
||||
self.count_rows(None)
|
||||
|
||||
@property
|
||||
def schema(self) -> pa.Schema:
|
||||
"""The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#)
|
||||
@@ -54,6 +60,10 @@ class RemoteTable(Table):
|
||||
"""Get the current version of the table"""
|
||||
return LOOP.run(self._table.version())
|
||||
|
||||
@property
|
||||
def tags(self) -> Tags:
|
||||
return Tags(self._table)
|
||||
|
||||
@cached_property
|
||||
def embedding_functions(self) -> Dict[str, EmbeddingFunctionConfig]:
|
||||
"""
|
||||
@@ -81,13 +91,13 @@ class RemoteTable(Table):
|
||||
"""to_pandas() is not yet supported on LanceDB cloud."""
|
||||
return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
||||
|
||||
def checkout(self, version: int):
|
||||
def checkout(self, version: Union[int, str]):
|
||||
return LOOP.run(self._table.checkout(version))
|
||||
|
||||
def checkout_latest(self):
|
||||
return LOOP.run(self._table.checkout_latest())
|
||||
|
||||
def restore(self, version: Optional[int] = None):
|
||||
def restore(self, version: Optional[Union[int, str]] = None):
|
||||
return LOOP.run(self._table.restore(version))
|
||||
|
||||
def list_indices(self) -> Iterable[IndexConfig]:
|
||||
@@ -104,6 +114,7 @@ class RemoteTable(Table):
|
||||
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
|
||||
*,
|
||||
replace: bool = False,
|
||||
wait_timeout: timedelta = None,
|
||||
):
|
||||
"""Creates a scalar index
|
||||
Parameters
|
||||
@@ -126,13 +137,18 @@ class RemoteTable(Table):
|
||||
else:
|
||||
raise ValueError(f"Unknown index type: {index_type}")
|
||||
|
||||
LOOP.run(self._table.create_index(column, config=config, replace=replace))
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
column, config=config, replace=replace, wait_timeout=wait_timeout
|
||||
)
|
||||
)
|
||||
|
||||
def create_fts_index(
|
||||
self,
|
||||
column: str,
|
||||
*,
|
||||
replace: bool = False,
|
||||
wait_timeout: timedelta = None,
|
||||
with_position: bool = True,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: str = "simple",
|
||||
@@ -153,7 +169,11 @@ class RemoteTable(Table):
|
||||
remove_stop_words=remove_stop_words,
|
||||
ascii_folding=ascii_folding,
|
||||
)
|
||||
LOOP.run(self._table.create_index(column, config=config, replace=replace))
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
column, config=config, replace=replace, wait_timeout=wait_timeout
|
||||
)
|
||||
)
|
||||
|
||||
def create_index(
|
||||
self,
|
||||
@@ -165,6 +185,7 @@ class RemoteTable(Table):
|
||||
replace: Optional[bool] = None,
|
||||
accelerator: Optional[str] = None,
|
||||
index_type="vector",
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
):
|
||||
"""Create an index on the table.
|
||||
Currently, the only parameters that matter are
|
||||
@@ -236,7 +257,11 @@ class RemoteTable(Table):
|
||||
" 'IVF_FLAT', 'IVF_PQ', 'IVF_HNSW_PQ', 'IVF_HNSW_SQ'"
|
||||
)
|
||||
|
||||
LOOP.run(self._table.create_index(vector_column_name, config=config))
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
vector_column_name, config=config, wait_timeout=wait_timeout
|
||||
)
|
||||
)
|
||||
|
||||
def add(
|
||||
self,
|
||||
@@ -244,7 +269,7 @@ class RemoteTable(Table):
|
||||
mode: str = "append",
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
) -> int:
|
||||
) -> AddResult:
|
||||
"""Add more data to the [Table](Table). It has the same API signature as
|
||||
the OSS version.
|
||||
|
||||
@@ -267,8 +292,12 @@ class RemoteTable(Table):
|
||||
fill_value: float, default 0.
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
|
||||
Returns
|
||||
-------
|
||||
AddResult
|
||||
An object containing the new version number of the table after adding data.
|
||||
"""
|
||||
LOOP.run(
|
||||
return LOOP.run(
|
||||
self._table.add(
|
||||
data, mode=mode, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
||||
)
|
||||
@@ -394,10 +423,12 @@ class RemoteTable(Table):
|
||||
new_data: DATA,
|
||||
on_bad_vectors: str,
|
||||
fill_value: float,
|
||||
):
|
||||
LOOP.run(self._table._do_merge(merge, new_data, on_bad_vectors, fill_value))
|
||||
) -> MergeResult:
|
||||
return LOOP.run(
|
||||
self._table._do_merge(merge, new_data, on_bad_vectors, fill_value)
|
||||
)
|
||||
|
||||
def delete(self, predicate: str):
|
||||
def delete(self, predicate: str) -> DeleteResult:
|
||||
"""Delete rows from the table.
|
||||
|
||||
This can be used to delete a single row, many rows, all rows, or
|
||||
@@ -412,6 +443,11 @@ class RemoteTable(Table):
|
||||
|
||||
The filter must not be empty, or it will error.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DeleteResult
|
||||
An object containing the new version number of the table after deletion.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
@@ -444,7 +480,7 @@ class RemoteTable(Table):
|
||||
x vector _distance # doctest: +SKIP
|
||||
0 2 [3.0, 4.0] 85.0 # doctest: +SKIP
|
||||
"""
|
||||
LOOP.run(self._table.delete(predicate))
|
||||
return LOOP.run(self._table.delete(predicate))
|
||||
|
||||
def update(
|
||||
self,
|
||||
@@ -452,7 +488,7 @@ class RemoteTable(Table):
|
||||
values: Optional[dict] = None,
|
||||
*,
|
||||
values_sql: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
) -> UpdateResult:
|
||||
"""
|
||||
This can be used to update zero to all rows depending on how many
|
||||
rows match the where clause.
|
||||
@@ -470,6 +506,12 @@ class RemoteTable(Table):
|
||||
reference existing columns. For example, {"x": "x + 1"} will increment
|
||||
the x column by 1.
|
||||
|
||||
Returns
|
||||
-------
|
||||
UpdateResult
|
||||
- rows_updated: The number of rows that were updated
|
||||
- version: The new version number of the table after the update
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
@@ -494,7 +536,7 @@ class RemoteTable(Table):
|
||||
2 2 [10.0, 10.0] # doctest: +SKIP
|
||||
|
||||
"""
|
||||
LOOP.run(
|
||||
return LOOP.run(
|
||||
self._table.update(where=where, updates=values, updates_sql=values_sql)
|
||||
)
|
||||
|
||||
@@ -542,18 +584,28 @@ class RemoteTable(Table):
|
||||
def count_rows(self, filter: Optional[str] = None) -> int:
|
||||
return LOOP.run(self._table.count_rows(filter))
|
||||
|
||||
def add_columns(self, transforms: Dict[str, str]):
|
||||
def add_columns(self, transforms: Dict[str, str]) -> AddColumnsResult:
|
||||
return LOOP.run(self._table.add_columns(transforms))
|
||||
|
||||
def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
|
||||
def alter_columns(
|
||||
self, *alterations: Iterable[Dict[str, str]]
|
||||
) -> AlterColumnsResult:
|
||||
return LOOP.run(self._table.alter_columns(*alterations))
|
||||
|
||||
def drop_columns(self, columns: Iterable[str]):
|
||||
def drop_columns(self, columns: Iterable[str]) -> DropColumnsResult:
|
||||
return LOOP.run(self._table.drop_columns(columns))
|
||||
|
||||
def drop_index(self, index_name: str):
|
||||
return LOOP.run(self._table.drop_index(index_name))
|
||||
|
||||
def wait_for_index(
|
||||
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
|
||||
):
|
||||
return LOOP.run(self._table.wait_for_index(index_names, timeout))
|
||||
|
||||
def stats(self):
|
||||
return LOOP.run(self._table.stats())
|
||||
|
||||
def uses_v2_manifest_paths(self) -> bool:
|
||||
raise NotImplementedError(
|
||||
"uses_v2_manifest_paths() is not supported on the LanceDB Cloud"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -18,15 +18,19 @@ def test_upsert(mem_db):
|
||||
{"id": 1, "name": "Bobby"},
|
||||
{"id": 2, "name": "Charlie"},
|
||||
]
|
||||
(
|
||||
res = (
|
||||
table.merge_insert("id")
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.execute(new_users)
|
||||
)
|
||||
table.count_rows() # 3
|
||||
res # {'num_inserted_rows': 1, 'num_updated_rows': 1, 'num_deleted_rows': 0}
|
||||
# --8<-- [end:upsert_basic]
|
||||
assert table.count_rows() == 3
|
||||
assert res.num_inserted_rows == 1
|
||||
assert res.num_deleted_rows == 0
|
||||
assert res.num_updated_rows == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -44,15 +48,22 @@ async def test_upsert_async(mem_db_async):
|
||||
{"id": 1, "name": "Bobby"},
|
||||
{"id": 2, "name": "Charlie"},
|
||||
]
|
||||
await (
|
||||
res = await (
|
||||
table.merge_insert("id")
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.execute(new_users)
|
||||
)
|
||||
await table.count_rows() # 3
|
||||
res
|
||||
# MergeResult(version=2, num_updated_rows=1,
|
||||
# num_inserted_rows=1, num_deleted_rows=0)
|
||||
# --8<-- [end:upsert_basic_async]
|
||||
assert await table.count_rows() == 3
|
||||
assert res.version == 2
|
||||
assert res.num_inserted_rows == 1
|
||||
assert res.num_deleted_rows == 0
|
||||
assert res.num_updated_rows == 1
|
||||
|
||||
|
||||
def test_insert_if_not_exists(mem_db):
|
||||
@@ -69,10 +80,19 @@ def test_insert_if_not_exists(mem_db):
|
||||
{"domain": "google.com", "name": "Google"},
|
||||
{"domain": "facebook.com", "name": "Facebook"},
|
||||
]
|
||||
(table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains))
|
||||
res = (
|
||||
table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)
|
||||
)
|
||||
table.count_rows() # 3
|
||||
res
|
||||
# MergeResult(version=2, num_updated_rows=0,
|
||||
# num_inserted_rows=1, num_deleted_rows=0)
|
||||
# --8<-- [end:insert_if_not_exists]
|
||||
assert table.count_rows() == 3
|
||||
assert res.version == 2
|
||||
assert res.num_inserted_rows == 1
|
||||
assert res.num_deleted_rows == 0
|
||||
assert res.num_updated_rows == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -90,12 +110,19 @@ async def test_insert_if_not_exists_async(mem_db_async):
|
||||
{"domain": "google.com", "name": "Google"},
|
||||
{"domain": "facebook.com", "name": "Facebook"},
|
||||
]
|
||||
await (
|
||||
res = await (
|
||||
table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)
|
||||
)
|
||||
await table.count_rows() # 3
|
||||
# --8<-- [end:insert_if_not_exists_async]
|
||||
res
|
||||
# MergeResult(version=2, num_updated_rows=0,
|
||||
# num_inserted_rows=1, num_deleted_rows=0)
|
||||
# --8<-- [end:insert_if_not_exists]
|
||||
assert await table.count_rows() == 3
|
||||
assert res.version == 2
|
||||
assert res.num_inserted_rows == 1
|
||||
assert res.num_deleted_rows == 0
|
||||
assert res.num_updated_rows == 0
|
||||
|
||||
|
||||
def test_replace_range(mem_db):
|
||||
@@ -113,7 +140,7 @@ def test_replace_range(mem_db):
|
||||
new_chunks = [
|
||||
{"doc_id": 1, "chunk_id": 0, "text": "Baz"},
|
||||
]
|
||||
(
|
||||
res = (
|
||||
table.merge_insert(["doc_id", "chunk_id"])
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
@@ -121,8 +148,15 @@ def test_replace_range(mem_db):
|
||||
.execute(new_chunks)
|
||||
)
|
||||
table.count_rows("doc_id = 1") # 1
|
||||
# --8<-- [end:replace_range]
|
||||
res
|
||||
# MergeResult(version=2, num_updated_rows=1,
|
||||
# num_inserted_rows=0, num_deleted_rows=1)
|
||||
# --8<-- [end:insert_if_not_exists]
|
||||
assert table.count_rows("doc_id = 1") == 1
|
||||
assert res.version == 2
|
||||
assert res.num_inserted_rows == 0
|
||||
assert res.num_deleted_rows == 1
|
||||
assert res.num_updated_rows == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -141,7 +175,7 @@ async def test_replace_range_async(mem_db_async):
|
||||
new_chunks = [
|
||||
{"doc_id": 1, "chunk_id": 0, "text": "Baz"},
|
||||
]
|
||||
await (
|
||||
res = await (
|
||||
table.merge_insert(["doc_id", "chunk_id"])
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
@@ -149,5 +183,12 @@ async def test_replace_range_async(mem_db_async):
|
||||
.execute(new_chunks)
|
||||
)
|
||||
await table.count_rows("doc_id = 1") # 1
|
||||
# --8<-- [end:replace_range_async]
|
||||
res
|
||||
# MergeResult(version=2, num_updated_rows=1,
|
||||
# num_inserted_rows=0, num_deleted_rows=1)
|
||||
# --8<-- [end:insert_if_not_exists]
|
||||
assert await table.count_rows("doc_id = 1") == 1
|
||||
assert res.version == 2
|
||||
assert res.num_inserted_rows == 0
|
||||
assert res.num_deleted_rows == 1
|
||||
assert res.num_updated_rows == 1
|
||||
|
||||
@@ -11,7 +11,7 @@ import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
from lancedb.embeddings import get_registry
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.pydantic import LanceModel, Vector, MultiVector
|
||||
import requests
|
||||
|
||||
# These are integration tests for embedding functions.
|
||||
@@ -575,3 +575,67 @@ def test_voyageai_multimodal_embedding_text_function():
|
||||
|
||||
tbl.add(df)
|
||||
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skipif(
|
||||
importlib.util.find_spec("colpali_engine") is None,
|
||||
reason="colpali_engine not installed",
|
||||
)
|
||||
def test_colpali(tmp_path):
|
||||
import requests
|
||||
from lancedb.pydantic import LanceModel
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
registry = get_registry()
|
||||
func = registry.get("colpali").create()
|
||||
|
||||
class MediaItems(LanceModel):
|
||||
text: str
|
||||
image_uri: str = func.SourceField()
|
||||
image_bytes: bytes = func.SourceField()
|
||||
image_vectors: MultiVector(func.ndims()) = (
|
||||
func.VectorField()
|
||||
) # Multivector image embeddings
|
||||
|
||||
table = db.create_table("media", schema=MediaItems)
|
||||
|
||||
texts = [
|
||||
"a cute cat playing with yarn",
|
||||
"a puppy in a flower field",
|
||||
"a red sports car on the highway",
|
||||
"a vintage bicycle leaning against a wall",
|
||||
"a plate of delicious pasta",
|
||||
"fresh fruit salad in a bowl",
|
||||
]
|
||||
|
||||
uris = [
|
||||
"http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg",
|
||||
"http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg",
|
||||
"http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg",
|
||||
"http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg",
|
||||
"http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg",
|
||||
"http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg",
|
||||
]
|
||||
|
||||
# Get images as bytes
|
||||
image_bytes = [requests.get(uri).content for uri in uris]
|
||||
|
||||
table.add(
|
||||
pd.DataFrame({"text": texts, "image_uri": uris, "image_bytes": image_bytes})
|
||||
)
|
||||
|
||||
# Test text-to-image search
|
||||
image_results = (
|
||||
table.search("fluffy companion", vector_column_name="image_vectors")
|
||||
.limit(1)
|
||||
.to_pydantic(MediaItems)[0]
|
||||
)
|
||||
assert "cat" in image_results.text.lower() or "puppy" in image_results.text.lower()
|
||||
|
||||
# Verify multivector dimensions
|
||||
first_row = table.to_arrow().to_pylist()[0]
|
||||
assert len(first_row["image_vectors"]) > 1, "Should have multiple image vectors"
|
||||
assert len(first_row["image_vectors"][0]) == func.ndims(), (
|
||||
"Vector dimension mismatch"
|
||||
)
|
||||
|
||||
@@ -4,13 +4,32 @@
|
||||
import lancedb
|
||||
|
||||
from lancedb.query import LanceHybridQueryBuilder
|
||||
from lancedb.rerankers.rrf import RRFReranker
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from lancedb.index import FTS
|
||||
from lancedb.table import AsyncTable
|
||||
from lancedb.table import AsyncTable, Table
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sync_table(tmpdir_factory) -> Table:
|
||||
tmp_path = str(tmpdir_factory.mktemp("data"))
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pa.table(
|
||||
{
|
||||
"text": pa.array(["a", "b", "cat", "dog"]),
|
||||
"vector": pa.array(
|
||||
[[0.1, 0.1], [2, 2], [-0.1, -0.1], [0.5, -0.5]],
|
||||
type=pa.list_(pa.float32(), list_size=2),
|
||||
),
|
||||
}
|
||||
)
|
||||
table = db.create_table("test", data)
|
||||
table.create_fts_index("text", with_position=False, use_tantivy=False)
|
||||
return table
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
@@ -102,6 +121,42 @@ async def test_async_hybrid_query_default_limit(table: AsyncTable):
|
||||
assert texts.count("a") == 1
|
||||
|
||||
|
||||
def test_hybrid_query_distance_range(sync_table: Table):
|
||||
reranker = RRFReranker(return_score="all")
|
||||
result = (
|
||||
sync_table.search(query_type="hybrid")
|
||||
.vector([0.0, 0.4])
|
||||
.text("cat and dog")
|
||||
.distance_range(lower_bound=0.2, upper_bound=0.5)
|
||||
.rerank(reranker)
|
||||
.limit(2)
|
||||
.to_arrow()
|
||||
)
|
||||
assert len(result) == 2
|
||||
print(result)
|
||||
for dist in result["_distance"]:
|
||||
if dist.is_valid:
|
||||
assert 0.2 <= dist.as_py() <= 0.5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_hybrid_query_distance_range_async(table: AsyncTable):
|
||||
reranker = RRFReranker(return_score="all")
|
||||
result = await (
|
||||
table.query()
|
||||
.nearest_to([0.0, 0.4])
|
||||
.nearest_to_text("cat and dog")
|
||||
.distance_range(lower_bound=0.2, upper_bound=0.5)
|
||||
.rerank(reranker)
|
||||
.limit(2)
|
||||
.to_arrow()
|
||||
)
|
||||
assert len(result) == 2
|
||||
for dist in result["_distance"]:
|
||||
if dist.is_valid:
|
||||
assert 0.2 <= dist.as_py() <= 0.5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_explain_plan(table: AsyncTable):
|
||||
plan = await (
|
||||
|
||||
@@ -8,7 +8,7 @@ import pyarrow as pa
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from lancedb import AsyncConnection, AsyncTable, connect_async
|
||||
from lancedb.index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq
|
||||
from lancedb.index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
@@ -119,6 +119,18 @@ async def test_create_label_list_index(some_table: AsyncTable):
|
||||
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_full_text_search_index(some_table: AsyncTable):
|
||||
await some_table.create_index("tags", config=FTS(with_position=False))
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(FTS, columns=["tags"], name="tags_idx")]'
|
||||
|
||||
await some_table.prewarm_index("tags_idx")
|
||||
|
||||
res = await (await some_table.search("tag0")).to_arrow()
|
||||
assert res.num_rows > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_vector_index(some_table: AsyncTable):
|
||||
# Can create
|
||||
|
||||
@@ -9,7 +9,13 @@ from typing import List, Optional, Tuple
|
||||
import pyarrow as pa
|
||||
import pydantic
|
||||
import pytest
|
||||
from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, Vector, pydantic_to_schema
|
||||
from lancedb.pydantic import (
|
||||
PYDANTIC_VERSION,
|
||||
LanceModel,
|
||||
Vector,
|
||||
pydantic_to_schema,
|
||||
MultiVector,
|
||||
)
|
||||
from pydantic import BaseModel
|
||||
from pydantic import Field
|
||||
|
||||
@@ -354,3 +360,55 @@ def test_optional_nested_model():
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def test_multi_vector():
|
||||
class TestModel(pydantic.BaseModel):
|
||||
vec: MultiVector(8)
|
||||
|
||||
schema = pydantic_to_schema(TestModel)
|
||||
assert schema == pa.schema(
|
||||
[pa.field("vec", pa.list_(pa.list_(pa.float32(), 8)), True)]
|
||||
)
|
||||
|
||||
with pytest.raises(pydantic.ValidationError):
|
||||
TestModel(vec=[[1.0] * 7])
|
||||
|
||||
with pytest.raises(pydantic.ValidationError):
|
||||
TestModel(vec=[[1.0] * 9])
|
||||
|
||||
TestModel(vec=[[1.0] * 8])
|
||||
TestModel(vec=[[1.0] * 8, [2.0] * 8])
|
||||
|
||||
TestModel(vec=[])
|
||||
|
||||
|
||||
def test_multi_vector_nullable():
|
||||
class NullableModel(pydantic.BaseModel):
|
||||
vec: MultiVector(16, nullable=False)
|
||||
|
||||
schema = pydantic_to_schema(NullableModel)
|
||||
assert schema == pa.schema(
|
||||
[pa.field("vec", pa.list_(pa.list_(pa.float32(), 16)), False)]
|
||||
)
|
||||
|
||||
class DefaultModel(pydantic.BaseModel):
|
||||
vec: MultiVector(16)
|
||||
|
||||
schema = pydantic_to_schema(DefaultModel)
|
||||
assert schema == pa.schema(
|
||||
[pa.field("vec", pa.list_(pa.list_(pa.float32(), 16)), True)]
|
||||
)
|
||||
|
||||
|
||||
def test_multi_vector_in_lance_model():
|
||||
class TestModel(LanceModel):
|
||||
id: int
|
||||
vectors: MultiVector(16) = Field(default=[[0.0] * 16])
|
||||
|
||||
schema = pydantic_to_schema(TestModel)
|
||||
assert schema == TestModel.to_arrow_schema()
|
||||
assert TestModel.field_names() == ["id", "vectors"]
|
||||
|
||||
t = TestModel(id=1)
|
||||
assert t.vectors == [[0.0] * 16]
|
||||
|
||||
@@ -257,7 +257,9 @@ async def test_distance_range_with_new_rows_async():
|
||||
}
|
||||
)
|
||||
table = await conn.create_table("test", data)
|
||||
table.create_index("vector", config=IvfPq(num_partitions=1, num_sub_vectors=2))
|
||||
await table.create_index(
|
||||
"vector", config=IvfPq(num_partitions=1, num_sub_vectors=2)
|
||||
)
|
||||
|
||||
q = [0, 0]
|
||||
rs = await table.query().nearest_to(q).to_arrow()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import re
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import contextlib
|
||||
from datetime import timedelta
|
||||
@@ -149,6 +149,24 @@ async def test_async_checkout():
|
||||
assert await table.count_rows() == 300
|
||||
|
||||
|
||||
def test_table_len_sync():
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/create/?mode=create":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(b"{}")
|
||||
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(json.dumps(1).encode())
|
||||
|
||||
with mock_lancedb_connection(handler) as db:
|
||||
table = db.create_table("test", [{"id": 1}])
|
||||
assert len(table) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_http_error():
|
||||
request_id_holder = {"request_id": None}
|
||||
@@ -235,6 +253,10 @@ def test_table_add_in_threadpool():
|
||||
|
||||
def test_table_create_indices():
|
||||
def handler(request):
|
||||
index_stats = dict(
|
||||
index_type="IVF_PQ", num_indexed_rows=1000, num_unindexed_rows=0
|
||||
)
|
||||
|
||||
if request.path == "/v1/table/test/create_index/":
|
||||
request.send_response(200)
|
||||
request.end_headers()
|
||||
@@ -258,6 +280,47 @@ def test_table_create_indices():
|
||||
)
|
||||
)
|
||||
request.wfile.write(payload.encode())
|
||||
elif request.path == "/v1/table/test/index/list/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(
|
||||
dict(
|
||||
indexes=[
|
||||
{
|
||||
"index_name": "id_idx",
|
||||
"columns": ["id"],
|
||||
},
|
||||
{
|
||||
"index_name": "text_idx",
|
||||
"columns": ["text"],
|
||||
},
|
||||
{
|
||||
"index_name": "vector_idx",
|
||||
"columns": ["vector"],
|
||||
},
|
||||
]
|
||||
)
|
||||
)
|
||||
request.wfile.write(payload.encode())
|
||||
elif request.path == "/v1/table/test/index/id_idx/stats/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(index_stats)
|
||||
request.wfile.write(payload.encode())
|
||||
elif request.path == "/v1/table/test/index/text_idx/stats/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(index_stats)
|
||||
request.wfile.write(payload.encode())
|
||||
elif request.path == "/v1/table/test/index/vector_idx/stats/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(index_stats)
|
||||
request.wfile.write(payload.encode())
|
||||
elif "/drop/" in request.path:
|
||||
request.send_response(200)
|
||||
request.end_headers()
|
||||
@@ -269,14 +332,125 @@ def test_table_create_indices():
|
||||
# Parameters are well-tested through local and async tests.
|
||||
# This is a smoke-test.
|
||||
table = db.create_table("test", [{"id": 1}])
|
||||
table.create_scalar_index("id")
|
||||
table.create_fts_index("text")
|
||||
table.create_scalar_index("vector")
|
||||
table.create_scalar_index("id", wait_timeout=timedelta(seconds=2))
|
||||
table.create_fts_index("text", wait_timeout=timedelta(seconds=2))
|
||||
table.create_index(
|
||||
vector_column_name="vector", wait_timeout=timedelta(seconds=10)
|
||||
)
|
||||
table.wait_for_index(["id_idx"], timedelta(seconds=2))
|
||||
table.wait_for_index(["text_idx", "vector_idx"], timedelta(seconds=2))
|
||||
table.drop_index("vector_idx")
|
||||
table.drop_index("id_idx")
|
||||
table.drop_index("text_idx")
|
||||
|
||||
|
||||
def test_table_wait_for_index_timeout():
|
||||
def handler(request):
|
||||
index_stats = dict(
|
||||
index_type="BTREE", num_indexed_rows=1000, num_unindexed_rows=1
|
||||
)
|
||||
|
||||
if request.path == "/v1/table/test/create/?mode=create":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(b"{}")
|
||||
elif request.path == "/v1/table/test/describe/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(
|
||||
dict(
|
||||
version=1,
|
||||
schema=dict(
|
||||
fields=[
|
||||
dict(name="id", type={"type": "int64"}, nullable=False),
|
||||
]
|
||||
),
|
||||
)
|
||||
)
|
||||
request.wfile.write(payload.encode())
|
||||
elif request.path == "/v1/table/test/index/list/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(
|
||||
dict(
|
||||
indexes=[
|
||||
{
|
||||
"index_name": "id_idx",
|
||||
"columns": ["id"],
|
||||
},
|
||||
]
|
||||
)
|
||||
)
|
||||
request.wfile.write(payload.encode())
|
||||
elif request.path == "/v1/table/test/index/id_idx/stats/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(index_stats)
|
||||
print(f"{index_stats=}")
|
||||
request.wfile.write(payload.encode())
|
||||
else:
|
||||
request.send_response(404)
|
||||
request.end_headers()
|
||||
|
||||
with mock_lancedb_connection(handler) as db:
|
||||
table = db.create_table("test", [{"id": 1}])
|
||||
with pytest.raises(
|
||||
RuntimeError,
|
||||
match=re.escape(
|
||||
'Timeout error: timed out waiting for indices: ["id_idx"] after 1s'
|
||||
),
|
||||
):
|
||||
table.wait_for_index(["id_idx"], timedelta(seconds=1))
|
||||
|
||||
|
||||
def test_stats():
|
||||
stats = {
|
||||
"total_bytes": 38,
|
||||
"num_rows": 2,
|
||||
"num_indices": 0,
|
||||
"fragment_stats": {
|
||||
"num_fragments": 1,
|
||||
"num_small_fragments": 1,
|
||||
"lengths": {
|
||||
"min": 2,
|
||||
"max": 2,
|
||||
"mean": 2,
|
||||
"p25": 2,
|
||||
"p50": 2,
|
||||
"p75": 2,
|
||||
"p99": 2,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/create/?mode=create":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(b"{}")
|
||||
elif request.path == "/v1/table/test/stats/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(stats)
|
||||
request.wfile.write(payload.encode())
|
||||
else:
|
||||
print(request.path)
|
||||
request.send_response(404)
|
||||
request.end_headers()
|
||||
|
||||
with mock_lancedb_connection(handler) as db:
|
||||
table = db.create_table("test", [{"id": 1}])
|
||||
res = table.stats()
|
||||
print(f"{res=}")
|
||||
assert res == stats
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def query_test_table(query_handler, *, server_version=Version("0.1.0")):
|
||||
def handler(request):
|
||||
|
||||
@@ -9,9 +9,9 @@ from typing import List
|
||||
from unittest.mock import patch
|
||||
|
||||
import lancedb
|
||||
from lancedb.dependencies import _PANDAS_AVAILABLE
|
||||
from lancedb.index import HnswPq, HnswSq, IvfPq
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import pyarrow as pa
|
||||
import pyarrow.dataset
|
||||
@@ -106,15 +106,22 @@ async def test_update_async(mem_db_async: AsyncConnection):
|
||||
table = await mem_db_async.create_table("some_table", data=[{"id": 0}])
|
||||
assert await table.count_rows("id == 0") == 1
|
||||
assert await table.count_rows("id == 7") == 0
|
||||
await table.update({"id": 7})
|
||||
update_res = await table.update({"id": 7})
|
||||
assert update_res.rows_updated == 1
|
||||
assert update_res.version == 2
|
||||
assert await table.count_rows("id == 7") == 1
|
||||
assert await table.count_rows("id == 0") == 0
|
||||
await table.add([{"id": 2}])
|
||||
await table.update(where="id % 2 == 0", updates_sql={"id": "5"})
|
||||
add_res = await table.add([{"id": 2}])
|
||||
assert add_res.version == 3
|
||||
update_res = await table.update(where="id % 2 == 0", updates_sql={"id": "5"})
|
||||
assert update_res.rows_updated == 1
|
||||
assert update_res.version == 4
|
||||
assert await table.count_rows("id == 7") == 1
|
||||
assert await table.count_rows("id == 2") == 0
|
||||
assert await table.count_rows("id == 5") == 1
|
||||
await table.update({"id": 10}, where="id == 5")
|
||||
update_res = await table.update({"id": 10}, where="id == 5")
|
||||
assert update_res.rows_updated == 1
|
||||
assert update_res.version == 5
|
||||
assert await table.count_rows("id == 10") == 1
|
||||
|
||||
|
||||
@@ -138,13 +145,16 @@ def test_create_table(mem_db: DBConnection):
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
]
|
||||
df = pd.DataFrame(rows)
|
||||
pa_table = pa.Table.from_pandas(df, schema=schema)
|
||||
pa_table = pa.Table.from_pylist(rows, schema=schema)
|
||||
data = [
|
||||
("Rows", rows),
|
||||
("pd_DataFrame", df),
|
||||
("pa_Table", pa_table),
|
||||
]
|
||||
if _PANDAS_AVAILABLE:
|
||||
import pandas as pd
|
||||
|
||||
df = pd.DataFrame(rows)
|
||||
data.append(("pd_DataFrame", df))
|
||||
|
||||
for name, d in data:
|
||||
tbl = mem_db.create_table(name, data=d, schema=schema).to_arrow()
|
||||
@@ -296,7 +306,7 @@ def test_add_subschema(mem_db: DBConnection):
|
||||
|
||||
data = {"price": 10.0, "item": "foo"}
|
||||
table.add([data])
|
||||
data = pd.DataFrame({"price": [2.0], "vector": [[3.1, 4.1]]})
|
||||
data = pa.Table.from_pydict({"price": [2.0], "vector": [[3.1, 4.1]]})
|
||||
table.add(data)
|
||||
data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
|
||||
table.add([data])
|
||||
@@ -405,6 +415,7 @@ def test_add_nullability(mem_db: DBConnection):
|
||||
|
||||
|
||||
def test_add_pydantic_model(mem_db: DBConnection):
|
||||
pytest.importorskip("pandas")
|
||||
# https://github.com/lancedb/lancedb/issues/562
|
||||
|
||||
class Metadata(BaseModel):
|
||||
@@ -433,7 +444,8 @@ def test_add_pydantic_model(mem_db: DBConnection):
|
||||
content="foo", meta=Metadata(source="bar", timestamp=datetime.now())
|
||||
),
|
||||
)
|
||||
tbl.add([expected])
|
||||
add_res = tbl.add([expected])
|
||||
assert add_res.version == 2
|
||||
|
||||
result = tbl.search([0.0, 0.0]).limit(1).to_pydantic(LanceSchema)[0]
|
||||
assert result == expected
|
||||
@@ -455,11 +467,12 @@ async def test_add_async(mem_db_async: AsyncConnection):
|
||||
],
|
||||
)
|
||||
assert await table.count_rows() == 2
|
||||
await table.add(
|
||||
add_res = await table.add(
|
||||
data=[
|
||||
{"vector": [10.0, 11.0], "item": "baz", "price": 30.0},
|
||||
],
|
||||
)
|
||||
assert add_res.version == 2
|
||||
assert await table.count_rows() == 3
|
||||
|
||||
|
||||
@@ -473,10 +486,10 @@ def test_polars(mem_db: DBConnection):
|
||||
table = mem_db.create_table("test", data=pl.DataFrame(data))
|
||||
assert len(table) == 2
|
||||
|
||||
result = table.to_pandas()
|
||||
assert np.allclose(result["vector"].tolist(), data["vector"])
|
||||
assert result["item"].tolist() == data["item"]
|
||||
assert np.allclose(result["price"].tolist(), data["price"])
|
||||
result = table.to_arrow()
|
||||
assert np.allclose(result["vector"].to_pylist(), data["vector"])
|
||||
assert result["item"].to_pylist() == data["item"]
|
||||
assert np.allclose(result["price"].to_pylist(), data["price"])
|
||||
|
||||
schema = pa.schema(
|
||||
[
|
||||
@@ -525,6 +538,113 @@ def test_versioning(mem_db: DBConnection):
|
||||
assert len(table) == 2
|
||||
|
||||
|
||||
def test_tags(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"test",
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
],
|
||||
)
|
||||
|
||||
table.tags.create("tag1", 1)
|
||||
tags = table.tags.list()
|
||||
assert "tag1" in tags
|
||||
assert tags["tag1"]["version"] == 1
|
||||
|
||||
table.add(
|
||||
data=[
|
||||
{"vector": [10.0, 11.0], "item": "baz", "price": 30.0},
|
||||
],
|
||||
)
|
||||
|
||||
table.tags.create("tag2", 2)
|
||||
tags = table.tags.list()
|
||||
assert "tag1" in tags
|
||||
assert "tag2" in tags
|
||||
assert tags["tag1"]["version"] == 1
|
||||
assert tags["tag2"]["version"] == 2
|
||||
|
||||
table.tags.delete("tag2")
|
||||
table.tags.update("tag1", 2)
|
||||
tags = table.tags.list()
|
||||
assert "tag1" in tags
|
||||
assert tags["tag1"]["version"] == 2
|
||||
|
||||
table.tags.update("tag1", 1)
|
||||
tags = table.tags.list()
|
||||
assert "tag1" in tags
|
||||
assert tags["tag1"]["version"] == 1
|
||||
|
||||
table.checkout("tag1")
|
||||
assert table.version == 1
|
||||
assert table.count_rows() == 2
|
||||
table.tags.create("tag2", 2)
|
||||
table.checkout("tag2")
|
||||
assert table.version == 2
|
||||
assert table.count_rows() == 3
|
||||
table.checkout_latest()
|
||||
table.add(
|
||||
data=[
|
||||
{"vector": [12.0, 13.0], "item": "baz", "price": 40.0},
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_tags(mem_db_async: AsyncConnection):
|
||||
table = await mem_db_async.create_table(
|
||||
"test",
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
],
|
||||
)
|
||||
|
||||
await table.tags.create("tag1", 1)
|
||||
tags = await table.tags.list()
|
||||
assert "tag1" in tags
|
||||
assert tags["tag1"]["version"] == 1
|
||||
|
||||
await table.add(
|
||||
data=[
|
||||
{"vector": [10.0, 11.0], "item": "baz", "price": 30.0},
|
||||
],
|
||||
)
|
||||
|
||||
await table.tags.create("tag2", 2)
|
||||
tags = await table.tags.list()
|
||||
assert "tag1" in tags
|
||||
assert "tag2" in tags
|
||||
assert tags["tag1"]["version"] == 1
|
||||
assert tags["tag2"]["version"] == 2
|
||||
|
||||
await table.tags.delete("tag2")
|
||||
await table.tags.update("tag1", 2)
|
||||
tags = await table.tags.list()
|
||||
assert "tag1" in tags
|
||||
assert tags["tag1"]["version"] == 2
|
||||
|
||||
await table.tags.update("tag1", 1)
|
||||
tags = await table.tags.list()
|
||||
assert "tag1" in tags
|
||||
assert tags["tag1"]["version"] == 1
|
||||
|
||||
await table.checkout("tag1")
|
||||
assert await table.version() == 1
|
||||
assert await table.count_rows() == 2
|
||||
await table.tags.create("tag2", 2)
|
||||
await table.checkout("tag2")
|
||||
assert await table.version() == 2
|
||||
assert await table.count_rows() == 3
|
||||
await table.checkout_latest()
|
||||
await table.add(
|
||||
data=[
|
||||
{"vector": [12.0, 13.0], "item": "baz", "price": 40.0},
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@patch("lancedb.table.AsyncTable.create_index")
|
||||
def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
@@ -649,6 +769,29 @@ def test_restore(mem_db: DBConnection):
|
||||
table.restore(0)
|
||||
|
||||
|
||||
def test_restore_with_tags(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"my_table",
|
||||
data=[{"vector": [1.1, 0.9], "type": "vector"}],
|
||||
)
|
||||
tag = "tag1"
|
||||
table.tags.create(tag, 1)
|
||||
table.add([{"vector": [0.5, 0.2], "type": "vector"}])
|
||||
table.restore(tag)
|
||||
assert len(table.list_versions()) == 3
|
||||
assert len(table) == 1
|
||||
expected = table.to_arrow()
|
||||
|
||||
table.add([{"vector": [0.3, 0.3], "type": "vector"}])
|
||||
table.checkout("tag1")
|
||||
table.restore()
|
||||
assert len(table.list_versions()) == 5
|
||||
assert table.to_arrow() == expected
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
table.restore("tag_unknown")
|
||||
|
||||
|
||||
def test_merge(tmp_db: DBConnection, tmp_path):
|
||||
pytest.importorskip("lance")
|
||||
import lance
|
||||
@@ -684,11 +827,12 @@ def test_delete(mem_db: DBConnection):
|
||||
)
|
||||
assert len(table) == 2
|
||||
assert len(table.list_versions()) == 1
|
||||
table.delete("id=0")
|
||||
delete_res = table.delete("id=0")
|
||||
assert delete_res.version == 2
|
||||
assert len(table.list_versions()) == 2
|
||||
assert table.version == 2
|
||||
assert len(table) == 1
|
||||
assert table.to_pandas()["id"].tolist() == [1]
|
||||
assert table.to_arrow()["id"].to_pylist() == [1]
|
||||
|
||||
|
||||
def test_update(mem_db: DBConnection):
|
||||
@@ -698,7 +842,9 @@ def test_update(mem_db: DBConnection):
|
||||
)
|
||||
assert len(table) == 2
|
||||
assert len(table.list_versions()) == 1
|
||||
table.update(where="id=0", values={"vector": [1.1, 1.1]})
|
||||
update_res = table.update(where="id=0", values={"vector": [1.1, 1.1]})
|
||||
assert update_res.version == 2
|
||||
assert update_res.rows_updated == 1
|
||||
assert len(table.list_versions()) == 2
|
||||
assert table.version == 2
|
||||
assert len(table) == 2
|
||||
@@ -787,9 +933,16 @@ def test_merge_insert(mem_db: DBConnection):
|
||||
new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
|
||||
|
||||
# upsert
|
||||
table.merge_insert(
|
||||
"a"
|
||||
).when_matched_update_all().when_not_matched_insert_all().execute(new_data)
|
||||
merge_insert_res = (
|
||||
table.merge_insert("a")
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.execute(new_data, timeout=timedelta(seconds=10))
|
||||
)
|
||||
assert merge_insert_res.version == 2
|
||||
assert merge_insert_res.num_inserted_rows == 1
|
||||
assert merge_insert_res.num_updated_rows == 2
|
||||
assert merge_insert_res.num_deleted_rows == 0
|
||||
|
||||
expected = pa.table({"a": [1, 2, 3, 4], "b": ["a", "x", "y", "z"]})
|
||||
assert table.to_arrow().sort_by("a") == expected
|
||||
@@ -797,17 +950,28 @@ def test_merge_insert(mem_db: DBConnection):
|
||||
table.restore(version)
|
||||
|
||||
# conditional update
|
||||
table.merge_insert("a").when_matched_update_all(where="target.b = 'b'").execute(
|
||||
new_data
|
||||
merge_insert_res = (
|
||||
table.merge_insert("a")
|
||||
.when_matched_update_all(where="target.b = 'b'")
|
||||
.execute(new_data)
|
||||
)
|
||||
assert merge_insert_res.version == 4
|
||||
assert merge_insert_res.num_inserted_rows == 0
|
||||
assert merge_insert_res.num_updated_rows == 1
|
||||
assert merge_insert_res.num_deleted_rows == 0
|
||||
expected = pa.table({"a": [1, 2, 3], "b": ["a", "x", "c"]})
|
||||
assert table.to_arrow().sort_by("a") == expected
|
||||
|
||||
table.restore(version)
|
||||
|
||||
# insert-if-not-exists
|
||||
table.merge_insert("a").when_not_matched_insert_all().execute(new_data)
|
||||
|
||||
merge_insert_res = (
|
||||
table.merge_insert("a").when_not_matched_insert_all().execute(new_data)
|
||||
)
|
||||
assert merge_insert_res.version == 6
|
||||
assert merge_insert_res.num_inserted_rows == 1
|
||||
assert merge_insert_res.num_updated_rows == 0
|
||||
assert merge_insert_res.num_deleted_rows == 0
|
||||
expected = pa.table({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "z"]})
|
||||
assert table.to_arrow().sort_by("a") == expected
|
||||
|
||||
@@ -816,13 +980,17 @@ def test_merge_insert(mem_db: DBConnection):
|
||||
new_data = pa.table({"a": [2, 4], "b": ["x", "z"]})
|
||||
|
||||
# replace-range
|
||||
(
|
||||
merge_insert_res = (
|
||||
table.merge_insert("a")
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.when_not_matched_by_source_delete("a > 2")
|
||||
.execute(new_data)
|
||||
)
|
||||
assert merge_insert_res.version == 8
|
||||
assert merge_insert_res.num_inserted_rows == 1
|
||||
assert merge_insert_res.num_updated_rows == 1
|
||||
assert merge_insert_res.num_deleted_rows == 1
|
||||
|
||||
expected = pa.table({"a": [1, 2, 4], "b": ["a", "x", "z"]})
|
||||
assert table.to_arrow().sort_by("a") == expected
|
||||
@@ -830,15 +998,27 @@ def test_merge_insert(mem_db: DBConnection):
|
||||
table.restore(version)
|
||||
|
||||
# replace-range no condition
|
||||
table.merge_insert(
|
||||
"a"
|
||||
).when_matched_update_all().when_not_matched_insert_all().when_not_matched_by_source_delete().execute(
|
||||
new_data
|
||||
merge_insert_res = (
|
||||
table.merge_insert("a")
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.when_not_matched_by_source_delete()
|
||||
.execute(new_data)
|
||||
)
|
||||
assert merge_insert_res.version == 10
|
||||
assert merge_insert_res.num_inserted_rows == 1
|
||||
assert merge_insert_res.num_updated_rows == 1
|
||||
assert merge_insert_res.num_deleted_rows == 2
|
||||
|
||||
expected = pa.table({"a": [2, 4], "b": ["x", "z"]})
|
||||
assert table.to_arrow().sort_by("a") == expected
|
||||
|
||||
# timeout
|
||||
with pytest.raises(Exception, match="merge insert timed out"):
|
||||
table.merge_insert("a").when_matched_update_all().execute(
|
||||
new_data, timeout=timedelta(0)
|
||||
)
|
||||
|
||||
|
||||
# We vary the data format because there are slight differences in how
|
||||
# subschemas are handled in different formats
|
||||
@@ -852,6 +1032,7 @@ def test_merge_insert(mem_db: DBConnection):
|
||||
ids=["pa.Table", "pd.DataFrame", "rows"],
|
||||
)
|
||||
def test_merge_insert_subschema(mem_db: DBConnection, data_format):
|
||||
pytest.importorskip("pandas")
|
||||
initial_data = pa.table(
|
||||
{"id": range(3), "a": [1.0, 2.0, 3.0], "c": ["x", "x", "x"]}
|
||||
)
|
||||
@@ -948,7 +1129,7 @@ def test_create_with_embedding_function(mem_db: DBConnection):
|
||||
|
||||
func = MockTextEmbeddingFunction.create()
|
||||
texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
|
||||
df = pd.DataFrame({"text": texts, "vector": func.compute_source_embeddings(texts)})
|
||||
df = pa.table({"text": texts, "vector": func.compute_source_embeddings(texts)})
|
||||
|
||||
conf = EmbeddingFunctionConfig(
|
||||
source_column="text", vector_column="vector", function=func
|
||||
@@ -973,7 +1154,7 @@ def test_create_f16_table(mem_db: DBConnection):
|
||||
text: str
|
||||
vector: Vector(32, value_type=pa.float16())
|
||||
|
||||
df = pd.DataFrame(
|
||||
df = pa.table(
|
||||
{
|
||||
"text": [f"s-{i}" for i in range(512)],
|
||||
"vector": [np.random.randn(32).astype(np.float16) for _ in range(512)],
|
||||
@@ -986,7 +1167,7 @@ def test_create_f16_table(mem_db: DBConnection):
|
||||
table.add(df)
|
||||
table.create_index(num_partitions=2, num_sub_vectors=2)
|
||||
|
||||
query = df.vector.iloc[2]
|
||||
query = df["vector"][2].as_py()
|
||||
expected = table.search(query).limit(2).to_arrow()
|
||||
|
||||
assert "s-2" in expected["text"].to_pylist()
|
||||
@@ -1002,7 +1183,7 @@ def test_add_with_embedding_function(mem_db: DBConnection):
|
||||
table = mem_db.create_table("my_table", schema=MyTable)
|
||||
|
||||
texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
|
||||
df = pd.DataFrame({"text": texts})
|
||||
df = pa.table({"text": texts})
|
||||
table.add(df)
|
||||
|
||||
texts = ["the quick brown fox", "jumped over the lazy dog"]
|
||||
@@ -1033,14 +1214,14 @@ def test_multiple_vector_columns(mem_db: DBConnection):
|
||||
{"vector1": v1, "vector2": v2, "text": "foo"},
|
||||
{"vector1": v2, "vector2": v1, "text": "bar"},
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
df = pa.Table.from_pylist(data)
|
||||
table.add(df)
|
||||
|
||||
q = np.random.randn(10)
|
||||
result1 = table.search(q, vector_column_name="vector1").limit(1).to_pandas()
|
||||
result2 = table.search(q, vector_column_name="vector2").limit(1).to_pandas()
|
||||
result1 = table.search(q, vector_column_name="vector1").limit(1).to_arrow()
|
||||
result2 = table.search(q, vector_column_name="vector2").limit(1).to_arrow()
|
||||
|
||||
assert result1["text"].iloc[0] != result2["text"].iloc[0]
|
||||
assert result1["text"][0] != result2["text"][0]
|
||||
|
||||
|
||||
def test_create_scalar_index(mem_db: DBConnection):
|
||||
@@ -1078,22 +1259,22 @@ def test_empty_query(mem_db: DBConnection):
|
||||
"my_table",
|
||||
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
|
||||
)
|
||||
df = table.search().select(["id"]).where("text='bar'").limit(1).to_pandas()
|
||||
val = df.id.iloc[0]
|
||||
df = table.search().select(["id"]).where("text='bar'").limit(1).to_arrow()
|
||||
val = df["id"][0].as_py()
|
||||
assert val == 1
|
||||
|
||||
table = mem_db.create_table("my_table2", data=[{"id": i} for i in range(100)])
|
||||
df = table.search().select(["id"]).to_pandas()
|
||||
assert len(df) == 100
|
||||
df = table.search().select(["id"]).to_arrow()
|
||||
assert df.num_rows == 100
|
||||
# None is the same as default
|
||||
df = table.search().select(["id"]).limit(None).to_pandas()
|
||||
assert len(df) == 100
|
||||
df = table.search().select(["id"]).limit(None).to_arrow()
|
||||
assert df.num_rows == 100
|
||||
# invalid limist is the same as None, wihch is the same as default
|
||||
df = table.search().select(["id"]).limit(-1).to_pandas()
|
||||
assert len(df) == 100
|
||||
df = table.search().select(["id"]).limit(-1).to_arrow()
|
||||
assert df.num_rows == 100
|
||||
# valid limit should work
|
||||
df = table.search().select(["id"]).limit(42).to_pandas()
|
||||
assert len(df) == 42
|
||||
df = table.search().select(["id"]).limit(42).to_arrow()
|
||||
assert df.num_rows == 42
|
||||
|
||||
|
||||
def test_search_with_schema_inf_single_vector(mem_db: DBConnection):
|
||||
@@ -1112,14 +1293,14 @@ def test_search_with_schema_inf_single_vector(mem_db: DBConnection):
|
||||
{"vector_col": v1, "text": "foo"},
|
||||
{"vector_col": v2, "text": "bar"},
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
df = pa.Table.from_pylist(data)
|
||||
table.add(df)
|
||||
|
||||
q = np.random.randn(10)
|
||||
result1 = table.search(q, vector_column_name="vector_col").limit(1).to_pandas()
|
||||
result2 = table.search(q).limit(1).to_pandas()
|
||||
result1 = table.search(q, vector_column_name="vector_col").limit(1).to_arrow()
|
||||
result2 = table.search(q).limit(1).to_arrow()
|
||||
|
||||
assert result1["text"].iloc[0] == result2["text"].iloc[0]
|
||||
assert result1["text"][0].as_py() == result2["text"][0].as_py()
|
||||
|
||||
|
||||
def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection):
|
||||
@@ -1139,12 +1320,12 @@ def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection):
|
||||
{"vector1": v1, "vector2": v2, "text": "foo"},
|
||||
{"vector1": v2, "vector2": v1, "text": "bar"},
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
df = pa.Table.from_pylist(data)
|
||||
table.add(df)
|
||||
|
||||
q = np.random.randn(10)
|
||||
with pytest.raises(ValueError):
|
||||
table.search(q).limit(1).to_pandas()
|
||||
table.search(q).limit(1).to_arrow()
|
||||
|
||||
|
||||
def test_compact_cleanup(tmp_db: DBConnection):
|
||||
@@ -1366,11 +1547,13 @@ def test_restore_consistency(tmp_path):
|
||||
def test_add_columns(mem_db: DBConnection):
|
||||
data = pa.table({"id": [0, 1]})
|
||||
table = LanceTable.create(mem_db, "my_table", data=data)
|
||||
table.add_columns({"new_col": "id + 2"})
|
||||
add_columns_res = table.add_columns({"new_col": "id + 2"})
|
||||
assert add_columns_res.version == 2
|
||||
assert table.to_arrow().column_names == ["id", "new_col"]
|
||||
assert table.to_arrow()["new_col"].to_pylist() == [2, 3]
|
||||
|
||||
table.add_columns({"null_int": "cast(null as bigint)"})
|
||||
add_columns_res = table.add_columns({"null_int": "cast(null as bigint)"})
|
||||
assert add_columns_res.version == 3
|
||||
assert table.schema.field("null_int").type == pa.int64()
|
||||
|
||||
|
||||
@@ -1378,7 +1561,8 @@ def test_add_columns(mem_db: DBConnection):
|
||||
async def test_add_columns_async(mem_db_async: AsyncConnection):
|
||||
data = pa.table({"id": [0, 1]})
|
||||
table = await mem_db_async.create_table("my_table", data=data)
|
||||
await table.add_columns({"new_col": "id + 2"})
|
||||
add_columns_res = await table.add_columns({"new_col": "id + 2"})
|
||||
assert add_columns_res.version == 2
|
||||
data = await table.to_arrow()
|
||||
assert data.column_names == ["id", "new_col"]
|
||||
assert data["new_col"].to_pylist() == [2, 3]
|
||||
@@ -1388,9 +1572,10 @@ async def test_add_columns_async(mem_db_async: AsyncConnection):
|
||||
async def test_add_columns_with_schema(mem_db_async: AsyncConnection):
|
||||
data = pa.table({"id": [0, 1]})
|
||||
table = await mem_db_async.create_table("my_table", data=data)
|
||||
await table.add_columns(
|
||||
add_columns_res = await table.add_columns(
|
||||
[pa.field("x", pa.int64()), pa.field("vector", pa.list_(pa.float32(), 8))]
|
||||
)
|
||||
assert add_columns_res.version == 2
|
||||
|
||||
assert await table.schema() == pa.schema(
|
||||
[
|
||||
@@ -1401,11 +1586,12 @@ async def test_add_columns_with_schema(mem_db_async: AsyncConnection):
|
||||
)
|
||||
|
||||
table = await mem_db_async.create_table("table2", data=data)
|
||||
await table.add_columns(
|
||||
add_columns_res = await table.add_columns(
|
||||
pa.schema(
|
||||
[pa.field("y", pa.int64()), pa.field("emb", pa.list_(pa.float32(), 8))]
|
||||
)
|
||||
)
|
||||
assert add_columns_res.version == 2
|
||||
assert await table.schema() == pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
@@ -1418,7 +1604,8 @@ async def test_add_columns_with_schema(mem_db_async: AsyncConnection):
|
||||
def test_alter_columns(mem_db: DBConnection):
|
||||
data = pa.table({"id": [0, 1]})
|
||||
table = mem_db.create_table("my_table", data=data)
|
||||
table.alter_columns({"path": "id", "rename": "new_id"})
|
||||
alter_columns_res = table.alter_columns({"path": "id", "rename": "new_id"})
|
||||
assert alter_columns_res.version == 2
|
||||
assert table.to_arrow().column_names == ["new_id"]
|
||||
|
||||
|
||||
@@ -1426,9 +1613,13 @@ def test_alter_columns(mem_db: DBConnection):
|
||||
async def test_alter_columns_async(mem_db_async: AsyncConnection):
|
||||
data = pa.table({"id": [0, 1]})
|
||||
table = await mem_db_async.create_table("my_table", data=data)
|
||||
await table.alter_columns({"path": "id", "rename": "new_id"})
|
||||
alter_columns_res = await table.alter_columns({"path": "id", "rename": "new_id"})
|
||||
assert alter_columns_res.version == 2
|
||||
assert (await table.to_arrow()).column_names == ["new_id"]
|
||||
await table.alter_columns(dict(path="new_id", data_type=pa.int16(), nullable=True))
|
||||
alter_columns_res = await table.alter_columns(
|
||||
dict(path="new_id", data_type=pa.int16(), nullable=True)
|
||||
)
|
||||
assert alter_columns_res.version == 3
|
||||
data = await table.to_arrow()
|
||||
assert data.column(0).type == pa.int16()
|
||||
assert data.schema.field(0).nullable
|
||||
@@ -1437,7 +1628,8 @@ async def test_alter_columns_async(mem_db_async: AsyncConnection):
|
||||
def test_drop_columns(mem_db: DBConnection):
|
||||
data = pa.table({"id": [0, 1], "category": ["a", "b"]})
|
||||
table = mem_db.create_table("my_table", data=data)
|
||||
table.drop_columns(["category"])
|
||||
drop_columns_res = table.drop_columns(["category"])
|
||||
assert drop_columns_res.version == 2
|
||||
assert table.to_arrow().column_names == ["id"]
|
||||
|
||||
|
||||
@@ -1445,7 +1637,8 @@ def test_drop_columns(mem_db: DBConnection):
|
||||
async def test_drop_columns_async(mem_db_async: AsyncConnection):
|
||||
data = pa.table({"id": [0, 1], "category": ["a", "b"]})
|
||||
table = await mem_db_async.create_table("my_table", data=data)
|
||||
await table.drop_columns(["category"])
|
||||
drop_columns_res = await table.drop_columns(["category"])
|
||||
assert drop_columns_res.version == 2
|
||||
assert (await table.to_arrow()).column_names == ["id"]
|
||||
|
||||
|
||||
@@ -1583,3 +1776,31 @@ def test_replace_field_metadata(tmp_path):
|
||||
schema = table.schema
|
||||
field = schema[0].metadata
|
||||
assert field == {b"foo": b"bar"}
|
||||
|
||||
|
||||
def test_stats(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"my_table",
|
||||
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
|
||||
)
|
||||
assert len(table) == 2
|
||||
stats = table.stats()
|
||||
print(f"{stats=}")
|
||||
assert stats == {
|
||||
"total_bytes": 38,
|
||||
"num_rows": 2,
|
||||
"num_indices": 0,
|
||||
"fragment_stats": {
|
||||
"num_fragments": 1,
|
||||
"num_small_fragments": 1,
|
||||
"lengths": {
|
||||
"min": 2,
|
||||
"max": 2,
|
||||
"mean": 2,
|
||||
"p25": 2,
|
||||
"p50": 2,
|
||||
"p75": 2,
|
||||
"p99": 2,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -11,7 +11,10 @@ use pyo3::{
|
||||
wrap_pyfunction, Bound, PyResult, Python,
|
||||
};
|
||||
use query::{FTSQuery, HybridQuery, Query, VectorQuery};
|
||||
use table::Table;
|
||||
use table::{
|
||||
AddColumnsResult, AddResult, AlterColumnsResult, DeleteResult, DropColumnsResult, MergeResult,
|
||||
Table, UpdateResult,
|
||||
};
|
||||
|
||||
pub mod arrow;
|
||||
pub mod connection;
|
||||
@@ -35,6 +38,13 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_class::<HybridQuery>()?;
|
||||
m.add_class::<VectorQuery>()?;
|
||||
m.add_class::<RecordBatchStream>()?;
|
||||
m.add_class::<AddColumnsResult>()?;
|
||||
m.add_class::<AlterColumnsResult>()?;
|
||||
m.add_class::<AddResult>()?;
|
||||
m.add_class::<MergeResult>()?;
|
||||
m.add_class::<DeleteResult>()?;
|
||||
m.add_class::<DropColumnsResult>()?;
|
||||
m.add_class::<UpdateResult>()?;
|
||||
m.add_function(wrap_pyfunction!(connect, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
|
||||
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
|
||||
|
||||
@@ -652,6 +652,11 @@ impl HybridQuery {
|
||||
self.inner_vec.bypass_vector_index();
|
||||
}
|
||||
|
||||
#[pyo3(signature = (lower_bound=None, upper_bound=None))]
|
||||
pub fn distance_range(&mut self, lower_bound: Option<f32>, upper_bound: Option<f32>) {
|
||||
self.inner_vec.distance_range(lower_bound, upper_bound);
|
||||
}
|
||||
|
||||
pub fn to_vector_query(&mut self) -> PyResult<VectorQuery> {
|
||||
Ok(VectorQuery {
|
||||
inner: self.inner_vec.inner.clone(),
|
||||
|
||||
@@ -2,6 +2,11 @@
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use crate::{
|
||||
error::PythonErrorExt,
|
||||
index::{extract_index_params, IndexConfig},
|
||||
query::Query,
|
||||
};
|
||||
use arrow::{
|
||||
datatypes::{DataType, Schema},
|
||||
ffi_stream::ArrowArrayStreamReader,
|
||||
@@ -19,12 +24,6 @@ use pyo3::{
|
||||
};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::{
|
||||
error::PythonErrorExt,
|
||||
index::{extract_index_params, IndexConfig},
|
||||
query::Query,
|
||||
};
|
||||
|
||||
/// Statistics about a compaction operation.
|
||||
#[pyclass(get_all)]
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -59,6 +58,170 @@ pub struct OptimizeStats {
|
||||
pub prune: RemovalStats,
|
||||
}
|
||||
|
||||
#[pyclass(get_all)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct UpdateResult {
|
||||
pub rows_updated: u64,
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl UpdateResult {
|
||||
pub fn __repr__(&self) -> String {
|
||||
format!(
|
||||
"UpdateResult(rows_updated={}, version={})",
|
||||
self.rows_updated, self.version
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<lancedb::table::UpdateResult> for UpdateResult {
|
||||
fn from(result: lancedb::table::UpdateResult) -> Self {
|
||||
Self {
|
||||
rows_updated: result.rows_updated,
|
||||
version: result.version,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(get_all)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AddResult {
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl AddResult {
|
||||
pub fn __repr__(&self) -> String {
|
||||
format!("AddResult(version={})", self.version)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<lancedb::table::AddResult> for AddResult {
|
||||
fn from(result: lancedb::table::AddResult) -> Self {
|
||||
Self {
|
||||
version: result.version,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(get_all)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DeleteResult {
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl DeleteResult {
|
||||
pub fn __repr__(&self) -> String {
|
||||
format!("DeleteResult(version={})", self.version)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<lancedb::table::DeleteResult> for DeleteResult {
|
||||
fn from(result: lancedb::table::DeleteResult) -> Self {
|
||||
Self {
|
||||
version: result.version,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(get_all)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct MergeResult {
|
||||
pub version: u64,
|
||||
pub num_updated_rows: u64,
|
||||
pub num_inserted_rows: u64,
|
||||
pub num_deleted_rows: u64,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl MergeResult {
|
||||
pub fn __repr__(&self) -> String {
|
||||
format!(
|
||||
"MergeResult(version={}, num_updated_rows={}, num_inserted_rows={}, num_deleted_rows={})",
|
||||
self.version,
|
||||
self.num_updated_rows,
|
||||
self.num_inserted_rows,
|
||||
self.num_deleted_rows
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<lancedb::table::MergeResult> for MergeResult {
|
||||
fn from(result: lancedb::table::MergeResult) -> Self {
|
||||
Self {
|
||||
version: result.version,
|
||||
num_updated_rows: result.num_updated_rows,
|
||||
num_inserted_rows: result.num_inserted_rows,
|
||||
num_deleted_rows: result.num_deleted_rows,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(get_all)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AddColumnsResult {
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl AddColumnsResult {
|
||||
pub fn __repr__(&self) -> String {
|
||||
format!("AddColumnsResult(version={})", self.version)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<lancedb::table::AddColumnsResult> for AddColumnsResult {
|
||||
fn from(result: lancedb::table::AddColumnsResult) -> Self {
|
||||
Self {
|
||||
version: result.version,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(get_all)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AlterColumnsResult {
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl AlterColumnsResult {
|
||||
pub fn __repr__(&self) -> String {
|
||||
format!("AlterColumnsResult(version={})", self.version)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<lancedb::table::AlterColumnsResult> for AlterColumnsResult {
|
||||
fn from(result: lancedb::table::AlterColumnsResult) -> Self {
|
||||
Self {
|
||||
version: result.version,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(get_all)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DropColumnsResult {
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl DropColumnsResult {
|
||||
pub fn __repr__(&self) -> String {
|
||||
format!("DropColumnsResult(version={})", self.version)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<lancedb::table::DropColumnsResult> for DropColumnsResult {
|
||||
fn from(result: lancedb::table::DropColumnsResult) -> Self {
|
||||
Self {
|
||||
version: result.version,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct Table {
|
||||
// We keep a copy of the name to use if the inner table is dropped
|
||||
@@ -133,15 +296,16 @@ impl Table {
|
||||
}
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
op.execute().await.infer_error()?;
|
||||
Ok(())
|
||||
let result = op.execute().await.infer_error()?;
|
||||
Ok(AddResult::from(result))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn delete(self_: PyRef<'_, Self>, condition: String) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.delete(&condition).await.infer_error()
|
||||
let result = inner.delete(&condition).await.infer_error()?;
|
||||
Ok(DeleteResult::from(result))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -161,8 +325,8 @@ impl Table {
|
||||
op = op.column(column_name, value);
|
||||
}
|
||||
future_into_py(self_.py(), async move {
|
||||
op.execute().await.infer_error()?;
|
||||
Ok(())
|
||||
let result = op.execute().await.infer_error()?;
|
||||
Ok(UpdateResult::from(result))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -177,15 +341,19 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (column, index=None, replace=None))]
|
||||
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None))]
|
||||
pub fn create_index<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
column: String,
|
||||
index: Option<Bound<'_, PyAny>>,
|
||||
replace: Option<bool>,
|
||||
wait_timeout: Option<Bound<'_, PyAny>>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let index = extract_index_params(&index)?;
|
||||
let mut op = self_.inner_ref()?.create_index(&[column], index);
|
||||
let timeout = wait_timeout.map(|t| t.extract::<std::time::Duration>().unwrap());
|
||||
let mut op = self_
|
||||
.inner_ref()?
|
||||
.create_index_with_timeout(&[column], index, timeout);
|
||||
if let Some(replace) = replace {
|
||||
op = op.replace(replace);
|
||||
}
|
||||
@@ -204,6 +372,34 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn wait_for_index<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
index_names: Vec<String>,
|
||||
timeout: Bound<'_, PyAny>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
let timeout = timeout.extract::<std::time::Duration>()?;
|
||||
future_into_py(self_.py(), async move {
|
||||
let index_refs = index_names
|
||||
.iter()
|
||||
.map(String::as_str)
|
||||
.collect::<Vec<&str>>();
|
||||
inner
|
||||
.wait_for_index(&index_refs, timeout)
|
||||
.await
|
||||
.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn prewarm_index(self_: PyRef<'_, Self>, index_name: String) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.prewarm_index(&index_name).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn list_indices(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
@@ -248,6 +444,40 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn stats(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let stats = inner.stats().await.infer_error()?;
|
||||
Python::with_gil(|py| {
|
||||
let dict = PyDict::new(py);
|
||||
dict.set_item("total_bytes", stats.total_bytes)?;
|
||||
dict.set_item("num_rows", stats.num_rows)?;
|
||||
dict.set_item("num_indices", stats.num_indices)?;
|
||||
|
||||
let fragment_stats = PyDict::new(py);
|
||||
fragment_stats.set_item("num_fragments", stats.fragment_stats.num_fragments)?;
|
||||
fragment_stats.set_item(
|
||||
"num_small_fragments",
|
||||
stats.fragment_stats.num_small_fragments,
|
||||
)?;
|
||||
|
||||
let fragment_lengths = PyDict::new(py);
|
||||
fragment_lengths.set_item("min", stats.fragment_stats.lengths.min)?;
|
||||
fragment_lengths.set_item("max", stats.fragment_stats.lengths.max)?;
|
||||
fragment_lengths.set_item("mean", stats.fragment_stats.lengths.mean)?;
|
||||
fragment_lengths.set_item("p25", stats.fragment_stats.lengths.p25)?;
|
||||
fragment_lengths.set_item("p50", stats.fragment_stats.lengths.p50)?;
|
||||
fragment_lengths.set_item("p75", stats.fragment_stats.lengths.p75)?;
|
||||
fragment_lengths.set_item("p99", stats.fragment_stats.lengths.p99)?;
|
||||
|
||||
fragment_stats.set_item("lengths", fragment_lengths)?;
|
||||
dict.set_item("fragment_stats", fragment_stats)?;
|
||||
|
||||
Ok(Some(dict.unbind()))
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
pub fn __repr__(&self) -> String {
|
||||
match &self.inner {
|
||||
None => format!("ClosedTable({})", self.name),
|
||||
@@ -290,10 +520,16 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn checkout(self_: PyRef<'_, Self>, version: u64) -> PyResult<Bound<'_, PyAny>> {
|
||||
pub fn checkout(self_: PyRef<'_, Self>, version: LanceVersion) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.checkout(version).await.infer_error()
|
||||
let py = self_.py();
|
||||
future_into_py(py, async move {
|
||||
match version {
|
||||
LanceVersion::Version(version_num) => {
|
||||
inner.checkout(version_num).await.infer_error()
|
||||
}
|
||||
LanceVersion::Tag(tag) => inner.checkout_tag(&tag).await.infer_error(),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -305,12 +541,19 @@ impl Table {
|
||||
}
|
||||
|
||||
#[pyo3(signature = (version=None))]
|
||||
pub fn restore(self_: PyRef<'_, Self>, version: Option<u64>) -> PyResult<Bound<'_, PyAny>> {
|
||||
pub fn restore(
|
||||
self_: PyRef<'_, Self>,
|
||||
version: Option<LanceVersion>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
let py = self_.py();
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
future_into_py(py, async move {
|
||||
if let Some(version) = version {
|
||||
inner.checkout(version).await.infer_error()?;
|
||||
match version {
|
||||
LanceVersion::Version(num) => inner.checkout(num).await.infer_error()?,
|
||||
LanceVersion::Tag(tag) => inner.checkout_tag(&tag).await.infer_error()?,
|
||||
}
|
||||
}
|
||||
inner.restore().await.infer_error()
|
||||
})
|
||||
@@ -320,6 +563,11 @@ impl Table {
|
||||
Query::new(self.inner_ref().unwrap().query())
|
||||
}
|
||||
|
||||
#[getter]
|
||||
pub fn tags(&self) -> PyResult<Tags> {
|
||||
Ok(Tags::new(self.inner_ref()?.clone()))
|
||||
}
|
||||
|
||||
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
|
||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
|
||||
pub fn optimize(
|
||||
@@ -401,10 +649,13 @@ impl Table {
|
||||
builder
|
||||
.when_not_matched_by_source_delete(parameters.when_not_matched_by_source_condition);
|
||||
}
|
||||
if let Some(timeout) = parameters.timeout {
|
||||
builder.timeout(timeout);
|
||||
}
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
builder.execute(Box::new(batches)).await.infer_error()?;
|
||||
Ok(())
|
||||
let res = builder.execute(Box::new(batches)).await.infer_error()?;
|
||||
Ok(MergeResult::from(res))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -440,8 +691,8 @@ impl Table {
|
||||
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.add_columns(definitions, None).await.infer_error()?;
|
||||
Ok(())
|
||||
let result = inner.add_columns(definitions, None).await.infer_error()?;
|
||||
Ok(AddColumnsResult::from(result))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -454,8 +705,8 @@ impl Table {
|
||||
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.add_columns(transform, None).await.infer_error()?;
|
||||
Ok(())
|
||||
let result = inner.add_columns(transform, None).await.infer_error()?;
|
||||
Ok(AddColumnsResult::from(result))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -498,8 +749,8 @@ impl Table {
|
||||
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.alter_columns(&alterations).await.infer_error()?;
|
||||
Ok(())
|
||||
let result = inner.alter_columns(&alterations).await.infer_error()?;
|
||||
Ok(AlterColumnsResult::from(result))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -507,8 +758,8 @@ impl Table {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let column_refs = columns.iter().map(String::as_str).collect::<Vec<&str>>();
|
||||
inner.drop_columns(&column_refs).await.infer_error()?;
|
||||
Ok(())
|
||||
let result = inner.drop_columns(&column_refs).await.infer_error()?;
|
||||
Ok(DropColumnsResult::from(result))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -544,6 +795,12 @@ impl Table {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
pub enum LanceVersion {
|
||||
Version(u64),
|
||||
Tag(String),
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
#[pyo3(from_item_all)]
|
||||
pub struct MergeInsertParams {
|
||||
@@ -553,4 +810,74 @@ pub struct MergeInsertParams {
|
||||
when_not_matched_insert_all: bool,
|
||||
when_not_matched_by_source_delete: bool,
|
||||
when_not_matched_by_source_condition: Option<String>,
|
||||
timeout: Option<std::time::Duration>,
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct Tags {
|
||||
inner: LanceDbTable,
|
||||
}
|
||||
|
||||
impl Tags {
|
||||
pub fn new(table: LanceDbTable) -> Self {
|
||||
Self { inner: table }
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Tags {
|
||||
pub fn list(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let tags = inner.tags().await.infer_error()?;
|
||||
let res = tags.list().await.infer_error()?;
|
||||
|
||||
Python::with_gil(|py| {
|
||||
let py_dict = PyDict::new(py);
|
||||
for (key, contents) in res {
|
||||
let value_dict = PyDict::new(py);
|
||||
value_dict.set_item("version", contents.version)?;
|
||||
value_dict.set_item("manifest_size", contents.manifest_size)?;
|
||||
py_dict.set_item(key, value_dict)?;
|
||||
}
|
||||
Ok(py_dict.unbind())
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_version(self_: PyRef<'_, Self>, tag: String) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let tags = inner.tags().await.infer_error()?;
|
||||
let res = tags.get_version(tag.as_str()).await.infer_error()?;
|
||||
Ok(res)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn create(self_: PyRef<Self>, tag: String, version: u64) -> PyResult<Bound<PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let mut tags = inner.tags().await.infer_error()?;
|
||||
tags.create(tag.as_str(), version).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn delete(self_: PyRef<Self>, tag: String) -> PyResult<Bound<PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let mut tags = inner.tags().await.infer_error()?;
|
||||
tags.delete(tag.as_str()).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn update(self_: PyRef<Self>, tag: String, version: u64) -> PyResult<Bound<PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let mut tags = inner.tags().await.infer_error()?;
|
||||
tags.update(tag.as_str(), version).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -163,8 +163,9 @@ pub fn parse_fts_query(query: &Bound<'_, PyDict>) -> PyResult<FtsQuery> {
|
||||
.ok_or(PyValueError::new_err("boost not found"))?
|
||||
.extract::<Vec<f32>>()?;
|
||||
|
||||
let query =
|
||||
MultiMatchQuery::try_new_with_boosts(query, columns, boost).map_err(|e| {
|
||||
let query = MultiMatchQuery::try_new(query, columns)
|
||||
.and_then(|q| q.try_with_boosts(boost))
|
||||
.map_err(|e| {
|
||||
PyValueError::new_err(format!("Error creating MultiMatchQuery: {}", e))
|
||||
})?;
|
||||
Ok(query.into())
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-node"
|
||||
version = "0.19.0-beta.6"
|
||||
version = "0.19.1-beta.3"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.19.0-beta.6"
|
||||
version = "0.19.1-beta.3"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
|
||||
@@ -81,7 +81,7 @@ impl ListingCatalogOptionsBuilder {
|
||||
/// [`crate::database::listing::ListingDatabase`]
|
||||
#[derive(Debug)]
|
||||
pub struct ListingCatalog {
|
||||
object_store: ObjectStore,
|
||||
object_store: Arc<ObjectStore>,
|
||||
|
||||
uri: String,
|
||||
|
||||
@@ -105,7 +105,7 @@ impl ListingCatalog {
|
||||
}
|
||||
|
||||
async fn open_path(path: &str) -> Result<Self> {
|
||||
let (object_store, base_path) = ObjectStore::from_path(path).unwrap();
|
||||
let (object_store, base_path) = ObjectStore::from_uri(path).await.unwrap();
|
||||
if object_store.is_local() {
|
||||
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
|
||||
}
|
||||
|
||||
@@ -201,7 +201,7 @@ impl ListingDatabaseOptionsBuilder {
|
||||
/// We will have two tables named `table1` and `table2`.
|
||||
#[derive(Debug)]
|
||||
pub struct ListingDatabase {
|
||||
object_store: ObjectStore,
|
||||
object_store: Arc<ObjectStore>,
|
||||
query_string: Option<String>,
|
||||
|
||||
pub(crate) uri: String,
|
||||
|
||||
@@ -35,6 +35,8 @@ pub enum Error {
|
||||
Schema { message: String },
|
||||
#[snafu(display("Runtime error: {message}"))]
|
||||
Runtime { message: String },
|
||||
#[snafu(display("Timeout error: {message}"))]
|
||||
Timeout { message: String },
|
||||
|
||||
// 3rd party / external errors
|
||||
#[snafu(display("object_store error: {source}"))]
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use scalar::FtsIndexBuilder;
|
||||
use serde::Deserialize;
|
||||
use serde_with::skip_serializing_none;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use vector::IvfFlatIndexBuilder;
|
||||
|
||||
use crate::{table::BaseTable, DistanceType, Error, Result};
|
||||
@@ -17,6 +17,7 @@ use self::{
|
||||
|
||||
pub mod scalar;
|
||||
pub mod vector;
|
||||
pub mod waiter;
|
||||
|
||||
/// Supported index types.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -69,6 +70,7 @@ pub struct IndexBuilder {
|
||||
pub(crate) index: Index,
|
||||
pub(crate) columns: Vec<String>,
|
||||
pub(crate) replace: bool,
|
||||
pub(crate) wait_timeout: Option<Duration>,
|
||||
}
|
||||
|
||||
impl IndexBuilder {
|
||||
@@ -78,6 +80,7 @@ impl IndexBuilder {
|
||||
index,
|
||||
columns,
|
||||
replace: true,
|
||||
wait_timeout: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,6 +94,15 @@ impl IndexBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Duration of time to wait for asynchronous indexing to complete. If not set,
|
||||
/// `create_index()` will not wait.
|
||||
///
|
||||
/// This is not supported for `NativeTable` since indexing is synchronous.
|
||||
pub fn wait_timeout(mut self, d: Duration) -> Self {
|
||||
self.wait_timeout = Some(d);
|
||||
self
|
||||
}
|
||||
|
||||
pub async fn execute(self) -> Result<()> {
|
||||
self.parent.clone().create_index(self).await
|
||||
}
|
||||
|
||||
89
rust/lancedb/src/index/waiter.rs
Normal file
89
rust/lancedb/src/index/waiter.rs
Normal file
@@ -0,0 +1,89 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::table::BaseTable;
|
||||
use crate::Error;
|
||||
use log::debug;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::time::sleep;
|
||||
|
||||
const DEFAULT_SLEEP_MS: u64 = 1000;
|
||||
const MAX_WAIT: Duration = Duration::from_secs(2 * 60 * 60);
|
||||
|
||||
/// Poll the table using list_indices() and index_stats() until all of the indices have 0 un-indexed rows.
|
||||
/// Will return Error::Timeout if the columns are not fully indexed within the timeout.
|
||||
pub async fn wait_for_index(
|
||||
table: &dyn BaseTable,
|
||||
index_names: &[&str],
|
||||
timeout: Duration,
|
||||
) -> Result<()> {
|
||||
if timeout > MAX_WAIT {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!("timeout must be less than {:?}", MAX_WAIT),
|
||||
});
|
||||
}
|
||||
let start = Instant::now();
|
||||
let mut remaining = index_names.to_vec();
|
||||
|
||||
// poll via list_indices() and index_stats() until all indices are created and fully indexed
|
||||
while start.elapsed() < timeout {
|
||||
let mut completed = vec![];
|
||||
let indices = table.list_indices().await?;
|
||||
|
||||
for &idx in &remaining {
|
||||
if !indices.iter().any(|i| i.name == *idx) {
|
||||
debug!("still waiting for new index '{}'", idx);
|
||||
continue;
|
||||
}
|
||||
|
||||
let stats = table.index_stats(idx.as_ref()).await?;
|
||||
match stats {
|
||||
None => {
|
||||
debug!("still waiting for new index '{}'", idx);
|
||||
continue;
|
||||
}
|
||||
Some(s) => {
|
||||
if s.num_unindexed_rows == 0 {
|
||||
// note: this may never stabilize under constant writes.
|
||||
// we should later replace this with a status/job model
|
||||
completed.push(idx);
|
||||
debug!(
|
||||
"fully indexed '{}'. indexed rows: {}",
|
||||
idx, s.num_indexed_rows
|
||||
);
|
||||
} else {
|
||||
debug!(
|
||||
"still waiting for index '{}'. unindexed rows: {}",
|
||||
idx, s.num_unindexed_rows
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
remaining.retain(|idx| !completed.contains(idx));
|
||||
if remaining.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
sleep(Duration::from_millis(DEFAULT_SLEEP_MS)).await;
|
||||
}
|
||||
|
||||
// debug log index diagnostics
|
||||
for &r in &remaining {
|
||||
let stats = table.index_stats(r.as_ref()).await?;
|
||||
match stats {
|
||||
Some(s) => debug!(
|
||||
"index '{}' not fully indexed after {:?}. stats: {:?}",
|
||||
r, timeout, s
|
||||
),
|
||||
None => debug!("index '{}' not found after {:?}", r, timeout),
|
||||
}
|
||||
}
|
||||
|
||||
Err(Error::Timeout {
|
||||
message: format!(
|
||||
"timed out waiting for indices: {:?} after {:?}",
|
||||
remaining, timeout
|
||||
),
|
||||
})
|
||||
}
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
pub(crate) mod client;
|
||||
pub(crate) mod db;
|
||||
mod retry;
|
||||
pub(crate) mod table;
|
||||
pub(crate) mod util;
|
||||
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::{collections::HashMap, future::Future, str::FromStr, time::Duration};
|
||||
|
||||
use http::HeaderName;
|
||||
use log::debug;
|
||||
use reqwest::{
|
||||
header::{HeaderMap, HeaderValue},
|
||||
Request, RequestBuilder, Response,
|
||||
Body, Request, RequestBuilder, Response,
|
||||
};
|
||||
use std::{collections::HashMap, future::Future, str::FromStr, time::Duration};
|
||||
|
||||
use crate::error::{Error, Result};
|
||||
use crate::remote::db::RemoteOptions;
|
||||
use crate::remote::retry::{ResolvedRetryConfig, RetryCounter};
|
||||
|
||||
const REQUEST_ID_HEADER: HeaderName = HeaderName::from_static("x-request-id");
|
||||
|
||||
@@ -118,41 +118,14 @@ pub struct RetryConfig {
|
||||
/// You can also set the `LANCE_CLIENT_RETRY_STATUSES` environment variable
|
||||
/// to set this value. Use a comma-separated list of integer values.
|
||||
///
|
||||
/// The default is 429, 500, 502, 503.
|
||||
/// Note that write operations will never be retried on 5xx errors as this may
|
||||
/// result in duplicated writes.
|
||||
///
|
||||
/// The default is 409, 429, 500, 502, 503, 504.
|
||||
pub statuses: Option<Vec<u16>>,
|
||||
// TODO: should we allow customizing methods?
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ResolvedRetryConfig {
|
||||
retries: u8,
|
||||
connect_retries: u8,
|
||||
read_retries: u8,
|
||||
backoff_factor: f32,
|
||||
backoff_jitter: f32,
|
||||
statuses: Vec<reqwest::StatusCode>,
|
||||
}
|
||||
|
||||
impl TryFrom<RetryConfig> for ResolvedRetryConfig {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(retry_config: RetryConfig) -> Result<Self> {
|
||||
Ok(Self {
|
||||
retries: retry_config.retries.unwrap_or(3),
|
||||
connect_retries: retry_config.connect_retries.unwrap_or(3),
|
||||
read_retries: retry_config.read_retries.unwrap_or(3),
|
||||
backoff_factor: retry_config.backoff_factor.unwrap_or(0.25),
|
||||
backoff_jitter: retry_config.backoff_jitter.unwrap_or(0.25),
|
||||
statuses: retry_config
|
||||
.statuses
|
||||
.unwrap_or_else(|| vec![429, 500, 502, 503])
|
||||
.into_iter()
|
||||
.map(|status| reqwest::StatusCode::from_u16(status).unwrap())
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// We use the `HttpSend` trait to abstract over the `reqwest::Client` so that
|
||||
// we can mock responses in tests. Based on the patterns from this blog post:
|
||||
// https://write.as/balrogboogie/testing-reqwest-based-clients
|
||||
@@ -160,8 +133,8 @@ impl TryFrom<RetryConfig> for ResolvedRetryConfig {
|
||||
pub struct RestfulLanceDbClient<S: HttpSend = Sender> {
|
||||
client: reqwest::Client,
|
||||
host: String,
|
||||
retry_config: ResolvedRetryConfig,
|
||||
sender: S,
|
||||
pub(crate) retry_config: ResolvedRetryConfig,
|
||||
pub(crate) sender: S,
|
||||
}
|
||||
|
||||
pub trait HttpSend: Clone + Send + Sync + std::fmt::Debug + 'static {
|
||||
@@ -375,74 +348,69 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
self.client.post(full_uri)
|
||||
}
|
||||
|
||||
pub async fn send(&self, req: RequestBuilder, with_retry: bool) -> Result<(String, Response)> {
|
||||
pub async fn send(&self, req: RequestBuilder) -> Result<(String, Response)> {
|
||||
let (client, request) = req.build_split();
|
||||
let mut request = request.unwrap();
|
||||
let request_id = self.extract_request_id(&mut request);
|
||||
self.log_request(&request, &request_id);
|
||||
|
||||
// Set a request id.
|
||||
// TODO: allow the user to supply this, through middleware?
|
||||
let request_id = if let Some(request_id) = request.headers().get(REQUEST_ID_HEADER) {
|
||||
request_id.to_str().unwrap().to_string()
|
||||
} else {
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
let header = HeaderValue::from_str(&request_id).unwrap();
|
||||
request.headers_mut().insert(REQUEST_ID_HEADER, header);
|
||||
request_id
|
||||
};
|
||||
|
||||
if log::log_enabled!(log::Level::Debug) {
|
||||
let content_type = request
|
||||
.headers()
|
||||
.get("content-type")
|
||||
.map(|v| v.to_str().unwrap());
|
||||
if content_type == Some("application/json") {
|
||||
let body = request.body().as_ref().unwrap().as_bytes().unwrap();
|
||||
let body = String::from_utf8_lossy(body);
|
||||
debug!(
|
||||
"Sending request_id={}: {:?} with body {}",
|
||||
request_id, request, body
|
||||
);
|
||||
} else {
|
||||
debug!("Sending request_id={}: {:?}", request_id, request);
|
||||
}
|
||||
}
|
||||
|
||||
if with_retry {
|
||||
self.send_with_retry_impl(client, request, request_id).await
|
||||
} else {
|
||||
let response = self
|
||||
.sender
|
||||
.send(&client, request)
|
||||
.await
|
||||
.err_to_http(request_id.clone())?;
|
||||
debug!(
|
||||
"Received response for request_id={}: {:?}",
|
||||
request_id, &response
|
||||
);
|
||||
Ok((request_id, response))
|
||||
}
|
||||
let response = self
|
||||
.sender
|
||||
.send(&client, request)
|
||||
.await
|
||||
.err_to_http(request_id.clone())?;
|
||||
debug!(
|
||||
"Received response for request_id={}: {:?}",
|
||||
request_id, &response
|
||||
);
|
||||
Ok((request_id, response))
|
||||
}
|
||||
|
||||
async fn send_with_retry_impl(
|
||||
/// Send the request using retries configured in the RetryConfig.
|
||||
/// If retry_5xx is false, 5xx requests will not be retried regardless of the statuses configured
|
||||
/// in the RetryConfig.
|
||||
/// Since this requires arrow serialization, this is implemented here instead of in RestfulLanceDbClient
|
||||
pub async fn send_with_retry(
|
||||
&self,
|
||||
client: reqwest::Client,
|
||||
req: Request,
|
||||
request_id: String,
|
||||
req_builder: RequestBuilder,
|
||||
mut make_body: Option<Box<dyn FnMut() -> Result<Body> + Send + 'static>>,
|
||||
retry_5xx: bool,
|
||||
) -> Result<(String, Response)> {
|
||||
let mut retry_counter = RetryCounter::new(&self.retry_config, request_id);
|
||||
let retry_config = &self.retry_config;
|
||||
let non_5xx_statuses = retry_config
|
||||
.statuses
|
||||
.iter()
|
||||
.filter(|s| !s.is_server_error())
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// clone and build the request to extract the request id
|
||||
let tmp_req = req_builder.try_clone().ok_or_else(|| Error::Runtime {
|
||||
message: "Attempted to retry a request that cannot be cloned".to_string(),
|
||||
})?;
|
||||
let (_, r) = tmp_req.build_split();
|
||||
let mut r = r.unwrap();
|
||||
let request_id = self.extract_request_id(&mut r);
|
||||
let mut retry_counter = RetryCounter::new(retry_config, request_id.clone());
|
||||
|
||||
loop {
|
||||
// This only works if the request body is not a stream. If it is
|
||||
// a stream, we can't use the retry path. We would need to implement
|
||||
// an outer retry.
|
||||
let request = req.try_clone().ok_or_else(|| Error::Runtime {
|
||||
let mut req_builder = req_builder.try_clone().ok_or_else(|| Error::Runtime {
|
||||
message: "Attempted to retry a request that cannot be cloned".to_string(),
|
||||
})?;
|
||||
let response = self
|
||||
.sender
|
||||
.send(&client, request)
|
||||
.await
|
||||
.map(|r| (r.status(), r));
|
||||
|
||||
// set the streaming body on the request builder after clone
|
||||
if let Some(body_gen) = make_body.as_mut() {
|
||||
let body = body_gen()?;
|
||||
req_builder = req_builder.body(body);
|
||||
}
|
||||
|
||||
let (c, request) = req_builder.build_split();
|
||||
let mut request = request.unwrap();
|
||||
self.set_request_id(&mut request, &request_id.clone());
|
||||
self.log_request(&request, &request_id);
|
||||
|
||||
let response = self.sender.send(&c, request).await.map(|r| (r.status(), r));
|
||||
|
||||
match response {
|
||||
Ok((status, response)) if status.is_success() => {
|
||||
debug!(
|
||||
@@ -451,7 +419,10 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
);
|
||||
return Ok((retry_counter.request_id, response));
|
||||
}
|
||||
Ok((status, response)) if self.retry_config.statuses.contains(&status) => {
|
||||
Ok((status, response))
|
||||
if (retry_5xx && retry_config.statuses.contains(&status))
|
||||
|| non_5xx_statuses.contains(&status) =>
|
||||
{
|
||||
let source = self
|
||||
.check_response(&retry_counter.request_id, response)
|
||||
.await
|
||||
@@ -480,6 +451,47 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
}
|
||||
}
|
||||
|
||||
fn log_request(&self, request: &Request, request_id: &String) {
|
||||
if log::log_enabled!(log::Level::Debug) {
|
||||
let content_type = request
|
||||
.headers()
|
||||
.get("content-type")
|
||||
.map(|v| v.to_str().unwrap());
|
||||
if content_type == Some("application/json") {
|
||||
let body = request.body().as_ref().unwrap().as_bytes().unwrap();
|
||||
let body = String::from_utf8_lossy(body);
|
||||
debug!(
|
||||
"Sending request_id={}: {:?} with body {}",
|
||||
request_id, request, body
|
||||
);
|
||||
} else {
|
||||
debug!("Sending request_id={}: {:?}", request_id, request);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the request ID from the request headers.
|
||||
/// If the request ID header is not set, this will generate a new one and set
|
||||
/// it on the request headers
|
||||
pub fn extract_request_id(&self, request: &mut Request) -> String {
|
||||
// Set a request id.
|
||||
// TODO: allow the user to supply this, through middleware?
|
||||
let request_id = if let Some(request_id) = request.headers().get(REQUEST_ID_HEADER) {
|
||||
request_id.to_str().unwrap().to_string()
|
||||
} else {
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
self.set_request_id(request, &request_id);
|
||||
request_id
|
||||
};
|
||||
request_id
|
||||
}
|
||||
|
||||
/// Set the request ID header
|
||||
pub fn set_request_id(&self, request: &mut Request, request_id: &str) {
|
||||
let header = HeaderValue::from_str(request_id).unwrap();
|
||||
request.headers_mut().insert(REQUEST_ID_HEADER, header);
|
||||
}
|
||||
|
||||
pub async fn check_response(&self, request_id: &str, response: Response) -> Result<Response> {
|
||||
// Try to get the response text, but if that fails, just return the status code
|
||||
let status = response.status();
|
||||
@@ -501,91 +513,6 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
}
|
||||
}
|
||||
|
||||
struct RetryCounter<'a> {
|
||||
request_failures: u8,
|
||||
connect_failures: u8,
|
||||
read_failures: u8,
|
||||
config: &'a ResolvedRetryConfig,
|
||||
request_id: String,
|
||||
}
|
||||
|
||||
impl<'a> RetryCounter<'a> {
|
||||
fn new(config: &'a ResolvedRetryConfig, request_id: String) -> Self {
|
||||
Self {
|
||||
request_failures: 0,
|
||||
connect_failures: 0,
|
||||
read_failures: 0,
|
||||
config,
|
||||
request_id,
|
||||
}
|
||||
}
|
||||
|
||||
fn check_out_of_retries(
|
||||
&self,
|
||||
source: Box<dyn std::error::Error + Send + Sync>,
|
||||
status_code: Option<reqwest::StatusCode>,
|
||||
) -> Result<()> {
|
||||
if self.request_failures >= self.config.retries
|
||||
|| self.connect_failures >= self.config.connect_retries
|
||||
|| self.read_failures >= self.config.read_retries
|
||||
{
|
||||
Err(Error::Retry {
|
||||
request_id: self.request_id.clone(),
|
||||
request_failures: self.request_failures,
|
||||
max_request_failures: self.config.retries,
|
||||
connect_failures: self.connect_failures,
|
||||
max_connect_failures: self.config.connect_retries,
|
||||
read_failures: self.read_failures,
|
||||
max_read_failures: self.config.read_retries,
|
||||
source,
|
||||
status_code,
|
||||
})
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn increment_request_failures(&mut self, source: crate::Error) -> Result<()> {
|
||||
self.request_failures += 1;
|
||||
let status_code = if let crate::Error::Http { status_code, .. } = &source {
|
||||
*status_code
|
||||
} else {
|
||||
None
|
||||
};
|
||||
self.check_out_of_retries(Box::new(source), status_code)
|
||||
}
|
||||
|
||||
fn increment_connect_failures(&mut self, source: reqwest::Error) -> Result<()> {
|
||||
self.connect_failures += 1;
|
||||
let status_code = source.status();
|
||||
self.check_out_of_retries(Box::new(source), status_code)
|
||||
}
|
||||
|
||||
fn increment_read_failures(&mut self, source: reqwest::Error) -> Result<()> {
|
||||
self.read_failures += 1;
|
||||
let status_code = source.status();
|
||||
self.check_out_of_retries(Box::new(source), status_code)
|
||||
}
|
||||
|
||||
fn next_sleep_time(&self) -> Duration {
|
||||
let backoff = self.config.backoff_factor * (2.0f32.powi(self.request_failures as i32));
|
||||
let jitter = rand::random::<f32>() * self.config.backoff_jitter;
|
||||
let sleep_time = Duration::from_secs_f32(backoff + jitter);
|
||||
debug!(
|
||||
"Retrying request {:?} ({}/{} connect, {}/{} read, {}/{} read) in {:?}",
|
||||
self.request_id,
|
||||
self.connect_failures,
|
||||
self.config.connect_retries,
|
||||
self.request_failures,
|
||||
self.config.retries,
|
||||
self.read_failures,
|
||||
self.config.read_retries,
|
||||
sleep_time
|
||||
);
|
||||
sleep_time
|
||||
}
|
||||
}
|
||||
|
||||
pub trait RequestResultExt {
|
||||
type Output;
|
||||
fn err_to_http(self, request_id: String) -> Result<Self::Output>;
|
||||
|
||||
@@ -255,7 +255,7 @@ impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
if let Some(start_after) = request.start_after {
|
||||
req = req.query(&[("page_token", start_after)]);
|
||||
}
|
||||
let (request_id, rsp) = self.client.send(req, true).await?;
|
||||
let (request_id, rsp) = self.client.send_with_retry(req, None, true).await?;
|
||||
let rsp = self.client.check_response(&request_id, rsp).await?;
|
||||
let version = parse_server_version(&request_id, &rsp)?;
|
||||
let tables = rsp
|
||||
@@ -302,7 +302,7 @@ impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
.body(data_buffer)
|
||||
.header(CONTENT_TYPE, ARROW_STREAM_CONTENT_TYPE);
|
||||
|
||||
let (request_id, rsp) = self.client.send(req, false).await?;
|
||||
let (request_id, rsp) = self.client.send(req).await?;
|
||||
|
||||
if rsp.status() == StatusCode::BAD_REQUEST {
|
||||
let body = rsp.text().await.err_to_http(request_id.clone())?;
|
||||
@@ -362,7 +362,7 @@ impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
let req = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/describe/", request.name));
|
||||
let (request_id, rsp) = self.client.send(req, true).await?;
|
||||
let (request_id, rsp) = self.client.send_with_retry(req, None, true).await?;
|
||||
if rsp.status() == StatusCode::NOT_FOUND {
|
||||
return Err(crate::Error::TableNotFound { name: request.name });
|
||||
}
|
||||
@@ -383,7 +383,7 @@ impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/rename/", current_name));
|
||||
let req = req.json(&serde_json::json!({ "new_table_name": new_name }));
|
||||
let (request_id, resp) = self.client.send(req, false).await?;
|
||||
let (request_id, resp) = self.client.send(req).await?;
|
||||
self.client.check_response(&request_id, resp).await?;
|
||||
let table = self.table_cache.remove(current_name).await;
|
||||
if let Some(table) = table {
|
||||
@@ -394,7 +394,7 @@ impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
|
||||
async fn drop_table(&self, name: &str) -> Result<()> {
|
||||
let req = self.client.post(&format!("/v1/table/{}/drop/", name));
|
||||
let (request_id, resp) = self.client.send(req, true).await?;
|
||||
let (request_id, resp) = self.client.send(req).await?;
|
||||
self.client.check_response(&request_id, resp).await?;
|
||||
self.table_cache.remove(name).await;
|
||||
Ok(())
|
||||
|
||||
122
rust/lancedb/src/remote/retry.rs
Normal file
122
rust/lancedb/src/remote/retry.rs
Normal file
@@ -0,0 +1,122 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use crate::remote::RetryConfig;
|
||||
use crate::Error;
|
||||
use log::debug;
|
||||
use std::time::Duration;
|
||||
|
||||
pub struct RetryCounter<'a> {
|
||||
pub request_failures: u8,
|
||||
pub connect_failures: u8,
|
||||
pub read_failures: u8,
|
||||
pub config: &'a ResolvedRetryConfig,
|
||||
pub request_id: String,
|
||||
}
|
||||
|
||||
impl<'a> RetryCounter<'a> {
|
||||
pub(crate) fn new(config: &'a ResolvedRetryConfig, request_id: String) -> Self {
|
||||
Self {
|
||||
request_failures: 0,
|
||||
connect_failures: 0,
|
||||
read_failures: 0,
|
||||
config,
|
||||
request_id,
|
||||
}
|
||||
}
|
||||
|
||||
fn check_out_of_retries(
|
||||
&self,
|
||||
source: Box<dyn std::error::Error + Send + Sync>,
|
||||
status_code: Option<reqwest::StatusCode>,
|
||||
) -> crate::Result<()> {
|
||||
if self.request_failures >= self.config.retries
|
||||
|| self.connect_failures >= self.config.connect_retries
|
||||
|| self.read_failures >= self.config.read_retries
|
||||
{
|
||||
Err(Error::Retry {
|
||||
request_id: self.request_id.clone(),
|
||||
request_failures: self.request_failures,
|
||||
max_request_failures: self.config.retries,
|
||||
connect_failures: self.connect_failures,
|
||||
max_connect_failures: self.config.connect_retries,
|
||||
read_failures: self.read_failures,
|
||||
max_read_failures: self.config.read_retries,
|
||||
source,
|
||||
status_code,
|
||||
})
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn increment_request_failures(&mut self, source: crate::Error) -> crate::Result<()> {
|
||||
self.request_failures += 1;
|
||||
let status_code = if let crate::Error::Http { status_code, .. } = &source {
|
||||
*status_code
|
||||
} else {
|
||||
None
|
||||
};
|
||||
self.check_out_of_retries(Box::new(source), status_code)
|
||||
}
|
||||
|
||||
pub fn increment_connect_failures(&mut self, source: reqwest::Error) -> crate::Result<()> {
|
||||
self.connect_failures += 1;
|
||||
let status_code = source.status();
|
||||
self.check_out_of_retries(Box::new(source), status_code)
|
||||
}
|
||||
|
||||
pub fn increment_read_failures(&mut self, source: reqwest::Error) -> crate::Result<()> {
|
||||
self.read_failures += 1;
|
||||
let status_code = source.status();
|
||||
self.check_out_of_retries(Box::new(source), status_code)
|
||||
}
|
||||
|
||||
pub fn next_sleep_time(&self) -> Duration {
|
||||
let backoff = self.config.backoff_factor * (2.0f32.powi(self.request_failures as i32));
|
||||
let jitter = rand::random::<f32>() * self.config.backoff_jitter;
|
||||
let sleep_time = Duration::from_secs_f32(backoff + jitter);
|
||||
debug!(
|
||||
"Retrying request {:?} ({}/{} connect, {}/{} read, {}/{} read) in {:?}",
|
||||
self.request_id,
|
||||
self.connect_failures,
|
||||
self.config.connect_retries,
|
||||
self.request_failures,
|
||||
self.config.retries,
|
||||
self.read_failures,
|
||||
self.config.read_retries,
|
||||
sleep_time
|
||||
);
|
||||
sleep_time
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ResolvedRetryConfig {
|
||||
pub retries: u8,
|
||||
pub connect_retries: u8,
|
||||
pub read_retries: u8,
|
||||
pub backoff_factor: f32,
|
||||
pub backoff_jitter: f32,
|
||||
pub statuses: Vec<reqwest::StatusCode>,
|
||||
}
|
||||
|
||||
impl TryFrom<RetryConfig> for ResolvedRetryConfig {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(retry_config: RetryConfig) -> crate::Result<Self> {
|
||||
Ok(Self {
|
||||
retries: retry_config.retries.unwrap_or(3),
|
||||
connect_retries: retry_config.connect_retries.unwrap_or(3),
|
||||
read_retries: retry_config.read_retries.unwrap_or(3),
|
||||
backoff_factor: retry_config.backoff_factor.unwrap_or(0.25),
|
||||
backoff_jitter: retry_config.backoff_jitter.unwrap_or(0.25),
|
||||
statuses: retry_config
|
||||
.statuses
|
||||
.unwrap_or_else(|| vec![409, 429, 500, 502, 503, 504])
|
||||
.into_iter()
|
||||
.map(|status| reqwest::StatusCode::from_u16(status).unwrap())
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -3,10 +3,6 @@
|
||||
|
||||
//! LanceDB Table APIs
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::{AsArray, FixedSizeListBuilder, Float32Builder};
|
||||
use arrow::datatypes::{Float32Type, UInt8Type};
|
||||
use arrow_array::{RecordBatchIterator, RecordBatchReader};
|
||||
@@ -18,7 +14,7 @@ use datafusion_physical_plan::projection::ProjectionExec;
|
||||
use datafusion_physical_plan::repartition::RepartitionExec;
|
||||
use datafusion_physical_plan::union::UnionExec;
|
||||
use datafusion_physical_plan::ExecutionPlan;
|
||||
use futures::{StreamExt, TryStreamExt};
|
||||
use futures::{FutureExt, StreamExt, TryFutureExt, TryStreamExt};
|
||||
use lance::dataset::builder::DatasetBuilder;
|
||||
use lance::dataset::cleanup::RemovalStats;
|
||||
use lance::dataset::optimize::{compact_files, CompactionMetrics, IndexRemapperOptions};
|
||||
@@ -45,6 +41,10 @@ use lance_table::format::Manifest;
|
||||
use lance_table::io::commit::ManifestNamingScheme;
|
||||
use log::info;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::format;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::arrow::IntoArrow;
|
||||
use crate::connection::NoData;
|
||||
@@ -78,10 +78,15 @@ pub mod datafusion;
|
||||
pub(crate) mod dataset;
|
||||
pub mod merge;
|
||||
|
||||
use crate::index::waiter::wait_for_index;
|
||||
pub use chrono::Duration;
|
||||
use futures::future::{join_all, Either};
|
||||
pub use lance::dataset::optimize::CompactionOptions;
|
||||
pub use lance::dataset::refs::{TagContents, Tags as LanceTags};
|
||||
pub use lance::dataset::scanner::DatasetRecordBatchStream;
|
||||
use lance::dataset::statistics::DatasetStatisticsExt;
|
||||
pub use lance_index::optimize::OptimizeOptions;
|
||||
use serde_with::skip_serializing_none;
|
||||
|
||||
/// Defines the type of column
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -306,7 +311,7 @@ impl<T: IntoArrow> AddDataBuilder<T> {
|
||||
self
|
||||
}
|
||||
|
||||
pub async fn execute(self) -> Result<()> {
|
||||
pub async fn execute(self) -> Result<AddResult> {
|
||||
let parent = self.parent.clone();
|
||||
let data = self.data.into_arrow()?;
|
||||
let without_data = AddDataBuilder::<NoData> {
|
||||
@@ -374,8 +379,8 @@ impl UpdateBuilder {
|
||||
}
|
||||
|
||||
/// Executes the update operation.
|
||||
/// Returns the number of rows that were updated.
|
||||
pub async fn execute(self) -> Result<u64> {
|
||||
/// Returns the update result
|
||||
pub async fn execute(self) -> Result<UpdateResult> {
|
||||
if self.columns.is_empty() {
|
||||
Err(Error::InvalidInput {
|
||||
message: "at least one column must be specified in an update operation".to_string(),
|
||||
@@ -400,6 +405,100 @@ pub enum AnyQuery {
|
||||
VectorQuery(VectorQueryRequest),
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait Tags: Send + Sync {
|
||||
/// List the tags of the table.
|
||||
async fn list(&self) -> Result<HashMap<String, TagContents>>;
|
||||
|
||||
/// Get the version of the table referenced by a tag.
|
||||
async fn get_version(&self, tag: &str) -> Result<u64>;
|
||||
|
||||
/// Create a new tag for the given version of the table.
|
||||
async fn create(&mut self, tag: &str, version: u64) -> Result<()>;
|
||||
|
||||
/// Delete a tag from the table.
|
||||
async fn delete(&mut self, tag: &str) -> Result<()>;
|
||||
|
||||
/// Update an existing tag to point to a new version of the table.
|
||||
async fn update(&mut self, tag: &str, version: u64) -> Result<()>;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct UpdateResult {
|
||||
#[serde(default)]
|
||||
pub rows_updated: u64,
|
||||
// The commit version associated with the operation.
|
||||
// A version of `0` indicates compatibility with legacy servers that do not return
|
||||
/// a commit version.
|
||||
#[serde(default)]
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct AddResult {
|
||||
// The commit version associated with the operation.
|
||||
// A version of `0` indicates compatibility with legacy servers that do not return
|
||||
/// a commit version.
|
||||
#[serde(default)]
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct DeleteResult {
|
||||
// The commit version associated with the operation.
|
||||
// A version of `0` indicates compatibility with legacy servers that do not return
|
||||
/// a commit version.
|
||||
#[serde(default)]
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct MergeResult {
|
||||
// The commit version associated with the operation.
|
||||
// A version of `0` indicates compatibility with legacy servers that do not return
|
||||
/// a commit version.
|
||||
#[serde(default)]
|
||||
pub version: u64,
|
||||
/// Number of inserted rows (for user statistics)
|
||||
#[serde(default)]
|
||||
pub num_inserted_rows: u64,
|
||||
/// Number of updated rows (for user statistics)
|
||||
#[serde(default)]
|
||||
pub num_updated_rows: u64,
|
||||
/// Number of deleted rows (for user statistics)
|
||||
/// Note: This is different from internal references to 'deleted_rows', since we technically "delete" updated rows during processing.
|
||||
/// However those rows are not shared with the user.
|
||||
#[serde(default)]
|
||||
pub num_deleted_rows: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct AddColumnsResult {
|
||||
// The commit version associated with the operation.
|
||||
// A version of `0` indicates compatibility with legacy servers that do not return
|
||||
/// a commit version.
|
||||
#[serde(default)]
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct AlterColumnsResult {
|
||||
// The commit version associated with the operation.
|
||||
// A version of `0` indicates compatibility with legacy servers that do not return
|
||||
/// a commit version.
|
||||
#[serde(default)]
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct DropColumnsResult {
|
||||
// The commit version associated with the operation.
|
||||
// A version of `0` indicates compatibility with legacy servers that do not return
|
||||
/// a commit version.
|
||||
#[serde(default)]
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
/// A trait for anything "table-like". This is used for both native tables (which target
|
||||
/// Lance datasets) and remote tables (which target LanceDB cloud)
|
||||
///
|
||||
@@ -444,17 +543,19 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
||||
&self,
|
||||
add: AddDataBuilder<NoData>,
|
||||
data: Box<dyn arrow_array::RecordBatchReader + Send>,
|
||||
) -> Result<()>;
|
||||
) -> Result<AddResult>;
|
||||
/// Delete rows from the table.
|
||||
async fn delete(&self, predicate: &str) -> Result<()>;
|
||||
async fn delete(&self, predicate: &str) -> Result<DeleteResult>;
|
||||
/// Update rows in the table.
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<u64>;
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<UpdateResult>;
|
||||
/// Create an index on the provided column(s).
|
||||
async fn create_index(&self, index: IndexBuilder) -> Result<()>;
|
||||
/// List the indices on the table.
|
||||
async fn list_indices(&self) -> Result<Vec<IndexConfig>>;
|
||||
/// Drop an index from the table.
|
||||
async fn drop_index(&self, name: &str) -> Result<()>;
|
||||
/// Prewarm an index in the table
|
||||
async fn prewarm_index(&self, name: &str) -> Result<()>;
|
||||
/// Get statistics about the index.
|
||||
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>>;
|
||||
/// Merge insert new records into the table.
|
||||
@@ -462,7 +563,9 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
||||
&self,
|
||||
params: MergeInsertBuilder,
|
||||
new_data: Box<dyn RecordBatchReader + Send>,
|
||||
) -> Result<()>;
|
||||
) -> Result<MergeResult>;
|
||||
/// Gets the table tag manager.
|
||||
async fn tags(&self) -> Result<Box<dyn Tags + '_>>;
|
||||
/// Optimize the dataset.
|
||||
async fn optimize(&self, action: OptimizeAction) -> Result<OptimizeStats>;
|
||||
/// Add columns to the table.
|
||||
@@ -470,15 +573,18 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
||||
&self,
|
||||
transforms: NewColumnTransform,
|
||||
read_columns: Option<Vec<String>>,
|
||||
) -> Result<()>;
|
||||
) -> Result<AddColumnsResult>;
|
||||
/// Alter columns in the table.
|
||||
async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<()>;
|
||||
async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<AlterColumnsResult>;
|
||||
/// Drop columns from the table.
|
||||
async fn drop_columns(&self, columns: &[&str]) -> Result<()>;
|
||||
async fn drop_columns(&self, columns: &[&str]) -> Result<DropColumnsResult>;
|
||||
/// Get the version of the table.
|
||||
async fn version(&self) -> Result<u64>;
|
||||
/// Checkout a specific version of the table.
|
||||
async fn checkout(&self, version: u64) -> Result<()>;
|
||||
/// Checkout a table version referenced by a tag.
|
||||
/// Tags provide a human-readable way to reference specific versions of the table.
|
||||
async fn checkout_tag(&self, tag: &str) -> Result<()>;
|
||||
/// Checkout the latest version of the table.
|
||||
async fn checkout_latest(&self) -> Result<()>;
|
||||
/// Restore the table to the currently checked out version.
|
||||
@@ -489,6 +595,15 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
||||
async fn table_definition(&self) -> Result<TableDefinition>;
|
||||
/// Get the table URI
|
||||
fn dataset_uri(&self) -> &str;
|
||||
/// Poll until the columns are fully indexed. Will return Error::Timeout if the columns
|
||||
/// are not fully indexed within the timeout.
|
||||
async fn wait_for_index(
|
||||
&self,
|
||||
index_names: &[&str],
|
||||
timeout: std::time::Duration,
|
||||
) -> Result<()>;
|
||||
/// Get statistics on the table
|
||||
async fn stats(&self) -> Result<TableStatistics>;
|
||||
}
|
||||
|
||||
/// A Table is a collection of strong typed Rows.
|
||||
@@ -691,7 +806,7 @@ impl Table {
|
||||
/// tbl.delete("id > 5").await.unwrap();
|
||||
/// # });
|
||||
/// ```
|
||||
pub async fn delete(&self, predicate: &str) -> Result<()> {
|
||||
pub async fn delete(&self, predicate: &str) -> Result<DeleteResult> {
|
||||
self.inner.delete(predicate).await
|
||||
}
|
||||
|
||||
@@ -767,6 +882,28 @@ impl Table {
|
||||
)
|
||||
}
|
||||
|
||||
/// See [Table::create_index]
|
||||
/// For remote tables, this allows an optional wait_timeout to poll until asynchronous indexing is complete
|
||||
pub fn create_index_with_timeout(
|
||||
&self,
|
||||
columns: &[impl AsRef<str>],
|
||||
index: Index,
|
||||
wait_timeout: Option<std::time::Duration>,
|
||||
) -> IndexBuilder {
|
||||
let mut builder = IndexBuilder::new(
|
||||
self.inner.clone(),
|
||||
columns
|
||||
.iter()
|
||||
.map(|val| val.as_ref().to_string())
|
||||
.collect::<Vec<_>>(),
|
||||
index,
|
||||
);
|
||||
if let Some(timeout) = wait_timeout {
|
||||
builder = builder.wait_timeout(timeout);
|
||||
}
|
||||
builder
|
||||
}
|
||||
|
||||
/// Create a builder for a merge insert operation
|
||||
///
|
||||
/// This operation can add rows, update rows, and remove rows all in a single
|
||||
@@ -984,17 +1121,20 @@ impl Table {
|
||||
&self,
|
||||
transforms: NewColumnTransform,
|
||||
read_columns: Option<Vec<String>>,
|
||||
) -> Result<()> {
|
||||
) -> Result<AddColumnsResult> {
|
||||
self.inner.add_columns(transforms, read_columns).await
|
||||
}
|
||||
|
||||
/// Change a column's name or nullability.
|
||||
pub async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<()> {
|
||||
pub async fn alter_columns(
|
||||
&self,
|
||||
alterations: &[ColumnAlteration],
|
||||
) -> Result<AlterColumnsResult> {
|
||||
self.inner.alter_columns(alterations).await
|
||||
}
|
||||
|
||||
/// Remove columns from the table.
|
||||
pub async fn drop_columns(&self, columns: &[&str]) -> Result<()> {
|
||||
pub async fn drop_columns(&self, columns: &[&str]) -> Result<DropColumnsResult> {
|
||||
self.inner.drop_columns(columns).await
|
||||
}
|
||||
|
||||
@@ -1026,6 +1166,24 @@ impl Table {
|
||||
self.inner.checkout(version).await
|
||||
}
|
||||
|
||||
/// Checks out a specific version of the Table by tag
|
||||
///
|
||||
/// Any read operation on the table will now access the data at the version referenced by the tag.
|
||||
/// As a consequence, calling this method will disable any read consistency interval
|
||||
/// that was previously set.
|
||||
///
|
||||
/// This is a read-only operation that turns the table into a sort of "view"
|
||||
/// or "detached head". Other table instances will not be affected. To make the change
|
||||
/// permanent you can use the `[Self::restore]` method.
|
||||
///
|
||||
/// Any operation that modifies the table will fail while the table is in a checked
|
||||
/// out state.
|
||||
///
|
||||
/// To return the table to a normal state use `[Self::checkout_latest]`
|
||||
pub async fn checkout_tag(&self, tag: &str) -> Result<()> {
|
||||
self.inner.checkout_tag(tag).await
|
||||
}
|
||||
|
||||
/// Ensures the table is pointing at the latest version
|
||||
///
|
||||
/// This can be used to manually update a table when the read_consistency_interval is None
|
||||
@@ -1086,6 +1244,37 @@ impl Table {
|
||||
self.inner.drop_index(name).await
|
||||
}
|
||||
|
||||
/// Prewarm an index in the table
|
||||
///
|
||||
/// This is a hint to fully load the index into memory. It can be used to
|
||||
/// avoid cold starts
|
||||
///
|
||||
/// It is generally wasteful to call this if the index does not fit into the
|
||||
/// available cache.
|
||||
///
|
||||
/// Note: This function is not yet supported on all indices, in which case it
|
||||
/// may do nothing.
|
||||
///
|
||||
/// Use [`Self::list_indices()`] to find the names of the indices.
|
||||
pub async fn prewarm_index(&self, name: &str) -> Result<()> {
|
||||
self.inner.prewarm_index(name).await
|
||||
}
|
||||
|
||||
/// Poll until the columns are fully indexed. Will return Error::Timeout if the columns
|
||||
/// are not fully indexed within the timeout.
|
||||
pub async fn wait_for_index(
|
||||
&self,
|
||||
index_names: &[&str],
|
||||
timeout: std::time::Duration,
|
||||
) -> Result<()> {
|
||||
self.inner.wait_for_index(index_names, timeout).await
|
||||
}
|
||||
|
||||
/// Get the tags manager.
|
||||
pub async fn tags(&self) -> Result<Box<dyn Tags + '_>> {
|
||||
self.inner.tags().await
|
||||
}
|
||||
|
||||
// Take many execution plans and map them into a single plan that adds
|
||||
// a query_index column and unions them.
|
||||
pub(crate) fn multi_vector_plan(
|
||||
@@ -1136,6 +1325,40 @@ impl Table {
|
||||
.unwrap();
|
||||
Ok(Arc::new(repartitioned))
|
||||
}
|
||||
|
||||
/// Retrieve statistics on the table
|
||||
pub async fn stats(&self) -> Result<TableStatistics> {
|
||||
self.inner.stats().await
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NativeTags {
|
||||
inner: LanceTags,
|
||||
}
|
||||
#[async_trait]
|
||||
impl Tags for NativeTags {
|
||||
async fn list(&self) -> Result<HashMap<String, TagContents>> {
|
||||
Ok(self.inner.list().await?)
|
||||
}
|
||||
|
||||
async fn get_version(&self, tag: &str) -> Result<u64> {
|
||||
Ok(self.inner.get_version(tag).await?)
|
||||
}
|
||||
|
||||
async fn create(&mut self, tag: &str, version: u64) -> Result<()> {
|
||||
self.inner.create(tag, version).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete(&mut self, tag: &str) -> Result<()> {
|
||||
self.inner.delete(tag).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn update(&mut self, tag: &str, version: u64) -> Result<()> {
|
||||
self.inner.update(tag, version).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NativeTable> for Table {
|
||||
@@ -1791,7 +2014,7 @@ impl NativeTable {
|
||||
/// more information.
|
||||
pub async fn uses_v2_manifest_paths(&self) -> Result<bool> {
|
||||
let dataset = self.dataset.get().await?;
|
||||
Ok(dataset.manifest_naming_scheme == ManifestNamingScheme::V2)
|
||||
Ok(dataset.manifest_location().naming_scheme == ManifestNamingScheme::V2)
|
||||
}
|
||||
|
||||
/// Migrate the table to use the new manifest path scheme.
|
||||
@@ -1882,6 +2105,10 @@ impl BaseTable for NativeTable {
|
||||
self.dataset.as_time_travel(version).await
|
||||
}
|
||||
|
||||
async fn checkout_tag(&self, tag: &str) -> Result<()> {
|
||||
self.dataset.as_time_travel(tag).await
|
||||
}
|
||||
|
||||
async fn checkout_latest(&self) -> Result<()> {
|
||||
self.dataset
|
||||
.as_latest(self.read_consistency_interval)
|
||||
@@ -1940,7 +2167,7 @@ impl BaseTable for NativeTable {
|
||||
&self,
|
||||
add: AddDataBuilder<NoData>,
|
||||
data: Box<dyn RecordBatchReader + Send>,
|
||||
) -> Result<()> {
|
||||
) -> Result<AddResult> {
|
||||
let data = Box::new(MaybeEmbedded::try_new(
|
||||
data,
|
||||
self.table_definition().await?,
|
||||
@@ -1963,9 +2190,9 @@ impl BaseTable for NativeTable {
|
||||
.execute_stream(data)
|
||||
.await?
|
||||
};
|
||||
|
||||
let version = dataset.manifest().version;
|
||||
self.dataset.set_latest(dataset).await;
|
||||
Ok(())
|
||||
Ok(AddResult { version })
|
||||
}
|
||||
|
||||
async fn create_index(&self, opts: IndexBuilder) -> Result<()> {
|
||||
@@ -2006,7 +2233,12 @@ impl BaseTable for NativeTable {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
|
||||
async fn prewarm_index(&self, index_name: &str) -> Result<()> {
|
||||
let dataset = self.dataset.get().await?;
|
||||
Ok(dataset.prewarm_index(index_name).await?)
|
||||
}
|
||||
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<UpdateResult> {
|
||||
let dataset = self.dataset.get().await?.clone();
|
||||
let mut builder = LanceUpdateBuilder::new(Arc::new(dataset));
|
||||
if let Some(predicate) = update.filter {
|
||||
@@ -2022,7 +2254,10 @@ impl BaseTable for NativeTable {
|
||||
self.dataset
|
||||
.set_latest(res.new_dataset.as_ref().clone())
|
||||
.await;
|
||||
Ok(res.rows_updated)
|
||||
Ok(UpdateResult {
|
||||
rows_updated: res.rows_updated,
|
||||
version: res.new_dataset.version().version,
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_plan(
|
||||
@@ -2214,7 +2449,7 @@ impl BaseTable for NativeTable {
|
||||
&self,
|
||||
params: MergeInsertBuilder,
|
||||
new_data: Box<dyn RecordBatchReader + Send>,
|
||||
) -> Result<()> {
|
||||
) -> Result<MergeResult> {
|
||||
let dataset = Arc::new(self.dataset.get().await?.clone());
|
||||
let mut builder = LanceMergeInsertBuilder::try_new(dataset.clone(), params.on)?;
|
||||
match (
|
||||
@@ -2240,16 +2475,51 @@ impl BaseTable for NativeTable {
|
||||
} else {
|
||||
builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep);
|
||||
}
|
||||
let job = builder.try_build()?;
|
||||
let (new_dataset, _stats) = job.execute_reader(new_data).await?;
|
||||
|
||||
let future = if let Some(timeout) = params.timeout {
|
||||
// The default retry timeout is 30s, so we pass the full timeout down
|
||||
// as well in case it is longer than that.
|
||||
let future = builder
|
||||
.retry_timeout(timeout)
|
||||
.try_build()?
|
||||
.execute_reader(new_data);
|
||||
Either::Left(tokio::time::timeout(timeout, future).map(|res| match res {
|
||||
Ok(Ok((new_dataset, stats))) => Ok((new_dataset, stats)),
|
||||
Ok(Err(e)) => Err(e.into()),
|
||||
Err(_) => Err(Error::Runtime {
|
||||
message: "merge insert timed out".to_string(),
|
||||
}),
|
||||
}))
|
||||
} else {
|
||||
let job = builder.try_build()?;
|
||||
Either::Right(job.execute_reader(new_data).map_err(|e| e.into()))
|
||||
};
|
||||
let (new_dataset, stats) = future.await?;
|
||||
let version = new_dataset.manifest().version;
|
||||
self.dataset.set_latest(new_dataset.as_ref().clone()).await;
|
||||
Ok(())
|
||||
Ok(MergeResult {
|
||||
version,
|
||||
num_updated_rows: stats.num_updated_rows,
|
||||
num_inserted_rows: stats.num_inserted_rows,
|
||||
num_deleted_rows: stats.num_deleted_rows,
|
||||
})
|
||||
}
|
||||
|
||||
/// Delete rows from the table
|
||||
async fn delete(&self, predicate: &str) -> Result<()> {
|
||||
self.dataset.get_mut().await?.delete(predicate).await?;
|
||||
Ok(())
|
||||
async fn delete(&self, predicate: &str) -> Result<DeleteResult> {
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
dataset.delete(predicate).await?;
|
||||
Ok(DeleteResult {
|
||||
version: dataset.version().version,
|
||||
})
|
||||
}
|
||||
|
||||
async fn tags(&self) -> Result<Box<dyn Tags + '_>> {
|
||||
let dataset = self.dataset.get().await?;
|
||||
|
||||
Ok(Box::new(NativeTags {
|
||||
inner: dataset.tags.clone(),
|
||||
}))
|
||||
}
|
||||
|
||||
async fn optimize(&self, action: OptimizeAction) -> Result<OptimizeStats> {
|
||||
@@ -2308,27 +2578,28 @@ impl BaseTable for NativeTable {
|
||||
&self,
|
||||
transforms: NewColumnTransform,
|
||||
read_columns: Option<Vec<String>>,
|
||||
) -> Result<()> {
|
||||
self.dataset
|
||||
.get_mut()
|
||||
.await?
|
||||
.add_columns(transforms, read_columns, None)
|
||||
.await?;
|
||||
Ok(())
|
||||
) -> Result<AddColumnsResult> {
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
dataset.add_columns(transforms, read_columns, None).await?;
|
||||
Ok(AddColumnsResult {
|
||||
version: dataset.version().version,
|
||||
})
|
||||
}
|
||||
|
||||
async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<()> {
|
||||
self.dataset
|
||||
.get_mut()
|
||||
.await?
|
||||
.alter_columns(alterations)
|
||||
.await?;
|
||||
Ok(())
|
||||
async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<AlterColumnsResult> {
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
dataset.alter_columns(alterations).await?;
|
||||
Ok(AlterColumnsResult {
|
||||
version: dataset.version().version,
|
||||
})
|
||||
}
|
||||
|
||||
async fn drop_columns(&self, columns: &[&str]) -> Result<()> {
|
||||
self.dataset.get_mut().await?.drop_columns(columns).await?;
|
||||
Ok(())
|
||||
async fn drop_columns(&self, columns: &[&str]) -> Result<DropColumnsResult> {
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
dataset.drop_columns(columns).await?;
|
||||
Ok(DropColumnsResult {
|
||||
version: dataset.version().version,
|
||||
})
|
||||
}
|
||||
|
||||
async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
||||
@@ -2407,6 +2678,118 @@ impl BaseTable for NativeTable {
|
||||
loss,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Poll until the columns are fully indexed. Will return Error::Timeout if the columns
|
||||
/// are not fully indexed within the timeout.
|
||||
async fn wait_for_index(
|
||||
&self,
|
||||
index_names: &[&str],
|
||||
timeout: std::time::Duration,
|
||||
) -> Result<()> {
|
||||
wait_for_index(self, index_names, timeout).await
|
||||
}
|
||||
|
||||
async fn stats(&self) -> Result<TableStatistics> {
|
||||
let num_rows = self.count_rows(None).await?;
|
||||
let num_indices = self.list_indices().await?.len();
|
||||
let ds = self.dataset.get().await?;
|
||||
let ds_clone = (*ds).clone();
|
||||
let ds_stats = Arc::new(ds_clone).calculate_data_stats().await?;
|
||||
let total_bytes = ds_stats.fields.iter().map(|f| f.bytes_on_disk).sum::<u64>() as usize;
|
||||
|
||||
let frags = ds.get_fragments();
|
||||
let mut sorted_sizes = join_all(
|
||||
frags
|
||||
.iter()
|
||||
.map(|frag| async move { frag.physical_rows().await.unwrap_or(0) }),
|
||||
)
|
||||
.await;
|
||||
sorted_sizes.sort();
|
||||
|
||||
let small_frag_threshold = 100000;
|
||||
let num_fragments = sorted_sizes.len();
|
||||
let num_small_fragments = sorted_sizes
|
||||
.iter()
|
||||
.filter(|&&size| size < small_frag_threshold)
|
||||
.count();
|
||||
|
||||
let p25 = *sorted_sizes.get(num_fragments / 4).unwrap_or(&0);
|
||||
let p50 = *sorted_sizes.get(num_fragments / 2).unwrap_or(&0);
|
||||
let p75 = *sorted_sizes.get(num_fragments * 3 / 4).unwrap_or(&0);
|
||||
let p99 = *sorted_sizes.get(num_fragments * 99 / 100).unwrap_or(&0);
|
||||
let min = sorted_sizes.first().copied().unwrap_or(0);
|
||||
let max = sorted_sizes.last().copied().unwrap_or(0);
|
||||
let mean = if num_fragments == 0 {
|
||||
0
|
||||
} else {
|
||||
sorted_sizes.iter().copied().sum::<usize>() / num_fragments
|
||||
};
|
||||
|
||||
let frag_stats = FragmentStatistics {
|
||||
num_fragments,
|
||||
num_small_fragments,
|
||||
lengths: FragmentSummaryStats {
|
||||
min,
|
||||
max,
|
||||
mean,
|
||||
p25,
|
||||
p50,
|
||||
p75,
|
||||
p99,
|
||||
},
|
||||
};
|
||||
let stats = TableStatistics {
|
||||
total_bytes,
|
||||
num_rows,
|
||||
num_indices,
|
||||
fragment_stats: frag_stats,
|
||||
};
|
||||
Ok(stats)
|
||||
}
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Deserialize, PartialEq)]
|
||||
pub struct TableStatistics {
|
||||
/// The total number of bytes in the table
|
||||
pub total_bytes: usize,
|
||||
|
||||
/// The number of rows in the table
|
||||
pub num_rows: usize,
|
||||
|
||||
/// The number of indices in the table
|
||||
pub num_indices: usize,
|
||||
|
||||
/// Statistics on table fragments
|
||||
pub fragment_stats: FragmentStatistics,
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Deserialize, PartialEq)]
|
||||
pub struct FragmentStatistics {
|
||||
/// The number of fragments in the table
|
||||
pub num_fragments: usize,
|
||||
|
||||
/// The number of uncompacted fragments in the table
|
||||
pub num_small_fragments: usize,
|
||||
|
||||
/// Statistics on the number of rows in the table fragments
|
||||
pub lengths: FragmentSummaryStats,
|
||||
// todo: add size statistics
|
||||
// /// Statistics on the number of bytes in the table fragments
|
||||
// sizes: FragmentStats,
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Deserialize, PartialEq)]
|
||||
pub struct FragmentSummaryStats {
|
||||
pub min: usize,
|
||||
pub max: usize,
|
||||
pub mean: usize,
|
||||
pub p25: usize,
|
||||
pub p50: usize,
|
||||
pub p75: usize,
|
||||
pub p99: usize,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -3008,6 +3391,60 @@ mod tests {
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_tags() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
|
||||
let conn = ConnectBuilder::new(uri)
|
||||
.read_consistency_interval(Duration::from_secs(0))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
let table = conn
|
||||
.create_table("my_table", some_sample_data())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(table.version().await.unwrap(), 1);
|
||||
table.add(some_sample_data()).execute().await.unwrap();
|
||||
assert_eq!(table.version().await.unwrap(), 2);
|
||||
let mut tags_manager = table.tags().await.unwrap();
|
||||
let tags = tags_manager.list().await.unwrap();
|
||||
assert!(tags.is_empty(), "Tags should be empty initially");
|
||||
let tag1 = "tag1";
|
||||
tags_manager.create(tag1, 1).await.unwrap();
|
||||
assert_eq!(tags_manager.get_version(tag1).await.unwrap(), 1);
|
||||
let tags = tags_manager.list().await.unwrap();
|
||||
assert_eq!(tags.len(), 1);
|
||||
assert!(tags.contains_key(tag1));
|
||||
assert_eq!(tags.get(tag1).unwrap().version, 1);
|
||||
tags_manager.create("tag2", 2).await.unwrap();
|
||||
assert_eq!(tags_manager.get_version("tag2").await.unwrap(), 2);
|
||||
let tags = tags_manager.list().await.unwrap();
|
||||
assert_eq!(tags.len(), 2);
|
||||
assert!(tags.contains_key(tag1));
|
||||
assert_eq!(tags.get(tag1).unwrap().version, 1);
|
||||
assert!(tags.contains_key("tag2"));
|
||||
assert_eq!(tags.get("tag2").unwrap().version, 2);
|
||||
// Test update and delete
|
||||
table.add(some_sample_data()).execute().await.unwrap();
|
||||
tags_manager.update(tag1, 3).await.unwrap();
|
||||
assert_eq!(tags_manager.get_version(tag1).await.unwrap(), 3);
|
||||
tags_manager.delete("tag2").await.unwrap();
|
||||
let tags = tags_manager.list().await.unwrap();
|
||||
assert_eq!(tags.len(), 1);
|
||||
assert!(tags.contains_key(tag1));
|
||||
assert_eq!(tags.get(tag1).unwrap().version, 3);
|
||||
// Test checkout tag
|
||||
table.add(some_sample_data()).execute().await.unwrap();
|
||||
assert_eq!(table.version().await.unwrap(), 4);
|
||||
table.checkout_tag(tag1).await.unwrap();
|
||||
assert_eq!(table.version().await.unwrap(), 3);
|
||||
table.checkout_latest().await.unwrap();
|
||||
assert_eq!(table.version().await.unwrap(), 4);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_index() {
|
||||
use arrow_array::RecordBatch;
|
||||
@@ -3190,7 +3627,10 @@ mod tests {
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
table
|
||||
.wait_for_index(&["embeddings_idx"], Duration::from_millis(10))
|
||||
.await
|
||||
.unwrap();
|
||||
let index_configs = table.list_indices().await.unwrap();
|
||||
assert_eq!(index_configs.len(), 1);
|
||||
let index = index_configs.into_iter().next().unwrap();
|
||||
@@ -3258,7 +3698,10 @@ mod tests {
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
table
|
||||
.wait_for_index(&["i_idx"], Duration::from_millis(10))
|
||||
.await
|
||||
.unwrap();
|
||||
let index_configs = table.list_indices().await.unwrap();
|
||||
assert_eq!(index_configs.len(), 1);
|
||||
let index = index_configs.into_iter().next().unwrap();
|
||||
@@ -3455,6 +3898,9 @@ mod tests {
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
assert_eq!(stats.index_type, crate::index::IndexType::FTS);
|
||||
assert_eq!(stats.distance_type, None);
|
||||
|
||||
// Make sure we can call prewarm without error
|
||||
table.prewarm_index("text_idx").await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -3550,7 +3996,7 @@ mod tests {
|
||||
let native_tbl = table.as_native().unwrap();
|
||||
|
||||
let manifest = native_tbl.manifest().await.unwrap();
|
||||
assert_eq!(manifest.config.len(), 0);
|
||||
let base_config_len = manifest.config.len();
|
||||
|
||||
native_tbl
|
||||
.update_config(vec![("test_key1".to_string(), "test_val1".to_string())])
|
||||
@@ -3558,7 +4004,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let manifest = native_tbl.manifest().await.unwrap();
|
||||
assert_eq!(manifest.config.len(), 1);
|
||||
assert_eq!(manifest.config.len(), 1 + base_config_len);
|
||||
assert_eq!(
|
||||
manifest.config.get("test_key1"),
|
||||
Some(&"test_val1".to_string())
|
||||
@@ -3569,7 +4015,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
let manifest = native_tbl.manifest().await.unwrap();
|
||||
assert_eq!(manifest.config.len(), 2);
|
||||
assert_eq!(manifest.config.len(), 2 + base_config_len);
|
||||
assert_eq!(
|
||||
manifest.config.get("test_key1"),
|
||||
Some(&"test_val1".to_string())
|
||||
@@ -3587,7 +4033,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
let manifest = native_tbl.manifest().await.unwrap();
|
||||
assert_eq!(manifest.config.len(), 2);
|
||||
assert_eq!(manifest.config.len(), 2 + base_config_len);
|
||||
assert_eq!(
|
||||
manifest.config.get("test_key1"),
|
||||
Some(&"test_val1".to_string())
|
||||
@@ -3599,7 +4045,7 @@ mod tests {
|
||||
|
||||
native_tbl.delete_config_keys(&["test_key1"]).await.unwrap();
|
||||
let manifest = native_tbl.manifest().await.unwrap();
|
||||
assert_eq!(manifest.config.len(), 1);
|
||||
assert_eq!(manifest.config.len(), 1 + base_config_len);
|
||||
assert_eq!(
|
||||
manifest.config.get("test_key2"),
|
||||
Some(&"test_val2_update".to_string())
|
||||
@@ -3721,4 +4167,108 @@ mod tests {
|
||||
Some(&"test_field_val1".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
pub async fn test_stats() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
|
||||
let conn = ConnectBuilder::new(uri).execute().await.unwrap();
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("foo", DataType::Int32, true),
|
||||
]));
|
||||
let batch = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from_iter_values(0..100)),
|
||||
Arc::new(Int32Array::from_iter_values(0..100)),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let table = conn
|
||||
.create_table(
|
||||
"test_stats",
|
||||
RecordBatchIterator::new(vec![Ok(batch.clone())], batch.schema()),
|
||||
)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
for _ in 0..10 {
|
||||
let batch = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from_iter_values(0..15)),
|
||||
Arc::new(Int32Array::from_iter_values(0..15)),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
table
|
||||
.add(RecordBatchIterator::new(
|
||||
vec![Ok(batch.clone())],
|
||||
batch.schema(),
|
||||
))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let empty_table = conn
|
||||
.create_table(
|
||||
"test_stats_empty",
|
||||
RecordBatchIterator::new(vec![], batch.schema()),
|
||||
)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let res = table.stats().await.unwrap();
|
||||
println!("{:#?}", res);
|
||||
assert_eq!(
|
||||
res,
|
||||
TableStatistics {
|
||||
num_rows: 250,
|
||||
num_indices: 0,
|
||||
total_bytes: 2000,
|
||||
fragment_stats: FragmentStatistics {
|
||||
num_fragments: 11,
|
||||
num_small_fragments: 11,
|
||||
lengths: FragmentSummaryStats {
|
||||
min: 15,
|
||||
max: 100,
|
||||
mean: 22,
|
||||
p25: 15,
|
||||
p50: 15,
|
||||
p75: 15,
|
||||
p99: 100,
|
||||
},
|
||||
},
|
||||
}
|
||||
);
|
||||
let res = empty_table.stats().await.unwrap();
|
||||
println!("{:#?}", res);
|
||||
assert_eq!(
|
||||
res,
|
||||
TableStatistics {
|
||||
num_rows: 0,
|
||||
num_indices: 0,
|
||||
total_bytes: 0,
|
||||
fragment_stats: FragmentStatistics {
|
||||
num_fragments: 0,
|
||||
num_small_fragments: 0,
|
||||
lengths: FragmentSummaryStats {
|
||||
min: 0,
|
||||
max: 0,
|
||||
mean: 0,
|
||||
p25: 0,
|
||||
p50: 0,
|
||||
p75: 0,
|
||||
p99: 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::{
|
||||
time::{self, Duration, Instant},
|
||||
};
|
||||
|
||||
use lance::Dataset;
|
||||
use lance::{dataset::refs, Dataset};
|
||||
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
|
||||
use crate::error::Result;
|
||||
@@ -83,19 +83,32 @@ impl DatasetRef {
|
||||
}
|
||||
}
|
||||
|
||||
async fn as_time_travel(&mut self, target_version: u64) -> Result<()> {
|
||||
async fn as_time_travel(&mut self, target_version: impl Into<refs::Ref>) -> Result<()> {
|
||||
let target_ref = target_version.into();
|
||||
|
||||
match self {
|
||||
Self::Latest { dataset, .. } => {
|
||||
let new_dataset = dataset.checkout_version(target_ref.clone()).await?;
|
||||
let version_value = new_dataset.version().version;
|
||||
|
||||
*self = Self::TimeTravel {
|
||||
dataset: dataset.checkout_version(target_version).await?,
|
||||
version: target_version,
|
||||
dataset: new_dataset,
|
||||
version: version_value,
|
||||
};
|
||||
}
|
||||
Self::TimeTravel { dataset, version } => {
|
||||
if *version != target_version {
|
||||
let should_checkout = match &target_ref {
|
||||
refs::Ref::Version(target_ver) => version != target_ver,
|
||||
refs::Ref::Tag(_) => true, // Always checkout for tags
|
||||
};
|
||||
|
||||
if should_checkout {
|
||||
let new_dataset = dataset.checkout_version(target_ref).await?;
|
||||
let version_value = new_dataset.version().version;
|
||||
|
||||
*self = Self::TimeTravel {
|
||||
dataset: dataset.checkout_version(target_version).await?,
|
||||
version: target_version,
|
||||
dataset: new_dataset,
|
||||
version: version_value,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -175,7 +188,7 @@ impl DatasetConsistencyWrapper {
|
||||
write_guard.as_latest(read_consistency_interval).await
|
||||
}
|
||||
|
||||
pub async fn as_time_travel(&self, target_version: u64) -> Result<()> {
|
||||
pub async fn as_time_travel(&self, target_version: impl Into<refs::Ref>) -> Result<()> {
|
||||
self.0.write().await.as_time_travel(target_version).await
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use arrow_array::RecordBatchReader;
|
||||
|
||||
use crate::Result;
|
||||
|
||||
use super::BaseTable;
|
||||
use super::{BaseTable, MergeResult};
|
||||
|
||||
/// A builder used to create and run a merge insert operation
|
||||
///
|
||||
@@ -21,6 +21,7 @@ pub struct MergeInsertBuilder {
|
||||
pub(crate) when_not_matched_insert_all: bool,
|
||||
pub(crate) when_not_matched_by_source_delete: bool,
|
||||
pub(crate) when_not_matched_by_source_delete_filt: Option<String>,
|
||||
pub(crate) timeout: Option<Duration>,
|
||||
}
|
||||
|
||||
impl MergeInsertBuilder {
|
||||
@@ -33,6 +34,7 @@ impl MergeInsertBuilder {
|
||||
when_not_matched_insert_all: false,
|
||||
when_not_matched_by_source_delete: false,
|
||||
when_not_matched_by_source_delete_filt: None,
|
||||
timeout: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,10 +86,26 @@ impl MergeInsertBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Maximum time to run the operation before cancelling it.
|
||||
///
|
||||
/// By default, there is a 30-second timeout that is only enforced after the
|
||||
/// first attempt. This is to prevent spending too long retrying to resolve
|
||||
/// conflicts. For example, if a write attempt takes 20 seconds and fails,
|
||||
/// the second attempt will be cancelled after 10 seconds, hitting the
|
||||
/// 30-second timeout. However, a write that takes one hour and succeeds on the
|
||||
/// first attempt will not be cancelled.
|
||||
///
|
||||
/// When this is set, the timeout is enforced on all attempts, including the first.
|
||||
pub fn timeout(&mut self, timeout: Duration) -> &mut Self {
|
||||
self.timeout = Some(timeout);
|
||||
self
|
||||
}
|
||||
|
||||
/// Executes the merge insert operation
|
||||
///
|
||||
/// Nothing is returned but the [`super::Table`] is updated
|
||||
pub async fn execute(self, new_data: Box<dyn RecordBatchReader + Send>) -> Result<()> {
|
||||
/// Returns version and statistics about the merge operation including the number of rows
|
||||
/// inserted, updated, and deleted.
|
||||
pub async fn execute(self, new_data: Box<dyn RecordBatchReader + Send>) -> Result<MergeResult> {
|
||||
self.table.clone().merge_insert(self, new_data).await
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user