mirror of
https://github.com/lancedb/lancedb.git
synced 2026-06-12 16:50:41 +00:00
Compare commits
28 Commits
feature/re
...
feat/remot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ea2ede4754 | ||
|
|
dfbe5becaa | ||
|
|
49815da933 | ||
|
|
f8caef3aca | ||
|
|
40f3e22600 | ||
|
|
04480c274a | ||
|
|
ae7f2cbfe8 | ||
|
|
4fb7c92e86 | ||
|
|
f03abc27e3 | ||
|
|
85d9c1ce63 | ||
|
|
d786e39fdc | ||
|
|
8373318e89 | ||
|
|
8308cca05e | ||
|
|
566b67a634 | ||
|
|
9c12fb6437 | ||
|
|
f260d3bf12 | ||
|
|
d9018067b3 | ||
|
|
53517b3aaa | ||
|
|
3e25f584eb | ||
|
|
59fbfd4158 | ||
|
|
f37e698e2f | ||
|
|
09b1bbc12a | ||
|
|
c484b24e51 | ||
|
|
3868965413 | ||
|
|
c13ebc6796 | ||
|
|
4b287fd9c4 | ||
|
|
64194ea8ad | ||
|
|
e6c5de1a58 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -27,6 +27,7 @@ python/dist
|
||||
*.so
|
||||
*.dylib
|
||||
*.dll
|
||||
*.pdb
|
||||
|
||||
## Javascript
|
||||
*.node
|
||||
|
||||
704
Cargo.lock
generated
704
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
28
Cargo.toml
28
Cargo.toml
@@ -13,20 +13,20 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.91.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=7.2.0-beta.3", default-features = false, "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-core = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datagen = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-file = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-io = { "version" = "=7.2.0-beta.3", default-features = false, "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-index = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-linalg = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=7.2.0-beta.3", default-features = false, "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-table = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-testing = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datafusion = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-encoding = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-arrow = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance = { "version" = "=8.0.0-beta.12", default-features = false, "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-core = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datagen = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-file = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-io = { "version" = "=8.0.0-beta.12", default-features = false, "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-index = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-linalg = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=8.0.0-beta.12", default-features = false, "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-table = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-testing = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datafusion = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-encoding = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-arrow = { "version" = "=8.0.0-beta.12", "tag" = "v8.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
ahash = "0.8"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "58.0.0", optional = false }
|
||||
|
||||
@@ -147,6 +147,14 @@ allow = [
|
||||
"CDLA-Permissive-2.0",
|
||||
]
|
||||
confidence-threshold = 0.8
|
||||
# Per-crate license exceptions: allow a license for a specific crate only,
|
||||
# rather than globally via the `allow` list above.
|
||||
exceptions = [
|
||||
# CDDL-1.0 (copyleft) is pulled in only as a dev/profiling dependency via
|
||||
# `inferno` -> `pprof` -> `lance-testing`; it is a test dependency that we
|
||||
# do not distribute, so scope the allowance to `inferno` alone.
|
||||
{ allow = ["CDDL-1.0"], crate = "inferno" },
|
||||
]
|
||||
# Crates whose license cannot be determined from Cargo metadata but whose
|
||||
# license we've manually confirmed from upstream. Keep this list minimal.
|
||||
[[licenses.clarify]]
|
||||
|
||||
43
docs/src/js/classes/BranchContents.md
Normal file
43
docs/src/js/classes/BranchContents.md
Normal file
@@ -0,0 +1,43 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / BranchContents
|
||||
|
||||
# Class: BranchContents
|
||||
|
||||
## Constructors
|
||||
|
||||
### new BranchContents()
|
||||
|
||||
```ts
|
||||
new BranchContents(): BranchContents
|
||||
```
|
||||
|
||||
#### Returns
|
||||
|
||||
[`BranchContents`](BranchContents.md)
|
||||
|
||||
## Properties
|
||||
|
||||
### manifestSize
|
||||
|
||||
```ts
|
||||
manifestSize: number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### parentBranch?
|
||||
|
||||
```ts
|
||||
optional parentBranch: string;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### parentVersion
|
||||
|
||||
```ts
|
||||
parentVersion: number;
|
||||
```
|
||||
96
docs/src/js/classes/Branches.md
Normal file
96
docs/src/js/classes/Branches.md
Normal file
@@ -0,0 +1,96 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / Branches
|
||||
|
||||
# Class: Branches
|
||||
|
||||
Branch manager for a [Table](Table.md).
|
||||
|
||||
Unlike tags, `create` and `checkout` return a new [Table](Table.md) handle scoped
|
||||
to the branch; writes on it do not affect `main`.
|
||||
|
||||
## Methods
|
||||
|
||||
### checkout()
|
||||
|
||||
```ts
|
||||
checkout(name, version?): Promise<Table>
|
||||
```
|
||||
|
||||
Check out an existing branch and return a handle scoped to it.
|
||||
|
||||
With `version` set, the returned handle is pinned to that version of the
|
||||
branch (a read-only, detached view); otherwise it tracks the branch's
|
||||
latest and stays writable.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
|
||||
* **version?**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Table`](Table.md)>
|
||||
|
||||
***
|
||||
|
||||
### create()
|
||||
|
||||
```ts
|
||||
create(
|
||||
name,
|
||||
fromRef?,
|
||||
fromVersion?): Promise<Table>
|
||||
```
|
||||
|
||||
Create a branch and return a handle scoped to it.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
Name of the new branch.
|
||||
|
||||
* **fromRef?**: `string`
|
||||
Source branch to fork from. Defaults to `main`.
|
||||
|
||||
* **fromVersion?**: `number`
|
||||
A specific version on `fromRef`. Defaults to latest.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Table`](Table.md)>
|
||||
|
||||
***
|
||||
|
||||
### delete()
|
||||
|
||||
```ts
|
||||
delete(name): Promise<void>
|
||||
```
|
||||
|
||||
Delete a branch.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
|
||||
***
|
||||
|
||||
### list()
|
||||
|
||||
```ts
|
||||
list(): Promise<Record<string, BranchContents>>
|
||||
```
|
||||
|
||||
List all branches, mapping name to branch metadata.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`Record`<`string`, [`BranchContents`](BranchContents.md)>>
|
||||
@@ -57,6 +57,24 @@ block size may be added in the future.
|
||||
|
||||
***
|
||||
|
||||
### fm()
|
||||
|
||||
```ts
|
||||
static fm(): Index
|
||||
```
|
||||
|
||||
Create an FM-Index.
|
||||
|
||||
An FM-Index is a scalar index on string or binary columns that accelerates
|
||||
substring search, i.e. `contains(col, 'needle')`. Unlike the tokenized
|
||||
full-text-search index, it matches arbitrary substrings of the raw bytes.
|
||||
|
||||
#### Returns
|
||||
|
||||
[`Index`](Index.md)
|
||||
|
||||
***
|
||||
|
||||
### fts()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -110,6 +110,23 @@ containing the new version number of the table after altering the columns.
|
||||
|
||||
***
|
||||
|
||||
### branches()
|
||||
|
||||
```ts
|
||||
abstract branches(): Promise<Branches>
|
||||
```
|
||||
|
||||
Get the branch manager for this table.
|
||||
|
||||
Branches are isolated, writable lines of history forked from another
|
||||
branch (or version). Writes on a branch do not affect `main`.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Branches`](Branches.md)>
|
||||
|
||||
***
|
||||
|
||||
### checkout()
|
||||
|
||||
```ts
|
||||
@@ -278,6 +295,23 @@ await table.createIndex("my_float_col");
|
||||
|
||||
***
|
||||
|
||||
### currentBranch()
|
||||
|
||||
```ts
|
||||
abstract currentBranch(): null | string
|
||||
```
|
||||
|
||||
The branch this table handle is scoped to, or `null` for the main branch.
|
||||
|
||||
A handle returned by [Branches.create](Branches.md#create) or [Branches.checkout](Branches.md#checkout)
|
||||
reports the branch it targets; a handle opened normally reports `null`.
|
||||
|
||||
#### Returns
|
||||
|
||||
`null` \| `string`
|
||||
|
||||
***
|
||||
|
||||
### delete()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -19,6 +19,8 @@
|
||||
|
||||
- [BooleanQuery](classes/BooleanQuery.md)
|
||||
- [BoostQuery](classes/BoostQuery.md)
|
||||
- [BranchContents](classes/BranchContents.md)
|
||||
- [Branches](classes/Branches.md)
|
||||
- [Connection](classes/Connection.md)
|
||||
- [HeaderProvider](classes/HeaderProvider.md)
|
||||
- [Index](classes/Index.md)
|
||||
|
||||
@@ -23,6 +23,31 @@ be more columns to represent composite indices.
|
||||
|
||||
***
|
||||
|
||||
### createdAt?
|
||||
|
||||
```ts
|
||||
optional createdAt: Date;
|
||||
```
|
||||
|
||||
When the index was created.
|
||||
|
||||
`undefined` for remote tables or indices created before timestamps were tracked.
|
||||
|
||||
***
|
||||
|
||||
### indexDetails?
|
||||
|
||||
```ts
|
||||
optional indexDetails: any;
|
||||
```
|
||||
|
||||
Index-type-specific details parsed as a JavaScript object.
|
||||
|
||||
Falls back to a raw string if JSON parsing fails. `undefined` for
|
||||
remote tables or when details are unavailable.
|
||||
|
||||
***
|
||||
|
||||
### indexType
|
||||
|
||||
```ts
|
||||
@@ -33,6 +58,30 @@ The type of the index
|
||||
|
||||
***
|
||||
|
||||
### indexUuid?
|
||||
|
||||
```ts
|
||||
optional indexUuid: string;
|
||||
```
|
||||
|
||||
The UUID of the first segment of the index.
|
||||
|
||||
`undefined` for remote tables, which do not yet surface this.
|
||||
|
||||
***
|
||||
|
||||
### indexVersion?
|
||||
|
||||
```ts
|
||||
optional indexVersion: number;
|
||||
```
|
||||
|
||||
The on-disk index format version.
|
||||
|
||||
`undefined` for remote tables.
|
||||
|
||||
***
|
||||
|
||||
### name
|
||||
|
||||
```ts
|
||||
@@ -40,3 +89,63 @@ name: string;
|
||||
```
|
||||
|
||||
The name of the index
|
||||
|
||||
***
|
||||
|
||||
### numIndexedRows?
|
||||
|
||||
```ts
|
||||
optional numIndexedRows: number;
|
||||
```
|
||||
|
||||
The number of rows indexed, across all segments.
|
||||
|
||||
`undefined` for remote tables.
|
||||
|
||||
***
|
||||
|
||||
### numSegments?
|
||||
|
||||
```ts
|
||||
optional numSegments: number;
|
||||
```
|
||||
|
||||
The number of segments that make up the index.
|
||||
|
||||
`undefined` for remote tables.
|
||||
|
||||
***
|
||||
|
||||
### numUnindexedRows?
|
||||
|
||||
```ts
|
||||
optional numUnindexedRows: number;
|
||||
```
|
||||
|
||||
The number of rows not yet covered by this index.
|
||||
|
||||
`undefined` for remote tables.
|
||||
|
||||
***
|
||||
|
||||
### sizeBytes?
|
||||
|
||||
```ts
|
||||
optional sizeBytes: number;
|
||||
```
|
||||
|
||||
The total size in bytes of all index files across all segments.
|
||||
|
||||
`undefined` for remote tables or indices without size tracking.
|
||||
|
||||
***
|
||||
|
||||
### typeUrl?
|
||||
|
||||
```ts
|
||||
optional typeUrl: string;
|
||||
```
|
||||
|
||||
The protobuf type URL, a precise type identifier for the index.
|
||||
|
||||
`undefined` for remote tables.
|
||||
|
||||
@@ -30,17 +30,6 @@ The type of the index
|
||||
|
||||
***
|
||||
|
||||
### loss?
|
||||
|
||||
```ts
|
||||
optional loss: number;
|
||||
```
|
||||
|
||||
The KMeans loss value of the index,
|
||||
it is only present for vector indices.
|
||||
|
||||
***
|
||||
|
||||
### numIndexedRows
|
||||
|
||||
```ts
|
||||
|
||||
@@ -8,6 +8,18 @@
|
||||
|
||||
## Properties
|
||||
|
||||
### branch?
|
||||
|
||||
```ts
|
||||
optional branch: string;
|
||||
```
|
||||
|
||||
Open the table scoped to this branch instead of the default branch.
|
||||
|
||||
Reads and writes on the returned table operate in the branch's context.
|
||||
|
||||
***
|
||||
|
||||
### ~~indexCacheSize?~~
|
||||
|
||||
```ts
|
||||
@@ -43,3 +55,17 @@ Options already set on the connection will be inherited by the table,
|
||||
but can be overridden here.
|
||||
|
||||
The available options are described at https://docs.lancedb.com/storage/
|
||||
|
||||
***
|
||||
|
||||
### version?
|
||||
|
||||
```ts
|
||||
optional version: number;
|
||||
```
|
||||
|
||||
Open the table pinned to this version, producing a read-only view.
|
||||
|
||||
Composes with [OpenTableOptions.branch](OpenTableOptions.md#branch): when both are set, opens
|
||||
that branch at the version; otherwise opens `main` at the version. Call
|
||||
`checkoutLatest` to return to a writable state.
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<arrow.version>15.0.0</arrow.version>
|
||||
<lance-core.version>7.2.0-beta.1</lance-core.version>
|
||||
<lance-core.version>8.0.0-beta.12</lance-core.version>
|
||||
<spotless.skip>false</spotless.skip>
|
||||
<spotless.version>2.30.0</spotless.version>
|
||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||
|
||||
@@ -25,8 +25,12 @@ lancedb = { path = "../rust/lancedb", default-features = false }
|
||||
lance-namespace.workspace = true
|
||||
napi = { version = "3.8.3", default-features = false, features = [
|
||||
"napi9",
|
||||
"async"
|
||||
"async",
|
||||
"chrono_date",
|
||||
"serde-json",
|
||||
] }
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
serde_json = "1"
|
||||
napi-derive = "3.5.2"
|
||||
# Prevent dynamic linking of lzma, which comes from datafusion
|
||||
lzma-sys = { version = "0.1", features = ["static"] }
|
||||
|
||||
@@ -191,6 +191,40 @@ describe("remote connection", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("supports version time-travel and branches on remote", async () => {
|
||||
await withMockDatabase(
|
||||
(req, res) => {
|
||||
const body = req.url?.includes("/branches/list")
|
||||
? JSON.stringify({
|
||||
branches: {
|
||||
exp: { parentVersion: 1, createAt: 1, manifestSize: 1 },
|
||||
},
|
||||
})
|
||||
: JSON.stringify({ name: "t", version: 2, schema: { fields: [] } });
|
||||
res.writeHead(200, { "Content-Type": "application/json" }).end(body);
|
||||
},
|
||||
async (db) => {
|
||||
// version-only (and "main" + version) time-travel the main chain
|
||||
const v2 = await db.openTable("t", undefined, { version: 2 });
|
||||
expect(v2.currentBranch()).toBeNull();
|
||||
const mainV2 = await db.openTable("t", undefined, {
|
||||
branch: "main",
|
||||
version: 2,
|
||||
});
|
||||
expect(mainV2.currentBranch()).toBeNull();
|
||||
|
||||
// a non-main branch opens a handle scoped to that branch
|
||||
const exp = await db.openTable("t", undefined, { branch: "exp" });
|
||||
expect(exp.currentBranch()).toBe("exp");
|
||||
const expV2 = await db.openTable("t", undefined, {
|
||||
branch: "exp",
|
||||
version: 2,
|
||||
});
|
||||
expect(expV2.currentBranch()).toBe("exp");
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
describe("TlsConfig", () => {
|
||||
it("should create TlsConfig with all fields", () => {
|
||||
const tlsConfig: TlsConfig = {
|
||||
|
||||
@@ -85,6 +85,140 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
await expect(table.countRows()).resolves.toBe(3);
|
||||
});
|
||||
|
||||
it("should support branches", async () => {
|
||||
await table.add([{ id: 1 }]);
|
||||
expect(await table.countRows()).toBe(1);
|
||||
|
||||
expect(table.currentBranch()).toBeNull();
|
||||
|
||||
// fork an isolated, writable branch from main
|
||||
const branch = await (await table.branches()).create("exp");
|
||||
expect(branch.currentBranch()).toBe("exp");
|
||||
expect(await branch.countRows()).toBe(1);
|
||||
await branch.add([{ id: 2 }]);
|
||||
expect(await branch.countRows()).toBe(2);
|
||||
// main is untouched by branch writes
|
||||
expect(await table.countRows()).toBe(1);
|
||||
|
||||
// listed, with main (null) as the parent
|
||||
const list = await (await table.branches()).list();
|
||||
expect(Object.keys(list)).toContain("exp");
|
||||
expect(list["exp"].parentBranch).toBeNull();
|
||||
|
||||
// fromRef="main" is equivalent to the default
|
||||
await (await table.branches()).create("exp2", "main");
|
||||
const list2 = await (await table.branches()).list();
|
||||
expect(list2["exp2"].parentBranch).toBeNull();
|
||||
|
||||
// checkout returns a handle scoped to the branch's latest
|
||||
const checkedOut = await (await table.branches()).checkout("exp");
|
||||
expect(checkedOut.currentBranch()).toBe("exp");
|
||||
expect(await checkedOut.countRows()).toBe(2);
|
||||
|
||||
// delete removes it
|
||||
await (await table.branches()).delete("exp");
|
||||
await (await table.branches()).delete("exp2");
|
||||
const after = await (await table.branches()).list();
|
||||
expect(Object.keys(after)).not.toContain("exp");
|
||||
});
|
||||
|
||||
it("should open a branch via open_table", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
await table.add([{ id: 1 }]);
|
||||
const branch = await (await table.branches()).create("exp");
|
||||
await branch.add([{ id: 2 }]);
|
||||
|
||||
// open_table(..., { branch }) returns a handle scoped to the branch
|
||||
const opened = await db.openTable("some_table", undefined, {
|
||||
branch: "exp",
|
||||
});
|
||||
expect(await opened.countRows()).toBe(2);
|
||||
// opening without branch still tracks main
|
||||
expect(await (await db.openTable("some_table")).countRows()).toBe(1);
|
||||
});
|
||||
|
||||
it("should open a branch at a version isolated from main and HEAD", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
// main: a single fork-point row
|
||||
const t = await db.createTable("bv_table", [{ id: 0 }]);
|
||||
const mainV1 = await t.version();
|
||||
|
||||
// fork "exp", then advance exp AND main independently past the fork so
|
||||
// they diverge while sharing version numbers
|
||||
const exp = await (await t.branches()).create("exp");
|
||||
await exp.add([{ id: 1 }]); // exp: {0, 1}
|
||||
const expV2 = await exp.version();
|
||||
await exp.add([{ id: 2 }]); // exp HEAD: {0, 1, 2}
|
||||
await t.add([{ id: 100 }, { id: 101 }, { id: 102 }]); // main HEAD: {0,100,101,102}
|
||||
expect(await t.version()).toBe(expV2);
|
||||
|
||||
// open exp at the shared version: the data must be exp's, not main's.
|
||||
// count alone cannot prove this (main@v2 also exists), so assert
|
||||
// provenance by content.
|
||||
const pinned = await db.openTable("bv_table", undefined, {
|
||||
branch: "exp",
|
||||
version: expV2,
|
||||
});
|
||||
expect(await pinned.countRows()).toBe(2); // not exp HEAD (3), not main@v2 (4)
|
||||
expect(await pinned.countRows("id = 1")).toBe(1); // exp's post-fork row
|
||||
expect(await pinned.countRows("id = 100")).toBe(0); // main's rows invisible
|
||||
|
||||
// the same coordinate is reachable directly via branches().checkout(name, version)
|
||||
const pinnedDirect = await (await t.branches()).checkout("exp", expV2);
|
||||
expect(await pinnedDirect.countRows()).toBe(2);
|
||||
|
||||
// the HEADs are unaffected
|
||||
expect(
|
||||
await (
|
||||
await db.openTable("bv_table", undefined, { branch: "exp" })
|
||||
).countRows(),
|
||||
).toBe(3);
|
||||
expect(await (await db.openTable("bv_table")).countRows()).toBe(4);
|
||||
|
||||
// version-only (no branch) time-travels main itself: its fork-point
|
||||
// version holds only main's first row, and the shared version number
|
||||
// resolves to main's data, not the branch's ("opens main at the version")
|
||||
const oldMain = await db.openTable("bv_table", undefined, {
|
||||
version: mainV1,
|
||||
});
|
||||
expect(await oldMain.countRows()).toBe(1);
|
||||
const sharedOnMain = await db.openTable("bv_table", undefined, {
|
||||
version: expV2,
|
||||
});
|
||||
expect(await sharedOnMain.countRows()).toBe(4); // main@v2, not exp@v2 (2)
|
||||
|
||||
// detached head: writing to a pinned version is rejected
|
||||
await expect(pinned.add([{ id: 9 }])).rejects.toThrow(
|
||||
/cannot be modified/,
|
||||
);
|
||||
|
||||
// a nonexistent version is rejected -- on main, and on a branch (a
|
||||
// distinct resolution path, on the branch's manifests)
|
||||
await expect(
|
||||
db.openTable("bv_table", undefined, { version: 9999 }),
|
||||
).rejects.toThrow();
|
||||
await expect(
|
||||
db.openTable("bv_table", undefined, { branch: "exp", version: 9999 }),
|
||||
).rejects.toThrow();
|
||||
|
||||
// checkoutLatest re-attaches the pinned handle to the BRANCH's HEAD
|
||||
// (writable again), not main's HEAD (4), and not staying pinned (2)
|
||||
await pinned.checkoutLatest();
|
||||
expect(await pinned.countRows()).toBe(3); // exp HEAD
|
||||
await pinned.add([{ id: 3 }]);
|
||||
expect(await pinned.countRows()).toBe(4); // writable again
|
||||
});
|
||||
|
||||
it("rejects invalid branch inputs", async () => {
|
||||
const branches = await table.branches();
|
||||
await expect(branches.create("")).rejects.toThrow("non-empty");
|
||||
await expect(branches.checkout("")).rejects.toThrow("non-empty");
|
||||
await expect(branches.delete("")).rejects.toThrow("non-empty");
|
||||
await expect(branches.create("bad", "main", -1)).rejects.toThrow(
|
||||
"non-negative",
|
||||
);
|
||||
});
|
||||
|
||||
it("should show table stats", async () => {
|
||||
await table.add([{ id: 1 }, { id: 2 }]);
|
||||
await table.add([{ id: 1 }]);
|
||||
@@ -715,13 +849,15 @@ describe("When creating an index", () => {
|
||||
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
||||
const indices = await tbl.listIndices();
|
||||
expect(indices.length).toBe(1);
|
||||
expect(indices[0]).toEqual({
|
||||
name: "vec_idx",
|
||||
indexType: "IvfPq",
|
||||
columns: ["vec"],
|
||||
});
|
||||
expect(indices[0]).toEqual(
|
||||
expect.objectContaining({
|
||||
name: "vec_idx",
|
||||
indexType: "IvfPq",
|
||||
columns: ["vec"],
|
||||
}),
|
||||
);
|
||||
const stats = await tbl.indexStats("vec_idx");
|
||||
expect(stats?.loss).toBeDefined();
|
||||
expect(stats).toBeDefined();
|
||||
|
||||
// Search without specifying the column
|
||||
let rst = await tbl
|
||||
@@ -781,10 +917,22 @@ describe("When creating an index", () => {
|
||||
expect(indices2.length).toBe(0);
|
||||
});
|
||||
|
||||
it("should create and search a nested vector index", async () => {
|
||||
it("should preserve canonical nested field paths across index lifecycle", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const nestedSchema = new Schema([
|
||||
new Field("id", new Int32(), true),
|
||||
new Field("rowId", new Int32(), true),
|
||||
new Field("row-id", new Int32(), true),
|
||||
new Field("userId", new Int32(), true),
|
||||
new Field(
|
||||
"metadata",
|
||||
new Struct([new Field("user_id", new Int32(), true)]),
|
||||
true,
|
||||
),
|
||||
new Field(
|
||||
"MetaData",
|
||||
new Struct([new Field("userId", new Int32(), true)]),
|
||||
true,
|
||||
),
|
||||
new Field(
|
||||
"image",
|
||||
new Struct([
|
||||
@@ -796,28 +944,147 @@ describe("When creating an index", () => {
|
||||
]),
|
||||
true,
|
||||
),
|
||||
new Field(
|
||||
"payload",
|
||||
new Struct([new Field("text", new Utf8(), true)]),
|
||||
true,
|
||||
),
|
||||
new Field(
|
||||
"meta-data",
|
||||
new Struct([new Field("user-id", new Int32(), true)]),
|
||||
true,
|
||||
),
|
||||
new Field(
|
||||
"literal",
|
||||
new Struct([new Field("a.b", new Int32(), true)]),
|
||||
true,
|
||||
),
|
||||
]);
|
||||
const nestedTable = await db.createTable(
|
||||
"nested_vector",
|
||||
"nested_field_index_lifecycle",
|
||||
makeArrowTable(
|
||||
Array.from({ length: 300 }, (_, id) => ({
|
||||
id,
|
||||
image: { embedding: [id, id + 1] },
|
||||
Array.from({ length: 300 }, (_, rowId) => ({
|
||||
rowId,
|
||||
"row-id": rowId,
|
||||
userId: rowId,
|
||||
metadata: { ["user_id"]: rowId },
|
||||
["MetaData"]: { userId: rowId },
|
||||
image: { embedding: [rowId, rowId + 1] },
|
||||
payload: { text: `document ${rowId}` },
|
||||
"meta-data": { "user-id": rowId },
|
||||
literal: { "a.b": rowId },
|
||||
})),
|
||||
{ schema: nestedSchema },
|
||||
),
|
||||
);
|
||||
|
||||
await nestedTable.createIndex("rowId", {
|
||||
config: Index.btree(),
|
||||
name: "row_id_idx",
|
||||
});
|
||||
await nestedTable.createIndex("`row-id`", {
|
||||
config: Index.btree(),
|
||||
name: "row_dash_id_idx",
|
||||
});
|
||||
await nestedTable.createIndex("userId", {
|
||||
config: Index.btree(),
|
||||
name: "top_user_id_idx",
|
||||
});
|
||||
await nestedTable.createIndex("metadata.user_id", {
|
||||
config: Index.btree(),
|
||||
name: "nested_user_id_idx",
|
||||
});
|
||||
await nestedTable.createIndex("MetaData.userId", {
|
||||
config: Index.btree(),
|
||||
name: "mixed_case_metadata_user_id_idx",
|
||||
});
|
||||
await nestedTable.createIndex("`meta-data`.`user-id`", {
|
||||
config: Index.btree(),
|
||||
name: "escaped_names_idx",
|
||||
});
|
||||
await nestedTable.createIndex("literal.`a.b`", {
|
||||
config: Index.btree(),
|
||||
name: "literal_dot_idx",
|
||||
});
|
||||
await nestedTable.createIndex("image.embedding", {
|
||||
name: "image_embedding_idx",
|
||||
});
|
||||
const indices = await nestedTable.listIndices();
|
||||
expect(indices).toContainEqual({
|
||||
name: "image_embedding_idx",
|
||||
indexType: "IvfPq",
|
||||
columns: ["image.embedding"],
|
||||
await nestedTable.createIndex("payload.text", {
|
||||
config: Index.fts({ withPosition: false }),
|
||||
name: "payload_text_idx",
|
||||
});
|
||||
|
||||
const indices = await nestedTable.listIndices();
|
||||
expect(indices).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
name: "row_id_idx",
|
||||
indexType: "BTree",
|
||||
columns: ["rowId"],
|
||||
}),
|
||||
expect.objectContaining({
|
||||
name: "row_dash_id_idx",
|
||||
indexType: "BTree",
|
||||
columns: ["`row-id`"],
|
||||
}),
|
||||
expect.objectContaining({
|
||||
name: "top_user_id_idx",
|
||||
indexType: "BTree",
|
||||
columns: ["userId"],
|
||||
}),
|
||||
expect.objectContaining({
|
||||
name: "nested_user_id_idx",
|
||||
indexType: "BTree",
|
||||
columns: ["metadata.user_id"],
|
||||
}),
|
||||
expect.objectContaining({
|
||||
name: "mixed_case_metadata_user_id_idx",
|
||||
indexType: "BTree",
|
||||
columns: ["MetaData.userId"],
|
||||
}),
|
||||
expect.objectContaining({
|
||||
name: "escaped_names_idx",
|
||||
indexType: "BTree",
|
||||
columns: ["`meta-data`.`user-id`"],
|
||||
}),
|
||||
expect.objectContaining({
|
||||
name: "literal_dot_idx",
|
||||
indexType: "BTree",
|
||||
columns: ["literal.`a.b`"],
|
||||
}),
|
||||
expect.objectContaining({
|
||||
name: "image_embedding_idx",
|
||||
indexType: "IvfPq",
|
||||
columns: ["image.embedding"],
|
||||
}),
|
||||
expect.objectContaining({
|
||||
name: "payload_text_idx",
|
||||
indexType: "FTS",
|
||||
columns: ["payload.text"],
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
||||
const stats = await nestedTable.indexStats(
|
||||
"mixed_case_metadata_user_id_idx",
|
||||
);
|
||||
expect(stats?.numIndexedRows).toEqual(300);
|
||||
expect(stats?.indexType).toEqual("BTREE");
|
||||
|
||||
const filtered = await nestedTable
|
||||
.query()
|
||||
.where("MetaData.userId = 42")
|
||||
.limit(1)
|
||||
.toArray();
|
||||
expect(filtered[0].MetaData.userId).toEqual(42);
|
||||
|
||||
const escapedFiltered = await nestedTable
|
||||
.query()
|
||||
.where("`row-id` = 43")
|
||||
.limit(1)
|
||||
.toArray();
|
||||
expect(escapedFiltered[0]["row-id"]).toEqual(43);
|
||||
|
||||
const explicit = await nestedTable
|
||||
.query()
|
||||
.nearestTo([0.0, 1.0])
|
||||
@@ -829,7 +1096,37 @@ describe("When creating an index", () => {
|
||||
.nearestTo([0.0, 1.0])
|
||||
.limit(1)
|
||||
.toArray();
|
||||
expect(inferred[0].id).toEqual(explicit[0].id);
|
||||
expect(inferred[0].rowId).toEqual(explicit[0].rowId);
|
||||
|
||||
await nestedTable.add([
|
||||
{
|
||||
rowId: 300,
|
||||
"row-id": 300,
|
||||
userId: 300,
|
||||
metadata: { ["user_id"]: 300 },
|
||||
["MetaData"]: { userId: 300 },
|
||||
image: { embedding: [300.0, 301.0] },
|
||||
payload: { text: "document 300" },
|
||||
"meta-data": { "user-id": 300 },
|
||||
literal: { "a.b": 300 },
|
||||
},
|
||||
]);
|
||||
await nestedTable.optimize();
|
||||
const indicesAfterOptimize = await nestedTable.listIndices();
|
||||
expect(indicesAfterOptimize).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
name: "mixed_case_metadata_user_id_idx",
|
||||
indexType: "BTree",
|
||||
columns: ["MetaData.userId"],
|
||||
}),
|
||||
expect.objectContaining({
|
||||
name: "image_embedding_idx",
|
||||
indexType: "IvfPq",
|
||||
columns: ["image.embedding"],
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it("should report multiple nested vector candidates", async () => {
|
||||
@@ -963,11 +1260,13 @@ describe("When creating an index", () => {
|
||||
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
||||
const indices = await tbl.listIndices();
|
||||
expect(indices.length).toBe(1);
|
||||
expect(indices[0]).toEqual({
|
||||
name: "vec_idx",
|
||||
indexType: "IvfHnswSq",
|
||||
columns: ["vec"],
|
||||
});
|
||||
expect(indices[0]).toEqual(
|
||||
expect.objectContaining({
|
||||
name: "vec_idx",
|
||||
indexType: "IvfHnswSq",
|
||||
columns: ["vec"],
|
||||
}),
|
||||
);
|
||||
|
||||
// Search without specifying the column
|
||||
let rst = await tbl
|
||||
@@ -1140,6 +1439,20 @@ describe("When creating an index", () => {
|
||||
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
||||
});
|
||||
|
||||
test("create an FM index", async () => {
|
||||
// FM-Index accelerates substring search on a string/binary column.
|
||||
const db = await connect(tmpDir.name);
|
||||
const fmTbl = await db.createTable("fm_table", [
|
||||
{ id: 0, text: "hello world" },
|
||||
{ id: 1, text: "foo bar" },
|
||||
]);
|
||||
await fmTbl.createIndex("text", {
|
||||
config: Index.fm(),
|
||||
});
|
||||
const indexDir = path.join(tmpDir.name, "fm_table.lance", "_indices");
|
||||
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
||||
});
|
||||
|
||||
test("should be able to get index stats", async () => {
|
||||
await tbl.createIndex("id");
|
||||
|
||||
@@ -1150,7 +1463,6 @@ describe("When creating an index", () => {
|
||||
expect(stats?.distanceType).toBeUndefined();
|
||||
expect(stats?.indexType).toEqual("BTREE");
|
||||
expect(stats?.numIndices).toEqual(1);
|
||||
expect(stats?.loss).toBeUndefined();
|
||||
});
|
||||
|
||||
test("when getting stats on non-existent index", async () => {
|
||||
@@ -1300,6 +1612,35 @@ describe("When creating an index", () => {
|
||||
expect(rst64Query.toString()).toEqual(rst64Search.toString());
|
||||
expect(rst64Query.numRows).toBe(2);
|
||||
});
|
||||
|
||||
it("should expose rich metadata fields on IndexConfig", async () => {
|
||||
await tbl.createIndex("id", { config: Index.btree() });
|
||||
await tbl.createIndex("vec");
|
||||
|
||||
const indicesByName = Object.fromEntries(
|
||||
(await tbl.listIndices()).map((idx) => [idx.name, idx]),
|
||||
);
|
||||
|
||||
const scalarIdx = indicesByName["id_idx"];
|
||||
expect(scalarIdx).toBeDefined();
|
||||
expect(typeof scalarIdx.indexUuid).toBe("string");
|
||||
expect(scalarIdx.numIndexedRows).toBe(300);
|
||||
expect(scalarIdx.numUnindexedRows).toBe(0);
|
||||
expect(scalarIdx.numSegments).toBeGreaterThanOrEqual(1);
|
||||
expect(scalarIdx.sizeBytes).toBeGreaterThan(0);
|
||||
// Use toString check to avoid cross-realm instanceof failures with native Date objects
|
||||
expect(Object.prototype.toString.call(scalarIdx.createdAt)).toBe(
|
||||
"[object Date]",
|
||||
);
|
||||
expect((scalarIdx.createdAt as Date).getTime()).toBeGreaterThan(0);
|
||||
expect(typeof scalarIdx.indexDetails).toBe("object");
|
||||
|
||||
const vectorIdx = indicesByName["vec_idx"];
|
||||
expect(vectorIdx).toBeDefined();
|
||||
expect(typeof vectorIdx.indexUuid).toBe("string");
|
||||
expect(vectorIdx.numIndexedRows).toBe(300);
|
||||
expect(typeof vectorIdx.indexDetails).toBe("object");
|
||||
});
|
||||
});
|
||||
|
||||
describe("When querying a table", () => {
|
||||
|
||||
@@ -84,6 +84,20 @@ export interface CreateTableOptions {
|
||||
}
|
||||
|
||||
export interface OpenTableOptions {
|
||||
/**
|
||||
* Open the table scoped to this branch instead of the default branch.
|
||||
*
|
||||
* Reads and writes on the returned table operate in the branch's context.
|
||||
*/
|
||||
branch?: string;
|
||||
/**
|
||||
* Open the table pinned to this version, producing a read-only view.
|
||||
*
|
||||
* Composes with {@link OpenTableOptions.branch}: when both are set, opens
|
||||
* that branch at the version; otherwise opens `main` at the version. Call
|
||||
* `checkoutLatest` to return to a writable state.
|
||||
*/
|
||||
version?: number;
|
||||
/**
|
||||
* Configuration for object storage.
|
||||
*
|
||||
@@ -483,7 +497,20 @@ export class LocalConnection extends Connection {
|
||||
options?.indexCacheSize,
|
||||
);
|
||||
|
||||
return new LocalTable(innerTable);
|
||||
let table: Table = new LocalTable(innerTable);
|
||||
// "main" is the default branch, so treat it as no branch. On a real branch,
|
||||
// scope and pin in one step (yielding "version V of branch B"); otherwise
|
||||
// pin the version, if any, against main.
|
||||
const branch =
|
||||
options?.branch != null && options.branch !== "main"
|
||||
? options.branch
|
||||
: undefined;
|
||||
if (branch != null) {
|
||||
table = await (await table.branches()).checkout(branch, options?.version);
|
||||
} else if (options?.version != null) {
|
||||
await table.checkout(options.version);
|
||||
}
|
||||
return table;
|
||||
}
|
||||
|
||||
async cloneTable(
|
||||
|
||||
@@ -38,6 +38,7 @@ export {
|
||||
FragmentSummaryStats,
|
||||
Tags,
|
||||
TagContents,
|
||||
BranchContents,
|
||||
MergeResult,
|
||||
AddResult,
|
||||
AddColumnsResult,
|
||||
@@ -111,6 +112,7 @@ export {
|
||||
|
||||
export {
|
||||
Table,
|
||||
Branches,
|
||||
AddDataOptions,
|
||||
UpdateOptions,
|
||||
OptimizeOptions,
|
||||
|
||||
@@ -702,6 +702,17 @@ export class Index {
|
||||
return new Index(LanceDbIndex.labelList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an FM-Index.
|
||||
*
|
||||
* An FM-Index is a scalar index on string or binary columns that accelerates
|
||||
* substring search, i.e. `contains(col, 'needle')`. Unlike the tokenized
|
||||
* full-text-search index, it matches arbitrary substrings of the raw bytes.
|
||||
*/
|
||||
static fm() {
|
||||
return new Index(LanceDbIndex.fm());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a full text search index
|
||||
*
|
||||
|
||||
@@ -25,10 +25,12 @@ import {
|
||||
AddColumnsSql,
|
||||
AddResult,
|
||||
AlterColumnsResult,
|
||||
BranchContents,
|
||||
DeleteResult,
|
||||
DropColumnsResult,
|
||||
IndexConfig,
|
||||
IndexStatistics,
|
||||
Branches as NativeBranches,
|
||||
OptimizeStats,
|
||||
TableStatistics,
|
||||
Tags,
|
||||
@@ -653,6 +655,22 @@ export abstract class Table {
|
||||
*/
|
||||
abstract tags(): Promise<Tags>;
|
||||
|
||||
/**
|
||||
* Get the branch manager for this table.
|
||||
*
|
||||
* Branches are isolated, writable lines of history forked from another
|
||||
* branch (or version). Writes on a branch do not affect `main`.
|
||||
*/
|
||||
abstract branches(): Promise<Branches>;
|
||||
|
||||
/**
|
||||
* The branch this table handle is scoped to, or `null` for the main branch.
|
||||
*
|
||||
* A handle returned by {@link Branches.create} or {@link Branches.checkout}
|
||||
* reports the branch it targets; a handle opened normally reports `null`.
|
||||
*/
|
||||
abstract currentBranch(): string | null;
|
||||
|
||||
/**
|
||||
* Restore the table to the currently checked out version
|
||||
*
|
||||
@@ -1108,6 +1126,14 @@ export class LocalTable extends Table {
|
||||
return await this.inner.tags();
|
||||
}
|
||||
|
||||
async branches(): Promise<Branches> {
|
||||
return new Branches(await this.inner.branches());
|
||||
}
|
||||
|
||||
currentBranch(): string | null {
|
||||
return this.inner.currentBranch() ?? null;
|
||||
}
|
||||
|
||||
async optimize(options?: Partial<OptimizeOptions>): Promise<OptimizeStats> {
|
||||
let cleanupOlderThanMs;
|
||||
if (
|
||||
@@ -1238,3 +1264,57 @@ export interface FieldMetadataUpdate {
|
||||
/** If true, replace the field's entire metadata map instead of merging. */
|
||||
replace?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Branch manager for a {@link Table}.
|
||||
*
|
||||
* Unlike tags, `create` and `checkout` return a new {@link Table} handle scoped
|
||||
* to the branch; writes on it do not affect `main`.
|
||||
*/
|
||||
export class Branches {
|
||||
#inner: NativeBranches;
|
||||
|
||||
/**
|
||||
* Construct a Branches manager. Internal use only.
|
||||
* @hidden
|
||||
*/
|
||||
constructor(inner: NativeBranches) {
|
||||
this.#inner = inner;
|
||||
}
|
||||
|
||||
/** List all branches, mapping name to branch metadata. */
|
||||
async list(): Promise<Record<string, BranchContents>> {
|
||||
return await this.#inner.list();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a branch and return a handle scoped to it.
|
||||
*
|
||||
* @param name Name of the new branch.
|
||||
* @param fromRef Source branch to fork from. Defaults to `main`.
|
||||
* @param fromVersion A specific version on `fromRef`. Defaults to latest.
|
||||
*/
|
||||
async create(
|
||||
name: string,
|
||||
fromRef?: string,
|
||||
fromVersion?: number,
|
||||
): Promise<Table> {
|
||||
return new LocalTable(await this.#inner.create(name, fromRef, fromVersion));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check out an existing branch and return a handle scoped to it.
|
||||
*
|
||||
* With `version` set, the returned handle is pinned to that version of the
|
||||
* branch (a read-only, detached view); otherwise it tracks the branch's
|
||||
* latest and stays writable.
|
||||
*/
|
||||
async checkout(name: string, version?: number): Promise<Table> {
|
||||
return new LocalTable(await this.#inner.checkout(name, version));
|
||||
}
|
||||
|
||||
/** Delete a branch. */
|
||||
async delete(name: string): Promise<void> {
|
||||
return await this.#inner.delete(name);
|
||||
}
|
||||
}
|
||||
|
||||
10
nodejs/package-lock.json
generated
10
nodejs/package-lock.json
generated
@@ -26,7 +26,7 @@
|
||||
"@aws-sdk/client-s3": "3.1003.0",
|
||||
"@biomejs/biome": "^1.7.3",
|
||||
"@jest/globals": "^29.7.0",
|
||||
"@napi-rs/cli": "3.5.1",
|
||||
"@napi-rs/cli": "3.7.0",
|
||||
"@types/axios": "^0.14.0",
|
||||
"@types/jest": "^29.1.2",
|
||||
"@types/node": "22.7.4",
|
||||
@@ -2942,9 +2942,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/cli": {
|
||||
"version": "3.5.1",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/cli/-/cli-3.5.1.tgz",
|
||||
"integrity": "sha512-XBfLQRDcB3qhu6bazdMJsecWW55kR85l5/k0af9BIBELXQSsCFU0fzug7PX8eQp6vVdm7W/U3z6uP5WmITB2Gw==",
|
||||
"version": "3.7.0",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/cli/-/cli-3.7.0.tgz",
|
||||
"integrity": "sha512-3d3+rmxlOIV/G1zPWeX4PCxuYnhcCQM2BvY9rtimC8RO0dFR9gtYP+Grov+WoduZtfWRj5N1XvytWeRxxCk5zw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
@@ -2954,7 +2954,7 @@
|
||||
"@octokit/rest": "^22.0.1",
|
||||
"clipanion": "^4.0.0-rc.4",
|
||||
"colorette": "^2.0.20",
|
||||
"emnapi": "^1.7.1",
|
||||
"emnapi": "^1.10.0",
|
||||
"es-toolkit": "^1.41.0",
|
||||
"js-yaml": "^4.1.0",
|
||||
"obug": "^2.0.0",
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
"@aws-sdk/client-s3": "3.1003.0",
|
||||
"@biomejs/biome": "^1.7.3",
|
||||
"@jest/globals": "^29.7.0",
|
||||
"@napi-rs/cli": "3.5.1",
|
||||
"@napi-rs/cli": "3.7.0",
|
||||
"@types/axios": "^0.14.0",
|
||||
"@types/jest": "^29.1.2",
|
||||
"@types/node": "22.7.4",
|
||||
|
||||
10
nodejs/pnpm-lock.yaml
generated
10
nodejs/pnpm-lock.yaml
generated
@@ -31,8 +31,8 @@ importers:
|
||||
specifier: ^29.7.0
|
||||
version: 29.7.0
|
||||
'@napi-rs/cli':
|
||||
specifier: 3.5.1
|
||||
version: 3.5.1(@emnapi/core@1.10.0)(@emnapi/runtime@1.10.0)(@types/node@22.7.4)
|
||||
specifier: 3.7.0
|
||||
version: 3.7.0(@emnapi/core@1.10.0)(@emnapi/runtime@1.10.0)(@types/node@22.7.4)
|
||||
'@types/axios':
|
||||
specifier: ^0.14.0
|
||||
version: 0.14.4
|
||||
@@ -887,8 +887,8 @@ packages:
|
||||
'@jridgewell/trace-mapping@0.3.31':
|
||||
resolution: {integrity: sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==}
|
||||
|
||||
'@napi-rs/cli@3.5.1':
|
||||
resolution: {integrity: sha512-XBfLQRDcB3qhu6bazdMJsecWW55kR85l5/k0af9BIBELXQSsCFU0fzug7PX8eQp6vVdm7W/U3z6uP5WmITB2Gw==}
|
||||
'@napi-rs/cli@3.7.0':
|
||||
resolution: {integrity: sha512-3d3+rmxlOIV/G1zPWeX4PCxuYnhcCQM2BvY9rtimC8RO0dFR9gtYP+Grov+WoduZtfWRj5N1XvytWeRxxCk5zw==}
|
||||
engines: {node: '>= 16'}
|
||||
hasBin: true
|
||||
peerDependencies:
|
||||
@@ -4582,7 +4582,7 @@ snapshots:
|
||||
'@jridgewell/resolve-uri': 3.1.2
|
||||
'@jridgewell/sourcemap-codec': 1.5.5
|
||||
|
||||
'@napi-rs/cli@3.5.1(@emnapi/core@1.10.0)(@emnapi/runtime@1.10.0)(@types/node@22.7.4)':
|
||||
'@napi-rs/cli@3.7.0(@emnapi/core@1.10.0)(@emnapi/runtime@1.10.0)(@types/node@22.7.4)':
|
||||
dependencies:
|
||||
'@inquirer/prompts': 8.4.3(@types/node@22.7.4)
|
||||
'@napi-rs/cross-toolchain': 1.0.3(@emnapi/core@1.10.0)(@emnapi/runtime@1.10.0)
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
use std::sync::Mutex;
|
||||
|
||||
use lancedb::index::Index as LanceDbIndex;
|
||||
use lancedb::index::scalar::{BTreeIndexBuilder, FtsIndexBuilder};
|
||||
use lancedb::index::scalar::{BTreeIndexBuilder, FmIndexBuilder, FtsIndexBuilder};
|
||||
use lancedb::index::vector::{
|
||||
IvfFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder,
|
||||
IvfRqIndexBuilder,
|
||||
@@ -143,6 +143,13 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn fm() -> Self {
|
||||
Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::Fm(FmIndexBuilder::default()))),
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn fts(
|
||||
|
||||
@@ -3,11 +3,13 @@
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
|
||||
use lancedb::ipc::{ipc_file_to_batches, ipc_file_to_schema};
|
||||
use lancedb::table::{
|
||||
AddDataMode, ColumnAlteration as LanceColumnAlteration, Duration,
|
||||
FieldMetadataUpdate as LanceFieldMetadataUpdate, NewColumnTransform, OptimizeAction,
|
||||
OptimizeOptions, Table as LanceDbTable,
|
||||
OptimizeOptions, Ref, Table as LanceDbTable,
|
||||
};
|
||||
use napi::bindgen_prelude::*;
|
||||
use napi::threadsafe_function::{ThreadsafeFunction, ThreadsafeFunctionCallMode};
|
||||
@@ -478,6 +480,19 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn branches(&self) -> napi::Result<Branches> {
|
||||
Ok(Branches {
|
||||
inner: self.inner_ref()?.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
/// The branch this handle is scoped to, or `null` for the main branch.
|
||||
#[napi]
|
||||
pub fn current_branch(&self) -> napi::Result<Option<String>> {
|
||||
Ok(self.inner_ref()?.current_branch())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn optimize(
|
||||
&self,
|
||||
@@ -595,6 +610,43 @@ pub struct IndexConfig {
|
||||
/// Currently this is always an array of size 1. In the future there may
|
||||
/// be more columns to represent composite indices.
|
||||
pub columns: Vec<String>,
|
||||
/// The UUID of the first segment of the index.
|
||||
///
|
||||
/// `undefined` for remote tables, which do not yet surface this.
|
||||
pub index_uuid: Option<String>,
|
||||
/// The protobuf type URL, a precise type identifier for the index.
|
||||
///
|
||||
/// `undefined` for remote tables.
|
||||
pub type_url: Option<String>,
|
||||
/// When the index was created.
|
||||
///
|
||||
/// `undefined` for remote tables or indices created before timestamps were tracked.
|
||||
pub created_at: Option<DateTime<Utc>>,
|
||||
/// The number of rows indexed, across all segments.
|
||||
///
|
||||
/// `undefined` for remote tables.
|
||||
pub num_indexed_rows: Option<i64>,
|
||||
/// The number of rows not yet covered by this index.
|
||||
///
|
||||
/// `undefined` for remote tables.
|
||||
pub num_unindexed_rows: Option<i64>,
|
||||
/// The total size in bytes of all index files across all segments.
|
||||
///
|
||||
/// `undefined` for remote tables or indices without size tracking.
|
||||
pub size_bytes: Option<i64>,
|
||||
/// The number of segments that make up the index.
|
||||
///
|
||||
/// `undefined` for remote tables.
|
||||
pub num_segments: Option<i32>,
|
||||
/// The on-disk index format version.
|
||||
///
|
||||
/// `undefined` for remote tables.
|
||||
pub index_version: Option<i32>,
|
||||
/// Index-type-specific details parsed as a JavaScript object.
|
||||
///
|
||||
/// Falls back to a raw string if JSON parsing fails. `undefined` for
|
||||
/// remote tables or when details are unavailable.
|
||||
pub index_details: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
impl From<lancedb::index::IndexConfig> for IndexConfig {
|
||||
@@ -604,6 +656,17 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
|
||||
index_type,
|
||||
columns: value.columns,
|
||||
name: value.name,
|
||||
index_uuid: value.index_uuid,
|
||||
type_url: value.type_url,
|
||||
created_at: value.created_at,
|
||||
num_indexed_rows: value.num_indexed_rows.map(|n| n as i64),
|
||||
num_unindexed_rows: value.num_unindexed_rows.map(|n| n as i64),
|
||||
size_bytes: value.size_bytes.map(|n| n as i64),
|
||||
num_segments: value.num_segments.map(|n| n as i32),
|
||||
index_version: value.index_version,
|
||||
index_details: value
|
||||
.index_details
|
||||
.and_then(|s| serde_json::from_str(&s).ok()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -838,9 +901,6 @@ pub struct IndexStatistics {
|
||||
pub distance_type: Option<String>,
|
||||
/// The number of parts this index is split into.
|
||||
pub num_indices: Option<u32>,
|
||||
/// The KMeans loss value of the index,
|
||||
/// it is only present for vector indices.
|
||||
pub loss: Option<f64>,
|
||||
}
|
||||
impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
||||
fn from(value: lancedb::index::IndexStatistics) -> Self {
|
||||
@@ -850,7 +910,6 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
||||
index_type: value.index_type.to_string(),
|
||||
distance_type: value.distance_type.map(|d| d.to_string()),
|
||||
num_indices: value.num_indices,
|
||||
loss: value.loss,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1060,6 +1119,13 @@ pub struct TagContents {
|
||||
pub manifest_size: i64,
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub struct BranchContents {
|
||||
pub parent_branch: Option<String>,
|
||||
pub parent_version: i64,
|
||||
pub manifest_size: i64,
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub struct Tags {
|
||||
inner: LanceDbTable,
|
||||
@@ -1128,3 +1194,75 @@ impl Tags {
|
||||
.default_error()
|
||||
}
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub struct Branches {
|
||||
inner: LanceDbTable,
|
||||
}
|
||||
|
||||
#[napi]
|
||||
impl Branches {
|
||||
#[napi]
|
||||
pub async fn list(&self) -> napi::Result<HashMap<String, BranchContents>> {
|
||||
let branches = self.inner.list_branches().await.default_error()?;
|
||||
let result = branches
|
||||
.into_iter()
|
||||
.map(|(k, v)| {
|
||||
(
|
||||
k,
|
||||
BranchContents {
|
||||
parent_branch: v.parent_branch,
|
||||
parent_version: v.parent_version as i64,
|
||||
manifest_size: v.manifest_size as i64,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub async fn create(
|
||||
&self,
|
||||
name: String,
|
||||
from_ref: Option<String>,
|
||||
from_version: Option<i64>,
|
||||
) -> napi::Result<Table> {
|
||||
let from_ref = from_ref.filter(|b| b != "main");
|
||||
let from_version = from_version
|
||||
.map(|v| {
|
||||
u64::try_from(v).map_err(|_| {
|
||||
napi::Error::from_reason("from_version must be a non-negative integer")
|
||||
})
|
||||
})
|
||||
.transpose()?;
|
||||
let from = Ref::Version(from_ref, from_version);
|
||||
let table = self
|
||||
.inner
|
||||
.create_branch(&name, from)
|
||||
.await
|
||||
.default_error()?;
|
||||
Ok(Table::new(table))
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub async fn checkout(&self, name: String, version: Option<i64>) -> napi::Result<Table> {
|
||||
let version = version
|
||||
.map(|v| {
|
||||
u64::try_from(v)
|
||||
.map_err(|_| napi::Error::from_reason("version must be a non-negative integer"))
|
||||
})
|
||||
.transpose()?;
|
||||
let table = self
|
||||
.inner
|
||||
.checkout_branch(&name, version)
|
||||
.await
|
||||
.default_error()?;
|
||||
Ok(Table::new(table))
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub async fn delete(&self, name: String) -> napi::Result<()> {
|
||||
self.inner.delete_branch(&name).await.default_error()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,7 +26,8 @@ lance-namespace-impls.workspace = true
|
||||
lance-io.workspace = true
|
||||
env_logger.workspace = true
|
||||
log.workspace = true
|
||||
pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39"] }
|
||||
pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39", "chrono"] }
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
pyo3-async-runtimes = { version = "0.28", features = [
|
||||
"attributes",
|
||||
"tokio-runtime",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from datetime import timedelta
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional, Tuple, Any, TypedDict, Union, Literal
|
||||
|
||||
import pyarrow as pa
|
||||
@@ -10,6 +10,7 @@ from .index import (
|
||||
IvfSq,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
@@ -47,6 +48,7 @@ class PyExpr:
|
||||
def lower(self) -> "PyExpr": ...
|
||||
def upper(self) -> "PyExpr": ...
|
||||
def contains(self, substr: "PyExpr") -> "PyExpr": ...
|
||||
def isin(self, values: List["PyExpr"]) -> "PyExpr": ...
|
||||
def cast(self, data_type: pa.DataType) -> "PyExpr": ...
|
||||
def to_sql(self) -> str: ...
|
||||
|
||||
@@ -186,6 +188,7 @@ class Table:
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
FTS,
|
||||
],
|
||||
replace: Optional[bool],
|
||||
@@ -202,7 +205,7 @@ class Table:
|
||||
async def prewarm_index(self, index_name: str) -> None: ...
|
||||
async def prewarm_data(self, columns: Optional[List[str]] = None) -> None: ...
|
||||
async def list_indices(self) -> list[IndexConfig]: ...
|
||||
async def delete(self, filter: str) -> DeleteResult: ...
|
||||
async def delete(self, filter: Union[str, PyExpr]) -> DeleteResult: ...
|
||||
async def add_columns(self, columns: list[tuple[str, str]]) -> AddColumnsResult: ...
|
||||
async def add_columns_with_schema(self, schema: pa.Schema) -> AddColumnsResult: ...
|
||||
async def alter_columns(
|
||||
@@ -226,6 +229,9 @@ class Table:
|
||||
async def close_lsm_writers(self) -> None: ...
|
||||
@property
|
||||
def tags(self) -> Tags: ...
|
||||
@property
|
||||
def branches(self) -> Branches: ...
|
||||
def current_branch(self) -> Optional[str]: ...
|
||||
def query(self) -> Query: ...
|
||||
def take_offsets(self, offsets: list[int]) -> TakeQuery: ...
|
||||
def take_row_ids(self, row_ids: list[int]) -> TakeQuery: ...
|
||||
@@ -238,10 +244,30 @@ class Tags:
|
||||
async def delete(self, tag: str): ...
|
||||
async def update(self, tag: str, version: int): ...
|
||||
|
||||
class Branches:
|
||||
async def list(self) -> Dict[str, Any]: ...
|
||||
async def create(
|
||||
self,
|
||||
name: str,
|
||||
from_ref: Optional[str] = None,
|
||||
from_version: Optional[int] = None,
|
||||
) -> Table: ...
|
||||
async def checkout(self, name: str, version: Optional[int] = None) -> Table: ...
|
||||
async def delete(self, name: str) -> None: ...
|
||||
|
||||
class IndexConfig:
|
||||
name: str
|
||||
index_type: str
|
||||
columns: List[str]
|
||||
index_uuid: Optional[str]
|
||||
type_url: Optional[str]
|
||||
created_at: Optional[datetime]
|
||||
num_indexed_rows: Optional[int]
|
||||
num_unindexed_rows: Optional[int]
|
||||
size_bytes: Optional[int]
|
||||
num_segments: Optional[int]
|
||||
index_version: Optional[int]
|
||||
index_details: Optional[Any]
|
||||
|
||||
async def connect(
|
||||
uri: str,
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import os
|
||||
import threading
|
||||
import warnings
|
||||
@@ -37,6 +38,24 @@ class BackgroundEventLoop:
|
||||
|
||||
LOOP = BackgroundEventLoop()
|
||||
|
||||
|
||||
def _new_embedding_executor() -> concurrent.futures.ThreadPoolExecutor:
|
||||
return concurrent.futures.ThreadPoolExecutor(thread_name_prefix="lancedb-embedding")
|
||||
|
||||
|
||||
# Embedding functions can block for a long time -- a heavy local model or an
|
||||
# HTTP request to a remote embeddings API. Running them on asyncio's default
|
||||
# executor lets them starve the unrelated blocking I/O that shares that pool,
|
||||
# so they get a dedicated one. See
|
||||
# https://github.com/lancedb/lancedb/issues/3310.
|
||||
_EMBEDDING_EXECUTOR = _new_embedding_executor()
|
||||
|
||||
|
||||
def embedding_executor() -> concurrent.futures.ThreadPoolExecutor:
|
||||
"""Return the executor dedicated to running blocking embedding calls."""
|
||||
return _EMBEDDING_EXECUTOR
|
||||
|
||||
|
||||
_FORK_WARNED = False
|
||||
|
||||
|
||||
@@ -47,6 +66,12 @@ def _reset_after_fork():
|
||||
# the new state. The Rust-side tokio runtime is reset analogously by a
|
||||
# pthread_atfork hook installed in the _lancedb extension.
|
||||
LOOP._start()
|
||||
# The embedding executor's worker threads are dead in the child as well.
|
||||
# Replace it with a fresh pool (threads are spawned lazily, so this is
|
||||
# cheap); we don't shut down the old one, since joining its dead workers
|
||||
# could hang.
|
||||
global _EMBEDDING_EXECUTOR
|
||||
_EMBEDDING_EXECUTOR = _new_embedding_executor()
|
||||
global _FORK_WARNED
|
||||
if not _FORK_WARNED:
|
||||
_FORK_WARNED = True
|
||||
|
||||
@@ -416,6 +416,8 @@ class DBConnection(EnforceOverrides):
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
branch: Optional[str] = None,
|
||||
version: Optional[int] = None,
|
||||
) -> Table:
|
||||
"""Open a Lance Table in the database.
|
||||
|
||||
@@ -444,6 +446,14 @@ class DBConnection(EnforceOverrides):
|
||||
connection will be inherited by the table, but can be overridden here.
|
||||
See available options at
|
||||
<https://docs.lancedb.com/storage/>
|
||||
branch: str, optional
|
||||
If provided, open a handle scoped to this branch instead of the
|
||||
default branch. Reads and writes operate in the branch's context.
|
||||
version: int, optional
|
||||
If provided, open the table pinned to this version, producing a
|
||||
read-only handle. Composes with ``branch``: when both are given,
|
||||
opens that branch at the version; otherwise opens ``main`` at the
|
||||
version. Call ``checkout_latest`` to return to a writable state.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -958,6 +968,8 @@ class LanceDBConnection(DBConnection):
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
branch: Optional[str] = None,
|
||||
version: Optional[int] = None,
|
||||
) -> LanceTable:
|
||||
"""Open a table in the database.
|
||||
|
||||
@@ -968,6 +980,14 @@ class LanceDBConnection(DBConnection):
|
||||
namespace_path: List[str], optional
|
||||
The namespace to open the table from. When non-empty, the
|
||||
table is resolved through the directory namespace client.
|
||||
branch: str, optional
|
||||
If provided, open a handle scoped to this branch instead of the
|
||||
default branch. Reads and writes operate in the branch's context.
|
||||
version: int, optional
|
||||
If provided, open the table pinned to this version, producing a
|
||||
read-only handle. Composes with ``branch``: when both are given,
|
||||
opens that branch at the version; otherwise opens ``main`` at the
|
||||
version. Call ``checkout_latest`` to return to a writable state.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -987,20 +1007,26 @@ class LanceDBConnection(DBConnection):
|
||||
)
|
||||
|
||||
if namespace_path:
|
||||
return self._namespace_conn().open_table(
|
||||
tbl = self._namespace_conn().open_table(
|
||||
name,
|
||||
namespace_path=namespace_path,
|
||||
storage_options=storage_options,
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
else:
|
||||
tbl = LanceTable.open(
|
||||
self,
|
||||
name,
|
||||
namespace_path=namespace_path,
|
||||
storage_options=storage_options,
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
|
||||
return LanceTable.open(
|
||||
self,
|
||||
name,
|
||||
namespace_path=namespace_path,
|
||||
storage_options=storage_options,
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
if branch is not None:
|
||||
tbl = tbl.branches.checkout(branch, version)
|
||||
elif version is not None:
|
||||
tbl.checkout(version)
|
||||
return tbl
|
||||
|
||||
def clone_table(
|
||||
self,
|
||||
@@ -1641,6 +1667,8 @@ class AsyncConnection(object):
|
||||
location: Optional[str] = None,
|
||||
namespace_client: Optional[Any] = None,
|
||||
managed_versioning: Optional[bool] = None,
|
||||
branch: Optional[str] = None,
|
||||
version: Optional[int] = None,
|
||||
) -> AsyncTable:
|
||||
"""Open a Lance Table in the database.
|
||||
|
||||
@@ -1676,6 +1704,14 @@ class AsyncConnection(object):
|
||||
managed_versioning: bool, optional
|
||||
Whether managed versioning is enabled for this table. If provided,
|
||||
avoids a redundant describe_table call when namespace_client is set.
|
||||
branch: str, optional
|
||||
If provided, open a handle scoped to this branch instead of the
|
||||
default branch. Reads and writes operate in the branch's context.
|
||||
version: int, optional
|
||||
If provided, open the table pinned to this version, producing a
|
||||
read-only handle. Composes with ``branch``: when both are given,
|
||||
opens that branch at the version; otherwise opens ``main`` at the
|
||||
version. Call ``checkout_latest`` to return to a writable state.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -1692,7 +1728,14 @@ class AsyncConnection(object):
|
||||
namespace_client=namespace_client,
|
||||
managed_versioning=managed_versioning,
|
||||
)
|
||||
return AsyncTable(table)
|
||||
tbl = AsyncTable(table)
|
||||
# "main" is the default branch, so treat it as no branch: remote rejects
|
||||
# every branch checkout (even "main"), and the version still applies.
|
||||
if branch is not None and branch != "main":
|
||||
tbl = await tbl.branches.checkout(branch, version)
|
||||
elif version is not None:
|
||||
await tbl.checkout(version)
|
||||
return tbl
|
||||
|
||||
async def clone_table(
|
||||
self,
|
||||
|
||||
@@ -19,7 +19,7 @@ operators::
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Union
|
||||
from typing import Iterable, Union
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -174,6 +174,11 @@ class Expr:
|
||||
"""Return True where the string contains *substr*."""
|
||||
return Expr(self._inner.contains(_coerce(substr)._inner))
|
||||
|
||||
def isin(self, values: "Iterable[ExprLike]") -> "Expr":
|
||||
"""Return True where the value is one of *values* (SQL ``IN``)."""
|
||||
inner = [_coerce(v)._inner for v in values]
|
||||
return Expr(self._inner.isin(inner))
|
||||
|
||||
# ── type cast ────────────────────────────────────────────────────────────
|
||||
|
||||
def cast(self, data_type: Union[str, "pa.DataType"]) -> "Expr":
|
||||
|
||||
@@ -93,6 +93,20 @@ class LabelList:
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Fm:
|
||||
"""Describe an FM-Index configuration.
|
||||
|
||||
`Fm` is a scalar index on string or binary columns that accelerates
|
||||
substring search, i.e. `contains(col, 'needle')`. Unlike the tokenized
|
||||
`FTS` index, it matches arbitrary substrings of the raw bytes.
|
||||
|
||||
For example, it works with `url`, `path`, `content`, etc.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class FTS:
|
||||
"""Describe a FTS index configuration.
|
||||
@@ -828,4 +842,5 @@ __all__ = [
|
||||
"FTS",
|
||||
"Bitmap",
|
||||
"LabelList",
|
||||
"Fm",
|
||||
]
|
||||
|
||||
@@ -5,7 +5,9 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
from .expr import Expr
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .common import DATA
|
||||
@@ -32,6 +34,7 @@ class LanceMergeInsertBuilder(object):
|
||||
self._when_not_matched_insert_all = False
|
||||
self._when_not_matched_by_source_delete = False
|
||||
self._when_not_matched_by_source_condition = None
|
||||
self._when_not_matched_by_source_condition_expr = None
|
||||
self._timeout = None
|
||||
self._use_index = True
|
||||
self._use_lsm_write = None
|
||||
@@ -62,7 +65,7 @@ class LanceMergeInsertBuilder(object):
|
||||
return self
|
||||
|
||||
def when_not_matched_by_source_delete(
|
||||
self, condition: Optional[str] = None
|
||||
self, condition: Union[str, Expr, None] = None
|
||||
) -> LanceMergeInsertBuilder:
|
||||
"""
|
||||
Rows that exist only in the target table (old data) will be
|
||||
@@ -71,13 +74,16 @@ class LanceMergeInsertBuilder(object):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
condition: Optional[str], default None
|
||||
condition: str or :class:`~lancedb.expr.Expr` or None, default None
|
||||
If None then all such rows will be deleted. Otherwise the
|
||||
condition will be used as an SQL filter to limit what rows
|
||||
are deleted.
|
||||
condition will be used as a filter to limit what rows are deleted.
|
||||
Can be a SQL string or a type-safe :class:`~lancedb.expr.Expr`
|
||||
built with :func:`~lancedb.expr.col` and :func:`~lancedb.expr.lit`.
|
||||
"""
|
||||
self._when_not_matched_by_source_delete = True
|
||||
if condition is not None:
|
||||
if isinstance(condition, Expr):
|
||||
self._when_not_matched_by_source_condition_expr = condition._inner
|
||||
elif condition is not None:
|
||||
self._when_not_matched_by_source_condition = condition
|
||||
return self
|
||||
|
||||
|
||||
@@ -58,6 +58,7 @@ from lance_namespace import (
|
||||
ListTablesRequest,
|
||||
DescribeNamespaceRequest,
|
||||
DropTableRequest,
|
||||
RenameTableRequest,
|
||||
ListNamespacesRequest,
|
||||
CreateNamespaceRequest,
|
||||
DropNamespaceRequest,
|
||||
@@ -144,7 +145,12 @@ def _query_to_namespace_request(
|
||||
if query.postfilter is not None:
|
||||
prefilter = not query.postfilter
|
||||
|
||||
k = query.limit if query.limit is not None else 10
|
||||
if query.limit is not None:
|
||||
k = query.limit
|
||||
elif query.vector is None and query.full_text_query is None:
|
||||
k = sys.maxsize
|
||||
else:
|
||||
k = 10
|
||||
|
||||
# Build request kwargs, only including non-None values for optional fields
|
||||
# that Pydantic doesn't accept as None
|
||||
@@ -544,6 +550,8 @@ class LanceNamespaceDBConnection(DBConnection):
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
branch: Optional[str] = None,
|
||||
version: Optional[int] = None,
|
||||
) -> Table:
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
@@ -562,7 +570,7 @@ class LanceNamespaceDBConnection(DBConnection):
|
||||
raise TableNotFoundError(f"Table not found: {'$'.join(table_id)}")
|
||||
raise
|
||||
|
||||
return LanceTable(
|
||||
tbl = LanceTable(
|
||||
self,
|
||||
name,
|
||||
namespace_path=namespace_path,
|
||||
@@ -570,6 +578,11 @@ class LanceNamespaceDBConnection(DBConnection):
|
||||
pushdown_operations=self._namespace_client_pushdown_operations,
|
||||
_async=async_table,
|
||||
)
|
||||
if branch is not None:
|
||||
tbl = tbl.branches.checkout(branch, version)
|
||||
elif version is not None:
|
||||
tbl.checkout(version)
|
||||
return tbl
|
||||
|
||||
@override
|
||||
def drop_table(self, name: str, namespace_path: Optional[List[str]] = None):
|
||||
@@ -592,9 +605,14 @@ class LanceNamespaceDBConnection(DBConnection):
|
||||
cur_namespace_path = []
|
||||
if new_namespace_path is None:
|
||||
new_namespace_path = []
|
||||
raise NotImplementedError(
|
||||
"rename_table is not supported for namespace connections"
|
||||
cur_table_id = cur_namespace_path + [cur_name]
|
||||
new_namespace_id = new_namespace_path if new_namespace_path else None
|
||||
request = RenameTableRequest(
|
||||
id=cur_table_id,
|
||||
new_table_name=new_name,
|
||||
new_namespace_id=new_namespace_id,
|
||||
)
|
||||
self._namespace_client.rename_table(request)
|
||||
|
||||
@override
|
||||
def drop_database(self):
|
||||
@@ -954,7 +972,7 @@ class AsyncLanceNamespaceDBConnection:
|
||||
if mode.lower() not in ["create", "overwrite"]:
|
||||
raise ValueError("mode must be either 'create' or 'overwrite'")
|
||||
validate_table_name(name)
|
||||
return await self._inner.create_table(
|
||||
table = await self._inner.create_table(
|
||||
name,
|
||||
data,
|
||||
schema=schema,
|
||||
@@ -966,6 +984,11 @@ class AsyncLanceNamespaceDBConnection:
|
||||
embedding_functions=embedding_functions,
|
||||
storage_options=storage_options,
|
||||
)
|
||||
return table._set_namespace_context(
|
||||
namespace_path=namespace_path,
|
||||
namespace_client=self._namespace_client,
|
||||
pushdown_operations=self._namespace_client_pushdown_operations,
|
||||
)
|
||||
|
||||
async def open_table(
|
||||
self,
|
||||
@@ -974,12 +997,14 @@ class AsyncLanceNamespaceDBConnection:
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
branch: Optional[str] = None,
|
||||
version: Optional[int] = None,
|
||||
) -> AsyncTable:
|
||||
"""Open an existing table from the namespace."""
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
try:
|
||||
return await self._inner.open_table(
|
||||
table = await self._inner.open_table(
|
||||
name,
|
||||
namespace_path=namespace_path,
|
||||
storage_options=storage_options,
|
||||
@@ -990,6 +1015,17 @@ class AsyncLanceNamespaceDBConnection:
|
||||
table_id = namespace_path + [name]
|
||||
raise TableNotFoundError(f"Table not found: {'$'.join(table_id)}")
|
||||
raise
|
||||
# "main" is the default branch, so treat it as no branch (mirrors the
|
||||
# sync remote path); the version still applies.
|
||||
if branch is not None and branch != "main":
|
||||
table = await table.branches.checkout(branch, version)
|
||||
elif version is not None:
|
||||
await table.checkout(version)
|
||||
return table._set_namespace_context(
|
||||
namespace_path=namespace_path,
|
||||
namespace_client=self._namespace_client,
|
||||
pushdown_operations=self._namespace_client_pushdown_operations,
|
||||
)
|
||||
|
||||
async def drop_table(self, name: str, namespace_path: Optional[List[str]] = None):
|
||||
"""Drop a table from the namespace."""
|
||||
@@ -1006,14 +1042,19 @@ class AsyncLanceNamespaceDBConnection:
|
||||
cur_namespace_path: Optional[List[str]] = None,
|
||||
new_namespace_path: Optional[List[str]] = None,
|
||||
):
|
||||
"""Rename is not supported for namespace connections."""
|
||||
"""Rename a table in the namespace."""
|
||||
if cur_namespace_path is None:
|
||||
cur_namespace_path = []
|
||||
if new_namespace_path is None:
|
||||
new_namespace_path = []
|
||||
raise NotImplementedError(
|
||||
"rename_table is not supported for namespace connections"
|
||||
cur_table_id = cur_namespace_path + [cur_name]
|
||||
new_namespace_id = new_namespace_path if new_namespace_path else None
|
||||
request = RenameTableRequest(
|
||||
id=cur_table_id,
|
||||
new_table_name=new_name,
|
||||
new_namespace_id=new_namespace_id,
|
||||
)
|
||||
self._namespace_client.rename_table(request)
|
||||
|
||||
async def drop_database(self):
|
||||
"""Deprecated method."""
|
||||
|
||||
@@ -383,6 +383,8 @@ class RemoteDBConnection(DBConnection):
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
branch: Optional[str] = None,
|
||||
version: Optional[int] = None,
|
||||
) -> Table:
|
||||
"""Open a Lance Table in the database.
|
||||
|
||||
@@ -393,6 +395,14 @@ class RemoteDBConnection(DBConnection):
|
||||
namespace_path: List[str], optional
|
||||
The namespace to open the table from.
|
||||
None or empty list represents root namespace.
|
||||
branch: str, optional
|
||||
If provided, open a handle scoped to this branch instead of the
|
||||
default branch. Reads and writes operate in the branch's context.
|
||||
version: int, optional
|
||||
If provided, open the table pinned to this version, producing a
|
||||
read-only handle. Composes with ``branch``: when both are given,
|
||||
opens that branch at the version; otherwise opens ``main`` at the
|
||||
version. Call ``checkout_latest`` to return to a writable state.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -409,12 +419,17 @@ class RemoteDBConnection(DBConnection):
|
||||
)
|
||||
|
||||
table = LOOP.run(self._conn.open_table(name, namespace_path=namespace_path))
|
||||
return RemoteTable(
|
||||
tbl = RemoteTable(
|
||||
table,
|
||||
self.db_name,
|
||||
connection_state=self.serialize,
|
||||
namespace_path=namespace_path,
|
||||
)
|
||||
if branch is not None:
|
||||
tbl = tbl.branches.checkout(branch, version)
|
||||
elif version is not None:
|
||||
tbl.checkout(version)
|
||||
return tbl
|
||||
|
||||
def clone_table(
|
||||
self,
|
||||
|
||||
@@ -27,6 +27,9 @@ class LanceDBClientError(RuntimeError):
|
||||
self.request_id = request_id
|
||||
self.status_code = status_code
|
||||
|
||||
def __reduce__(self) -> tuple[type, tuple]:
|
||||
return (self.__class__, (str(self), self.request_id, self.status_code))
|
||||
|
||||
|
||||
class HttpError(LanceDBClientError):
|
||||
"""An error that occurred during an HTTP request.
|
||||
@@ -101,3 +104,19 @@ class RetryError(LanceDBClientError):
|
||||
self.max_request_failures = max_request_failures
|
||||
self.max_connect_failures = max_connect_failures
|
||||
self.max_read_failures = max_read_failures
|
||||
|
||||
def __reduce__(self) -> tuple[type, tuple]:
|
||||
return (
|
||||
self.__class__,
|
||||
(
|
||||
str(self),
|
||||
self.request_id,
|
||||
self.request_failures,
|
||||
self.connect_failures,
|
||||
self.read_failures,
|
||||
self.max_request_failures,
|
||||
self.max_connect_failures,
|
||||
self.max_read_failures,
|
||||
self.status_code,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -56,7 +56,7 @@ from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||
from lancedb.table import _normalize_progress
|
||||
|
||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
|
||||
from ..table import AsyncTable, BlobMode, IndexStatistics, Query, Table, Tags
|
||||
from ..table import AsyncTable, BlobMode, Branches, IndexStatistics, Query, Table, Tags
|
||||
from ..types import BaseTokenizerType
|
||||
|
||||
|
||||
@@ -75,6 +75,9 @@ class RemoteTable(Table):
|
||||
self._connection_state = connection_state
|
||||
self._namespace_path = list(namespace_path or [])
|
||||
self._checkout_version: Optional[int] = None
|
||||
# The branch this handle is scoped to (None == main). Persisted so a
|
||||
# fork/pickle reopen restores the branch instead of reverting to main.
|
||||
self._branch: Optional[str] = None
|
||||
self._pid = os.getpid()
|
||||
|
||||
def _serialized_connection_state(self) -> str:
|
||||
@@ -109,9 +112,14 @@ class RemoteTable(Table):
|
||||
from lancedb import deserialize_conn
|
||||
|
||||
db = deserialize_conn(self._serialized_connection_state(), for_worker=True)
|
||||
table = db.open_table(self._name, namespace_path=self._namespace_path)
|
||||
if self._checkout_version is not None:
|
||||
table.checkout(self._checkout_version)
|
||||
# Reopen on the same branch and pinned version (branch=None / version=None
|
||||
# reproduce the plain main-latest open).
|
||||
table = db.open_table(
|
||||
self._name,
|
||||
namespace_path=self._namespace_path,
|
||||
branch=self._branch,
|
||||
version=self._checkout_version,
|
||||
)
|
||||
|
||||
self._table_handle = table._table
|
||||
self.db_name = table.db_name
|
||||
@@ -124,6 +132,7 @@ class RemoteTable(Table):
|
||||
"name": self.name,
|
||||
"namespace_path": self._namespace_path,
|
||||
"checkout_version": self._checkout_version,
|
||||
"branch": self._branch,
|
||||
}
|
||||
|
||||
def __setstate__(self, state: dict) -> None:
|
||||
@@ -133,6 +142,7 @@ class RemoteTable(Table):
|
||||
self._connection_state = state["connection_state"]
|
||||
self._namespace_path = state["namespace_path"]
|
||||
self._checkout_version = state["checkout_version"]
|
||||
self._branch = state.get("branch")
|
||||
self._pid = None
|
||||
|
||||
@property
|
||||
@@ -160,6 +170,34 @@ class RemoteTable(Table):
|
||||
def tags(self) -> Tags:
|
||||
return Tags(self._table)
|
||||
|
||||
@property
|
||||
def branches(self) -> Branches:
|
||||
"""Branch management for the table.
|
||||
|
||||
``create``/``checkout`` return a new table handle scoped to the branch;
|
||||
writes on it do not affect ``main``.
|
||||
"""
|
||||
return Branches(self)
|
||||
|
||||
def current_branch(self) -> Optional[str]:
|
||||
"""The branch this table handle is scoped to, or ``None`` for ``main``."""
|
||||
return self._table.current_branch()
|
||||
|
||||
def _wrap_branch_handle(
|
||||
self, async_table: AsyncTable, version: Optional[int] = None
|
||||
) -> "RemoteTable":
|
||||
# A branch handle stays a RemoteTable with the same connection context.
|
||||
# Record the branch and version pin so a fork/pickle reopen restores both.
|
||||
handle = RemoteTable(
|
||||
async_table,
|
||||
self.db_name,
|
||||
connection_state=self._connection_state,
|
||||
namespace_path=self._namespace_path,
|
||||
)
|
||||
handle._branch = async_table.current_branch()
|
||||
handle._checkout_version = version
|
||||
return handle
|
||||
|
||||
@cached_property
|
||||
def embedding_functions(self) -> Dict[str, EmbeddingFunctionConfig]:
|
||||
"""
|
||||
|
||||
@@ -30,7 +30,7 @@ from lancedb.scannable import _register_optional_converters, to_scannable
|
||||
|
||||
from . import __version__
|
||||
from lancedb.arrow import peek_reader
|
||||
from lancedb.background_loop import LOOP
|
||||
from lancedb.background_loop import LOOP, embedding_executor
|
||||
from .dependencies import (
|
||||
_check_for_hugging_face,
|
||||
_check_for_lance,
|
||||
@@ -55,11 +55,13 @@ from .index import (
|
||||
Bitmap,
|
||||
IvfRq,
|
||||
LabelList,
|
||||
Fm,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
FTS,
|
||||
)
|
||||
from .expr import Expr
|
||||
from .merge import LanceMergeInsertBuilder
|
||||
from .pydantic import LanceModel, model_to_dict
|
||||
from .query import (
|
||||
@@ -92,6 +94,12 @@ BlobMode = Literal["lazy", "bytes", "descriptions"]
|
||||
_VALID_BLOB_MODES = ("lazy", "bytes", "descriptions")
|
||||
|
||||
|
||||
def _should_push_down_query_table(
|
||||
namespace_client: Optional[Any], pushdown_operations: set
|
||||
) -> bool:
|
||||
return namespace_client is not None and "QueryTable" in pushdown_operations
|
||||
|
||||
|
||||
def _validate_blob_mode(blob_mode: BlobMode) -> None:
|
||||
if blob_mode not in _VALID_BLOB_MODES:
|
||||
modes = ", ".join(repr(mode) for mode in _VALID_BLOB_MODES)
|
||||
@@ -207,6 +215,7 @@ IndexConfigType = Union[
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
FTS,
|
||||
]
|
||||
|
||||
@@ -778,6 +787,19 @@ class Table(ABC):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def branches(self) -> "Branches":
|
||||
"""Branch management for the table.
|
||||
|
||||
Branches are isolated, writable lines of history forked from another
|
||||
branch (or version). Writes on a branch do not affect ``main``.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def current_branch(self) -> Optional[str]:
|
||||
"""The branch this table handle is scoped to, or ``None`` for ``main``."""
|
||||
raise NotImplementedError
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""The number of rows in this Table"""
|
||||
return self.count_rows(None)
|
||||
@@ -923,7 +945,7 @@ class Table(ABC):
|
||||
config : IndexConfigType, optional
|
||||
The index configuration object. If provided, uses the new unified API.
|
||||
Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq,
|
||||
BTree, Bitmap, LabelList, FTS.
|
||||
BTree, Bitmap, LabelList, Fm, FTS.
|
||||
replace : bool, default True
|
||||
Whether to replace an existing index on this column.
|
||||
wait_timeout : timedelta, optional
|
||||
@@ -1516,7 +1538,7 @@ class Table(ABC):
|
||||
) -> MergeResult: ...
|
||||
|
||||
@abstractmethod
|
||||
def delete(self, where: str) -> DeleteResult:
|
||||
def delete(self, where: Union[str, Expr]) -> DeleteResult:
|
||||
"""Delete rows from the table.
|
||||
|
||||
This can be used to delete a single row, many rows, all rows, or
|
||||
@@ -1524,10 +1546,10 @@ class Table(ABC):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
where: str
|
||||
The SQL where clause to use when deleting rows.
|
||||
|
||||
- For example, 'x = 2' or 'x IN (1, 2, 3)'.
|
||||
where: str or :class:`~lancedb.expr.Expr`
|
||||
The filter condition. Can be a SQL string or a type-safe
|
||||
:class:`~lancedb.expr.Expr` built with :func:`~lancedb.expr.col`
|
||||
and :func:`~lancedb.expr.lit`.
|
||||
|
||||
The filter must not be empty, or it will error.
|
||||
|
||||
@@ -2106,22 +2128,27 @@ class LanceTable(Table):
|
||||
"Please install with `pip install pylance`."
|
||||
)
|
||||
|
||||
branch = self.current_branch()
|
||||
version = None if branch is not None else self.version
|
||||
if self._namespace_client is not None:
|
||||
table_id = self._namespace_path + [self.name]
|
||||
return lance.dataset(
|
||||
version=self.version,
|
||||
ds = lance.dataset(
|
||||
version=version,
|
||||
storage_options=self._conn.storage_options,
|
||||
namespace_client=self._namespace_client,
|
||||
table_id=table_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return lance.dataset(
|
||||
self._dataset_path,
|
||||
version=self.version,
|
||||
storage_options=self._conn.storage_options,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
ds = lance.dataset(
|
||||
self._dataset_path,
|
||||
version=version,
|
||||
storage_options=self._conn.storage_options,
|
||||
**kwargs,
|
||||
)
|
||||
if branch is not None:
|
||||
ds = ds.checkout_version((branch, self.version))
|
||||
return ds
|
||||
|
||||
@property
|
||||
def schema(self) -> pa.Schema:
|
||||
@@ -2187,6 +2214,34 @@ class LanceTable(Table):
|
||||
"""
|
||||
return Tags(self._table)
|
||||
|
||||
@property
|
||||
def branches(self) -> "Branches":
|
||||
"""Branch management for the table.
|
||||
|
||||
``create``/``checkout`` return a new table handle scoped to the branch;
|
||||
writes on it do not affect ``main``.
|
||||
"""
|
||||
return Branches(self)
|
||||
|
||||
def current_branch(self) -> Optional[str]:
|
||||
"""The branch this table handle is scoped to, or ``None`` for ``main``."""
|
||||
return self._table.current_branch()
|
||||
|
||||
def _wrap_branch_handle(
|
||||
self, async_table: "AsyncTable", version: Optional[int] = None
|
||||
) -> "LanceTable":
|
||||
# version is unused locally: the pin already lives on async_table and a
|
||||
# local handle is not reopened via a serialized connection.
|
||||
return LanceTable(
|
||||
self._conn,
|
||||
async_table.name,
|
||||
namespace_path=self._namespace_path,
|
||||
namespace_client=self._namespace_client,
|
||||
pushdown_operations=self._pushdown_operations,
|
||||
location=self._location,
|
||||
_async=async_table,
|
||||
)
|
||||
|
||||
def checkout(self, version: Union[int, str]):
|
||||
"""Checkout a version of the table. This is an in-place operation.
|
||||
|
||||
@@ -2333,6 +2388,11 @@ class LanceTable(Table):
|
||||
Returns
|
||||
-------
|
||||
pa.Table"""
|
||||
if _should_push_down_query_table(
|
||||
self._namespace_client, self._pushdown_operations
|
||||
):
|
||||
return self._execute_query(Query()).read_all()
|
||||
|
||||
return LOOP.run(self._table.to_arrow())
|
||||
|
||||
def to_polars(self, batch_size=None) -> "pl.LazyFrame":
|
||||
@@ -2449,7 +2509,7 @@ class LanceTable(Table):
|
||||
config : IndexConfigType, optional
|
||||
The index configuration object. If provided, uses the new unified API.
|
||||
Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq,
|
||||
BTree, Bitmap, LabelList, FTS.
|
||||
BTree, Bitmap, LabelList, Fm, FTS.
|
||||
replace : bool, default True
|
||||
Whether to replace an existing index on this column.
|
||||
wait_timeout : timedelta, optional
|
||||
@@ -3383,8 +3443,9 @@ class LanceTable(Table):
|
||||
)
|
||||
return self
|
||||
|
||||
def delete(self, where: str) -> DeleteResult:
|
||||
return LOOP.run(self._table.delete(where))
|
||||
def delete(self, where: Union[str, Expr]) -> DeleteResult:
|
||||
predicate = where._inner if isinstance(where, Expr) else where
|
||||
return LOOP.run(self._table.delete(predicate))
|
||||
|
||||
def update(
|
||||
self,
|
||||
@@ -3446,9 +3507,14 @@ class LanceTable(Table):
|
||||
batch_size: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> pa.RecordBatchReader:
|
||||
# Branch queries run locally: the server-side query protocol can't
|
||||
# carry a branch yet.
|
||||
# TODO: push down server-side once it can (with remote table support).
|
||||
if (
|
||||
"QueryTable" in self._pushdown_operations
|
||||
and self._namespace_client is not None
|
||||
_should_push_down_query_table(
|
||||
self._namespace_client, self._pushdown_operations
|
||||
)
|
||||
and self.current_branch() is None
|
||||
):
|
||||
from lancedb.namespace import _execute_server_side_query
|
||||
|
||||
@@ -4182,7 +4248,14 @@ class AsyncTable:
|
||||
[AsyncTable.create_index][lancedb.table.AsyncTable.create_index].
|
||||
"""
|
||||
|
||||
def __init__(self, table: LanceDBTable):
|
||||
def __init__(
|
||||
self,
|
||||
table: LanceDBTable,
|
||||
*,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
namespace_client: Optional[Any] = None,
|
||||
pushdown_operations: Optional[set] = None,
|
||||
):
|
||||
"""Create a new AsyncTable object.
|
||||
|
||||
You should not create AsyncTable objects directly.
|
||||
@@ -4191,6 +4264,21 @@ class AsyncTable:
|
||||
[AsyncConnection.open_table][lancedb.AsyncConnection.open_table] to obtain
|
||||
Table objects."""
|
||||
self._inner = table
|
||||
self._namespace_path = namespace_path or []
|
||||
self._namespace_client = namespace_client
|
||||
self._pushdown_operations = pushdown_operations or set()
|
||||
|
||||
def _set_namespace_context(
|
||||
self,
|
||||
*,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
namespace_client: Optional[Any] = None,
|
||||
pushdown_operations: Optional[set] = None,
|
||||
) -> "AsyncTable":
|
||||
self._namespace_path = namespace_path or []
|
||||
self._namespace_client = namespace_client
|
||||
self._pushdown_operations = pushdown_operations or set()
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
return self._inner.__repr__()
|
||||
@@ -4353,12 +4441,20 @@ class AsyncTable:
|
||||
"Please install with `pip install pylance`."
|
||||
)
|
||||
|
||||
return lance.dataset(
|
||||
# lance.dataset() can't open a branch directly, so open the base table
|
||||
# and check out the branch ref (a None branch resolves to main).
|
||||
branch = self.current_branch()
|
||||
table_version = await self.version()
|
||||
version = None if branch is not None else table_version
|
||||
ds = lance.dataset(
|
||||
await self.uri(),
|
||||
version=await self.version(),
|
||||
version=version,
|
||||
storage_options=await self.latest_storage_options(),
|
||||
**kwargs,
|
||||
)
|
||||
if branch is not None:
|
||||
ds = ds.checkout_version((branch, table_version))
|
||||
return ds
|
||||
|
||||
async def to_pandas(self, blob_mode: BlobMode = "lazy", **kwargs) -> "pd.DataFrame":
|
||||
"""Return the table as a pandas DataFrame.
|
||||
@@ -4391,6 +4487,11 @@ class AsyncTable:
|
||||
-------
|
||||
pa.Table
|
||||
"""
|
||||
if _should_push_down_query_table(
|
||||
self._namespace_client, self._pushdown_operations
|
||||
):
|
||||
return (await self._execute_query(Query())).read_all()
|
||||
|
||||
return await self.query().to_arrow()
|
||||
|
||||
async def create_index(
|
||||
@@ -4409,6 +4510,7 @@ class AsyncTable:
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
FTS,
|
||||
]
|
||||
] = None,
|
||||
@@ -4461,12 +4563,14 @@ class AsyncTable:
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
FTS,
|
||||
),
|
||||
):
|
||||
raise TypeError(
|
||||
"config must be an instance of IvfSq, IvfPq, IvfRq, HnswPq, HnswSq,"
|
||||
" BTree, Bitmap, LabelList, or FTS, but got " + str(type(config))
|
||||
" BTree, Bitmap, LabelList, Fm, or FTS, but got "
|
||||
+ str(type(config))
|
||||
)
|
||||
try:
|
||||
await self._inner.create_index(
|
||||
@@ -4908,10 +5012,13 @@ class AsyncTable:
|
||||
if embedding is not None:
|
||||
loop = asyncio.get_running_loop()
|
||||
# This function is likely to block, since it either calls an expensive
|
||||
# function or makes an HTTP request to an embeddings REST API.
|
||||
# function or makes an HTTP request to an embeddings REST API. Run it
|
||||
# on a dedicated executor so it can't starve the default executor that
|
||||
# other blocking I/O shares. See
|
||||
# https://github.com/lancedb/lancedb/issues/3310.
|
||||
return (
|
||||
await loop.run_in_executor(
|
||||
None,
|
||||
embedding_executor(),
|
||||
embedding.function.compute_query_embeddings_with_retry,
|
||||
query,
|
||||
)
|
||||
@@ -5065,6 +5172,14 @@ class AsyncTable:
|
||||
batch_size: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> pa.RecordBatchReader:
|
||||
if _should_push_down_query_table(
|
||||
self._namespace_client, self._pushdown_operations
|
||||
):
|
||||
from lancedb.namespace import _execute_server_side_query
|
||||
|
||||
table_id = self._namespace_path + [self.name]
|
||||
return _execute_server_side_query(self._namespace_client, table_id, query)
|
||||
|
||||
# The sync table calls into this method, so we need to map the
|
||||
# query to the async version of the query and run that here. This is only
|
||||
# used for that code path right now.
|
||||
@@ -5120,6 +5235,7 @@ class AsyncTable:
|
||||
when_not_matched_insert_all=merge._when_not_matched_insert_all,
|
||||
when_not_matched_by_source_delete=merge._when_not_matched_by_source_delete,
|
||||
when_not_matched_by_source_condition=merge._when_not_matched_by_source_condition,
|
||||
when_not_matched_by_source_condition_expr=merge._when_not_matched_by_source_condition_expr,
|
||||
timeout=merge._timeout,
|
||||
use_index=merge._use_index,
|
||||
use_lsm_write=merge._use_lsm_write,
|
||||
@@ -5127,7 +5243,7 @@ class AsyncTable:
|
||||
),
|
||||
)
|
||||
|
||||
async def delete(self, where: str) -> DeleteResult:
|
||||
async def delete(self, where: Union[str, Expr]) -> DeleteResult:
|
||||
"""Delete rows from the table.
|
||||
|
||||
This can be used to delete a single row, many rows, all rows, or
|
||||
@@ -5135,10 +5251,10 @@ class AsyncTable:
|
||||
|
||||
Parameters
|
||||
----------
|
||||
where: str
|
||||
The SQL where clause to use when deleting rows.
|
||||
|
||||
- For example, 'x = 2' or 'x IN (1, 2, 3)'.
|
||||
where: str or :class:`~lancedb.expr.Expr`
|
||||
The filter condition. Can be a SQL string or a type-safe
|
||||
:class:`~lancedb.expr.Expr` built with :func:`~lancedb.expr.col`
|
||||
and :func:`~lancedb.expr.lit`.
|
||||
|
||||
The filter must not be empty, or it will error.
|
||||
|
||||
@@ -5177,7 +5293,8 @@ class AsyncTable:
|
||||
x vector
|
||||
0 3 [5.0, 6.0]
|
||||
"""
|
||||
return await self._inner.delete(where)
|
||||
predicate = where._inner if isinstance(where, Expr) else where
|
||||
return await self._inner.delete(predicate)
|
||||
|
||||
async def update(
|
||||
self,
|
||||
@@ -5473,6 +5590,19 @@ class AsyncTable:
|
||||
"""
|
||||
return AsyncTags(self._inner)
|
||||
|
||||
@property
|
||||
def branches(self) -> AsyncBranches:
|
||||
"""Branch management for the table.
|
||||
|
||||
Branches are isolated, writable lines of history forked from another
|
||||
branch (or version). Writes on a branch do not affect ``main``.
|
||||
"""
|
||||
return AsyncBranches(self._inner)
|
||||
|
||||
def current_branch(self) -> Optional[str]:
|
||||
"""The branch this table handle is scoped to, or ``None`` for ``main``."""
|
||||
return self._inner.current_branch()
|
||||
|
||||
async def optimize(
|
||||
self,
|
||||
*,
|
||||
@@ -5634,8 +5764,6 @@ class IndexStatistics:
|
||||
The distance type used by the index.
|
||||
num_indices: Optional[int]
|
||||
The number of parts the index is split into.
|
||||
loss: Optional[float]
|
||||
The KMeans loss for the index, for only vector indices.
|
||||
"""
|
||||
|
||||
num_indexed_rows: int
|
||||
@@ -5655,7 +5783,6 @@ class IndexStatistics:
|
||||
]
|
||||
distance_type: Optional[Literal["l2", "cosine", "dot"]] = None
|
||||
num_indices: Optional[int] = None
|
||||
loss: Optional[float] = None
|
||||
|
||||
# This exists for backwards compatibility with an older API, which returned
|
||||
# a dictionary instead of a class.
|
||||
@@ -5808,6 +5935,69 @@ class Tags:
|
||||
LOOP.run(self._table.tags.update(tag, version))
|
||||
|
||||
|
||||
class Branches:
|
||||
"""
|
||||
Table branch manager.
|
||||
"""
|
||||
|
||||
def __init__(self, parent: "LanceTable"):
|
||||
self._parent = parent
|
||||
self._table = parent._table
|
||||
|
||||
def list(self) -> Dict[str, Any]:
|
||||
"""List all branches, mapping name to branch metadata."""
|
||||
return LOOP.run(self._table.branches.list())
|
||||
|
||||
def create(
|
||||
self,
|
||||
name: str,
|
||||
from_ref: Optional[str] = None,
|
||||
from_version: Optional[int] = None,
|
||||
) -> "Table":
|
||||
"""Create a branch and return a handle scoped to it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
Name of the new branch.
|
||||
from_ref: str, optional
|
||||
Source branch to fork from. Defaults to ``main``.
|
||||
from_version: int, optional
|
||||
A specific version on ``from_ref`` to fork from. Defaults to latest.
|
||||
"""
|
||||
async_table = LOOP.run(
|
||||
self._table.branches.create(name, from_ref, from_version)
|
||||
)
|
||||
return self._wrap(async_table)
|
||||
|
||||
def checkout(self, name: str, version: Optional[int] = None) -> "Table":
|
||||
"""Check out an existing branch and return a handle scoped to it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
Name of the branch to check out.
|
||||
version: int, optional
|
||||
A specific version on the branch to pin. When set, the returned
|
||||
handle is a read-only view of that version; when omitted it tracks
|
||||
the branch's latest and stays writable.
|
||||
"""
|
||||
async_table = LOOP.run(self._table.branches.checkout(name, version))
|
||||
return self._wrap(async_table, version)
|
||||
|
||||
def delete(self, name: str) -> None:
|
||||
"""Delete a branch."""
|
||||
LOOP.run(self._table.branches.delete(name))
|
||||
|
||||
def _wrap(
|
||||
self, async_table: "AsyncTable", version: Optional[int] = None
|
||||
) -> "Table":
|
||||
# Delegate to the parent so the branch handle keeps its concrete type
|
||||
# (LanceTable / RemoteTable) and connection context; `version` is the
|
||||
# explicit pin so a remote handle can restore branch+version on reopen.
|
||||
return self._parent._wrap_branch_handle(async_table, version)
|
||||
|
||||
|
||||
class AsyncTags:
|
||||
"""
|
||||
Async table tag manager.
|
||||
@@ -5875,3 +6065,56 @@ class AsyncTags:
|
||||
The new table version to tag.
|
||||
"""
|
||||
await self._table.tags.update(tag, version)
|
||||
|
||||
|
||||
class AsyncBranches:
|
||||
"""Async table branch manager."""
|
||||
|
||||
def __init__(self, table):
|
||||
self._table = table
|
||||
|
||||
async def list(self) -> Dict[str, Any]:
|
||||
"""List all branches, mapping name to branch metadata."""
|
||||
return await self._table.branches.list()
|
||||
|
||||
async def create(
|
||||
self,
|
||||
name: str,
|
||||
from_ref: Optional[str] = None,
|
||||
from_version: Optional[int] = None,
|
||||
) -> "AsyncTable":
|
||||
"""Create a branch and return a handle scoped to it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
Name of the new branch.
|
||||
from_ref: str, optional
|
||||
Source branch to fork from. Defaults to ``main``.
|
||||
from_version: int, optional
|
||||
A specific version on ``from_ref`` to fork from. Defaults to latest.
|
||||
"""
|
||||
# "main" and None are two spellings of the root branch in lance; normalize
|
||||
# so from_ref="main" behaves identically to the default.
|
||||
if from_ref == "main":
|
||||
from_ref = None
|
||||
inner = await self._table.branches.create(name, from_ref, from_version)
|
||||
return AsyncTable(inner)
|
||||
|
||||
async def checkout(self, name: str, version: Optional[int] = None) -> "AsyncTable":
|
||||
"""Check out an existing branch and return a handle scoped to it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
Name of the branch to check out.
|
||||
version: int, optional
|
||||
A specific version on the branch to pin. When set, the returned
|
||||
handle is a read-only view of that version; when omitted it tracks
|
||||
the branch's latest and stays writable.
|
||||
"""
|
||||
return AsyncTable(await self._table.branches.checkout(name, version))
|
||||
|
||||
async def delete(self, name: str) -> None:
|
||||
"""Delete a branch."""
|
||||
await self._table.branches.delete(name)
|
||||
|
||||
@@ -385,6 +385,21 @@ def _(value: np.ndarray):
|
||||
return value_to_sql(value.tolist())
|
||||
|
||||
|
||||
@value_to_sql.register(np.bool_)
|
||||
def _(value: np.bool_):
|
||||
return value_to_sql(bool(value))
|
||||
|
||||
|
||||
@value_to_sql.register(np.integer)
|
||||
def _(value: np.integer):
|
||||
return value_to_sql(int(value))
|
||||
|
||||
|
||||
@value_to_sql.register(np.floating)
|
||||
def _(value: np.floating):
|
||||
return value_to_sql(float(value))
|
||||
|
||||
|
||||
def deprecated(func):
|
||||
"""This is a decorator which can be used to mark functions
|
||||
as deprecated. It will result in a warning being emitted
|
||||
|
||||
56
python/python/tests/test_errors.py
Normal file
56
python/python/tests/test_errors.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import pickle
|
||||
|
||||
from lancedb.remote.errors import HttpError, LanceDBClientError, RetryError
|
||||
|
||||
|
||||
def test_pickle_lancedb_client_error():
|
||||
err = LanceDBClientError("something went wrong", "req-123", 400)
|
||||
restored = pickle.loads(pickle.dumps(err))
|
||||
assert str(restored) == "something went wrong"
|
||||
assert restored.request_id == "req-123"
|
||||
assert restored.status_code == 400
|
||||
|
||||
|
||||
def test_pickle_lancedb_client_error_no_status_code():
|
||||
err = LanceDBClientError("fail", "req-456")
|
||||
restored = pickle.loads(pickle.dumps(err))
|
||||
assert str(restored) == "fail"
|
||||
assert restored.request_id == "req-456"
|
||||
assert restored.status_code is None
|
||||
|
||||
|
||||
def test_pickle_http_error():
|
||||
err = HttpError("not found", "req-789", 404)
|
||||
restored = pickle.loads(pickle.dumps(err))
|
||||
assert isinstance(restored, HttpError)
|
||||
assert str(restored) == "not found"
|
||||
assert restored.request_id == "req-789"
|
||||
assert restored.status_code == 404
|
||||
|
||||
|
||||
def test_pickle_retry_error():
|
||||
err = RetryError(
|
||||
"max retries exceeded",
|
||||
"req-abc",
|
||||
request_failures=3,
|
||||
connect_failures=1,
|
||||
read_failures=2,
|
||||
max_request_failures=5,
|
||||
max_connect_failures=3,
|
||||
max_read_failures=3,
|
||||
status_code=503,
|
||||
)
|
||||
restored = pickle.loads(pickle.dumps(err))
|
||||
assert isinstance(restored, RetryError)
|
||||
assert str(restored) == "max retries exceeded"
|
||||
assert restored.request_id == "req-abc"
|
||||
assert restored.request_failures == 3
|
||||
assert restored.connect_failures == 1
|
||||
assert restored.read_failures == 2
|
||||
assert restored.max_request_failures == 5
|
||||
assert restored.max_connect_failures == 3
|
||||
assert restored.max_read_failures == 3
|
||||
assert restored.status_code == 503
|
||||
@@ -20,6 +20,7 @@ from lancedb.index import (
|
||||
IvfRq,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
Fm,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
@@ -113,8 +114,14 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
|
||||
pa.field("user.id", pa.int32()),
|
||||
]
|
||||
)
|
||||
mixed_case_metadata_type = pa.struct([pa.field("userId", pa.int32())])
|
||||
escaped_metadata_type = pa.struct([pa.field("user-id", pa.int32())])
|
||||
literal_type = pa.struct([pa.field("a.b", pa.int32())])
|
||||
data = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array([1, 2, 3], type=pa.int32()),
|
||||
pa.array([1, 2, 3], type=pa.int32()),
|
||||
pa.array([1, 2, 3], type=pa.int32()),
|
||||
pa.array([1, 2, 3], type=pa.int32()),
|
||||
pa.array(
|
||||
[
|
||||
@@ -124,25 +131,67 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
|
||||
],
|
||||
type=metadata_type,
|
||||
),
|
||||
pa.array(
|
||||
[{"userId": 10}, {"userId": 20}, {"userId": 30}],
|
||||
type=mixed_case_metadata_type,
|
||||
),
|
||||
pa.array(
|
||||
[{"user-id": 10}, {"user-id": 20}, {"user-id": 30}],
|
||||
type=escaped_metadata_type,
|
||||
),
|
||||
pa.array(
|
||||
[{"a.b": 10}, {"a.b": 20}, {"a.b": 30}],
|
||||
type=literal_type,
|
||||
),
|
||||
],
|
||||
names=[
|
||||
"rowId",
|
||||
"row-id",
|
||||
"userId",
|
||||
"user_id",
|
||||
"metadata",
|
||||
"MetaData",
|
||||
"meta-data",
|
||||
"literal",
|
||||
],
|
||||
names=["user_id", "metadata"],
|
||||
)
|
||||
table = await db_async.create_table("nested_scalar_index", data)
|
||||
|
||||
await table.create_index("user_id", config=BTree(), name="top_user_id_idx")
|
||||
await table.create_index("rowId", config=BTree(), name="row_id_idx")
|
||||
await table.create_index("`row-id`", config=BTree(), name="row_dash_id_idx")
|
||||
await table.create_index("userId", config=BTree(), name="top_user_id_idx")
|
||||
await table.create_index("user_id", config=BTree(), name="top_snake_user_id_idx")
|
||||
await table.create_index(
|
||||
"metadata.user_id", config=BTree(), name="nested_user_id_idx"
|
||||
)
|
||||
await table.create_index(
|
||||
"metadata.`user.id`", config=BTree(), name="escaped_user_id_idx"
|
||||
)
|
||||
await table.create_index(
|
||||
"MetaData.userId", config=BTree(), name="mixed_case_metadata_user_id_idx"
|
||||
)
|
||||
await table.create_index(
|
||||
"`meta-data`.`user-id`", config=BTree(), name="escaped_names_idx"
|
||||
)
|
||||
await table.create_index("literal.`a.b`", config=BTree(), name="literal_dot_idx")
|
||||
|
||||
columns_by_name = {
|
||||
index.name: index.columns for index in await table.list_indices()
|
||||
}
|
||||
assert columns_by_name["top_user_id_idx"] == ["user_id"]
|
||||
assert columns_by_name["row_id_idx"] == ["rowId"]
|
||||
assert columns_by_name["row_dash_id_idx"] == ["`row-id`"]
|
||||
assert columns_by_name["top_user_id_idx"] == ["userId"]
|
||||
assert columns_by_name["top_snake_user_id_idx"] == ["user_id"]
|
||||
assert columns_by_name["nested_user_id_idx"] == ["metadata.user_id"]
|
||||
assert columns_by_name["escaped_user_id_idx"] == ["metadata.`user.id`"]
|
||||
assert columns_by_name["mixed_case_metadata_user_id_idx"] == ["MetaData.userId"]
|
||||
assert columns_by_name["escaped_names_idx"] == ["`meta-data`.`user-id`"]
|
||||
assert columns_by_name["literal_dot_idx"] == ["literal.`a.b`"]
|
||||
|
||||
for index_name in columns_by_name:
|
||||
stats = await table.index_stats(index_name)
|
||||
assert stats is not None
|
||||
assert stats.num_indexed_rows == 3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -155,6 +204,16 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable):
|
||||
assert indices[0].columns == ["fsb"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_fm_index(some_table: AsyncTable):
|
||||
# FM-Index accelerates substring search on string/binary columns.
|
||||
await some_table.create_index("data", config=Fm())
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type == "Fm"
|
||||
assert indices[0].columns == ["data"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
await some_table.create_index("id", config=Bitmap())
|
||||
@@ -189,6 +248,51 @@ async def test_create_label_list_index(some_table: AsyncTable):
|
||||
await some_table.create_index("tags", config=LabelList())
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
|
||||
plan = await some_table.query().where("array_has(tags, 'tag0')").explain_plan()
|
||||
assert "ScalarIndexQuery" in plan
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_large_list_label_list_index(db_async):
|
||||
data = pa.Table.from_pydict(
|
||||
{"tags": [[f"tag{i % 2}", "shared"] for i in range(16)]},
|
||||
schema=pa.schema([pa.field("tags", pa.large_list(pa.string()))]),
|
||||
)
|
||||
table = await db_async.create_table("large_list_label_list_index", data)
|
||||
|
||||
await table.create_index("tags", config=LabelList())
|
||||
indices = await table.list_indices()
|
||||
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
|
||||
plan = await table.query().where("array_has(tags, 'shared')").explain_plan()
|
||||
assert "ScalarIndexQuery" in plan
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_label_list_index_rejects_list_struct(db_async):
|
||||
item_type = pa.struct(
|
||||
[
|
||||
pa.field("tag", pa.string()),
|
||||
pa.field(
|
||||
"metadata",
|
||||
pa.struct([pa.field("userId", pa.string())]),
|
||||
),
|
||||
]
|
||||
)
|
||||
data = pa.Table.from_pylist(
|
||||
[
|
||||
{
|
||||
"items": [
|
||||
{"tag": "tag0", "metadata": {"userId": "user0"}},
|
||||
{"tag": "shared", "metadata": {"userId": "user1"}},
|
||||
]
|
||||
}
|
||||
],
|
||||
schema=pa.schema([pa.field("items", pa.list_(item_type))]),
|
||||
)
|
||||
table = await db_async.create_table("list_struct_label_list_index", data)
|
||||
|
||||
with pytest.raises(Exception, match="LabelList index cannot be created"):
|
||||
await table.create_index("items", config=LabelList())
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -226,7 +330,6 @@ async def test_create_vector_index(some_table: AsyncTable):
|
||||
assert stats.num_indexed_rows == await some_table.count_rows()
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
assert stats.loss >= 0.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -250,7 +353,6 @@ async def test_create_4bit_ivfpq_index(some_table: AsyncTable):
|
||||
assert stats.num_indexed_rows == await some_table.count_rows()
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
assert stats.loss >= 0.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -5,10 +5,67 @@
|
||||
|
||||
import tempfile
|
||||
import shutil
|
||||
import sys
|
||||
import pytest
|
||||
import pyarrow as pa
|
||||
import lancedb
|
||||
from lance_namespace.errors import NamespaceNotEmptyError, TableNotFoundError
|
||||
from lancedb.table import AsyncTable, LanceTable
|
||||
|
||||
|
||||
PUSHDOWN_DATA = pa.table(
|
||||
{"id": list(range(12)), "text": [f"row-{idx}" for idx in range(12)]}
|
||||
)
|
||||
|
||||
|
||||
def _ipc_file(table: pa.Table = PUSHDOWN_DATA) -> bytes:
|
||||
sink = pa.BufferOutputStream()
|
||||
with pa.ipc.new_file(sink, table.schema) as writer:
|
||||
writer.write_table(table)
|
||||
return sink.getvalue().to_pybytes()
|
||||
|
||||
|
||||
class _FailingSyncInner:
|
||||
name = "hist"
|
||||
|
||||
def current_branch(self):
|
||||
# The pushdown gate only routes server-side when on the default branch.
|
||||
return None
|
||||
|
||||
async def schema(self):
|
||||
return PUSHDOWN_DATA.schema
|
||||
|
||||
async def to_arrow(self):
|
||||
raise RuntimeError("direct table to_arrow should not be used")
|
||||
|
||||
|
||||
class _FailingAsyncInner:
|
||||
def name(self):
|
||||
return "hist"
|
||||
|
||||
async def schema(self):
|
||||
return PUSHDOWN_DATA.schema
|
||||
|
||||
def query(self):
|
||||
raise AssertionError("direct async query should not be used")
|
||||
|
||||
|
||||
class _NamespaceClient:
|
||||
def __init__(self):
|
||||
self.requests = []
|
||||
|
||||
def query_table(self, request):
|
||||
self.requests.append(request)
|
||||
return _ipc_file()
|
||||
|
||||
|
||||
def _namespace_lance_table(namespace_client: _NamespaceClient) -> LanceTable:
|
||||
table = LanceTable.__new__(LanceTable)
|
||||
table._table = _FailingSyncInner()
|
||||
table._namespace_path = ["geneva"]
|
||||
table._namespace_client = namespace_client
|
||||
table._pushdown_operations = {"QueryTable"}
|
||||
return table
|
||||
|
||||
|
||||
class TestNamespaceConnection:
|
||||
@@ -200,8 +257,15 @@ class TestNamespaceConnection:
|
||||
assert table_schema.field("id").type == pa.int64()
|
||||
assert table_schema.field("text").type == pa.string()
|
||||
|
||||
def test_rename_table_not_supported(self):
|
||||
"""Test that rename_table raises NotImplementedError."""
|
||||
def test_rename_table(self):
|
||||
"""Test that rename_table renames a table in the namespace.
|
||||
|
||||
The `dir` namespace implementation in lance-namespace-impls does not
|
||||
implement `rename_table` yet (only the `rest` backend does), so it
|
||||
currently falls back to the default trait method which raises
|
||||
NotSupported. This is expected to start passing once the `dir`
|
||||
backend gains rename_table support upstream.
|
||||
"""
|
||||
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
|
||||
|
||||
# Create a child namespace first
|
||||
@@ -216,9 +280,14 @@ class TestNamespaceConnection:
|
||||
)
|
||||
db.create_table("old_name", schema=schema, namespace_path=["test_ns"])
|
||||
|
||||
# Rename should raise NotImplementedError
|
||||
with pytest.raises(NotImplementedError, match="rename_table is not supported"):
|
||||
db.rename_table("old_name", "new_name")
|
||||
# Rename the table within the same namespace
|
||||
with pytest.raises(NotImplementedError, match="rename_table not implemented"):
|
||||
db.rename_table(
|
||||
"old_name",
|
||||
"new_name",
|
||||
cur_namespace_path=["test_ns"],
|
||||
new_namespace_path=["test_ns"],
|
||||
)
|
||||
|
||||
def test_drop_all_tables(self):
|
||||
"""Test dropping all tables through namespace."""
|
||||
@@ -736,6 +805,22 @@ class TestPushdownOperations:
|
||||
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
|
||||
assert len(db._namespace_client_pushdown_operations) == 0
|
||||
|
||||
def test_lance_table_to_arrow_uses_query_pushdown(self):
|
||||
namespace_client = _NamespaceClient()
|
||||
table = _namespace_lance_table(namespace_client)
|
||||
|
||||
assert table.to_arrow().equals(PUSHDOWN_DATA)
|
||||
assert table.to_pandas()["id"].tolist() == list(range(12))
|
||||
assert len(namespace_client.requests) == 2
|
||||
assert [request.id for request in namespace_client.requests] == [
|
||||
["geneva", "hist"],
|
||||
["geneva", "hist"],
|
||||
]
|
||||
assert [request.k for request in namespace_client.requests] == [
|
||||
sys.maxsize,
|
||||
sys.maxsize,
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestAsyncPushdownOperations:
|
||||
@@ -771,3 +856,39 @@ class TestAsyncPushdownOperations:
|
||||
"""Test that pushdown operations default to empty on async connection."""
|
||||
db = lancedb.connect_namespace_async("dir", {"root": self.temp_dir})
|
||||
assert len(db._namespace_client_pushdown_operations) == 0
|
||||
|
||||
async def test_async_table_to_arrow_uses_query_pushdown(self):
|
||||
namespace_client = _NamespaceClient()
|
||||
|
||||
table = AsyncTable(
|
||||
_FailingAsyncInner(),
|
||||
namespace_path=["geneva"],
|
||||
namespace_client=namespace_client,
|
||||
pushdown_operations={"QueryTable"},
|
||||
)
|
||||
|
||||
assert (await table.to_arrow()).equals(PUSHDOWN_DATA)
|
||||
assert (await table.to_pandas())["id"].tolist() == list(range(12))
|
||||
assert len(namespace_client.requests) == 2
|
||||
assert [request.id for request in namespace_client.requests] == [
|
||||
["geneva", "hist"],
|
||||
["geneva", "hist"],
|
||||
]
|
||||
assert [request.k for request in namespace_client.requests] == [
|
||||
sys.maxsize,
|
||||
sys.maxsize,
|
||||
]
|
||||
|
||||
|
||||
def test_local_table_to_arrow_and_to_pandas_are_unchanged(tmp_path):
|
||||
db = lancedb.connect(str(tmp_path / "db"))
|
||||
table = db.create_table(
|
||||
"local",
|
||||
data=[
|
||||
{"id": 1, "vector": [1.0, 2.0]},
|
||||
{"id": 2, "vector": [3.0, 4.0]},
|
||||
],
|
||||
)
|
||||
|
||||
assert table.to_arrow().column("id").to_pylist() == [1, 2]
|
||||
assert table.to_pandas()["id"].tolist() == [1, 2]
|
||||
|
||||
686
python/python/tests/test_nested_fields.py
Normal file
686
python/python/tests/test_nested_fields.py
Normal file
@@ -0,0 +1,686 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
"""Regression matrix for nested field support across LanceDB Python APIs.
|
||||
|
||||
Covers the lifecycle described in lancedb/lancedb#3406:
|
||||
- Nested scalar, vector, and FTS index creation with full dotted paths
|
||||
- list_indices / index_stats return canonical full paths (not leaf names)
|
||||
- search, filter, append, optimize behaviour
|
||||
- Field-name edge cases: mixed case, literal-dot field names, same-name leaves
|
||||
- Both sync and async Python table APIs
|
||||
|
||||
The matrix uses the following field-name variants from the acceptance criteria:
|
||||
- rowId (camelCase top-level)
|
||||
- `row-id` (hyphenated top-level, escaped)
|
||||
- parent.`leaf.name` (struct leaf whose name contains a literal dot)
|
||||
- MetaData.userId (mixed-case nested path)
|
||||
- `meta-data`.`user-id` (hyphenated struct with hyphenated leaf)
|
||||
|
||||
Note: Lance forbids top-level field names that contain a '.', so the literal-dot
|
||||
edge case is exercised via a struct leaf field (parent.`leaf.name`) instead.
|
||||
"""
|
||||
|
||||
from datetime import timedelta
|
||||
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
import lancedb
|
||||
from lancedb.db import AsyncConnection, DBConnection
|
||||
from lancedb.index import BTree, FTS, IvfPq
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DIM = 8
|
||||
# IvfPq requires at least num_partitions * 256 rows by default; keeping rows
|
||||
# small means we must drop num_sub_vectors and num_partitions very low.
|
||||
NROWS = 256
|
||||
|
||||
|
||||
def _vec(row: int) -> list:
|
||||
return [float((row * DIM + i) % 256) for i in range(DIM)]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sync_db(tmp_path) -> DBConnection:
|
||||
return lancedb.connect(tmp_path)
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_db(tmp_path) -> AsyncConnection:
|
||||
return await lancedb.connect_async(
|
||||
tmp_path, read_consistency_interval=timedelta(seconds=0)
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schema / data builders
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _nested_scalar_schema() -> pa.Schema:
|
||||
"""Schema with nested scalar fields covering the acceptance-criteria names.
|
||||
|
||||
Top-level columns:
|
||||
- rowId int32 (camelCase top-level)
|
||||
- row-id int32 (hyphenated top-level name)
|
||||
- MetaData struct{userId int32} (mixed-case nested path)
|
||||
- meta-data struct{user-id int32} (hyphenated struct + hyphenated leaf)
|
||||
|
||||
Lance disallows top-level field names that contain '.' (e.g. a field
|
||||
literally named 'a.b'), so that edge case is tested separately using
|
||||
_literal_dot_schema() below.
|
||||
"""
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field("rowId", pa.int32()),
|
||||
pa.field("row-id", pa.int32()),
|
||||
pa.field(
|
||||
"MetaData",
|
||||
pa.struct([pa.field("userId", pa.int32())]),
|
||||
),
|
||||
pa.field(
|
||||
"meta-data",
|
||||
pa.struct([pa.field("user-id", pa.int32())]),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _nested_scalar_data(nrows: int = NROWS) -> pa.Table:
|
||||
schema = _nested_scalar_schema()
|
||||
return pa.table(
|
||||
{
|
||||
"rowId": pa.array(list(range(nrows)), pa.int32()),
|
||||
"row-id": pa.array(list(range(nrows)), pa.int32()),
|
||||
"MetaData": pa.array(
|
||||
[{"userId": i} for i in range(nrows)],
|
||||
type=pa.struct([pa.field("userId", pa.int32())]),
|
||||
),
|
||||
"meta-data": pa.array(
|
||||
[{"user-id": i} for i in range(nrows)],
|
||||
type=pa.struct([pa.field("user-id", pa.int32())]),
|
||||
),
|
||||
},
|
||||
schema=schema,
|
||||
)
|
||||
|
||||
|
||||
def _literal_dot_schema() -> pa.Schema:
|
||||
"""Schema where a struct *leaf* field is named with a literal dot.
|
||||
|
||||
The path used in the index API is ``parent.`leaf.name` ``.
|
||||
"""
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int32()),
|
||||
pa.field(
|
||||
"parent",
|
||||
pa.struct([pa.field("leaf.name", pa.int32())]),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _literal_dot_data(nrows: int = NROWS) -> pa.Table:
|
||||
parent_type = pa.struct([pa.field("leaf.name", pa.int32())])
|
||||
return pa.table(
|
||||
{
|
||||
"id": pa.array(list(range(nrows)), pa.int32()),
|
||||
"parent": pa.array(
|
||||
[{"leaf.name": i} for i in range(nrows)],
|
||||
type=parent_type,
|
||||
),
|
||||
},
|
||||
schema=_literal_dot_schema(),
|
||||
)
|
||||
|
||||
|
||||
def _same_leaf_schema() -> pa.Schema:
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field("StructA", pa.struct([pa.field("userId", pa.int32())])),
|
||||
pa.field("StructB", pa.struct([pa.field("userId", pa.int32())])),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _same_leaf_data(nrows: int = NROWS) -> pa.Table:
|
||||
t = pa.struct([pa.field("userId", pa.int32())])
|
||||
return pa.table(
|
||||
{
|
||||
"StructA": pa.array([{"userId": i} for i in range(nrows)], type=t),
|
||||
"StructB": pa.array([{"userId": i * 10} for i in range(nrows)], type=t),
|
||||
},
|
||||
schema=_same_leaf_schema(),
|
||||
)
|
||||
|
||||
|
||||
def _nested_vector_schema() -> pa.Schema:
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int32()),
|
||||
pa.field(
|
||||
"image",
|
||||
pa.struct([pa.field("embedding", pa.list_(pa.float32(), DIM))]),
|
||||
),
|
||||
pa.field(
|
||||
"MetaData",
|
||||
pa.struct([pa.field("userId", pa.int32())]),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _nested_vector_data(nrows: int = NROWS) -> pa.Table:
|
||||
embedding_type = pa.list_(pa.float32(), DIM)
|
||||
image_type = pa.struct([pa.field("embedding", embedding_type)])
|
||||
meta_type = pa.struct([pa.field("userId", pa.int32())])
|
||||
return pa.table(
|
||||
{
|
||||
"id": pa.array(list(range(nrows)), pa.int32()),
|
||||
"image": pa.array(
|
||||
[{"embedding": _vec(i)} for i in range(nrows)],
|
||||
type=image_type,
|
||||
),
|
||||
"MetaData": pa.array(
|
||||
[{"userId": i} for i in range(nrows)],
|
||||
type=meta_type,
|
||||
),
|
||||
},
|
||||
schema=_nested_vector_schema(),
|
||||
)
|
||||
|
||||
|
||||
def _nested_fts_schema() -> pa.Schema:
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int32()),
|
||||
pa.field(
|
||||
"payload",
|
||||
pa.struct([pa.field("text", pa.utf8())]),
|
||||
),
|
||||
pa.field(
|
||||
"MetaData",
|
||||
pa.struct([pa.field("userId", pa.int32())]),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _nested_fts_data(nrows: int = NROWS) -> pa.Table:
|
||||
words = ["alpha", "bravo", "charlie", "delta", "echo"]
|
||||
payload_type = pa.struct([pa.field("text", pa.utf8())])
|
||||
meta_type = pa.struct([pa.field("userId", pa.int32())])
|
||||
return pa.table(
|
||||
{
|
||||
"id": pa.array(list(range(nrows)), pa.int32()),
|
||||
"payload": pa.array(
|
||||
[{"text": words[i % len(words)]} for i in range(nrows)],
|
||||
type=payload_type,
|
||||
),
|
||||
"MetaData": pa.array(
|
||||
[{"userId": i} for i in range(nrows)],
|
||||
type=meta_type,
|
||||
),
|
||||
},
|
||||
schema=_nested_fts_schema(),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _columns_by_name_sync(tbl) -> dict:
|
||||
return {idx.name: idx.columns for idx in tbl.list_indices()}
|
||||
|
||||
|
||||
async def _columns_by_name_async(tbl) -> dict:
|
||||
return {idx.name: idx.columns for idx in await tbl.list_indices()}
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# SYNC TESTS
|
||||
# ===========================================================================
|
||||
#
|
||||
# The sync LanceTable API uses:
|
||||
# - create_scalar_index(column, ...) for scalar (BTree/Bitmap/LabelList) indices
|
||||
# - create_fts_index(column, ...) for full-text-search indices
|
||||
# - create_index(...) for vector indices (older positional API)
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
class TestNestedScalarIndexSync:
|
||||
"""Sync regression matrix for nested scalar (BTree) indices."""
|
||||
|
||||
def test_top_level_camelcase_field(self, sync_db):
|
||||
"""list_indices must return the full camelCase field name."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index("rowId", index_type="BTREE", name="rowid_idx")
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["rowid_idx"] == ["rowId"], (
|
||||
"list_indices must return 'rowId', not a truncated leaf name"
|
||||
)
|
||||
|
||||
def test_top_level_hyphenated_field_escaped(self, sync_db):
|
||||
"""Top-level field 'row-id' (hyphenated) accessed via escaped path."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index("`row-id`", index_type="BTREE", name="rowid_hyph_idx")
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["rowid_hyph_idx"] == ["`row-id`"], (
|
||||
"list_indices must return escaped path '`row-id`'"
|
||||
)
|
||||
|
||||
def test_struct_leaf_literal_dot_field_escaped(self, sync_db):
|
||||
"""Struct leaf with a literal-dot name: parent.`leaf.name`.
|
||||
|
||||
The index listing must use the full escaped path, not just the leaf.
|
||||
"""
|
||||
tbl = sync_db.create_table("t", _literal_dot_data())
|
||||
tbl.create_scalar_index(
|
||||
"parent.`leaf.name`", index_type="BTREE", name="leaf_dot_idx"
|
||||
)
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["leaf_dot_idx"] == ["parent.`leaf.name`"], (
|
||||
"list_indices must return 'parent.`leaf.name`', not just '`leaf.name`'"
|
||||
)
|
||||
|
||||
def test_nested_mixed_case_path(self, sync_db):
|
||||
"""Nested path MetaData.userId (mixed case) must appear as full path."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"MetaData.userId", index_type="BTREE", name="metadata_userid_idx"
|
||||
)
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["metadata_userid_idx"] == ["MetaData.userId"], (
|
||||
"list_indices must return 'MetaData.userId', not leaf 'userId'"
|
||||
)
|
||||
|
||||
def test_nested_hyphenated_path_escaped(self, sync_db):
|
||||
"""`meta-data`.`user-id` path with both parts escaped."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"`meta-data`.`user-id`", index_type="BTREE", name="metauid_idx"
|
||||
)
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["metauid_idx"] == ["`meta-data`.`user-id`"], (
|
||||
"list_indices must return '`meta-data`.`user-id`', not 'user-id'"
|
||||
)
|
||||
|
||||
def test_filter_on_nested_mixed_case(self, sync_db):
|
||||
"""WHERE filter on a nested dotted path works after index creation."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"MetaData.userId", index_type="BTREE", name="metadata_userid_idx"
|
||||
)
|
||||
rows = tbl.search().where("MetaData.userId = 5").to_list()
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["MetaData"]["userId"] == 5
|
||||
|
||||
def test_append_and_list_indices_stable(self, sync_db):
|
||||
"""After appending rows the index listing must remain unchanged."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"MetaData.userId", index_type="BTREE", name="meta_uid_idx"
|
||||
)
|
||||
tbl.add(_nested_scalar_data(nrows=4))
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["meta_uid_idx"] == ["MetaData.userId"]
|
||||
|
||||
def test_optimize_and_list_indices_stable(self, tmp_path):
|
||||
"""After optimize the index listing must still show full paths."""
|
||||
db = lancedb.connect(tmp_path / "opt_db")
|
||||
tbl = db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"MetaData.userId", index_type="BTREE", name="meta_uid_idx"
|
||||
)
|
||||
tbl.add(_nested_scalar_data(nrows=4))
|
||||
tbl.optimize()
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["meta_uid_idx"] == ["MetaData.userId"]
|
||||
|
||||
def test_same_name_leaves_are_distinct(self, sync_db):
|
||||
"""Two structs sharing a leaf name must produce distinct index paths."""
|
||||
tbl = sync_db.create_table("same_leaf", _same_leaf_data())
|
||||
tbl.create_scalar_index(
|
||||
"StructA.userId", index_type="BTREE", name="a_userid_idx"
|
||||
)
|
||||
tbl.create_scalar_index(
|
||||
"StructB.userId", index_type="BTREE", name="b_userid_idx"
|
||||
)
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["a_userid_idx"] == ["StructA.userId"]
|
||||
assert col_map["b_userid_idx"] == ["StructB.userId"]
|
||||
|
||||
def test_index_stats_canonical_path(self, sync_db):
|
||||
"""index_stats round-trip: create on nested field, verify row count."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"MetaData.userId", index_type="BTREE", name="meta_uid_idx"
|
||||
)
|
||||
stats = tbl.index_stats("meta_uid_idx")
|
||||
assert stats is not None
|
||||
assert stats.index_type == "BTREE"
|
||||
assert stats.num_indexed_rows == NROWS
|
||||
|
||||
|
||||
class TestNestedVectorIndexSync:
|
||||
"""Sync regression matrix for nested vector (IvfPq) indices."""
|
||||
|
||||
def test_nested_vector_index_full_path(self, sync_db):
|
||||
"""Listing after vector index creation must use the full dotted path."""
|
||||
tbl = sync_db.create_table("vt", _nested_vector_data())
|
||||
tbl.create_index(
|
||||
num_partitions=2,
|
||||
num_sub_vectors=2,
|
||||
vector_column_name="image.embedding",
|
||||
name="image_emb_idx",
|
||||
)
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["image_emb_idx"] == ["image.embedding"], (
|
||||
"list_indices must return 'image.embedding', not leaf 'embedding'"
|
||||
)
|
||||
|
||||
def test_nested_vector_search(self, sync_db):
|
||||
"""Vector search on nested embedding field must return results."""
|
||||
tbl = sync_db.create_table("vt", _nested_vector_data())
|
||||
tbl.create_index(
|
||||
num_partitions=2,
|
||||
num_sub_vectors=2,
|
||||
vector_column_name="image.embedding",
|
||||
name="image_emb_idx",
|
||||
)
|
||||
results = (
|
||||
tbl.search(_vec(0), vector_column_name="image.embedding").limit(5).to_list()
|
||||
)
|
||||
assert len(results) > 0
|
||||
|
||||
def test_nested_vector_index_stats(self, sync_db):
|
||||
"""index_stats for a nested vector index must reflect correct row count."""
|
||||
tbl = sync_db.create_table("vt", _nested_vector_data())
|
||||
tbl.create_index(
|
||||
num_partitions=2,
|
||||
num_sub_vectors=2,
|
||||
vector_column_name="image.embedding",
|
||||
name="image_emb_idx",
|
||||
)
|
||||
stats = tbl.index_stats("image_emb_idx")
|
||||
assert stats is not None
|
||||
assert stats.num_indexed_rows == NROWS
|
||||
|
||||
def test_nested_vector_append_optimize(self, tmp_path):
|
||||
"""After append and optimize the vector index listing must be stable."""
|
||||
db = lancedb.connect(tmp_path / "vec_opt_db")
|
||||
tbl = db.create_table("vt", _nested_vector_data())
|
||||
tbl.create_index(
|
||||
num_partitions=2,
|
||||
num_sub_vectors=2,
|
||||
vector_column_name="image.embedding",
|
||||
name="image_emb_idx",
|
||||
)
|
||||
tbl.add(_nested_vector_data(nrows=4))
|
||||
tbl.optimize()
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["image_emb_idx"] == ["image.embedding"]
|
||||
|
||||
|
||||
class TestNestedFTSIndexSync:
|
||||
"""Sync regression matrix for nested FTS indices."""
|
||||
|
||||
def test_nested_fts_index_full_path(self, sync_db):
|
||||
"""FTS index on payload.text must be listed with the full path."""
|
||||
tbl = sync_db.create_table("ft", _nested_fts_data())
|
||||
tbl.create_fts_index("payload.text", name="payload_text_idx")
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["payload_text_idx"] == ["payload.text"], (
|
||||
"list_indices must return 'payload.text', not leaf 'text'"
|
||||
)
|
||||
|
||||
def test_nested_fts_search(self, sync_db):
|
||||
"""FTS search on a nested text field must return correct results."""
|
||||
tbl = sync_db.create_table("ft", _nested_fts_data())
|
||||
tbl.create_fts_index("payload.text", name="payload_text_idx")
|
||||
results = (
|
||||
tbl.search("alpha", query_type="fts", fts_columns="payload.text")
|
||||
.limit(10)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) > 0
|
||||
assert all(row["payload"]["text"] == "alpha" for row in results)
|
||||
|
||||
def test_nested_fts_append_optimize(self, tmp_path):
|
||||
"""After append and optimize the FTS index listing must be stable."""
|
||||
db = lancedb.connect(tmp_path / "fts_opt_db")
|
||||
tbl = db.create_table("ft", _nested_fts_data())
|
||||
tbl.create_fts_index("payload.text", name="payload_text_idx")
|
||||
tbl.add(_nested_fts_data(nrows=4))
|
||||
tbl.optimize()
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["payload_text_idx"] == ["payload.text"]
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# ASYNC TESTS
|
||||
# ===========================================================================
|
||||
#
|
||||
# The async AsyncTable API uses create_index(column, config=...) uniformly
|
||||
# for scalar, vector, and FTS indices.
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
class TestNestedScalarIndexAsync:
|
||||
"""Async regression matrix for nested scalar (BTree) indices."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_top_level_camelcase_field(self, async_db):
|
||||
"""list_indices must return the full camelCase field name."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index("rowId", config=BTree(), name="rowid_idx")
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["rowid_idx"] == ["rowId"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_top_level_hyphenated_field_escaped(self, async_db):
|
||||
"""Hyphenated top-level field accessed via escaped path."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index("`row-id`", config=BTree(), name="rowid_hyph_idx")
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["rowid_hyph_idx"] == ["`row-id`"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_struct_leaf_literal_dot_field_escaped(self, async_db):
|
||||
"""Struct leaf with a literal-dot name: parent.`leaf.name`."""
|
||||
tbl = await async_db.create_table("t", _literal_dot_data())
|
||||
await tbl.create_index(
|
||||
"parent.`leaf.name`", config=BTree(), name="leaf_dot_idx"
|
||||
)
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["leaf_dot_idx"] == ["parent.`leaf.name`"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_mixed_case_path(self, async_db):
|
||||
"""Mixed-case nested path MetaData.userId must appear as full path."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index(
|
||||
"MetaData.userId", config=BTree(), name="metadata_userid_idx"
|
||||
)
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["metadata_userid_idx"] == ["MetaData.userId"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_hyphenated_path_escaped(self, async_db):
|
||||
"""`meta-data`.`user-id` path with both parts escaped."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index(
|
||||
"`meta-data`.`user-id`", config=BTree(), name="metauid_idx"
|
||||
)
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["metauid_idx"] == ["`meta-data`.`user-id`"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_filter_on_nested_mixed_case(self, async_db):
|
||||
"""WHERE filter on a nested dotted path works after index creation."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index(
|
||||
"MetaData.userId", config=BTree(), name="metadata_userid_idx"
|
||||
)
|
||||
rows = await tbl.query().where("MetaData.userId = 5").to_list()
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["MetaData"]["userId"] == 5
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_index_stats_canonical_path(self, async_db):
|
||||
"""index_stats round-trip: create on nested field, verify stats."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index("MetaData.userId", config=BTree(), name="meta_uid_idx")
|
||||
stats = await tbl.index_stats("meta_uid_idx")
|
||||
assert stats is not None
|
||||
assert stats.index_type == "BTREE"
|
||||
assert stats.num_indexed_rows == NROWS
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_append_and_list_indices_stable(self, async_db):
|
||||
"""After appending rows the index listing must remain unchanged."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index("MetaData.userId", config=BTree(), name="meta_uid_idx")
|
||||
await tbl.add(_nested_scalar_data(nrows=4))
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["meta_uid_idx"] == ["MetaData.userId"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_optimize_and_list_indices_stable(self, tmp_path):
|
||||
"""After optimize the index listing must still show full paths."""
|
||||
db = await lancedb.connect_async(
|
||||
tmp_path / "opt_db", read_consistency_interval=timedelta(seconds=0)
|
||||
)
|
||||
tbl = await db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index("MetaData.userId", config=BTree(), name="meta_uid_idx")
|
||||
await tbl.add(_nested_scalar_data(nrows=4))
|
||||
await tbl.optimize()
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["meta_uid_idx"] == ["MetaData.userId"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_same_name_leaves_are_distinct(self, async_db):
|
||||
"""Two structs sharing a leaf name must produce distinct index paths."""
|
||||
tbl = await async_db.create_table("same_leaf", _same_leaf_data())
|
||||
await tbl.create_index("StructA.userId", config=BTree(), name="a_userid_idx")
|
||||
await tbl.create_index("StructB.userId", config=BTree(), name="b_userid_idx")
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["a_userid_idx"] == ["StructA.userId"]
|
||||
assert col_map["b_userid_idx"] == ["StructB.userId"]
|
||||
|
||||
|
||||
class TestNestedVectorIndexAsync:
|
||||
"""Async regression matrix for nested vector (IvfPq) indices."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_vector_index_full_path(self, async_db):
|
||||
"""Listing after vector index creation must use the full dotted path."""
|
||||
tbl = await async_db.create_table("vt", _nested_vector_data())
|
||||
await tbl.create_index(
|
||||
"image.embedding",
|
||||
config=IvfPq(num_partitions=2, num_sub_vectors=2),
|
||||
name="image_emb_idx",
|
||||
)
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["image_emb_idx"] == ["image.embedding"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_vector_search(self, async_db):
|
||||
"""Vector search on nested embedding field must return results."""
|
||||
tbl = await async_db.create_table("vt", _nested_vector_data())
|
||||
await tbl.create_index(
|
||||
"image.embedding",
|
||||
config=IvfPq(num_partitions=2, num_sub_vectors=2),
|
||||
name="image_emb_idx",
|
||||
)
|
||||
results = (
|
||||
await tbl.query()
|
||||
.nearest_to(_vec(0))
|
||||
.column("image.embedding")
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) > 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_vector_index_stats(self, async_db):
|
||||
"""index_stats for a nested vector index must reflect correct row count."""
|
||||
tbl = await async_db.create_table("vt", _nested_vector_data())
|
||||
await tbl.create_index(
|
||||
"image.embedding",
|
||||
config=IvfPq(num_partitions=2, num_sub_vectors=2),
|
||||
name="image_emb_idx",
|
||||
)
|
||||
stats = await tbl.index_stats("image_emb_idx")
|
||||
assert stats is not None
|
||||
assert stats.num_indexed_rows == NROWS
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_vector_append_optimize(self, tmp_path):
|
||||
"""After append and optimize the vector index listing must be stable."""
|
||||
db = await lancedb.connect_async(
|
||||
tmp_path / "vec_opt_db", read_consistency_interval=timedelta(seconds=0)
|
||||
)
|
||||
tbl = await db.create_table("vt", _nested_vector_data())
|
||||
await tbl.create_index(
|
||||
"image.embedding",
|
||||
config=IvfPq(num_partitions=2, num_sub_vectors=2),
|
||||
name="image_emb_idx",
|
||||
)
|
||||
await tbl.add(_nested_vector_data(nrows=4))
|
||||
await tbl.optimize()
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["image_emb_idx"] == ["image.embedding"]
|
||||
|
||||
|
||||
class TestNestedFTSIndexAsync:
|
||||
"""Async regression matrix for nested FTS indices."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_fts_index_full_path(self, async_db):
|
||||
"""FTS index on payload.text must be listed with the full path."""
|
||||
tbl = await async_db.create_table("ft", _nested_fts_data())
|
||||
await tbl.create_index("payload.text", config=FTS(), name="payload_text_idx")
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["payload_text_idx"] == ["payload.text"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_fts_search(self, async_db):
|
||||
"""FTS search on a nested text field must return correct results."""
|
||||
tbl = await async_db.create_table("ft", _nested_fts_data())
|
||||
await tbl.create_index("payload.text", config=FTS(), name="payload_text_idx")
|
||||
results = (
|
||||
await tbl.query()
|
||||
.nearest_to_text("alpha", columns="payload.text")
|
||||
.limit(10)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) > 0
|
||||
assert all(row["payload"]["text"] == "alpha" for row in results)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_fts_append_optimize(self, tmp_path):
|
||||
"""After append and optimize the FTS index listing must be stable."""
|
||||
db = await lancedb.connect_async(
|
||||
tmp_path / "fts_opt_db", read_consistency_interval=timedelta(seconds=0)
|
||||
)
|
||||
tbl = await db.create_table("ft", _nested_fts_data())
|
||||
await tbl.create_index("payload.text", config=FTS(), name="payload_text_idx")
|
||||
await tbl.add(_nested_fts_data(nrows=4))
|
||||
await tbl.optimize()
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["payload_text_idx"] == ["payload.text"]
|
||||
@@ -154,6 +154,118 @@ async def test_async_checkout():
|
||||
assert await table.count_rows() == 300
|
||||
|
||||
|
||||
def _branch_open_handler(request):
|
||||
if "/branches/list" in request.path:
|
||||
body = json.dumps(
|
||||
{
|
||||
"branches": {
|
||||
"exp": {
|
||||
"parentBranch": None,
|
||||
"parentVersion": 1,
|
||||
"createAt": 1,
|
||||
"manifestSize": 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
).encode()
|
||||
else:
|
||||
# describe (table open + version/branch validation)
|
||||
body = json.dumps({"version": 2, "schema": {"fields": []}}).encode()
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(body)
|
||||
|
||||
|
||||
def test_remote_open_table_branch_and_version():
|
||||
with mock_lancedb_connection(_branch_open_handler) as db:
|
||||
# version-only (and "main" + version) time-travels the main chain
|
||||
assert db.open_table("test", version=2) is not None
|
||||
assert db.open_table("test", branch="main", version=2).current_branch() is None
|
||||
|
||||
# a non-main branch opens a handle scoped to that branch, with or
|
||||
# without a version
|
||||
assert db.open_table("test", branch="exp").current_branch() == "exp"
|
||||
assert db.open_table("test", branch="exp", version=2).current_branch() == "exp"
|
||||
|
||||
|
||||
def test_remote_table_branches_sync():
|
||||
# Branch CRUD + current_branch on the sync RemoteTable. The handle returned
|
||||
# by create/checkout must stay a RemoteTable scoped to the branch.
|
||||
from lancedb.remote.table import RemoteTable
|
||||
|
||||
def handler(request):
|
||||
if "/branches/list" in request.path:
|
||||
body = json.dumps(
|
||||
{
|
||||
"branches": {
|
||||
"exp": {
|
||||
"parentBranch": None,
|
||||
"parentVersion": 1,
|
||||
"createAt": 1,
|
||||
"manifestSize": 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
).encode()
|
||||
elif "/branches/create" in request.path or "/branches/delete" in request.path:
|
||||
body = b"{}"
|
||||
else:
|
||||
# describe (table open + checkout validation)
|
||||
body = json.dumps({"version": 1, "schema": {"fields": []}}).encode()
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(body)
|
||||
|
||||
with mock_lancedb_connection(handler) as db:
|
||||
table = db.open_table("test")
|
||||
assert isinstance(table, RemoteTable)
|
||||
assert table.current_branch() is None
|
||||
|
||||
branch = table.branches.create("exp")
|
||||
assert isinstance(branch, RemoteTable)
|
||||
assert branch.current_branch() == "exp"
|
||||
|
||||
# list + checkout round trip; checkout also yields a branch-scoped handle
|
||||
assert "exp" in table.branches.list()
|
||||
checked = table.branches.checkout("exp")
|
||||
assert isinstance(checked, RemoteTable)
|
||||
assert checked.current_branch() == "exp"
|
||||
|
||||
table.branches.delete("exp")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_remote_open_table_branch_and_version():
|
||||
async with mock_lancedb_connection_async(_branch_open_handler) as db:
|
||||
# version-only (and "main" + version) time-travels the main chain
|
||||
assert await db.open_table("test", version=2) is not None
|
||||
main_v2 = await db.open_table("test", branch="main", version=2)
|
||||
assert main_v2.current_branch() is None
|
||||
|
||||
# a non-main branch opens a handle scoped to that branch
|
||||
exp = await db.open_table("test", branch="exp")
|
||||
assert exp.current_branch() == "exp"
|
||||
exp_v2 = await db.open_table("test", branch="exp", version=2)
|
||||
assert exp_v2.current_branch() == "exp"
|
||||
|
||||
|
||||
def test_remote_table_branch_survives_pickle():
|
||||
# Regression: a branch-scoped handle must keep its branch across a
|
||||
# pickle/fork round-trip (it used to reopen on main).
|
||||
with mock_lancedb_connection(_branch_open_handler) as db:
|
||||
branch = db.open_table("test", branch="exp")
|
||||
assert branch.current_branch() == "exp"
|
||||
restored = pickle.loads(pickle.dumps(branch))
|
||||
assert restored.current_branch() == "exp"
|
||||
|
||||
# the pinned version is carried through as well
|
||||
branch_v2 = db.open_table("test", branch="exp", version=2)
|
||||
restored_v2 = pickle.loads(pickle.dumps(branch_v2))
|
||||
assert restored_v2.current_branch() == "exp"
|
||||
|
||||
|
||||
def test_table_len_sync():
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/create/?mode=create":
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import warnings
|
||||
from datetime import date, datetime, timedelta
|
||||
from time import sleep
|
||||
@@ -21,6 +22,7 @@ import pytest
|
||||
from lancedb.conftest import MockTextEmbeddingFunction
|
||||
from lancedb.db import AsyncConnection, DBConnection
|
||||
from lancedb.embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
||||
from lancedb.expr import col, lit
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.table import LanceTable
|
||||
from pydantic import BaseModel
|
||||
@@ -927,6 +929,346 @@ async def test_async_tags(mem_db_async: AsyncConnection):
|
||||
)
|
||||
|
||||
|
||||
def test_branches(tmp_path):
|
||||
db = lancedb.connect(tmp_path, read_consistency_interval=timedelta(0))
|
||||
table = db.create_table(
|
||||
"test",
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
],
|
||||
)
|
||||
assert table.count_rows() == 2
|
||||
|
||||
# fork an isolated, writable branch from main
|
||||
branch = table.branches.create("exp")
|
||||
assert branch.count_rows() == 2
|
||||
branch.add(data=[{"vector": [10.0, 11.0], "item": "baz", "price": 30.0}])
|
||||
|
||||
# writes on the branch do not touch main
|
||||
assert branch.count_rows() == 3
|
||||
assert table.count_rows() == 2
|
||||
|
||||
# the branch is listed, with main (None) as its parent
|
||||
branches = table.branches.list()
|
||||
assert "exp" in branches
|
||||
assert branches["exp"]["parent_branch"] is None
|
||||
|
||||
# from_ref="main" is equivalent to the default
|
||||
table.branches.create("exp2", from_ref="main")
|
||||
assert table.branches.list()["exp2"]["parent_branch"] is None
|
||||
|
||||
# checkout returns a handle scoped to the branch's latest
|
||||
checked_out = table.branches.checkout("exp")
|
||||
assert checked_out.count_rows() == 3
|
||||
|
||||
# delete removes it
|
||||
table.branches.delete("exp")
|
||||
table.branches.delete("exp2")
|
||||
assert "exp" not in table.branches.list()
|
||||
|
||||
|
||||
def test_branch_handle_tracks_concurrent_writes(tmp_path):
|
||||
db = lancedb.connect(tmp_path, read_consistency_interval=timedelta(0))
|
||||
table = db.create_table("t", [{"id": 1}])
|
||||
|
||||
# two independent handles on the same branch
|
||||
writer = table.branches.create("exp")
|
||||
reader = db.open_table("t", branch="exp")
|
||||
assert reader.count_rows() == 1
|
||||
|
||||
# a concurrent write on the branch is visible to the other handle
|
||||
writer.add([{"id": 2}])
|
||||
assert reader.count_rows() == 2
|
||||
# main is unaffected
|
||||
assert table.count_rows() == 1
|
||||
|
||||
|
||||
def test_branch_name_validation(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
table = db.create_table("t", [{"id": 1}])
|
||||
|
||||
with pytest.raises(ValueError, match="non-empty"):
|
||||
table.branches.create("")
|
||||
with pytest.raises(ValueError, match="non-empty"):
|
||||
table.branches.checkout("")
|
||||
with pytest.raises(ValueError, match="non-empty"):
|
||||
table.branches.delete("")
|
||||
|
||||
|
||||
def test_branches_preserve_namespace(tmp_path):
|
||||
pytest.importorskip(
|
||||
"lance"
|
||||
) # namespace_path routes through lance's DirectoryNamespace
|
||||
db = lancedb.connect(tmp_path)
|
||||
table = db.create_table("t", [{"id": 1}], namespace_path=["ns1"])
|
||||
assert table.namespace == ["ns1"]
|
||||
|
||||
branch = table.branches.create("exp")
|
||||
assert branch.namespace == ["ns1"]
|
||||
assert branch.id == table.id
|
||||
|
||||
# opening the branch directly also preserves namespace identity
|
||||
opened = db.open_table("t", namespace_path=["ns1"], branch="exp")
|
||||
assert opened.namespace == ["ns1"]
|
||||
|
||||
|
||||
def test_open_table_with_branch(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
table = db.create_table("t", [{"i": 1}])
|
||||
table.branches.create("exp").add([{"i": 2}])
|
||||
|
||||
# open_table(branch=...) returns a handle scoped to the branch
|
||||
assert db.open_table("t", branch="exp").count_rows() == 2
|
||||
# opening without branch still tracks main
|
||||
assert db.open_table("t").count_rows() == 1
|
||||
|
||||
|
||||
def test_open_table_with_branch_version(tmp_path):
|
||||
db = lancedb.connect(tmp_path, read_consistency_interval=timedelta(0))
|
||||
|
||||
# main: a single fork-point row
|
||||
t = db.create_table("t", [{"i": 0}])
|
||||
main_v1 = t.version
|
||||
|
||||
# fork "exp", then advance exp AND main independently past the fork so they
|
||||
# diverge while sharing version numbers
|
||||
exp = t.branches.create("exp")
|
||||
exp.add([{"i": 1}]) # exp: {0, 1}
|
||||
exp_v2 = exp.version
|
||||
exp.add([{"i": 2}]) # exp HEAD: {0, 1, 2}
|
||||
t.add([{"i": 100}, {"i": 101}, {"i": 102}]) # main HEAD: {0, 100, 101, 102}
|
||||
assert exp_v2 == t.version, "branch and main must share the version number"
|
||||
|
||||
# open exp at the shared version: the data must be exp's, not main's. count
|
||||
# alone cannot prove this (main@v2 also exists), so assert provenance by
|
||||
# content.
|
||||
pinned = db.open_table("t", branch="exp", version=exp_v2)
|
||||
assert pinned.current_branch() == "exp"
|
||||
assert pinned.count_rows() == 2 # not exp HEAD (3), not main@v2 (4)
|
||||
assert pinned.count_rows("i = 1") == 1 # exp's post-fork row is visible
|
||||
assert pinned.count_rows("i = 100") == 0 # main's divergent rows are invisible
|
||||
|
||||
# the same coordinate is reachable directly via branches.checkout(name, version)
|
||||
pinned_direct = t.branches.checkout("exp", exp_v2)
|
||||
assert pinned_direct.current_branch() == "exp"
|
||||
assert pinned_direct.count_rows() == 2
|
||||
|
||||
# the HEADs are unaffected
|
||||
assert db.open_table("t", branch="exp").count_rows() == 3
|
||||
assert db.open_table("t").count_rows() == 4
|
||||
|
||||
# version-only (no branch) time-travels main itself: its fork-point version
|
||||
# holds only main's first row, and the shared version number resolves to
|
||||
# main's data, not the branch's ("opens main at the version")
|
||||
old_main = db.open_table("t", version=main_v1)
|
||||
assert old_main.current_branch() is None
|
||||
assert old_main.count_rows() == 1
|
||||
shared_on_main = db.open_table("t", version=exp_v2)
|
||||
assert shared_on_main.current_branch() is None
|
||||
assert shared_on_main.count_rows() == 4
|
||||
|
||||
# detached head: writing to a pinned version is rejected
|
||||
with pytest.raises((ValueError, RuntimeError), match="cannot be modified"):
|
||||
pinned.add([{"i": 9}])
|
||||
|
||||
# a nonexistent version is rejected -- on main, and on a branch (a distinct
|
||||
# resolution path, on the branch's manifests)
|
||||
with pytest.raises((ValueError, RuntimeError)):
|
||||
db.open_table("t", version=9999)
|
||||
with pytest.raises((ValueError, RuntimeError)):
|
||||
db.open_table("t", branch="exp", version=9999)
|
||||
|
||||
# checkout_latest re-attaches the pinned handle to the BRANCH's HEAD
|
||||
# (writable again), not main's HEAD, and not staying pinned
|
||||
pinned.checkout_latest()
|
||||
assert pinned.current_branch() == "exp"
|
||||
assert pinned.count_rows() == 3 # exp HEAD, not main's 4
|
||||
pinned.add([{"i": 3}])
|
||||
assert pinned.count_rows() == 4 # writable again
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_namespace_open_table_with_branch(tmp_path):
|
||||
pytest.importorskip("lance") # "dir" impl is lance.namespace.DirectoryNamespace
|
||||
db = lancedb.connect_namespace_async("dir", {"root": str(tmp_path)})
|
||||
await db.create_namespace(["ns1"])
|
||||
table = await db.create_table("t", [{"id": 1}], namespace_path=["ns1"])
|
||||
branch = await table.branches.create("exp")
|
||||
await branch.add([{"id": 2}])
|
||||
|
||||
# open_table(branch=...) on the async namespace connection must work
|
||||
opened = await db.open_table("t", namespace_path=["ns1"], branch="exp")
|
||||
assert await opened.count_rows() == 2
|
||||
|
||||
|
||||
def test_namespace_open_table_with_branch_version(tmp_path):
|
||||
pytest.importorskip("lance") # "dir" impl is lance.namespace.DirectoryNamespace
|
||||
db = lancedb.connect_namespace("dir", {"root": str(tmp_path)})
|
||||
db.create_namespace(["ns1"])
|
||||
t = db.create_table("t", [{"i": 0}], namespace_path=["ns1"])
|
||||
|
||||
# fork "exp", then advance exp AND main past the fork so they diverge while
|
||||
# sharing version numbers
|
||||
exp = t.branches.create("exp")
|
||||
exp.add([{"i": 1}])
|
||||
exp_v2 = exp.version
|
||||
exp.add([{"i": 2}])
|
||||
t.add([{"i": 100}, {"i": 101}, {"i": 102}])
|
||||
assert exp_v2 == t.version, "branch and main must share the version number"
|
||||
|
||||
# open_table(branch=, version=) on the namespace connection reads the
|
||||
# branch's data at that version, not main's
|
||||
pinned = db.open_table("t", namespace_path=["ns1"], branch="exp", version=exp_v2)
|
||||
assert pinned.current_branch() == "exp"
|
||||
assert pinned.count_rows() == 2 # not exp HEAD (3), not main@v2 (4)
|
||||
assert pinned.count_rows("i = 1") == 1 # exp's post-fork row is visible
|
||||
assert pinned.count_rows("i = 100") == 0 # main's divergent rows are invisible
|
||||
assert db.open_table("t", namespace_path=["ns1"], branch="exp").count_rows() == 3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_namespace_open_table_with_branch_version(tmp_path):
|
||||
pytest.importorskip("lance") # "dir" impl is lance.namespace.DirectoryNamespace
|
||||
db = lancedb.connect_namespace_async("dir", {"root": str(tmp_path)})
|
||||
await db.create_namespace(["ns1"])
|
||||
t = await db.create_table("t", [{"i": 0}], namespace_path=["ns1"])
|
||||
|
||||
# fork "exp", then advance exp AND main past the fork so they diverge while
|
||||
# sharing version numbers
|
||||
exp = await t.branches.create("exp")
|
||||
await exp.add([{"i": 1}])
|
||||
exp_v2 = await exp.version()
|
||||
await exp.add([{"i": 2}])
|
||||
await t.add([{"i": 100}, {"i": 101}, {"i": 102}])
|
||||
assert exp_v2 == await t.version(), "branch and main must share the version number"
|
||||
|
||||
# open_table(branch=, version=) on the async namespace connection reads the
|
||||
# branch's data at that version, not main's
|
||||
pinned = await db.open_table(
|
||||
"t", namespace_path=["ns1"], branch="exp", version=exp_v2
|
||||
)
|
||||
assert pinned.current_branch() == "exp"
|
||||
assert await pinned.count_rows() == 2 # not exp HEAD (3), not main@v2 (4)
|
||||
assert await pinned.count_rows("i = 1") == 1 # exp's post-fork row is visible
|
||||
assert await pinned.count_rows("i = 100") == 0 # main's rows are invisible
|
||||
assert (
|
||||
await (
|
||||
await db.open_table("t", namespace_path=["ns1"], branch="exp")
|
||||
).count_rows()
|
||||
== 3
|
||||
)
|
||||
|
||||
|
||||
def test_branch_to_lance_targets_branch(tmp_path):
|
||||
pytest.importorskip("lance")
|
||||
db = lancedb.connect(tmp_path)
|
||||
table = db.create_table("t", [{"i": 1}])
|
||||
branch = table.branches.create("exp")
|
||||
branch.add([{"i": 2}]) # branch: 2 rows, main: 1 row
|
||||
|
||||
assert branch.to_lance().count_rows() == 2
|
||||
assert table.to_lance().count_rows() == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_branches(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
table = await db.create_table(
|
||||
"test",
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
],
|
||||
)
|
||||
assert await table.count_rows() == 2
|
||||
|
||||
branch = await table.branches.create("exp")
|
||||
assert await branch.count_rows() == 2
|
||||
await branch.add(data=[{"vector": [10.0, 11.0], "item": "baz", "price": 30.0}])
|
||||
|
||||
assert await branch.count_rows() == 3
|
||||
assert await table.count_rows() == 2
|
||||
|
||||
branches = await table.branches.list()
|
||||
assert "exp" in branches
|
||||
assert branches["exp"]["parent_branch"] is None
|
||||
|
||||
await table.branches.create("exp2", from_ref="main")
|
||||
assert (await table.branches.list())["exp2"]["parent_branch"] is None
|
||||
|
||||
checked_out = await table.branches.checkout("exp")
|
||||
assert await checked_out.count_rows() == 3
|
||||
|
||||
await table.branches.delete("exp")
|
||||
await table.branches.delete("exp2")
|
||||
assert "exp" not in await table.branches.list()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_open_table_with_branch_version(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path, read_consistency_interval=timedelta(0))
|
||||
|
||||
# main: a single fork-point row
|
||||
t = await db.create_table("t", [{"i": 0}])
|
||||
main_v1 = await t.version()
|
||||
|
||||
# fork "exp", then advance exp AND main independently past the fork so they
|
||||
# diverge while sharing version numbers
|
||||
exp = await t.branches.create("exp")
|
||||
await exp.add([{"i": 1}]) # exp: {0, 1}
|
||||
exp_v2 = await exp.version()
|
||||
await exp.add([{"i": 2}]) # exp HEAD: {0, 1, 2}
|
||||
await t.add([{"i": 100}, {"i": 101}, {"i": 102}]) # main HEAD: {0, 100, 101, 102}
|
||||
assert exp_v2 == await t.version(), "branch and main must share the version number"
|
||||
|
||||
# open exp at the shared version: the data must be exp's, not main's. count
|
||||
# alone cannot prove this (main@v2 also exists), so assert provenance by
|
||||
# content.
|
||||
pinned = await db.open_table("t", branch="exp", version=exp_v2)
|
||||
assert pinned.current_branch() == "exp"
|
||||
assert await pinned.count_rows() == 2 # not exp HEAD (3), not main@v2 (4)
|
||||
assert await pinned.count_rows("i = 1") == 1 # exp's post-fork row is visible
|
||||
assert await pinned.count_rows("i = 100") == 0 # main's rows are invisible
|
||||
|
||||
# the same coordinate is reachable directly via branches.checkout(name, version)
|
||||
pinned_direct = await t.branches.checkout("exp", exp_v2)
|
||||
assert pinned_direct.current_branch() == "exp"
|
||||
assert await pinned_direct.count_rows() == 2
|
||||
|
||||
# the HEADs are unaffected
|
||||
assert await (await db.open_table("t", branch="exp")).count_rows() == 3
|
||||
assert await (await db.open_table("t")).count_rows() == 4
|
||||
|
||||
# version-only (no branch) time-travels main itself: its fork-point version
|
||||
# holds only main's first row, and the shared version number resolves to
|
||||
# main's data, not the branch's ("opens main at the version")
|
||||
old_main = await db.open_table("t", version=main_v1)
|
||||
assert old_main.current_branch() is None
|
||||
assert await old_main.count_rows() == 1
|
||||
shared_on_main = await db.open_table("t", version=exp_v2)
|
||||
assert shared_on_main.current_branch() is None
|
||||
assert await shared_on_main.count_rows() == 4
|
||||
|
||||
# detached head: writing to a pinned version is rejected
|
||||
with pytest.raises((ValueError, RuntimeError), match="cannot be modified"):
|
||||
await pinned.add([{"i": 9}])
|
||||
|
||||
# a nonexistent version is rejected -- on main, and on a branch
|
||||
with pytest.raises((ValueError, RuntimeError)):
|
||||
await db.open_table("t", version=9999)
|
||||
with pytest.raises((ValueError, RuntimeError)):
|
||||
await db.open_table("t", branch="exp", version=9999)
|
||||
|
||||
# checkout_latest re-attaches the pinned handle to the BRANCH's HEAD
|
||||
# (writable again), not main's HEAD, and not staying pinned
|
||||
await pinned.checkout_latest()
|
||||
assert pinned.current_branch() == "exp"
|
||||
assert await pinned.count_rows() == 3 # exp HEAD, not main's 4
|
||||
await pinned.add([{"i": 3}])
|
||||
assert await pinned.count_rows() == 4 # writable again
|
||||
|
||||
|
||||
@patch("lancedb.table.AsyncTable.create_index")
|
||||
def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
@@ -1625,6 +1967,38 @@ def test_delete(mem_db: DBConnection):
|
||||
assert table.to_arrow()["id"].to_pylist() == [1]
|
||||
|
||||
|
||||
def test_delete_expr(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"my_table",
|
||||
data=[
|
||||
{"vector": [1.1, 0.9], "id": 0},
|
||||
{"vector": [1.2, 1.9], "id": 1},
|
||||
{"vector": [1.3, 2.9], "id": 2},
|
||||
],
|
||||
)
|
||||
assert len(table) == 3
|
||||
delete_res = table.delete(col("id") == lit(0))
|
||||
assert delete_res.version == 2
|
||||
assert len(table) == 2
|
||||
assert sorted(table.to_arrow()["id"].to_pylist()) == [1, 2]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_delete_expr_async(mem_db_async: AsyncConnection):
|
||||
table = await mem_db_async.create_table(
|
||||
"my_table",
|
||||
data=[
|
||||
{"vector": [1.1, 0.9], "id": 0},
|
||||
{"vector": [1.2, 1.9], "id": 1},
|
||||
{"vector": [1.3, 2.9], "id": 2},
|
||||
],
|
||||
)
|
||||
assert await table.count_rows() == 3
|
||||
await table.delete(col("id") == lit(0))
|
||||
assert await table.count_rows() == 2
|
||||
assert sorted((await table.to_arrow())["id"].to_pylist()) == [1, 2]
|
||||
|
||||
|
||||
def test_update(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"my_table",
|
||||
@@ -1810,6 +2184,50 @@ def test_merge_insert(mem_db: DBConnection):
|
||||
)
|
||||
|
||||
|
||||
def test_merge_insert_by_source_delete_expr(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"my_table",
|
||||
data=pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}),
|
||||
)
|
||||
new_data = pa.table({"a": [2, 4], "b": ["x", "z"]})
|
||||
|
||||
# replace-range, limiting the source-absent delete with an Expr condition
|
||||
merge_insert_res = (
|
||||
table.merge_insert("a")
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.when_not_matched_by_source_delete(col("a") > lit(2))
|
||||
.execute(new_data)
|
||||
)
|
||||
assert merge_insert_res.num_inserted_rows == 1
|
||||
assert merge_insert_res.num_updated_rows == 1
|
||||
assert merge_insert_res.num_deleted_rows == 1
|
||||
|
||||
expected = pa.table({"a": [1, 2, 4], "b": ["a", "x", "z"]})
|
||||
assert table.to_arrow().sort_by("a") == expected
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_merge_insert_by_source_delete_expr_async(
|
||||
mem_db_async: AsyncConnection,
|
||||
):
|
||||
data = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
|
||||
table = await mem_db_async.create_table("some_table", data=data)
|
||||
new_data = pa.table({"a": [2, 4], "b": ["x", "z"]})
|
||||
|
||||
# replace-range, limiting the source-absent delete with an Expr condition
|
||||
await (
|
||||
table.merge_insert("a")
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.when_not_matched_by_source_delete(col("a") > lit(2))
|
||||
.execute(new_data)
|
||||
)
|
||||
|
||||
expected = pa.table({"a": [1, 2, 4], "b": ["a", "x", "z"]})
|
||||
assert (await table.to_arrow()).sort_by("a") == expected
|
||||
|
||||
|
||||
# We vary the data format because there are slight differences in how
|
||||
# subschemas are handled in different formats
|
||||
@pytest.mark.parametrize(
|
||||
@@ -2058,18 +2476,32 @@ def test_create_scalar_index(mem_db: DBConnection):
|
||||
def test_create_index_nested_field_paths(mem_db: DBConnection):
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("rowId", pa.int32()),
|
||||
pa.field("row-id", pa.int32()),
|
||||
pa.field("userId", pa.int32()),
|
||||
pa.field("metadata", pa.struct([pa.field("user_id", pa.int32())])),
|
||||
pa.field("MetaData", pa.struct([pa.field("userId", pa.int32())])),
|
||||
pa.field(
|
||||
"image",
|
||||
pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
|
||||
),
|
||||
pa.field("payload", pa.struct([pa.field("text", pa.string())])),
|
||||
pa.field("meta-data", pa.struct([pa.field("user-id", pa.int32())])),
|
||||
pa.field("literal", pa.struct([pa.field("a.b", pa.int32())])),
|
||||
]
|
||||
)
|
||||
data = pa.Table.from_pylist(
|
||||
[
|
||||
{
|
||||
"rowId": i,
|
||||
"row-id": i,
|
||||
"userId": i,
|
||||
"metadata": {"user_id": i},
|
||||
"MetaData": {"userId": i},
|
||||
"image": {"embedding": [float(i), float(i + 1)]},
|
||||
"payload": {"text": f"document {i}"},
|
||||
"meta-data": {"user-id": i},
|
||||
"literal": {"a.b": i},
|
||||
}
|
||||
for i in range(256)
|
||||
],
|
||||
@@ -2077,19 +2509,37 @@ def test_create_index_nested_field_paths(mem_db: DBConnection):
|
||||
)
|
||||
table = mem_db.create_table("nested_index_paths", data=data)
|
||||
|
||||
table.create_scalar_index("rowId", name="row_id_idx")
|
||||
table.create_scalar_index("`row-id`", name="row_dash_id_idx")
|
||||
table.create_scalar_index("userId", name="top_user_id_idx")
|
||||
table.create_scalar_index("metadata.user_id", name="metadata_user_id_idx")
|
||||
table.create_scalar_index("MetaData.userId", name="mixed_case_metadata_user_id_idx")
|
||||
table.create_scalar_index("`meta-data`.`user-id`", name="escaped_names_idx")
|
||||
table.create_scalar_index("literal.`a.b`", name="literal_dot_idx")
|
||||
table.create_index(
|
||||
vector_column_name="image.embedding",
|
||||
num_partitions=1,
|
||||
num_sub_vectors=1,
|
||||
name="image_embedding_idx",
|
||||
)
|
||||
table.create_fts_index("payload.text", with_position=False, name="payload_text_idx")
|
||||
|
||||
indices = sorted(table.list_indices(), key=lambda idx: idx.name)
|
||||
assert [(idx.name, idx.index_type, idx.columns) for idx in indices] == [
|
||||
("escaped_names_idx", "BTree", ["`meta-data`.`user-id`"]),
|
||||
("image_embedding_idx", "IvfPq", ["image.embedding"]),
|
||||
("literal_dot_idx", "BTree", ["literal.`a.b`"]),
|
||||
("metadata_user_id_idx", "BTree", ["metadata.user_id"]),
|
||||
("mixed_case_metadata_user_id_idx", "BTree", ["MetaData.userId"]),
|
||||
("payload_text_idx", "FTS", ["payload.text"]),
|
||||
("row_dash_id_idx", "BTree", ["`row-id`"]),
|
||||
("row_id_idx", "BTree", ["rowId"]),
|
||||
("top_user_id_idx", "BTree", ["userId"]),
|
||||
]
|
||||
for index in indices:
|
||||
stats = table.index_stats(index.name)
|
||||
assert stats is not None
|
||||
assert stats.num_indexed_rows == 256
|
||||
|
||||
vector_results = (
|
||||
table.search([0.0, 1.0], vector_column_name="image.embedding")
|
||||
@@ -2107,6 +2557,63 @@ def test_create_index_nested_field_paths(mem_db: DBConnection):
|
||||
assert len(filtered_results) == 1
|
||||
assert filtered_results[0]["metadata"]["user_id"] == 42
|
||||
|
||||
escaped_results = table.search().where("`row-id` = 43").limit(1).to_list()
|
||||
assert len(escaped_results) == 1
|
||||
assert escaped_results[0]["row-id"] == 43
|
||||
|
||||
fts_results = table.search("document 44", query_type="fts").limit(1).to_list()
|
||||
assert len(fts_results) == 1
|
||||
assert fts_results[0]["payload"]["text"] == "document 44"
|
||||
|
||||
|
||||
def test_index_config_fields(mem_db: DBConnection):
|
||||
"""Test that IndexConfig exposes the new rich metadata fields."""
|
||||
vec_array = pa.array(
|
||||
[[float(i), float(i + 1)] for i in range(300)], pa.list_(pa.float32(), 2)
|
||||
)
|
||||
data = pa.Table.from_pydict({"x": list(range(300)), "vector": vec_array})
|
||||
table = mem_db.create_table("index_config_fields", data=data)
|
||||
table.create_scalar_index("x", index_type="BTREE")
|
||||
table.create_index(
|
||||
vector_column_name="vector",
|
||||
num_partitions=1,
|
||||
num_sub_vectors=1,
|
||||
)
|
||||
|
||||
indices = {idx.name: idx for idx in table.list_indices()}
|
||||
|
||||
scalar_idx = indices["x_idx"]
|
||||
assert scalar_idx.index_uuid is not None
|
||||
assert isinstance(scalar_idx.index_uuid, str)
|
||||
assert scalar_idx.num_indexed_rows is not None
|
||||
assert scalar_idx.num_indexed_rows == 300
|
||||
assert scalar_idx.num_unindexed_rows is not None
|
||||
assert scalar_idx.num_unindexed_rows == 0
|
||||
assert scalar_idx.num_segments is not None
|
||||
assert scalar_idx.num_segments >= 1
|
||||
assert scalar_idx.size_bytes is not None
|
||||
assert scalar_idx.size_bytes > 0
|
||||
assert scalar_idx.created_at is not None
|
||||
from datetime import datetime, timezone
|
||||
|
||||
assert isinstance(scalar_idx.created_at, datetime)
|
||||
assert scalar_idx.created_at.tzinfo == timezone.utc
|
||||
|
||||
# __getitem__ compatibility
|
||||
assert scalar_idx["index_uuid"] == scalar_idx.index_uuid
|
||||
assert scalar_idx["num_indexed_rows"] == scalar_idx.num_indexed_rows
|
||||
assert scalar_idx["created_at"] == scalar_idx.created_at
|
||||
|
||||
# index_details is parsed from JSON into a Python object
|
||||
assert scalar_idx.index_details is not None
|
||||
assert isinstance(scalar_idx.index_details, dict)
|
||||
assert scalar_idx["index_details"] == scalar_idx.index_details
|
||||
|
||||
vector_idx = indices["vector_idx"]
|
||||
assert vector_idx.index_uuid is not None
|
||||
assert vector_idx.num_indexed_rows == 300
|
||||
assert isinstance(vector_idx.index_details, dict)
|
||||
|
||||
|
||||
def test_empty_query(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
@@ -2837,3 +3344,38 @@ def test_sanitize_data_metadata_not_stripped():
|
||||
assert result_schema.metadata is not None
|
||||
assert result_schema.metadata[b"existing_key"] == b"existing_value"
|
||||
assert result_schema.metadata[b"new_key"] == b"new_value"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_search_runs_embedding_on_dedicated_executor(
|
||||
mem_db_async: AsyncConnection,
|
||||
):
|
||||
# Regression test for #3310: AsyncTable.search() must run the (potentially
|
||||
# blocking) query-embedding call on the dedicated embedding executor, not
|
||||
# asyncio's default executor -- which is shared with other blocking I/O and
|
||||
# can be starved by a slow embedding call under concurrent load.
|
||||
func = MockTextEmbeddingFunction.create()
|
||||
|
||||
class Schema(LanceModel):
|
||||
text: str = func.SourceField()
|
||||
vector: Vector(func.ndims()) = func.VectorField()
|
||||
|
||||
table = await mem_db_async.create_table("embed_executor", schema=Schema)
|
||||
await table.add([{"text": "hello world"}])
|
||||
|
||||
captured_threads: List[str] = []
|
||||
original = MockTextEmbeddingFunction.generate_embeddings
|
||||
|
||||
def record_thread(self, texts):
|
||||
captured_threads.append(threading.current_thread().name)
|
||||
return original(self, texts)
|
||||
|
||||
# Patch only around the search so we capture the query-embedding call, not
|
||||
# the add-time source-embedding call.
|
||||
with patch.object(MockTextEmbeddingFunction, "generate_embeddings", record_thread):
|
||||
await (await table.search("a query string")).limit(1).to_list()
|
||||
|
||||
assert captured_threads, "search did not invoke the embedding function"
|
||||
assert all(name.startswith("lancedb-embedding") for name in captured_threads), (
|
||||
f"embedding ran off the dedicated executor: {captured_threads}"
|
||||
)
|
||||
|
||||
@@ -149,6 +149,21 @@ def test_value_to_sql_dict():
|
||||
assert value_to_sql({}) == "named_struct()"
|
||||
|
||||
|
||||
def test_value_to_sql_numpy_scalars():
|
||||
# numpy scalars (e.g. pulled from an ndarray or a pandas column) must
|
||||
# convert the same way as their native Python counterparts. np.float64
|
||||
# already worked by virtue of subclassing float, but the integer / bool
|
||||
# / float32 scalars previously raised NotImplementedError.
|
||||
import numpy as np
|
||||
|
||||
assert value_to_sql(np.int32(5)) == "5"
|
||||
assert value_to_sql(np.int64(5)) == "5"
|
||||
assert value_to_sql(np.float32(1.5)) == "1.5"
|
||||
assert value_to_sql(np.float64(1.5)) == "1.5"
|
||||
assert value_to_sql(np.bool_(True)) == "TRUE"
|
||||
assert value_to_sql(np.bool_(False)) == "FALSE"
|
||||
|
||||
|
||||
def test_append_vector_columns():
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
registry.register("test")(MockTextEmbeddingFunction)
|
||||
|
||||
@@ -9,7 +9,9 @@
|
||||
|
||||
use arrow::{datatypes::DataType, pyarrow::PyArrowType};
|
||||
use datafusion_common::ScalarValue;
|
||||
use lancedb::expr::{DfExpr, col as ldb_col, contains, expr_cast, lit as df_lit, lower, upper};
|
||||
use lancedb::expr::{
|
||||
DfExpr, col as ldb_col, contains, expr_cast, is_in, lit as df_lit, lower, upper,
|
||||
};
|
||||
use pyo3::types::PyBytes;
|
||||
use pyo3::{Bound, PyAny, PyResult, exceptions::PyValueError, prelude::*, pyfunction};
|
||||
|
||||
@@ -105,6 +107,14 @@ impl PyExpr {
|
||||
Self(contains(self.0.clone(), substr.0.clone()))
|
||||
}
|
||||
|
||||
// ── membership ───────────────────────────────────────────────────────────
|
||||
|
||||
/// Return true where the value is one of the given expressions (SQL ``IN``).
|
||||
fn isin(&self, list: Vec<Self>) -> Self {
|
||||
let items: Vec<DfExpr> = list.into_iter().map(|e| e.0).collect();
|
||||
Self(is_in(self.0.clone(), items))
|
||||
}
|
||||
|
||||
// ── type cast ────────────────────────────────────────────────────────────
|
||||
|
||||
/// Cast the expression to `data_type`.
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use lancedb::index::vector::{
|
||||
IvfFlatIndexBuilder, IvfHnswFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder,
|
||||
IvfPqIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder,
|
||||
};
|
||||
use lancedb::index::{
|
||||
Index as LanceDbIndex,
|
||||
scalar::{BTreeIndexBuilder, FtsIndexBuilder},
|
||||
scalar::{BTreeIndexBuilder, FmIndexBuilder, FtsIndexBuilder},
|
||||
};
|
||||
use pyo3::IntoPyObject;
|
||||
use pyo3::types::PyStringMethods;
|
||||
use pyo3::{
|
||||
Bound, FromPyObject, PyAny, PyResult, Python,
|
||||
Bound, FromPyObject, Py, PyAny, PyResult, Python,
|
||||
exceptions::{PyKeyError, PyValueError},
|
||||
intern, pyclass, pymethods,
|
||||
types::{PyAnyMethods, PyString},
|
||||
@@ -38,6 +39,7 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
||||
"BTree" => Ok(LanceDbIndex::BTree(BTreeIndexBuilder::default())),
|
||||
"Bitmap" => Ok(LanceDbIndex::Bitmap(Default::default())),
|
||||
"LabelList" => Ok(LanceDbIndex::LabelList(Default::default())),
|
||||
"Fm" => Ok(LanceDbIndex::Fm(FmIndexBuilder::default())),
|
||||
"FTS" => {
|
||||
let params = source.extract::<FtsParams>()?;
|
||||
let inner_opts = FtsIndexBuilder::default()
|
||||
@@ -183,7 +185,7 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
||||
Ok(LanceDbIndex::IvfHnswFlat(hnsw_flat_builder))
|
||||
}
|
||||
not_supported => Err(PyValueError::new_err(format!(
|
||||
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, IvfHnswSq, or IvfHnswFlat",
|
||||
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, Fm, FTS, IvfPq, IvfSq, IvfHnswPq, IvfHnswSq, or IvfHnswFlat",
|
||||
not_supported
|
||||
))),
|
||||
}
|
||||
@@ -293,6 +295,26 @@ pub struct IndexConfig {
|
||||
pub columns: Vec<String>,
|
||||
/// Name of the index.
|
||||
pub name: String,
|
||||
/// The UUID of the first segment of the index.
|
||||
pub index_uuid: Option<String>,
|
||||
/// The protobuf type URL, a precise type identifier for the index.
|
||||
pub type_url: Option<String>,
|
||||
/// When the index was created.
|
||||
pub created_at: Option<DateTime<Utc>>,
|
||||
/// The number of rows indexed, across all segments.
|
||||
pub num_indexed_rows: Option<u64>,
|
||||
/// The number of rows not yet covered by this index.
|
||||
pub num_unindexed_rows: Option<u64>,
|
||||
/// The total size in bytes of all index files across all segments.
|
||||
pub size_bytes: Option<u64>,
|
||||
/// The number of segments that make up the index.
|
||||
pub num_segments: Option<u32>,
|
||||
/// The on-disk index format version.
|
||||
pub index_version: Option<i32>,
|
||||
/// Index-type-specific details parsed as a Python object (dict, list, etc.).
|
||||
///
|
||||
/// Falls back to a raw string if JSON parsing fails. `None` when unavailable.
|
||||
pub index_details: Option<Py<PyAny>>,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
@@ -311,18 +333,49 @@ impl IndexConfig {
|
||||
"index_type" => Ok(self.index_type.clone().into_pyobject(py)?.into_any()),
|
||||
"columns" => Ok(self.columns.clone().into_pyobject(py)?.into_any()),
|
||||
"name" | "index_name" => Ok(self.name.clone().into_pyobject(py)?.into_any()),
|
||||
"index_uuid" => Ok(self.index_uuid.clone().into_pyobject(py)?.into_any()),
|
||||
"type_url" => Ok(self.type_url.clone().into_pyobject(py)?.into_any()),
|
||||
"created_at" => Ok(self.created_at.into_pyobject(py)?.into_any()),
|
||||
"num_indexed_rows" => Ok(self.num_indexed_rows.into_pyobject(py)?.into_any()),
|
||||
"num_unindexed_rows" => Ok(self.num_unindexed_rows.into_pyobject(py)?.into_any()),
|
||||
"size_bytes" => Ok(self.size_bytes.into_pyobject(py)?.into_any()),
|
||||
"num_segments" => Ok(self.num_segments.into_pyobject(py)?.into_any()),
|
||||
"index_version" => Ok(self.index_version.into_pyobject(py)?.into_any()),
|
||||
"index_details" => Ok(self
|
||||
.index_details
|
||||
.as_ref()
|
||||
.map(|obj| obj.clone_ref(py))
|
||||
.into_pyobject(py)?
|
||||
.into_any()),
|
||||
_ => Err(PyKeyError::new_err(format!("Invalid key: {}", key))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<lancedb::index::IndexConfig> for IndexConfig {
|
||||
fn from(value: lancedb::index::IndexConfig) -> Self {
|
||||
fn parse_index_details(py: Python<'_>, s: String) -> Py<PyAny> {
|
||||
let json = py.import("json").expect("json module is always available");
|
||||
match json.call_method1("loads", (s.as_str(),)) {
|
||||
Ok(obj) => obj.into_any().unbind(),
|
||||
Err(_) => s.into_pyobject(py).unwrap().into_any().unbind(),
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexConfig {
|
||||
pub fn from_lancedb(py: Python<'_>, value: lancedb::index::IndexConfig) -> Self {
|
||||
let index_type = format!("{:?}", value.index_type);
|
||||
Self {
|
||||
index_type,
|
||||
columns: value.columns,
|
||||
name: value.name,
|
||||
index_uuid: value.index_uuid,
|
||||
type_url: value.type_url,
|
||||
created_at: value.created_at,
|
||||
num_indexed_rows: value.num_indexed_rows,
|
||||
num_unindexed_rows: value.num_unindexed_rows,
|
||||
size_bytes: value.size_bytes,
|
||||
num_segments: value.num_segments,
|
||||
index_version: value.index_version,
|
||||
index_details: value.index_details.map(|s| parse_index_details(py, s)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ use crate::runtime::future_into_py;
|
||||
use crate::{
|
||||
connection::Connection,
|
||||
error::PythonErrorExt,
|
||||
expr::PyExpr,
|
||||
index::{IndexConfig, extract_index_params},
|
||||
query::{Query, TakeQuery},
|
||||
table::scannable::PyScannable,
|
||||
@@ -17,7 +18,7 @@ use arrow::{
|
||||
};
|
||||
use lancedb::table::{
|
||||
AddDataMode, ColumnAlteration, Duration, FieldMetadataUpdate, NewColumnTransform,
|
||||
OptimizeAction, OptimizeOptions, Table as LanceDbTable,
|
||||
OptimizeAction, OptimizeOptions, Ref, Table as LanceDbTable,
|
||||
};
|
||||
use pyo3::{
|
||||
Bound, FromPyObject, Py, PyAny, PyRef, PyResult, Python,
|
||||
@@ -28,6 +29,12 @@ use pyo3::{
|
||||
|
||||
mod scannable;
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
enum PredicateArg {
|
||||
Expr(PyExpr),
|
||||
Sql(String),
|
||||
}
|
||||
|
||||
/// Statistics about a compaction operation.
|
||||
#[pyclass(get_all, from_py_object)]
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -561,10 +568,15 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn delete(self_: PyRef<'_, Self>, condition: String) -> PyResult<Bound<'_, PyAny>> {
|
||||
#[allow(private_interfaces)]
|
||||
pub fn delete(self_: PyRef<'_, Self>, condition: PredicateArg) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let result = inner.delete(&condition).await.infer_error()?;
|
||||
let result = match &condition {
|
||||
PredicateArg::Expr(e) => inner.delete(&e.0).await,
|
||||
PredicateArg::Sql(s) => inner.delete(s.as_str()).await,
|
||||
}
|
||||
.infer_error()?;
|
||||
Ok(DeleteResult::from(result))
|
||||
})
|
||||
}
|
||||
@@ -682,13 +694,13 @@ impl Table {
|
||||
pub fn list_indices(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
Ok(inner
|
||||
.list_indices()
|
||||
.await
|
||||
.infer_error()?
|
||||
.into_iter()
|
||||
.map(IndexConfig::from)
|
||||
.collect::<Vec<_>>())
|
||||
let indices = inner.list_indices().await.infer_error()?;
|
||||
Python::attach(|py| {
|
||||
Ok(indices
|
||||
.into_iter()
|
||||
.map(|idx| IndexConfig::from_lancedb(py, idx))
|
||||
.collect::<Vec<_>>())
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
@@ -711,10 +723,6 @@ impl Table {
|
||||
dict.set_item("num_indices", num_indices)?;
|
||||
}
|
||||
|
||||
if let Some(loss) = stats.loss {
|
||||
dict.set_item("loss", loss)?;
|
||||
}
|
||||
|
||||
Ok(Some(dict.unbind()))
|
||||
})
|
||||
} else {
|
||||
@@ -864,6 +872,15 @@ impl Table {
|
||||
Ok(Tags::new(self.inner_ref()?.clone()))
|
||||
}
|
||||
|
||||
pub fn current_branch(&self) -> PyResult<Option<String>> {
|
||||
Ok(self.inner_ref()?.current_branch())
|
||||
}
|
||||
|
||||
#[getter]
|
||||
pub fn branches(&self) -> PyResult<Branches> {
|
||||
Ok(Branches::new(self.inner_ref()?.clone()))
|
||||
}
|
||||
|
||||
#[pyo3(signature = (offsets))]
|
||||
pub fn take_offsets(self_: PyRef<'_, Self>, offsets: Vec<u64>) -> PyResult<TakeQuery> {
|
||||
Ok(TakeQuery::new(
|
||||
@@ -954,8 +971,13 @@ impl Table {
|
||||
builder.when_not_matched_insert_all();
|
||||
}
|
||||
if parameters.when_not_matched_by_source_delete {
|
||||
builder
|
||||
.when_not_matched_by_source_delete(parameters.when_not_matched_by_source_condition);
|
||||
if let Some(e) = parameters.when_not_matched_by_source_condition_expr {
|
||||
builder.when_not_matched_by_source_delete_expr(e.0);
|
||||
} else {
|
||||
builder.when_not_matched_by_source_delete(
|
||||
parameters.when_not_matched_by_source_condition,
|
||||
);
|
||||
}
|
||||
}
|
||||
if let Some(timeout) = parameters.timeout {
|
||||
builder.timeout(timeout);
|
||||
@@ -1191,6 +1213,7 @@ pub struct MergeInsertParams {
|
||||
when_not_matched_insert_all: bool,
|
||||
when_not_matched_by_source_delete: bool,
|
||||
when_not_matched_by_source_condition: Option<String>,
|
||||
when_not_matched_by_source_condition_expr: Option<PyExpr>,
|
||||
timeout: Option<std::time::Duration>,
|
||||
use_index: Option<bool>,
|
||||
use_lsm_write: Option<bool>,
|
||||
@@ -1265,3 +1288,71 @@ impl Tags {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct Branches {
|
||||
inner: LanceDbTable,
|
||||
}
|
||||
|
||||
impl Branches {
|
||||
pub fn new(table: LanceDbTable) -> Self {
|
||||
Self { inner: table }
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Branches {
|
||||
pub fn list(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let res = inner.list_branches().await.infer_error()?;
|
||||
Python::attach(|py| {
|
||||
let py_dict = PyDict::new(py);
|
||||
for (name, contents) in res {
|
||||
let value = PyDict::new(py);
|
||||
value.set_item("parent_branch", contents.parent_branch)?;
|
||||
value.set_item("parent_version", contents.parent_version)?;
|
||||
value.set_item("manifest_size", contents.manifest_size)?;
|
||||
py_dict.set_item(name, value)?;
|
||||
}
|
||||
Ok(py_dict.unbind())
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (name, from_ref=None, from_version=None))]
|
||||
pub fn create(
|
||||
self_: PyRef<'_, Self>,
|
||||
name: String,
|
||||
from_ref: Option<String>,
|
||||
from_version: Option<u64>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let from = Ref::Version(from_ref, from_version);
|
||||
let table = inner.create_branch(&name, from).await.infer_error()?;
|
||||
Ok(Table::new(table))
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (name, version=None))]
|
||||
pub fn checkout(
|
||||
self_: PyRef<'_, Self>,
|
||||
name: String,
|
||||
version: Option<u64>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let table = inner.checkout_branch(&name, version).await.infer_error()?;
|
||||
Ok(Table::new(table))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn delete(self_: PyRef<'_, Self>, name: String) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.delete_branch(&name).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -450,6 +450,27 @@ def binary_table(tmp_path):
|
||||
return db.create_table("binary_test", data)
|
||||
|
||||
|
||||
class TestExprIsin:
|
||||
def test_isin_ints(self):
|
||||
assert col("id").isin([1, 2, 3]).to_sql() == "id IN (1, 2, 3)"
|
||||
|
||||
def test_isin_strs(self):
|
||||
assert (
|
||||
col("status").isin(["active", "pending"]).to_sql()
|
||||
== "status IN ('active', 'pending')"
|
||||
)
|
||||
|
||||
def test_isin_coerces_and_mixes(self):
|
||||
assert col("id").isin([lit(1), 2]).to_sql() == "id IN (1, 2)"
|
||||
|
||||
def test_isin_empty(self):
|
||||
assert col("id").isin([]).to_sql() == "id IN ()"
|
||||
|
||||
def test_isin_filter(self, simple_table):
|
||||
result = simple_table.search().where(col("id").isin([1, 3, 5])).to_arrow()
|
||||
assert result.num_rows == 3
|
||||
|
||||
|
||||
class TestExprBytesIntegration:
|
||||
def test_binary_equality_filter(self, binary_table):
|
||||
result = (
|
||||
|
||||
@@ -9,6 +9,7 @@ use std::sync::Arc;
|
||||
use arrow_array::RecordBatch;
|
||||
use arrow_schema::SchemaRef;
|
||||
use lance::dataset::ReadParams;
|
||||
use lance::dataset::refs::MAIN_BRANCH;
|
||||
use lance_namespace::models::{
|
||||
CreateNamespaceRequest, CreateNamespaceResponse, DescribeNamespaceRequest,
|
||||
DescribeNamespaceResponse, DropNamespaceRequest, DropNamespaceResponse, ListNamespacesRequest,
|
||||
@@ -119,6 +120,8 @@ pub struct OpenTableBuilder {
|
||||
parent: Arc<dyn Database>,
|
||||
request: OpenTableRequest,
|
||||
embedding_registry: Arc<dyn EmbeddingRegistry>,
|
||||
branch: Option<String>,
|
||||
version: Option<u64>,
|
||||
}
|
||||
|
||||
impl OpenTableBuilder {
|
||||
@@ -139,6 +142,8 @@ impl OpenTableBuilder {
|
||||
managed_versioning: None,
|
||||
},
|
||||
embedding_registry,
|
||||
branch: None,
|
||||
version: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -259,14 +264,48 @@ impl OpenTableBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Open the table scoped to the given branch instead of the default branch.
|
||||
///
|
||||
/// Reads and writes on the returned table operate in the branch's context.
|
||||
pub fn branch(mut self, branch: impl Into<String>) -> Self {
|
||||
self.branch = Some(branch.into());
|
||||
self
|
||||
}
|
||||
|
||||
/// Open the table pinned to a specific version, producing a read-only "view".
|
||||
///
|
||||
/// Composes with [`Self::branch`]: when a branch is also set, this opens that
|
||||
/// branch at the given version; otherwise it opens `main` at that version.
|
||||
/// The returned table is a detached head, so operations that modify the table
|
||||
/// will fail until [`Table::checkout_latest`] is called.
|
||||
///
|
||||
/// ```
|
||||
/// # use lancedb::Connection;
|
||||
/// # async fn f(conn: &Connection) -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let table = conn.open_table("t").branch("exp").version(3).execute().await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn version(mut self, version: u64) -> Self {
|
||||
self.version = Some(version);
|
||||
self
|
||||
}
|
||||
|
||||
/// Open the table
|
||||
pub async fn execute(self) -> Result<Table> {
|
||||
let table = self.parent.open_table(self.request).await?;
|
||||
Ok(Table::new_with_embedding_registry(
|
||||
table,
|
||||
self.parent,
|
||||
self.embedding_registry,
|
||||
))
|
||||
let table = Table::new_with_embedding_registry(table, self.parent, self.embedding_registry);
|
||||
// "main" is the default branch, so treat it as no branch.
|
||||
let branch = self.branch.filter(|b| b.as_str() != MAIN_BRANCH);
|
||||
match branch {
|
||||
Some(branch) => table.checkout_branch(&branch, self.version).await,
|
||||
None => {
|
||||
if let Some(version) = self.version {
|
||||
table.checkout(version).await?;
|
||||
}
|
||||
Ok(table)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ use lance_namespace::{
|
||||
CreateNamespaceRequest, CreateNamespaceResponse, DeclareTableRequest,
|
||||
DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableRequest,
|
||||
DropNamespaceRequest, DropNamespaceResponse, DropTableRequest, ListNamespacesRequest,
|
||||
ListNamespacesResponse, ListTablesRequest, ListTablesResponse,
|
||||
ListNamespacesResponse, ListTablesRequest, ListTablesResponse, RenameTableRequest,
|
||||
},
|
||||
};
|
||||
use lance_namespace_impls::ConnectBuilder;
|
||||
@@ -437,8 +437,11 @@ impl Database for LanceNamespaceDatabase {
|
||||
|
||||
// Set up commit handler when managed_versioning is enabled
|
||||
if managed_versioning == Some(true) {
|
||||
let external_store =
|
||||
LanceNamespaceExternalManifestStore::new(self.namespace.clone(), table_id.clone());
|
||||
let external_store = LanceNamespaceExternalManifestStore::for_table_uri(
|
||||
self.namespace.clone(),
|
||||
table_id.clone(),
|
||||
&location,
|
||||
)?;
|
||||
let commit_handler: Arc<dyn CommitHandler> = Arc::new(ExternalManifestCommitHandler {
|
||||
external_manifest_store: Arc::new(external_store),
|
||||
});
|
||||
@@ -488,14 +491,34 @@ impl Database for LanceNamespaceDatabase {
|
||||
|
||||
async fn rename_table(
|
||||
&self,
|
||||
_cur_name: &str,
|
||||
_new_name: &str,
|
||||
_cur_namespace_path: &[String],
|
||||
_new_namespace_path: &[String],
|
||||
cur_name: &str,
|
||||
new_name: &str,
|
||||
cur_namespace_path: &[String],
|
||||
new_namespace_path: &[String],
|
||||
) -> Result<()> {
|
||||
Err(Error::NotSupported {
|
||||
message: "rename_table is not supported for namespace connections".to_string(),
|
||||
})
|
||||
let mut cur_table_id = cur_namespace_path.to_vec();
|
||||
cur_table_id.push(cur_name.to_string());
|
||||
|
||||
let new_namespace_id = if new_namespace_path.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(new_namespace_path.to_vec())
|
||||
};
|
||||
|
||||
let rename_request = RenameTableRequest {
|
||||
id: Some(cur_table_id),
|
||||
new_table_name: new_name.to_string(),
|
||||
new_namespace_id,
|
||||
..Default::default()
|
||||
};
|
||||
self.namespace
|
||||
.rename_table(rename_request)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to rename table: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn drop_table(&self, name: &str, namespace_path: &[String]) -> Result<()> {
|
||||
@@ -740,6 +763,64 @@ mod tests {
|
||||
assert!(table_names.contains(&"test_table".to_string()));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_branch_query_under_pushdown_stays_local() {
|
||||
// With QueryTable pushdown enabled, a query on the main branch routes to
|
||||
// the namespace server, but a branch handle must run locally: the
|
||||
// server-side request carries no branch and would return main's rows.
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.pushdown_operation(NamespaceClientPushdownOperation::QueryTable)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
conn.create_namespace(CreateNamespaceRequest {
|
||||
id: Some(vec!["test_ns".into()]),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.expect("Failed to create namespace");
|
||||
|
||||
// main has 5 rows
|
||||
let table = conn
|
||||
.create_table("ref_test", create_test_data())
|
||||
.namespace(vec!["test_ns".into()])
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to create table");
|
||||
let main_version = table.version().await.unwrap();
|
||||
|
||||
// fork a branch off main, then add 5 more rows so it differs from main
|
||||
let branch = table
|
||||
.create_branch("exp", main_version)
|
||||
.await
|
||||
.expect("Failed to create branch");
|
||||
branch
|
||||
.add(create_test_data())
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to append to branch");
|
||||
|
||||
// the branch query must run locally and see the branch's 10 rows --
|
||||
// not get routed to the server (which carries no branch) and see main's 5
|
||||
let results = branch
|
||||
.query()
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to query branch")
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.expect("Failed to collect results");
|
||||
let count: usize = results.iter().map(|b| b.num_rows()).sum();
|
||||
assert_eq!(count, 10);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_describe_table() {
|
||||
// Setup: Create a temporary directory for the namespace
|
||||
|
||||
@@ -203,11 +203,11 @@ impl Shuffler {
|
||||
|
||||
// Finish writing files
|
||||
for (file_idx, mut writer) in file_writers.into_iter().enumerate() {
|
||||
let num_written = writer.finish().await?;
|
||||
let write_summary = writer.finish().await?;
|
||||
log::debug!(
|
||||
"Shuffle job {}: wrote {} rows to file {}",
|
||||
self.id,
|
||||
num_written,
|
||||
write_summary.num_rows,
|
||||
file_idx
|
||||
);
|
||||
}
|
||||
|
||||
@@ -57,6 +57,10 @@ pub fn expr_cast(expr: Expr, data_type: DataType) -> Expr {
|
||||
cast(expr, data_type)
|
||||
}
|
||||
|
||||
pub fn is_in(expr: Expr, list: Vec<Expr>) -> Expr {
|
||||
expr.in_list(list, false)
|
||||
}
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref FUNC_REGISTRY: std::sync::RwLock<std::collections::HashMap<String, Arc<ScalarUDF>>> = {
|
||||
let mut m = std::collections::HashMap::new();
|
||||
@@ -194,6 +198,13 @@ mod tests {
|
||||
assert_eq!(sql, "NOT (data = X'ABCD')");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_in() {
|
||||
let expr = is_in(col("id"), vec![lit(1i64), lit(2i64), lit(3i64)]);
|
||||
let sql = expr_to_sql_string(&expr).unwrap();
|
||||
assert!(sql.contains("IN"), "expected IN in: {}", sql);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_binary_literals() {
|
||||
use datafusion_common::ScalarValue;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use scalar::FtsIndexBuilder;
|
||||
use serde::Deserialize;
|
||||
use serde_with::skip_serializing_none;
|
||||
@@ -12,7 +13,7 @@ use crate::index::vector::IvfRqIndexBuilder;
|
||||
use crate::{DistanceType, Error, Result, table::BaseTable};
|
||||
|
||||
use self::{
|
||||
scalar::{BTreeIndexBuilder, BitmapIndexBuilder, LabelListIndexBuilder},
|
||||
scalar::{BTreeIndexBuilder, BitmapIndexBuilder, FmIndexBuilder, LabelListIndexBuilder},
|
||||
vector::{
|
||||
IvfHnswFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder,
|
||||
IvfSqIndexBuilder,
|
||||
@@ -48,6 +49,11 @@ pub enum Index {
|
||||
/// using an underlying bitmap index.
|
||||
LabelList(LabelListIndexBuilder),
|
||||
|
||||
/// An `FM` index is a scalar index on string/binary columns that accelerates
|
||||
/// substring search (`contains(col, 'needle')`). It matches arbitrary
|
||||
/// substrings of the raw bytes, unlike the tokenized [`Index::FTS`] index.
|
||||
Fm(FmIndexBuilder),
|
||||
|
||||
/// Full text search index using bm25.
|
||||
FTS(FtsIndexBuilder),
|
||||
|
||||
@@ -306,6 +312,8 @@ pub enum IndexType {
|
||||
Bitmap,
|
||||
#[serde(alias = "LABEL_LIST")]
|
||||
LabelList,
|
||||
#[serde(alias = "FM", alias = "FMINDEX", alias = "FMIndex")]
|
||||
Fm,
|
||||
// FTS
|
||||
#[serde(alias = "INVERTED", alias = "Inverted")]
|
||||
FTS,
|
||||
@@ -324,6 +332,7 @@ impl std::fmt::Display for IndexType {
|
||||
Self::BTree => write!(f, "BTREE"),
|
||||
Self::Bitmap => write!(f, "BITMAP"),
|
||||
Self::LabelList => write!(f, "LABEL_LIST"),
|
||||
Self::Fm => write!(f, "FM"),
|
||||
Self::FTS => write!(f, "FTS"),
|
||||
}
|
||||
}
|
||||
@@ -337,6 +346,7 @@ impl std::str::FromStr for IndexType {
|
||||
"BTREE" => Ok(Self::BTree),
|
||||
"BITMAP" => Ok(Self::Bitmap),
|
||||
"LABEL_LIST" | "LABELLIST" => Ok(Self::LabelList),
|
||||
"FM" | "FMINDEX" => Ok(Self::Fm),
|
||||
"FTS" | "INVERTED" => Ok(Self::FTS),
|
||||
"IVF_FLAT" => Ok(Self::IvfFlat),
|
||||
"IVF_SQ" => Ok(Self::IvfSq),
|
||||
@@ -364,6 +374,51 @@ pub struct IndexConfig {
|
||||
/// Currently this is always a Vec of size 1. In the future there may
|
||||
/// be more columns to represent composite indices.
|
||||
pub columns: Vec<String>,
|
||||
/// The UUID of the first segment of the index.
|
||||
///
|
||||
/// An index may be made up of multiple segments, each with their own UUID.
|
||||
/// This is the UUID of the first segment. `None` if it could not be
|
||||
/// determined (e.g. for remote tables, which do not yet surface this).
|
||||
pub index_uuid: Option<String>,
|
||||
/// The protobuf type URL, a precise type identifier for the index.
|
||||
///
|
||||
/// `None` if unavailable (e.g. for remote tables).
|
||||
pub type_url: Option<String>,
|
||||
/// When the index was created, taken as the minimum creation time across
|
||||
/// all segments.
|
||||
///
|
||||
/// `None` if unavailable, such as for indices created before creation
|
||||
/// timestamps were tracked, or for remote tables.
|
||||
pub created_at: Option<DateTime<Utc>>,
|
||||
/// The number of rows indexed, across all segments.
|
||||
///
|
||||
/// This is approximate and may include rows that have since been deleted.
|
||||
/// `None` if unavailable (e.g. for remote tables).
|
||||
pub num_indexed_rows: Option<u64>,
|
||||
/// The number of rows in the table that are not yet covered by this index.
|
||||
///
|
||||
/// Computed as the table's total row count minus [`Self::num_indexed_rows`].
|
||||
/// Optimizing the index will fold these rows into it. `None` if unavailable
|
||||
/// (e.g. for remote tables).
|
||||
pub num_unindexed_rows: Option<u64>,
|
||||
/// The total size in bytes of all index files across all segments.
|
||||
///
|
||||
/// `None` if size information is unavailable, such as for indices created
|
||||
/// before file sizes were tracked, or for remote tables.
|
||||
pub size_bytes: Option<u64>,
|
||||
/// The number of segments that make up the index.
|
||||
///
|
||||
/// `None` if unavailable (e.g. for remote tables).
|
||||
pub num_segments: Option<u32>,
|
||||
/// The on-disk index format version, taken from the first segment.
|
||||
///
|
||||
/// `None` if unavailable (e.g. for remote tables).
|
||||
pub index_version: Option<i32>,
|
||||
/// Index-type-specific details, serialized as JSON.
|
||||
///
|
||||
/// The shape of this JSON varies by index type. `None` if the details
|
||||
/// could not be produced (e.g. no plugin available) or for remote tables.
|
||||
pub index_details: Option<String>,
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
@@ -372,7 +427,6 @@ pub(crate) struct IndexMetadata {
|
||||
pub metric_type: Option<DistanceType>,
|
||||
// Sometimes the index type is provided at this level.
|
||||
pub index_type: Option<IndexType>,
|
||||
pub loss: Option<f64>,
|
||||
}
|
||||
|
||||
// This struct is used to deserialize the JSON data returned from the Lance API
|
||||
@@ -404,6 +458,4 @@ pub struct IndexStatistics {
|
||||
pub distance_type: Option<DistanceType>,
|
||||
/// The number of parts this index is split into.
|
||||
pub num_indices: Option<u32>,
|
||||
/// The loss value used by the index.
|
||||
pub loss: Option<f64>,
|
||||
}
|
||||
|
||||
@@ -51,6 +51,15 @@ pub struct BitmapIndexBuilder {}
|
||||
#[derive(Debug, Clone, Default, serde::Serialize)]
|
||||
pub struct LabelListIndexBuilder {}
|
||||
|
||||
/// Builder for an FM-Index.
|
||||
///
|
||||
/// An FM-Index (Ferragina–Manzini) is a scalar index over string/binary columns
|
||||
/// that accelerates substring search, i.e. `contains(col, 'needle')`. Unlike an
|
||||
/// inverted (FTS) index it matches arbitrary substrings of the raw bytes rather
|
||||
/// than tokenized words.
|
||||
#[derive(Debug, Clone, Default, serde::Serialize)]
|
||||
pub struct FmIndexBuilder {}
|
||||
|
||||
pub use lance_index::scalar::FullTextSearchQuery;
|
||||
pub use lance_index::scalar::InvertedIndexParams as FtsIndexBuilder;
|
||||
pub use lance_index::scalar::InvertedIndexParams;
|
||||
|
||||
@@ -983,6 +983,46 @@ mod tests {
|
||||
assert_eq!(table.name(), "table1");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_open_table_branch_and_version() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
let body = if request.url().path() == "/v1/table/t/branches/list/" {
|
||||
// checkout_branch validates the branch exists via list_branches.
|
||||
r#"{"branches":{"exp":{"parentVersion":1,"createAt":1,"manifestSize":1}}}"#
|
||||
} else {
|
||||
// describe (table open + version/branch validation)
|
||||
r#"{"table": "t", "version": 2, "schema": {"fields": [
|
||||
{"name": "a", "type": { "type": "int32" }, "nullable": false}
|
||||
]}}"#
|
||||
};
|
||||
http::Response::builder().status(200).body(body).unwrap()
|
||||
});
|
||||
|
||||
// version-only (and "main" + version) time-travel the main chain
|
||||
let v2 = conn.open_table("t").version(2).execute().await.unwrap();
|
||||
assert_eq!(v2.current_branch(), None);
|
||||
let main_v2 = conn
|
||||
.open_table("t")
|
||||
.branch("main")
|
||||
.version(2)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(main_v2.current_branch(), None);
|
||||
|
||||
// a non-main branch opens a handle scoped to that branch
|
||||
let exp = conn.open_table("t").branch("exp").execute().await.unwrap();
|
||||
assert_eq!(exp.current_branch(), Some("exp".to_string()));
|
||||
let exp_v2 = conn
|
||||
.open_table("t")
|
||||
.branch("exp")
|
||||
.version(2)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(exp_v2.current_branch(), Some("exp".to_string()));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_open_table_not_found() {
|
||||
let conn = Connection::new_with_handler(|_| {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -48,6 +48,8 @@ pub struct RemoteInsertExec<S: HttpSend = Sender> {
|
||||
metrics: ExecutionPlanMetricsSet,
|
||||
upload_id: Option<String>,
|
||||
tracker: Option<Arc<WriteProgressTracker>>,
|
||||
/// Branch to write to via `?branch=`. `None` targets the main branch.
|
||||
branch: Option<String>,
|
||||
}
|
||||
|
||||
impl<S: HttpSend + 'static> RemoteInsertExec<S> {
|
||||
@@ -59,9 +61,10 @@ impl<S: HttpSend + 'static> RemoteInsertExec<S> {
|
||||
input: Arc<dyn ExecutionPlan>,
|
||||
overwrite: bool,
|
||||
tracker: Option<Arc<WriteProgressTracker>>,
|
||||
branch: Option<String>,
|
||||
) -> Self {
|
||||
Self::new_inner(
|
||||
table_name, identifier, client, input, overwrite, None, tracker,
|
||||
table_name, identifier, client, input, overwrite, None, tracker, branch,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -70,6 +73,7 @@ impl<S: HttpSend + 'static> RemoteInsertExec<S> {
|
||||
/// Each partition's insert is staged under the given `upload_id` without
|
||||
/// committing. The caller is responsible for calling the complete (or abort)
|
||||
/// endpoint after all partitions finish.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new_multipart(
|
||||
table_name: String,
|
||||
identifier: String,
|
||||
@@ -78,6 +82,7 @@ impl<S: HttpSend + 'static> RemoteInsertExec<S> {
|
||||
overwrite: bool,
|
||||
upload_id: String,
|
||||
tracker: Option<Arc<WriteProgressTracker>>,
|
||||
branch: Option<String>,
|
||||
) -> Self {
|
||||
Self::new_inner(
|
||||
table_name,
|
||||
@@ -87,9 +92,11 @@ impl<S: HttpSend + 'static> RemoteInsertExec<S> {
|
||||
overwrite,
|
||||
Some(upload_id),
|
||||
tracker,
|
||||
branch,
|
||||
)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn new_inner(
|
||||
table_name: String,
|
||||
identifier: String,
|
||||
@@ -98,6 +105,7 @@ impl<S: HttpSend + 'static> RemoteInsertExec<S> {
|
||||
overwrite: bool,
|
||||
upload_id: Option<String>,
|
||||
tracker: Option<Arc<WriteProgressTracker>>,
|
||||
branch: Option<String>,
|
||||
) -> Self {
|
||||
let num_partitions = if upload_id.is_some() {
|
||||
input.output_partitioning().partition_count()
|
||||
@@ -123,6 +131,7 @@ impl<S: HttpSend + 'static> RemoteInsertExec<S> {
|
||||
metrics: ExecutionPlanMetricsSet::new(),
|
||||
upload_id,
|
||||
tracker,
|
||||
branch,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -273,6 +282,7 @@ impl<S: HttpSend + 'static> ExecutionPlan for RemoteInsertExec<S> {
|
||||
self.overwrite,
|
||||
self.upload_id.clone(),
|
||||
self.tracker.clone(),
|
||||
self.branch.clone(),
|
||||
)))
|
||||
}
|
||||
|
||||
@@ -304,6 +314,7 @@ impl<S: HttpSend + 'static> ExecutionPlan for RemoteInsertExec<S> {
|
||||
let table_name = self.table_name.clone();
|
||||
let upload_id = self.upload_id.clone();
|
||||
let tracker = self.tracker.clone();
|
||||
let branch = self.branch.clone();
|
||||
|
||||
let stream = futures::stream::once(async move {
|
||||
let mut request = client
|
||||
@@ -316,6 +327,9 @@ impl<S: HttpSend + 'static> ExecutionPlan for RemoteInsertExec<S> {
|
||||
if let Some(ref uid) = upload_id {
|
||||
request = request.query(&[("upload_id", uid.as_str())]);
|
||||
}
|
||||
if let Some(ref b) = branch {
|
||||
request = request.query(&[("branch", b.as_str())]);
|
||||
}
|
||||
|
||||
let (error_tx, mut error_rx) = tokio::sync::oneshot::channel();
|
||||
let body = Self::stream_as_http_body(input_stream, error_tx, tracker)?;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
1399
rust/lancedb/src/table/create_index.rs
Normal file
1399
rust/lancedb/src/table/create_index.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -76,6 +76,23 @@ impl DatasetConsistencyWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new wrapper pinned to the dataset's current version.
|
||||
///
|
||||
/// `dataset` must already be checked out at the desired version; this pins
|
||||
/// to `dataset.version()` without re-resolving. The wrapper is read-only
|
||||
/// (time-travel) until [`as_latest`](Self::as_latest) re-attaches it to the
|
||||
/// latest version.
|
||||
pub fn new_time_travel(dataset: Dataset, read_consistency_interval: Option<Duration>) -> Self {
|
||||
let version = dataset.version().version;
|
||||
let wrapper = Self::new_latest(dataset, read_consistency_interval);
|
||||
wrapper
|
||||
.state
|
||||
.lock()
|
||||
.unwrap_or_else(|e| e.into_inner())
|
||||
.pinned_version = Some(version);
|
||||
wrapper
|
||||
}
|
||||
|
||||
/// The MemWAL `ShardWriter` cache co-located with this dataset.
|
||||
pub(crate) fn shard_writer(&self) -> &Arc<ShardWriterCache> {
|
||||
&self.shard_writer
|
||||
@@ -144,8 +161,19 @@ impl DatasetConsistencyWrapper {
|
||||
}
|
||||
|
||||
/// Checkout a branch and track its HEAD for new versions.
|
||||
pub async fn as_branch(&self, _branch: impl Into<String>) -> Result<()> {
|
||||
todo!("Branch support not yet implemented")
|
||||
pub async fn as_branch(&self, branch: impl Into<String>) -> Result<()> {
|
||||
let branch = branch.into();
|
||||
let dataset = { self.state.lock()?.dataset.clone() };
|
||||
let new_dataset = dataset.checkout_branch(&branch).await?;
|
||||
|
||||
let mut state = self.state.lock()?;
|
||||
state.dataset = Arc::new(new_dataset);
|
||||
state.pinned_version = None;
|
||||
drop(state);
|
||||
if let ConsistencyMode::Eventual(bg_cache) = &self.consistency {
|
||||
bg_cache.invalidate();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check that the dataset is in a mutable mode (Latest).
|
||||
@@ -161,6 +189,17 @@ impl DatasetConsistencyWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
/// The branch this wrapper is currently tracking, or `None` for `main`.
|
||||
pub fn current_branch(&self) -> Option<String> {
|
||||
self.state
|
||||
.lock()
|
||||
.unwrap_or_else(|e| e.into_inner())
|
||||
.dataset
|
||||
.manifest()
|
||||
.branch
|
||||
.clone()
|
||||
}
|
||||
|
||||
/// Returns the version, if in time travel mode, or None otherwise.
|
||||
pub fn time_travel_version(&self) -> Option<u64> {
|
||||
self.state
|
||||
@@ -477,10 +516,6 @@ mod tests {
|
||||
let uri = dir.path().to_str().unwrap();
|
||||
let ds = create_test_dataset(uri).await;
|
||||
|
||||
// Other tests use a thread-local mock clock. Simulate leaked state from a
|
||||
// previous test to ensure this wrapper starts from real time.
|
||||
clock::advance_by(Duration::from_secs(60));
|
||||
|
||||
let wrapper = DatasetConsistencyWrapper::new_latest(ds, Some(Duration::from_millis(200)));
|
||||
|
||||
// Populate the cache
|
||||
@@ -490,12 +525,13 @@ mod tests {
|
||||
// External write
|
||||
append_to_dataset(uri).await;
|
||||
|
||||
// Should return cached value immediately (within TTL)
|
||||
// Should return cached value immediately (within TTL), regardless of how
|
||||
// long the external write above took on a slow CI runner.
|
||||
let v_cached = wrapper.get().await.unwrap().version().version;
|
||||
assert_eq!(v_cached, 1);
|
||||
|
||||
// Wait for TTL to expire, then get() should trigger a refresh
|
||||
tokio::time::sleep(Duration::from_millis(300)).await;
|
||||
// Advance the mock clock past the TTL so the next get() triggers a refresh.
|
||||
clock::advance_by(Duration::from_millis(300));
|
||||
let v_after = wrapper.get().await.unwrap().version().version;
|
||||
assert_eq!(v_after, 2);
|
||||
}
|
||||
@@ -737,4 +773,31 @@ mod tests {
|
||||
let result = wrapper.reload().await;
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_as_branch_is_writable_and_tracked() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let uri = dir.path().to_str().unwrap();
|
||||
|
||||
// v1 on main, then shallow-clone a branch off it
|
||||
let mut ds = create_test_dataset(uri).await;
|
||||
let v1 = ds.version().version;
|
||||
ds.create_branch("exp", v1, None).await.unwrap();
|
||||
|
||||
// wrapper starts on main: latest, writable, no branch
|
||||
let wrapper = DatasetConsistencyWrapper::new_latest(ds, None);
|
||||
assert_eq!(wrapper.current_branch(), None);
|
||||
|
||||
// switch to the branch
|
||||
wrapper.as_branch("exp").await.unwrap();
|
||||
assert_eq!(wrapper.current_branch().as_deref(), Some("exp"));
|
||||
|
||||
// a branch is writable (unlike a pinned/time-travel checkout)
|
||||
wrapper.ensure_mutable().unwrap();
|
||||
assert_eq!(wrapper.time_travel_version(), None);
|
||||
|
||||
// get() returns the branch dataset
|
||||
let on_branch = wrapper.get().await.unwrap();
|
||||
assert_eq!(on_branch.manifest().branch.as_deref(), Some("exp"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,11 +41,14 @@ pub async fn execute_query(
|
||||
query: &AnyQuery,
|
||||
options: QueryExecutionOptions,
|
||||
) -> Result<DatasetRecordBatchStream> {
|
||||
// If QueryTable pushdown is enabled and namespace client is configured, use server-side query execution
|
||||
// QueryTable pushdown runs the query server-side, but only on the main
|
||||
// branch: the namespace request carries no branch yet, so a branch handle
|
||||
// must fall through to local execution.
|
||||
if table
|
||||
.pushdown_operations
|
||||
.contains(&NamespaceClientPushdownOperation::QueryTable)
|
||||
&& let Some(ref namespace_client) = table.namespace_client
|
||||
&& table.dataset.current_branch().is_none()
|
||||
{
|
||||
return execute_namespace_query(table, namespace_client.clone(), query, options).await;
|
||||
}
|
||||
|
||||
@@ -257,7 +257,9 @@ pub fn supported_bitmap_data_type(dtype: &DataType) -> bool {
|
||||
|
||||
pub fn supported_label_list_data_type(dtype: &DataType) -> bool {
|
||||
match dtype {
|
||||
DataType::List(field) => supported_bitmap_data_type(field.data_type()),
|
||||
DataType::List(field) | DataType::LargeList(field) => {
|
||||
supported_bitmap_data_type(field.data_type())
|
||||
}
|
||||
DataType::FixedSizeList(field, _) => supported_bitmap_data_type(field.data_type()),
|
||||
_ => false,
|
||||
}
|
||||
@@ -277,6 +279,15 @@ fn supported_fts_data_type_impl(dtype: &DataType, in_list: bool) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
/// FM-Index accelerates substring (`contains`) search over raw bytes, so it
|
||||
/// applies to string and binary columns.
|
||||
pub fn supported_fm_data_type(dtype: &DataType) -> bool {
|
||||
matches!(
|
||||
dtype,
|
||||
DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary
|
||||
)
|
||||
}
|
||||
|
||||
pub fn supported_vector_data_type(dtype: &DataType) -> bool {
|
||||
match dtype {
|
||||
DataType::FixedSizeList(field, _) => {
|
||||
|
||||
Reference in New Issue
Block a user