mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-21 22:10:40 +00:00
Compare commits
1 Commits
dependabot
...
v0.29.1-be
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6da7ca183d |
5
.github/dependabot.yml
vendored
5
.github/dependabot.yml
vendored
@@ -11,11 +11,6 @@ updates:
|
||||
schedule:
|
||||
interval: weekly
|
||||
open-pull-requests-limit: 10
|
||||
# Only update Cargo.lock, never widen/raise the version requirements in
|
||||
# Cargo.toml. The goal is keeping the lockfile (and the binaries we ship)
|
||||
# current on security fixes, not forcing our library's consumers onto
|
||||
# newer minimum versions.
|
||||
versioning-strategy: lockfile-only
|
||||
groups:
|
||||
rust-minor-patch:
|
||||
update-types:
|
||||
|
||||
5
.github/workflows/nodejs.yml
vendored
5
.github/workflows/nodejs.yml
vendored
@@ -157,10 +157,7 @@ jobs:
|
||||
npx jest --testEnvironment jest-environment-node-single-context --verbose
|
||||
macos:
|
||||
timeout-minutes: 30
|
||||
# macos-15 ships a newer linker; the older macos-14 linker fails to insert
|
||||
# branch islands when the debug cdylib's __text section exceeds the 128 MB
|
||||
# AArch64 B/BL branch range.
|
||||
runs-on: "macos-15"
|
||||
runs-on: "macos-14"
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
2
.github/workflows/python.yml
vendored
2
.github/workflows/python.yml
vendored
@@ -205,7 +205,7 @@ jobs:
|
||||
- name: Delete wheels
|
||||
run: rm -rf target/wheels
|
||||
pydantic1x:
|
||||
timeout-minutes: 60
|
||||
timeout-minutes: 30
|
||||
runs-on: "ubuntu-24.04"
|
||||
defaults:
|
||||
run:
|
||||
|
||||
20
.github/workflows/rust.yml
vendored
20
.github/workflows/rust.yml
vendored
@@ -233,26 +233,6 @@ jobs:
|
||||
cargo update -p aws-sdk-sso --precise 1.62.0
|
||||
cargo update -p aws-sdk-ssooidc --precise 1.63.0
|
||||
cargo update -p aws-sdk-sts --precise 1.63.0
|
||||
# aws-runtime/sigv4/credential-types/types and the aws-smithy-*
|
||||
# crates bumped their MSRV to 1.91.1 in late 2026; pin to the last
|
||||
# 1.91.0-compatible versions. The order matters — each downgrade
|
||||
# only succeeds once everything that still pins it at a higher
|
||||
# version has itself been downgraded.
|
||||
cargo update -p aws-runtime --precise 1.5.12
|
||||
cargo update -p aws-types --precise 1.3.9
|
||||
cargo update -p aws-sigv4 --precise 1.3.5
|
||||
cargo update -p aws-credential-types --precise 1.2.8
|
||||
cargo update -p aws-smithy-checksums --precise 0.63.9
|
||||
cargo update -p aws-smithy-runtime --precise 1.9.3
|
||||
cargo update -p aws-smithy-http --precise 0.62.4
|
||||
cargo update -p aws-smithy-eventstream --precise 0.60.12
|
||||
cargo update -p aws-smithy-http-client --precise 1.1.3
|
||||
cargo update -p aws-smithy-observability --precise 0.1.4
|
||||
cargo update -p aws-smithy-query --precise 0.60.8
|
||||
cargo update -p aws-smithy-runtime-api --precise 1.9.1
|
||||
cargo update -p aws-smithy-async --precise 1.2.6
|
||||
cargo update -p aws-smithy-types --precise 1.3.5
|
||||
cargo update -p aws-smithy-xml --precise 0.60.11
|
||||
cargo update -p home --precise 0.5.9
|
||||
- name: cargo +${{ matrix.msrv }} check
|
||||
env:
|
||||
|
||||
25
AGENTS.md
25
AGENTS.md
@@ -17,30 +17,9 @@ Common commands:
|
||||
* Run tests: `cargo test --quiet --features remote --tests`
|
||||
* Run specific test: `cargo test --quiet --features remote -p <package_name> --test <test_name>`
|
||||
* Lint: `cargo clippy --quiet --features remote --tests --examples`
|
||||
* Format Rust: `cargo fmt --all`
|
||||
* Format Python: `ruff format .`
|
||||
* Lint Python: `ruff check .`
|
||||
* Bootstrap Python dev env: `cd python && uv run --extra tests --extra dev maturin develop --extras tests,dev`
|
||||
* Run Python tests: `cd python && uv run --extra tests pytest python/tests -vv --durations=10 -m "not slow and not s3_test"`
|
||||
* Run specific Python test: `cd python && uv run --extra tests pytest python/tests/<test_file>.py::<test_name> -q`
|
||||
* Format: `cargo fmt --all`
|
||||
|
||||
For Python validation, prefer the uv-managed environment declared by `python/uv.lock`.
|
||||
Do not treat system `python`, global `pytest`, or missing editable-install errors as
|
||||
final blockers; bootstrap or enter the uv environment instead. If `lancedb._lancedb`
|
||||
is missing or stale, or if Rust/PyO3 binding code changed, rebuild the Python
|
||||
extension with the bootstrap command above before running tests.
|
||||
|
||||
Before committing changes, run formatting for every language you touched. At minimum:
|
||||
|
||||
* Rust changes: run `cargo fmt --all`.
|
||||
* Python changes: run `ruff format .` and `ruff check .` from the repository root,
|
||||
and run targeted tests through `cd python && uv run ...`.
|
||||
* TypeScript changes: run the relevant `npm`/`pnpm` lint, format, build, and docs commands in `nodejs`.
|
||||
|
||||
Before creating a PR, make sure the PR title follows Conventional Commits, such as
|
||||
`fix: support nested field paths in native index creation` or
|
||||
`feat(python): add dataset multiprocessing support`. The semantic-release check uses the
|
||||
PR title and body as the merge commit message, so a non-conventional PR title will fail CI.
|
||||
Before committing changes, run formatting.
|
||||
|
||||
## Coding tips
|
||||
|
||||
|
||||
1113
Cargo.lock
generated
1113
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -441,28 +441,18 @@ Open a table in the database.
|
||||
|
||||
```ts
|
||||
abstract renameTable(
|
||||
currentName,
|
||||
oldName,
|
||||
newName,
|
||||
options?): Promise<void>
|
||||
namespacePath?): Promise<void>
|
||||
```
|
||||
|
||||
Rename a table.
|
||||
|
||||
Currently only supported by LanceDB Cloud. Local OSS connections and
|
||||
namespace-backed connections (via [connectNamespace](../functions/connectNamespace.md)) reject with
|
||||
a "not supported" error.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **currentName**: `string`
|
||||
The current name of the table.
|
||||
* **oldName**: `string`
|
||||
|
||||
* **newName**: `string`
|
||||
The new name for the table.
|
||||
|
||||
* **options?**: [`RenameTableOptions`](../interfaces/RenameTableOptions.md)
|
||||
Optional namespace paths. When
|
||||
`newNamespacePath` is omitted the table stays in `namespacePath`.
|
||||
* **namespacePath?**: `string`[]
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -87,7 +87,6 @@
|
||||
- [OptimizeStats](interfaces/OptimizeStats.md)
|
||||
- [QueryExecutionOptions](interfaces/QueryExecutionOptions.md)
|
||||
- [RemovalStats](interfaces/RemovalStats.md)
|
||||
- [RenameTableOptions](interfaces/RenameTableOptions.md)
|
||||
- [RestNamespaceConfig](interfaces/RestNamespaceConfig.md)
|
||||
- [RetryConfig](interfaces/RetryConfig.md)
|
||||
- [ScannableOptions](interfaces/ScannableOptions.md)
|
||||
@@ -105,7 +104,6 @@
|
||||
- [UpdateResult](interfaces/UpdateResult.md)
|
||||
- [Version](interfaces/Version.md)
|
||||
- [WriteExecutionOptions](interfaces/WriteExecutionOptions.md)
|
||||
- [WriteProgress](interfaces/WriteProgress.md)
|
||||
|
||||
## Type Aliases
|
||||
|
||||
|
||||
@@ -19,39 +19,3 @@ mode: "append" | "overwrite";
|
||||
If "append" (the default) then the new data will be added to the table
|
||||
|
||||
If "overwrite" then the new data will replace the existing data in the table.
|
||||
|
||||
***
|
||||
|
||||
### progress()
|
||||
|
||||
```ts
|
||||
progress: (progress) => void;
|
||||
```
|
||||
|
||||
Optional callback invoked periodically with write progress.
|
||||
|
||||
The callback is fired once per batch written and once more with
|
||||
`done: true` when the write completes. Calls are dispatched
|
||||
asynchronously to the JS event loop and never block the write — a slow
|
||||
callback will queue events rather than back-pressure the writer.
|
||||
|
||||
Errors thrown from the callback are logged with `console.warn` and
|
||||
swallowed — they do not abort the write.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **progress**: [`WriteProgress`](WriteProgress.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
`void`
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
await table.add(data, {
|
||||
progress: (p) => {
|
||||
console.log(`${p.outputRows}/${p.totalRows ?? "?"} rows`);
|
||||
},
|
||||
});
|
||||
```
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / RenameTableOptions
|
||||
|
||||
# Interface: RenameTableOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### namespacePath?
|
||||
|
||||
```ts
|
||||
optional namespacePath: string[];
|
||||
```
|
||||
|
||||
The namespace path of the table being renamed. Defaults to the root
|
||||
namespace (`[]`) when omitted.
|
||||
|
||||
***
|
||||
|
||||
### newNamespacePath?
|
||||
|
||||
```ts
|
||||
optional newNamespacePath: string[];
|
||||
```
|
||||
|
||||
The namespace path to move the table to as part of the rename. When
|
||||
omitted the table stays in `namespacePath`.
|
||||
@@ -1,84 +0,0 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / WriteProgress
|
||||
|
||||
# Interface: WriteProgress
|
||||
|
||||
Progress snapshot for a write operation, delivered to the `progress`
|
||||
callback passed to [Table.add](../classes/Table.md#add).
|
||||
|
||||
## Properties
|
||||
|
||||
### activeTasks
|
||||
|
||||
```ts
|
||||
activeTasks: number;
|
||||
```
|
||||
|
||||
Number of parallel write tasks currently in flight.
|
||||
|
||||
***
|
||||
|
||||
### done
|
||||
|
||||
```ts
|
||||
done: boolean;
|
||||
```
|
||||
|
||||
`true` for the final callback; `false` otherwise.
|
||||
|
||||
***
|
||||
|
||||
### elapsedSeconds
|
||||
|
||||
```ts
|
||||
elapsedSeconds: number;
|
||||
```
|
||||
|
||||
Wall-clock seconds since the write started.
|
||||
|
||||
***
|
||||
|
||||
### outputBytes
|
||||
|
||||
```ts
|
||||
outputBytes: number;
|
||||
```
|
||||
|
||||
Number of bytes written so far.
|
||||
|
||||
***
|
||||
|
||||
### outputRows
|
||||
|
||||
```ts
|
||||
outputRows: number;
|
||||
```
|
||||
|
||||
Number of rows written so far.
|
||||
|
||||
***
|
||||
|
||||
### totalRows?
|
||||
|
||||
```ts
|
||||
optional totalRows: number;
|
||||
```
|
||||
|
||||
Total rows expected, when the input source reports it.
|
||||
|
||||
Always set on the final callback (the one with `done: true`), falling
|
||||
back to the actual number of rows written when the source could not
|
||||
report a row count up front.
|
||||
|
||||
***
|
||||
|
||||
### totalTasks
|
||||
|
||||
```ts
|
||||
totalTasks: number;
|
||||
```
|
||||
|
||||
Total number of parallel write tasks (the write parallelism).
|
||||
@@ -166,12 +166,6 @@ lists the indices that LanceDb supports.
|
||||
|
||||
::: lancedb.index.IvfFlat
|
||||
|
||||
::: lancedb.index.IvfSq
|
||||
|
||||
::: lancedb.index.IvfRq
|
||||
|
||||
::: lancedb.index.HnswFlat
|
||||
|
||||
::: lancedb.table.IndexStatistics
|
||||
|
||||
## Querying (Asynchronous)
|
||||
|
||||
@@ -47,14 +47,6 @@ describe("given a connection", () => {
|
||||
await db.close();
|
||||
expect(db.isOpen()).toBe(false);
|
||||
await expect(db.tableNames()).rejects.toThrow("Connection is closed");
|
||||
await expect(db.renameTable("a", "b")).rejects.toThrow(
|
||||
"Connection is closed",
|
||||
);
|
||||
});
|
||||
|
||||
it("should report renameTable as unsupported on an OSS connection", async () => {
|
||||
await db.createTable("a", [{ id: 1 }]);
|
||||
await expect(db.renameTable("a", "b")).rejects.toThrow(/not supported/);
|
||||
});
|
||||
it("should be able to create a table from an object arg `createTable(options)`, or args `createTable(name, data, options)`", async () => {
|
||||
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
|
||||
@@ -89,6 +81,16 @@ describe("given a connection", () => {
|
||||
await db.createTable("test4", [{ id: 1 }, { id: 2 }]);
|
||||
});
|
||||
|
||||
it("should expose renameTable and reject on OSS listing DB", async () => {
|
||||
await db.createTable("old_name", [{ id: 1 }]);
|
||||
|
||||
await expect(db.renameTable("old_name", "new_name")).rejects.toThrow(
|
||||
"rename_table is not supported in LanceDB OSS",
|
||||
);
|
||||
|
||||
await expect(db.tableNames()).resolves.toEqual(["old_name"]);
|
||||
});
|
||||
|
||||
it("should fail if creating table twice, unless overwrite is true", async () => {
|
||||
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
|
||||
await expect(tbl.countRows()).resolves.toBe(2);
|
||||
|
||||
@@ -617,68 +617,4 @@ describe("remote connection", () => {
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("renameTable", () => {
|
||||
async function captureRenameRequest(
|
||||
call: (db: Connection) => Promise<void>,
|
||||
): Promise<{ url: string; body: Record<string, unknown> }> {
|
||||
let captured: { url: string; body: Record<string, unknown> } | undefined;
|
||||
await withMockDatabase((req, res) => {
|
||||
let raw = "";
|
||||
req.on("data", (chunk) => {
|
||||
raw += chunk;
|
||||
});
|
||||
req.on("end", () => {
|
||||
captured = {
|
||||
url: req.url ?? "",
|
||||
body: raw ? JSON.parse(raw) : {},
|
||||
};
|
||||
res.writeHead(200, { "Content-Type": "application/json" }).end("");
|
||||
});
|
||||
}, call);
|
||||
if (!captured) {
|
||||
throw new Error("mock server never saw a request");
|
||||
}
|
||||
return captured;
|
||||
}
|
||||
|
||||
it("sends rename request for a table in the root namespace", async () => {
|
||||
const { url, body } = await captureRenameRequest(async (db) => {
|
||||
await db.renameTable("table1", "table2");
|
||||
});
|
||||
expect(url).toBe("/v1/table/table1/rename/");
|
||||
// biome-ignore lint/style/useNamingConvention: snake_case mandated by the server wire format
|
||||
expect(body).toEqual({ new_table_name: "table2" });
|
||||
});
|
||||
|
||||
it("omits new_namespace when only the current namespace is supplied", async () => {
|
||||
// Safe-default check: passing namespacePath alone must not send
|
||||
// `new_namespace`, so the server keeps the table in its current
|
||||
// namespace instead of silently moving it to root.
|
||||
const { url, body } = await captureRenameRequest(async (db) => {
|
||||
await db.renameTable("table1", "table2", {
|
||||
namespacePath: ["ns1"],
|
||||
});
|
||||
});
|
||||
expect(url).toBe("/v1/table/ns1$table1/rename/");
|
||||
// biome-ignore lint/style/useNamingConvention: snake_case mandated by the server wire format
|
||||
expect(body).toEqual({ new_table_name: "table2" });
|
||||
});
|
||||
|
||||
it("includes new_namespace in the body for a cross-namespace rename", async () => {
|
||||
const { url, body } = await captureRenameRequest(async (db) => {
|
||||
await db.renameTable("table1", "table2", {
|
||||
namespacePath: ["ns1"],
|
||||
newNamespacePath: ["ns2"],
|
||||
});
|
||||
});
|
||||
expect(url).toBe("/v1/table/ns1$table1/rename/");
|
||||
expect(body).toEqual({
|
||||
// biome-ignore lint/style/useNamingConvention: snake_case mandated by the server wire format
|
||||
new_table_name: "table2",
|
||||
// biome-ignore lint/style/useNamingConvention: snake_case mandated by the server wire format
|
||||
new_namespace: ["ns2"],
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -115,48 +115,6 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
await expect(table.countRows()).resolves.toBe(1);
|
||||
});
|
||||
|
||||
it("should invoke the progress callback", async () => {
|
||||
const events: import("../lancedb").WriteProgress[] = [];
|
||||
await table.add([{ id: 1 }, { id: 2 }, { id: 3 }], {
|
||||
progress: (p) => events.push(p),
|
||||
});
|
||||
|
||||
expect(events.length).toBeGreaterThan(0);
|
||||
const last = events[events.length - 1];
|
||||
expect(last.done).toBe(true);
|
||||
// Earlier callbacks must have done=false.
|
||||
for (const ev of events.slice(0, -1)) {
|
||||
expect(ev.done).toBe(false);
|
||||
}
|
||||
// outputRows reflects the rows added in this call, not table size.
|
||||
expect(last.outputRows).toBe(3);
|
||||
// The input source (an array) reports a row count, so totalRows is set.
|
||||
expect(last.totalRows).toBe(3);
|
||||
// outputRows is monotonic.
|
||||
for (let i = 1; i < events.length; i++) {
|
||||
expect(events[i].outputRows).toBeGreaterThanOrEqual(
|
||||
events[i - 1].outputRows,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
it("should swallow errors thrown from the progress callback", async () => {
|
||||
const warn = jest
|
||||
.spyOn(console, "warn")
|
||||
.mockImplementation(() => undefined);
|
||||
try {
|
||||
const res = await table.add([{ id: 1 }, { id: 2 }], {
|
||||
progress: () => {
|
||||
throw new Error("callback bomb");
|
||||
},
|
||||
});
|
||||
expect(res.version).toBeGreaterThan(0);
|
||||
expect(warn).toHaveBeenCalled();
|
||||
} finally {
|
||||
warn.mockRestore();
|
||||
}
|
||||
});
|
||||
|
||||
it("should let me close the table", async () => {
|
||||
expect(table.isOpen()).toBe(true);
|
||||
table.close();
|
||||
|
||||
@@ -144,19 +144,6 @@ export interface DropNamespaceOptions {
|
||||
behavior?: "restrict" | "cascade";
|
||||
}
|
||||
|
||||
export interface RenameTableOptions {
|
||||
/**
|
||||
* The namespace path of the table being renamed. Defaults to the root
|
||||
* namespace (`[]`) when omitted.
|
||||
*/
|
||||
namespacePath?: string[];
|
||||
/**
|
||||
* The namespace path to move the table to as part of the rename. When
|
||||
* omitted the table stays in `namespacePath`.
|
||||
*/
|
||||
newNamespacePath?: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* A LanceDB Connection that allows you to open tables and create new ones.
|
||||
*
|
||||
@@ -309,6 +296,12 @@ export abstract class Connection {
|
||||
*/
|
||||
abstract dropTable(name: string, namespacePath?: string[]): Promise<void>;
|
||||
|
||||
abstract renameTable(
|
||||
oldName: string,
|
||||
newName: string,
|
||||
namespacePath?: string[],
|
||||
): Promise<void>;
|
||||
|
||||
/**
|
||||
* Drop all tables in the database.
|
||||
* @param {string[]} namespacePath The namespace path to drop tables from (defaults to root namespace).
|
||||
@@ -404,24 +397,6 @@ export abstract class Connection {
|
||||
isShallow?: boolean;
|
||||
},
|
||||
): Promise<Table>;
|
||||
|
||||
/**
|
||||
* Rename a table.
|
||||
*
|
||||
* Currently only supported by LanceDB Cloud. Local OSS connections and
|
||||
* namespace-backed connections (via {@link connectNamespace}) reject with
|
||||
* a "not supported" error.
|
||||
*
|
||||
* @param {string} currentName - The current name of the table.
|
||||
* @param {string} newName - The new name for the table.
|
||||
* @param {RenameTableOptions} options - Optional namespace paths. When
|
||||
* `newNamespacePath` is omitted the table stays in `namespacePath`.
|
||||
*/
|
||||
abstract renameTable(
|
||||
currentName: string,
|
||||
newName: string,
|
||||
options?: RenameTableOptions,
|
||||
): Promise<void>;
|
||||
}
|
||||
|
||||
/** @hideconstructor */
|
||||
@@ -640,6 +615,14 @@ export class LocalConnection extends Connection {
|
||||
return this.inner.dropTable(name, namespacePath ?? []);
|
||||
}
|
||||
|
||||
async renameTable(
|
||||
oldName: string,
|
||||
newName: string,
|
||||
namespacePath?: string[],
|
||||
): Promise<void> {
|
||||
return this.inner.renameTable(oldName, newName, namespacePath ?? []);
|
||||
}
|
||||
|
||||
async dropAllTables(namespacePath?: string[]): Promise<void> {
|
||||
return this.inner.dropAllTables(namespacePath ?? []);
|
||||
}
|
||||
@@ -682,19 +665,6 @@ export class LocalConnection extends Connection {
|
||||
options?.behavior,
|
||||
);
|
||||
}
|
||||
|
||||
async renameTable(
|
||||
currentName: string,
|
||||
newName: string,
|
||||
options?: RenameTableOptions,
|
||||
): Promise<void> {
|
||||
return this.inner.renameTable(
|
||||
currentName,
|
||||
newName,
|
||||
options?.namespacePath ?? [],
|
||||
options?.newNamespacePath,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -71,7 +71,6 @@ export {
|
||||
CreateNamespaceResponse,
|
||||
DropNamespaceResponse,
|
||||
DescribeNamespaceResponse,
|
||||
RenameTableOptions,
|
||||
} from "./connection";
|
||||
|
||||
export { Session } from "./native.js";
|
||||
@@ -114,7 +113,6 @@ export {
|
||||
UpdateOptions,
|
||||
OptimizeOptions,
|
||||
Version,
|
||||
WriteProgress,
|
||||
LsmWriteSpec,
|
||||
ColumnAlteration,
|
||||
} from "./table";
|
||||
|
||||
@@ -46,33 +46,6 @@ import { sanitizeType } from "./sanitize";
|
||||
import { IntoSql, toSQL } from "./util";
|
||||
export { IndexConfig } from "./native";
|
||||
|
||||
/**
|
||||
* Progress snapshot for a write operation, delivered to the `progress`
|
||||
* callback passed to {@link Table.add}.
|
||||
*/
|
||||
export interface WriteProgress {
|
||||
/** Number of rows written so far. */
|
||||
outputRows: number;
|
||||
/** Number of bytes written so far. */
|
||||
outputBytes: number;
|
||||
/**
|
||||
* Total rows expected, when the input source reports it.
|
||||
*
|
||||
* Always set on the final callback (the one with `done: true`), falling
|
||||
* back to the actual number of rows written when the source could not
|
||||
* report a row count up front.
|
||||
*/
|
||||
totalRows?: number;
|
||||
/** Wall-clock seconds since the write started. */
|
||||
elapsedSeconds: number;
|
||||
/** Number of parallel write tasks currently in flight. */
|
||||
activeTasks: number;
|
||||
/** Total number of parallel write tasks (the write parallelism). */
|
||||
totalTasks: number;
|
||||
/** `true` for the final callback; `false` otherwise. */
|
||||
done: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options for adding data to a table.
|
||||
*/
|
||||
@@ -83,28 +56,6 @@ export interface AddDataOptions {
|
||||
* If "overwrite" then the new data will replace the existing data in the table.
|
||||
*/
|
||||
mode: "append" | "overwrite";
|
||||
|
||||
/**
|
||||
* Optional callback invoked periodically with write progress.
|
||||
*
|
||||
* The callback is fired once per batch written and once more with
|
||||
* `done: true` when the write completes. Calls are dispatched
|
||||
* asynchronously to the JS event loop and never block the write — a slow
|
||||
* callback will queue events rather than back-pressure the writer.
|
||||
*
|
||||
* Errors thrown from the callback are logged with `console.warn` and
|
||||
* swallowed — they do not abort the write.
|
||||
*
|
||||
* @example
|
||||
* ```ts
|
||||
* await table.add(data, {
|
||||
* progress: (p) => {
|
||||
* console.log(`${p.outputRows}/${p.totalRows ?? "?"} rows`);
|
||||
* },
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
progress: (progress: WriteProgress) => void;
|
||||
}
|
||||
|
||||
export interface UpdateOptions {
|
||||
@@ -754,20 +705,7 @@ export class LocalTable extends Table {
|
||||
const schema = await this.schema();
|
||||
|
||||
const buffer = await fromDataToBuffer(data, undefined, schema);
|
||||
// Wrap the user callback so a thrown error doesn't surface as an
|
||||
// unhandled exception (the callback fires from a napi threadsafe
|
||||
// function — exceptions there crash the process).
|
||||
const userProgress = options?.progress;
|
||||
const progress = userProgress
|
||||
? (p: WriteProgress) => {
|
||||
try {
|
||||
userProgress(p);
|
||||
} catch (e) {
|
||||
console.warn("Table.add progress callback threw:", e);
|
||||
}
|
||||
}
|
||||
: undefined;
|
||||
return await this.inner.add(buffer, mode, progress);
|
||||
return await this.inner.add(buffer, mode);
|
||||
}
|
||||
|
||||
async update(
|
||||
|
||||
11029
nodejs/package-lock.json
generated
11029
nodejs/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -328,6 +328,20 @@ impl Connection {
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn rename_table(
|
||||
&self,
|
||||
old_name: String,
|
||||
new_name: String,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
) -> napi::Result<()> {
|
||||
let ns = namespace_path.unwrap_or_default();
|
||||
self.get_inner()?
|
||||
.rename_table(&old_name, &new_name, &ns, &ns)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn drop_all_tables(&self, namespace_path: Option<Vec<String>>) -> napi::Result<()> {
|
||||
let ns = namespace_path.unwrap_or_default();
|
||||
@@ -459,23 +473,4 @@ impl Connection {
|
||||
transaction_id: resp.transaction_id,
|
||||
})
|
||||
}
|
||||
|
||||
/// Rename a table. `current_namespace_path` and `new_namespace_path` default to
|
||||
/// the root namespace when omitted; the caller is expected to either pass both
|
||||
/// or pass neither.
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn rename_table(
|
||||
&self,
|
||||
current_name: String,
|
||||
new_name: String,
|
||||
current_namespace_path: Option<Vec<String>>,
|
||||
new_namespace_path: Option<Vec<String>>,
|
||||
) -> napi::Result<()> {
|
||||
let cur_ns = current_namespace_path.unwrap_or_default();
|
||||
let new_ns = new_namespace_path.unwrap_or_default();
|
||||
self.get_inner()?
|
||||
.rename_table(¤t_name, &new_name, &cur_ns, &new_ns)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ use lancedb::table::{
|
||||
OptimizeAction, OptimizeOptions, Table as LanceDbTable,
|
||||
};
|
||||
use napi::bindgen_prelude::*;
|
||||
use napi::threadsafe_function::{ThreadsafeFunction, ThreadsafeFunctionCallMode};
|
||||
use napi_derive::napi;
|
||||
|
||||
use crate::error::NapiErrorExt;
|
||||
@@ -68,16 +67,8 @@ impl Table {
|
||||
schema_to_buffer(&schema)
|
||||
}
|
||||
|
||||
#[napi(
|
||||
catch_unwind,
|
||||
ts_args_type = "buf: Buffer, mode: string, progressCallback?: (progress: WriteProgressInfo) => void"
|
||||
)]
|
||||
pub async fn add(
|
||||
&self,
|
||||
buf: Buffer,
|
||||
mode: String,
|
||||
progress_callback: Option<ProgressFn>,
|
||||
) -> napi::Result<AddResult> {
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn add(&self, buf: Buffer, mode: String) -> napi::Result<AddResult> {
|
||||
let batches = ipc_file_to_batches(buf.to_vec())
|
||||
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
|
||||
let batches = batches
|
||||
@@ -101,19 +92,6 @@ impl Table {
|
||||
return Err(napi::Error::from_reason(format!("Invalid mode: {}", mode)));
|
||||
};
|
||||
|
||||
if let Some(tsfn) = progress_callback {
|
||||
op = op.progress(move |p| {
|
||||
// NonBlocking: dispatch onto the JS event loop without
|
||||
// blocking the writer thread. With napi-rs's default
|
||||
// unbounded queue, events are not dropped — a slow JS
|
||||
// callback will just queue them.
|
||||
tsfn.call(
|
||||
WriteProgressInfo::from(p),
|
||||
ThreadsafeFunctionCallMode::NonBlocking,
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
let res = op.execute().await.default_error()?;
|
||||
Ok(res.into())
|
||||
}
|
||||
@@ -676,44 +654,6 @@ pub struct OptimizeStats {
|
||||
pub prune: RemovalStats,
|
||||
}
|
||||
|
||||
/// Progress snapshot for a write operation, delivered to the JS callback
|
||||
/// passed to `Table.add`.
|
||||
#[napi(object)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct WriteProgressInfo {
|
||||
/// Number of rows written so far.
|
||||
pub output_rows: i64,
|
||||
/// Number of bytes written so far.
|
||||
pub output_bytes: i64,
|
||||
/// Total rows expected, if the input source reports it.
|
||||
/// Always set on the final callback (where `done` is `true`).
|
||||
pub total_rows: Option<i64>,
|
||||
/// Wall-clock seconds since monitoring started.
|
||||
pub elapsed_seconds: f64,
|
||||
/// Number of parallel write tasks currently in flight.
|
||||
pub active_tasks: i64,
|
||||
/// Total number of parallel write tasks (the write parallelism).
|
||||
pub total_tasks: i64,
|
||||
/// `true` for the final callback; `false` otherwise.
|
||||
pub done: bool,
|
||||
}
|
||||
|
||||
impl From<&lancedb::table::write_progress::WriteProgress> for WriteProgressInfo {
|
||||
fn from(p: &lancedb::table::write_progress::WriteProgress) -> Self {
|
||||
Self {
|
||||
output_rows: p.output_rows() as i64,
|
||||
output_bytes: p.output_bytes() as i64,
|
||||
total_rows: p.total_rows().map(|n| n as i64),
|
||||
elapsed_seconds: p.elapsed().as_secs_f64(),
|
||||
active_tasks: p.active_tasks() as i64,
|
||||
total_tasks: p.total_tasks() as i64,
|
||||
done: p.done(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type ProgressFn = ThreadsafeFunction<WriteProgressInfo, (), WriteProgressInfo, Status, false>;
|
||||
|
||||
/// A definition of a column alteration. The alteration changes the column at
|
||||
/// `path` to have the new name `name`, to be nullable if `nullable` is true,
|
||||
/// and to have the data type `data_type`. At least one of `rename` or `nullable`
|
||||
|
||||
@@ -4,26 +4,16 @@ code is in the `src/` directory and the Python bindings are in the `lancedb/` di
|
||||
|
||||
Common commands:
|
||||
|
||||
* Bootstrap dev env: `uv run --extra tests --extra dev maturin develop --extras tests,dev`
|
||||
* Build: `make develop`
|
||||
* Format: `make format`
|
||||
* Lint: `make check`
|
||||
* Fix lints: `make fix`
|
||||
* Test: `uv run --extra tests pytest python/tests -vv --durations=10 -m "not slow and not s3_test"`
|
||||
* Run specific test: `uv run --extra tests pytest python/tests/<test_file>.py::<test_name> -q`
|
||||
* Doc test: `uv run --extra tests pytest --doctest-modules python/lancedb`
|
||||
|
||||
Use the uv-managed environment declared by `uv.lock` for Python validation. Do
|
||||
not treat system `python`, global `pytest`, or missing editable-install errors
|
||||
as final blockers; bootstrap or enter the uv environment instead. `make test`
|
||||
and `make doctest` assume the development environment is already prepared.
|
||||
* Test: `make test`
|
||||
* Doc test: `make doctest`
|
||||
|
||||
Before committing changes, run lints and then formatting.
|
||||
|
||||
When you change the Rust code, PyO3 binding code, or see a missing/stale
|
||||
`lancedb._lancedb`, recompile the Python bindings with
|
||||
`uv run --extra tests --extra dev maturin develop --extras tests,dev` before
|
||||
running tests.
|
||||
When you change the Rust code, you will need to recompile the Python bindings: `make develop`.
|
||||
|
||||
When you export new types from Rust to Python, you must manually update `python/lancedb/_lancedb.pyi`
|
||||
with the corresponding type hints. You can run `pyright` to check for type errors in the Python code.
|
||||
|
||||
@@ -718,7 +718,6 @@ class LanceQueryBuilder(ABC):
|
||||
flatten: Optional[Union[int, bool]] = None,
|
||||
*,
|
||||
timeout: Optional[timedelta] = None,
|
||||
**kwargs,
|
||||
) -> "pd.DataFrame":
|
||||
"""
|
||||
Execute the query and return the results as a pandas DataFrame.
|
||||
@@ -736,12 +735,9 @@ class LanceQueryBuilder(ABC):
|
||||
timeout: Optional[timedelta]
|
||||
The maximum time to wait for the query to complete.
|
||||
If None, wait indefinitely.
|
||||
**kwargs
|
||||
Forwarded to pyarrow.Table.to_pandas after query execution and
|
||||
optional flattening.
|
||||
"""
|
||||
tbl = flatten_columns(self.to_arrow(timeout=timeout), flatten)
|
||||
return tbl.to_pandas(**kwargs)
|
||||
return tbl.to_pandas()
|
||||
|
||||
@abstractmethod
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
@@ -2356,7 +2352,6 @@ class AsyncQueryBase(object):
|
||||
self,
|
||||
flatten: Optional[Union[int, bool]] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
**kwargs,
|
||||
) -> "pd.DataFrame":
|
||||
"""
|
||||
Execute the query and collect the results into a pandas DataFrame.
|
||||
@@ -2389,13 +2384,10 @@ class AsyncQueryBase(object):
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
**kwargs
|
||||
Forwarded to pyarrow.Table.to_pandas after query execution and
|
||||
optional flattening.
|
||||
"""
|
||||
return (
|
||||
flatten_columns(await self.to_arrow(timeout=timeout), flatten)
|
||||
).to_pandas(**kwargs)
|
||||
).to_pandas()
|
||||
|
||||
async def to_polars(
|
||||
self,
|
||||
@@ -3397,7 +3389,6 @@ class BaseQueryBuilder(object):
|
||||
self,
|
||||
flatten: Optional[Union[int, bool]] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
**kwargs,
|
||||
) -> "pd.DataFrame":
|
||||
"""
|
||||
Execute the query and collect the results into a pandas DataFrame.
|
||||
@@ -3430,11 +3421,8 @@ class BaseQueryBuilder(object):
|
||||
The maximum time to wait for the query to complete.
|
||||
If not specified, no timeout is applied. If the query does not
|
||||
complete within the specified time, an error will be raised.
|
||||
**kwargs
|
||||
Forwarded to pyarrow.Table.to_pandas after query execution and
|
||||
optional flattening.
|
||||
"""
|
||||
return LOOP.run(self._inner.to_pandas(flatten, timeout, **kwargs))
|
||||
return LOOP.run(self._inner.to_pandas(flatten, timeout))
|
||||
|
||||
def to_polars(
|
||||
self,
|
||||
|
||||
@@ -40,7 +40,7 @@ from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||
from lancedb.table import _normalize_progress
|
||||
|
||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
|
||||
from ..table import AsyncTable, BlobMode, IndexStatistics, Query, Table, Tags
|
||||
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
||||
from ..types import BaseTokenizerType
|
||||
|
||||
|
||||
@@ -101,7 +101,7 @@ class RemoteTable(Table):
|
||||
"""to_arrow() is not yet supported on LanceDB cloud."""
|
||||
raise NotImplementedError("to_arrow() is not yet supported on LanceDB cloud.")
|
||||
|
||||
def to_pandas(self, blob_mode: BlobMode = "lazy", **kwargs):
|
||||
def to_pandas(self):
|
||||
"""to_pandas() is not yet supported on LanceDB cloud."""
|
||||
raise NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
||||
|
||||
|
||||
@@ -87,8 +87,6 @@ from .util import (
|
||||
)
|
||||
from .index import lang_mapping
|
||||
|
||||
BlobMode = Literal["lazy", "bytes", "descriptions"]
|
||||
|
||||
_MODEL_BACKED_TOKENIZER_PREFIXES = ("jieba", "lindera")
|
||||
_MODEL_BACKED_TOKENIZER_ERRORS = (
|
||||
"unknown base tokenizer",
|
||||
@@ -762,22 +760,14 @@ class Table(ABC):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def to_pandas(self, blob_mode: BlobMode = "lazy", **kwargs) -> "pandas.DataFrame":
|
||||
def to_pandas(self) -> "pandas.DataFrame":
|
||||
"""Return the table as a pandas DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
blob_mode: str, default "lazy"
|
||||
Controls how blob columns are returned for backends that support
|
||||
Lance blob-aware pandas conversion.
|
||||
**kwargs
|
||||
Forwarded to PyArrow / Lance pandas conversion.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
"""
|
||||
return self.to_arrow().to_pandas(**kwargs)
|
||||
return self.to_arrow().to_pandas()
|
||||
|
||||
@abstractmethod
|
||||
def to_arrow(self) -> pa.Table:
|
||||
@@ -2193,27 +2183,14 @@ class LanceTable(Table):
|
||||
"""Return the first n rows of the table."""
|
||||
return LOOP.run(self._table.head(n))
|
||||
|
||||
def to_pandas(self, blob_mode: BlobMode = "lazy", **kwargs) -> "pd.DataFrame":
|
||||
def to_pandas(self) -> "pd.DataFrame":
|
||||
"""Return the table as a pandas DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
blob_mode: str, default "lazy"
|
||||
Controls how Lance blob columns are returned.
|
||||
**kwargs
|
||||
Forwarded to Lance pandas conversion.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
"""
|
||||
if blob_mode == "lazy" and (
|
||||
self._namespace_client is not None
|
||||
or get_uri_scheme(self._dataset_path) == "memory"
|
||||
):
|
||||
return self.to_arrow().to_pandas(**kwargs)
|
||||
|
||||
return self.to_lance().to_pandas(blob_mode=blob_mode, **kwargs)
|
||||
return self.to_arrow().to_pandas()
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
"""Return the table as a pyarrow Table.
|
||||
@@ -2542,6 +2519,11 @@ class LanceTable(Table):
|
||||
"at a time. To search over multiple text fields, create a "
|
||||
"separate FTS index for each field."
|
||||
)
|
||||
if "." in field_names:
|
||||
raise ValueError(
|
||||
"Native FTS indexes can only be created on top-level fields. "
|
||||
f"Received nested field path: {field_names!r}."
|
||||
)
|
||||
|
||||
if tokenizer_name is None:
|
||||
tokenizer_configs = {
|
||||
@@ -3963,39 +3945,14 @@ class AsyncTable:
|
||||
"""
|
||||
return AsyncQuery(self._inner.query())
|
||||
|
||||
async def _to_lance(self, **kwargs) -> lance.LanceDataset:
|
||||
try:
|
||||
import lance
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The lance library is required to use this function. "
|
||||
"Please install with `pip install pylance`."
|
||||
)
|
||||
|
||||
return lance.dataset(
|
||||
await self.uri(),
|
||||
version=await self.version(),
|
||||
storage_options=await self.latest_storage_options(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def to_pandas(self, blob_mode: BlobMode = "lazy", **kwargs) -> "pd.DataFrame":
|
||||
async def to_pandas(self) -> "pd.DataFrame":
|
||||
"""Return the table as a pandas DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
blob_mode: str, default "lazy"
|
||||
Controls how Lance blob columns are returned.
|
||||
**kwargs
|
||||
Forwarded to PyArrow / Lance pandas conversion.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
"""
|
||||
if blob_mode == "lazy":
|
||||
return (await self.to_arrow()).to_pandas(**kwargs)
|
||||
return (await self._to_lance()).to_pandas(blob_mode=blob_mode, **kwargs)
|
||||
return (await self.to_arrow()).to_pandas()
|
||||
|
||||
async def to_arrow(self) -> pa.Table:
|
||||
"""Return the table as a pyarrow Table.
|
||||
|
||||
@@ -563,19 +563,8 @@ def test_create_index_multiple_columns(tmp_path, table):
|
||||
|
||||
|
||||
def test_nested_schema(tmp_path, table):
|
||||
table.create_fts_index("nested.text")
|
||||
indices = table.list_indices()
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type == "FTS"
|
||||
assert indices[0].columns == ["nested.text"]
|
||||
|
||||
results = (
|
||||
table.search("puppy", query_type="fts", fts_columns="nested.text")
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) > 0
|
||||
assert all("puppy" in row["nested"]["text"] for row in results)
|
||||
with pytest.raises(ValueError, match="top-level fields"):
|
||||
table.create_fts_index("nested.text")
|
||||
|
||||
|
||||
def test_search_index_with_filter(table):
|
||||
|
||||
@@ -165,22 +165,6 @@ def test_offset(table):
|
||||
assert len(results_with_offset.to_pandas()) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_query_to_pandas_kwargs(table, table_async):
|
||||
sync_df = (
|
||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||
.select(["id"])
|
||||
.limit(1)
|
||||
.to_pandas(split_blocks=True)
|
||||
)
|
||||
assert sync_df["id"].tolist() == [1]
|
||||
|
||||
async_df = await (
|
||||
table_async.query().select(["id"]).limit(2).to_pandas(split_blocks=True)
|
||||
)
|
||||
assert async_df["id"].tolist() == [1, 2]
|
||||
|
||||
|
||||
def test_order_by_plain_query(mem_db):
|
||||
table = mem_db.create_table(
|
||||
"test_order_by",
|
||||
|
||||
@@ -269,25 +269,6 @@ def test_table_unimplemented_functions():
|
||||
table.to_pandas()
|
||||
|
||||
|
||||
def test_table_to_pandas_not_supported():
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/create/?mode=create":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(b"{}")
|
||||
else:
|
||||
request.send_response(404)
|
||||
request.end_headers()
|
||||
|
||||
with mock_lancedb_connection(handler) as db:
|
||||
table = db.create_table("test", [{"id": 1}])
|
||||
with pytest.raises(NotImplementedError):
|
||||
table.to_pandas()
|
||||
with pytest.raises(NotImplementedError):
|
||||
table.to_pandas(blob_mode="bytes", split_blocks=True)
|
||||
|
||||
|
||||
def test_table_add_in_threadpool():
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/insert/":
|
||||
|
||||
@@ -47,85 +47,6 @@ def test_basic(mem_db: DBConnection):
|
||||
assert table.to_arrow() == expected_data
|
||||
|
||||
|
||||
def test_table_to_pandas_default_matches_arrow(tmp_db: DBConnection):
|
||||
pd = pytest.importorskip("pandas")
|
||||
data = pa.table({"id": [1, 2], "text": ["one", "two"]})
|
||||
table = tmp_db.create_table("test_to_pandas_old_call", data=data)
|
||||
|
||||
expected = data.to_pandas()
|
||||
pd.testing.assert_frame_equal(table.to_pandas(), expected)
|
||||
|
||||
|
||||
def test_table_to_pandas_blob_bytes(tmp_db: DBConnection):
|
||||
pytest.importorskip("lance")
|
||||
data = pa.table(
|
||||
{
|
||||
"id": pa.array([1, 2], pa.int64()),
|
||||
"blob": pa.array([b"hello", b"world"], pa.large_binary()),
|
||||
},
|
||||
schema=pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field(
|
||||
"blob", pa.large_binary(), metadata={"lance-encoding:blob": "true"}
|
||||
),
|
||||
]
|
||||
),
|
||||
)
|
||||
table = tmp_db.create_table("test_to_pandas_blob_bytes", data=data)
|
||||
|
||||
df = table.to_pandas(blob_mode="bytes")
|
||||
|
||||
assert df["blob"].tolist() == [b"hello", b"world"]
|
||||
|
||||
|
||||
def test_table_to_pandas_kwargs(tmp_db: DBConnection):
|
||||
pd = pytest.importorskip("pandas")
|
||||
data = pa.table({"id": pa.array([1, 2], pa.int64())})
|
||||
table = tmp_db.create_table("test_to_pandas_kwargs", data=data)
|
||||
|
||||
df = table.to_pandas(types_mapper=pd.ArrowDtype)
|
||||
|
||||
assert str(df["id"].dtype) == "int64[pyarrow]"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_table_to_pandas_blob_bytes(tmp_db_async: AsyncConnection):
|
||||
pytest.importorskip("lance")
|
||||
data = pa.table(
|
||||
{
|
||||
"id": pa.array([1, 2], pa.int64()),
|
||||
"blob": pa.array([b"hello", b"world"], pa.large_binary()),
|
||||
},
|
||||
schema=pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field(
|
||||
"blob", pa.large_binary(), metadata={"lance-encoding:blob": "true"}
|
||||
),
|
||||
]
|
||||
),
|
||||
)
|
||||
table = await tmp_db_async.create_table(
|
||||
"test_async_to_pandas_blob_bytes", data=data
|
||||
)
|
||||
|
||||
df = await table.to_pandas(blob_mode="bytes")
|
||||
|
||||
assert df["blob"].tolist() == [b"hello", b"world"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_table_to_pandas_kwargs(tmp_db_async: AsyncConnection):
|
||||
pd = pytest.importorskip("pandas")
|
||||
data = pa.table({"id": pa.array([1, 2], pa.int64())})
|
||||
table = await tmp_db_async.create_table("test_async_to_pandas_kwargs", data=data)
|
||||
|
||||
df = await table.to_pandas(types_mapper=pd.ArrowDtype)
|
||||
|
||||
assert str(df["id"].dtype) == "int64[pyarrow]"
|
||||
|
||||
|
||||
def test_create_table_infers_large_int_vectors(mem_db: DBConnection):
|
||||
data = [{"vector": [0, 300]}]
|
||||
|
||||
@@ -1890,55 +1811,6 @@ def test_create_scalar_index(mem_db: DBConnection):
|
||||
assert scalar_index.name == "custom_y_index"
|
||||
|
||||
|
||||
def test_create_index_nested_field_paths(mem_db: DBConnection):
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("metadata", pa.struct([pa.field("user_id", pa.int32())])),
|
||||
pa.field(
|
||||
"image",
|
||||
pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
|
||||
),
|
||||
]
|
||||
)
|
||||
data = pa.Table.from_pylist(
|
||||
[
|
||||
{
|
||||
"metadata": {"user_id": i},
|
||||
"image": {"embedding": [float(i), float(i + 1)]},
|
||||
}
|
||||
for i in range(256)
|
||||
],
|
||||
schema=schema,
|
||||
)
|
||||
table = mem_db.create_table("nested_index_paths", data=data)
|
||||
|
||||
table.create_scalar_index("metadata.user_id", name="metadata_user_id_idx")
|
||||
table.create_index(
|
||||
vector_column_name="image.embedding",
|
||||
num_partitions=1,
|
||||
num_sub_vectors=1,
|
||||
name="image_embedding_idx",
|
||||
)
|
||||
|
||||
indices = sorted(table.list_indices(), key=lambda idx: idx.name)
|
||||
assert [(idx.name, idx.index_type, idx.columns) for idx in indices] == [
|
||||
("image_embedding_idx", "IvfPq", ["image.embedding"]),
|
||||
("metadata_user_id_idx", "BTree", ["metadata.user_id"]),
|
||||
]
|
||||
|
||||
vector_results = (
|
||||
table.search([0.0, 1.0], vector_column_name="image.embedding")
|
||||
.limit(1)
|
||||
.to_list()
|
||||
)
|
||||
assert len(vector_results) == 1
|
||||
assert vector_results[0]["metadata"]["user_id"] == 0
|
||||
|
||||
filtered_results = table.search().where("metadata.user_id = 42").limit(1).to_list()
|
||||
assert len(filtered_results) == 1
|
||||
assert filtered_results[0]["metadata"]["user_id"] == 42
|
||||
|
||||
|
||||
def test_empty_query(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"my_table",
|
||||
|
||||
@@ -271,26 +271,15 @@ impl Scannable for WithEmbeddingsScannable {
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Task panicked during embedding computation: {}", e),
|
||||
})??;
|
||||
// Look up columns by name (not position) so the result matches
|
||||
// the output schema even when columns appear in a different
|
||||
// order — e.g. `add_columns` placed a new column after the
|
||||
// embedding column, but the computed batch appends embeddings
|
||||
// at the end. Cast per-column because field metadata (e.g.
|
||||
// nested nullability) may also differ between the embedding
|
||||
// function output and the table.
|
||||
let columns: Vec<ArrayRef> = output_schema
|
||||
.fields()
|
||||
// Cast columns to match the declared output schema. The data is
|
||||
// identical but field metadata (e.g. nested nullability) may
|
||||
// differ between the embedding function output and the table.
|
||||
let columns: Vec<ArrayRef> = result
|
||||
.columns()
|
||||
.iter()
|
||||
.map(|field| {
|
||||
let col = result.column_by_name(field.name()).ok_or_else(|| {
|
||||
Error::InvalidInput {
|
||||
message: format!(
|
||||
"Column '{}' required by the table schema was not present in the input batch",
|
||||
field.name()
|
||||
),
|
||||
}
|
||||
})?;
|
||||
let target_type = field.data_type();
|
||||
.enumerate()
|
||||
.map(|(i, col)| {
|
||||
let target_type = output_schema.field(i).data_type();
|
||||
if col.data_type() == target_type {
|
||||
Ok(col.clone())
|
||||
} else {
|
||||
@@ -975,118 +964,5 @@ mod tests {
|
||||
"Expected EmbeddingFunctionNotFound"
|
||||
);
|
||||
}
|
||||
|
||||
/// Regression test for https://github.com/lancedb/lancedb/issues/3136.
|
||||
///
|
||||
/// When a column is added to the table after the embedding column via
|
||||
/// schema evolution, the table schema becomes
|
||||
/// `[..., embedding, extra]`. The input batch (without the embedding)
|
||||
/// is `[..., extra]`, and `compute_embeddings_for_batch` appends the
|
||||
/// embedding at the end giving `[..., extra, embedding]`. A positional
|
||||
/// cast to the output schema would map `extra` onto `embedding` and
|
||||
/// fail with a CastError. Columns must be matched by name.
|
||||
#[tokio::test]
|
||||
async fn test_with_embeddings_scannable_column_added_after_embedding() {
|
||||
let input_schema = Arc::new(Schema::new(vec![
|
||||
Field::new("text", DataType::Utf8, false),
|
||||
Field::new("score", DataType::Float64, true),
|
||||
]));
|
||||
let batch = RecordBatch::try_new(
|
||||
input_schema.clone(),
|
||||
vec![
|
||||
Arc::new(StringArray::from(vec!["hello", "world"])) as ArrayRef,
|
||||
Arc::new(arrow_array::Float64Array::from(vec![1.0, 2.0])) as ArrayRef,
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mock_embedding: Arc<dyn EmbeddingFunction> = Arc::new(MockEmbed::new("mock", 4));
|
||||
let embedding_def = EmbeddingDefinition::new("text", "mock", Some("text_vec"));
|
||||
|
||||
// Table schema: embedding column is BEFORE `score`, as would
|
||||
// happen if `score` was added via `add_columns` after creating
|
||||
// the table with an embedding on `text`.
|
||||
let output_schema = Arc::new(Schema::new(vec![
|
||||
Field::new("text", DataType::Utf8, false),
|
||||
Field::new(
|
||||
"text_vec",
|
||||
DataType::FixedSizeList(
|
||||
Arc::new(Field::new("item", DataType::Float32, true)),
|
||||
4,
|
||||
),
|
||||
false,
|
||||
),
|
||||
Field::new("score", DataType::Float64, true),
|
||||
]));
|
||||
|
||||
let mut scannable = WithEmbeddingsScannable::with_schema(
|
||||
Box::new(batch),
|
||||
vec![(embedding_def, mock_embedding)],
|
||||
output_schema.clone(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let stream = scannable.scan_as_stream();
|
||||
let results: Vec<RecordBatch> = stream.try_collect().await.unwrap();
|
||||
assert_eq!(results.len(), 1);
|
||||
|
||||
let result_batch = &results[0];
|
||||
assert_eq!(result_batch.schema(), output_schema);
|
||||
assert_eq!(result_batch.num_rows(), 2);
|
||||
// Position 1 must actually hold the FixedSizeList embedding —
|
||||
// not the score column reinterpreted by a permissive cast.
|
||||
let embedding = result_batch
|
||||
.column(1)
|
||||
.as_any()
|
||||
.downcast_ref::<arrow_array::FixedSizeListArray>()
|
||||
.expect("position 1 should be a FixedSizeList embedding");
|
||||
assert_eq!(embedding.value_length(), 4);
|
||||
assert_eq!(embedding.null_count(), 0);
|
||||
}
|
||||
|
||||
/// If the input batch is missing a non-embedding column required by
|
||||
/// the table schema, we should return a clear error rather than
|
||||
/// silently producing a malformed batch.
|
||||
#[tokio::test]
|
||||
async fn test_with_embeddings_scannable_missing_required_column() {
|
||||
let input_schema =
|
||||
Arc::new(Schema::new(vec![Field::new("text", DataType::Utf8, false)]));
|
||||
let batch = RecordBatch::try_new(
|
||||
input_schema,
|
||||
vec![Arc::new(StringArray::from(vec!["hello", "world"])) as ArrayRef],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mock_embedding: Arc<dyn EmbeddingFunction> = Arc::new(MockEmbed::new("mock", 4));
|
||||
let embedding_def = EmbeddingDefinition::new("text", "mock", Some("text_vec"));
|
||||
|
||||
let output_schema = Arc::new(Schema::new(vec![
|
||||
Field::new("text", DataType::Utf8, false),
|
||||
Field::new(
|
||||
"text_vec",
|
||||
DataType::FixedSizeList(
|
||||
Arc::new(Field::new("item", DataType::Float32, true)),
|
||||
4,
|
||||
),
|
||||
false,
|
||||
),
|
||||
Field::new("score", DataType::Float64, true),
|
||||
]));
|
||||
|
||||
let mut scannable = WithEmbeddingsScannable::with_schema(
|
||||
Box::new(batch),
|
||||
vec![(embedding_def, mock_embedding)],
|
||||
output_schema,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let stream = scannable.scan_as_stream();
|
||||
let results: Result<Vec<RecordBatch>> = stream.try_collect().await;
|
||||
let err = results.expect_err("expected an error");
|
||||
assert!(
|
||||
matches!(&err, Error::InvalidInput { message } if message.contains("score")),
|
||||
"expected InvalidInput about missing 'score' column, got: {err:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2171,33 +2171,6 @@ impl NativeTable {
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_index_field(
|
||||
schema: &lance_core::datatypes::Schema,
|
||||
column: &str,
|
||||
) -> Result<(String, Field)> {
|
||||
lance_core::datatypes::parse_field_path(column).map_err(|e| Error::InvalidInput {
|
||||
message: format!("Invalid field path `{}`: {}", column, e),
|
||||
})?;
|
||||
|
||||
let field_path = schema
|
||||
.resolve_case_insensitive(column)
|
||||
.ok_or_else(|| Error::Schema {
|
||||
message: format!(
|
||||
"Field path `{}` not found in schema. Available field paths: {}",
|
||||
column,
|
||||
schema.field_paths().join(", ")
|
||||
),
|
||||
})?;
|
||||
let field = field_path.last().expect("field path should be non-empty");
|
||||
let path_segments = field_path
|
||||
.iter()
|
||||
.map(|field| field.name.as_str())
|
||||
.collect::<Vec<_>>();
|
||||
let canonical_path = lance_core::datatypes::format_field_path(&path_segments);
|
||||
|
||||
Ok((canonical_path, Field::from(*field)))
|
||||
}
|
||||
|
||||
// Convert LanceDB Index to Lance IndexParams
|
||||
async fn make_index_params(
|
||||
&self,
|
||||
@@ -2688,14 +2661,13 @@ impl BaseTable for NativeTable {
|
||||
message: "Multi-column (composite) indices are not yet supported".to_string(),
|
||||
});
|
||||
}
|
||||
let schema = self.schema().await?;
|
||||
|
||||
let dataset = self.dataset.get().await?;
|
||||
let (column, field) = Self::resolve_index_field(dataset.schema(), &opts.columns[0])?;
|
||||
drop(dataset);
|
||||
let field = schema.field_with_name(&opts.columns[0])?;
|
||||
|
||||
let lance_idx_params = self.make_index_params(&field, opts.index.clone()).await?;
|
||||
let index_type = self.get_index_type_for_field(&field, &opts.index);
|
||||
let columns = [column.as_str()];
|
||||
let lance_idx_params = self.make_index_params(field, opts.index.clone()).await?;
|
||||
let index_type = self.get_index_type_for_field(field, &opts.index);
|
||||
let columns = [field.name().as_str()];
|
||||
self.dataset.ensure_mutable()?;
|
||||
let mut dataset = (*self.dataset.get().await?).clone();
|
||||
let mut builder = dataset
|
||||
@@ -2853,20 +2825,11 @@ impl BaseTable for NativeTable {
|
||||
|
||||
let mut columns = Vec::with_capacity(idx.fields.len());
|
||||
for field_id in &idx.fields {
|
||||
let column = match dataset.schema().field_path(*field_id) {
|
||||
Ok(column) => column,
|
||||
Err(e) => {
|
||||
log::warn!(
|
||||
"The index {} ({}) referenced a field with id {} which does not exist in the schema: {}",
|
||||
idx.name,
|
||||
idx.uuid,
|
||||
field_id,
|
||||
e
|
||||
);
|
||||
return None;
|
||||
}
|
||||
let Some(field) = dataset.schema().field_by_id(*field_id) else {
|
||||
log::warn!("The index {} ({}) referenced a field with id {} which does not exist in the schema", idx.name, idx.uuid, field_id);
|
||||
return None;
|
||||
};
|
||||
columns.push(column);
|
||||
columns.push(field.name.clone());
|
||||
}
|
||||
|
||||
let name = idx.name.clone();
|
||||
@@ -3079,8 +3042,8 @@ mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use arrow_array::{
|
||||
Array, ArrayRef, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray,
|
||||
RecordBatch, RecordBatchIterator, RecordBatchReader, StringArray, StructArray,
|
||||
Array, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray, RecordBatch,
|
||||
RecordBatchIterator, RecordBatchReader, StringArray,
|
||||
builder::{ListBuilder, StringBuilder},
|
||||
};
|
||||
use arrow_array::{BinaryArray, LargeBinaryArray};
|
||||
@@ -3100,7 +3063,6 @@ mod tests {
|
||||
use crate::query::Select;
|
||||
use crate::query::{ExecutableQuery, QueryBase};
|
||||
use crate::test_utils::connection::new_test_connection;
|
||||
use lance_index::scalar::FullTextSearchQuery;
|
||||
#[tokio::test]
|
||||
async fn test_open() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
@@ -3688,203 +3650,6 @@ mod tests {
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_index_nested_field_paths() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
let conn = ConnectBuilder::new(uri).execute().await.unwrap();
|
||||
|
||||
let num_rows = 512;
|
||||
let dimension = 8;
|
||||
|
||||
let metadata = Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new("user_id", DataType::Int32, false)),
|
||||
Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef,
|
||||
)]));
|
||||
|
||||
let vector_values = arrow_array::Float32Array::from_iter_values(
|
||||
(0..num_rows * dimension).map(|v| v as f32),
|
||||
);
|
||||
let embeddings =
|
||||
Arc::new(create_fixed_size_list(vector_values, dimension).unwrap()) as ArrayRef;
|
||||
let image = Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new(
|
||||
"embedding",
|
||||
embeddings.data_type().clone(),
|
||||
false,
|
||||
)),
|
||||
embeddings,
|
||||
)]));
|
||||
|
||||
let payload = Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new("text", DataType::Utf8, false)),
|
||||
Arc::new(StringArray::from_iter_values(
|
||||
(0..num_rows).map(|i| format!("document {}", i)),
|
||||
)) as ArrayRef,
|
||||
)]));
|
||||
|
||||
let meta_data = Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new("user-id", DataType::Int32, false)),
|
||||
Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef,
|
||||
)]));
|
||||
|
||||
let literal = Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new("a.b", DataType::Int32, false)),
|
||||
Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef,
|
||||
)]));
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("metadata", metadata.data_type().clone(), false),
|
||||
Field::new("image", image.data_type().clone(), false),
|
||||
Field::new("payload", payload.data_type().clone(), false),
|
||||
Field::new("meta-data", meta_data.data_type().clone(), false),
|
||||
Field::new("literal", literal.data_type().clone(), false),
|
||||
]));
|
||||
let batch =
|
||||
RecordBatch::try_new(schema, vec![metadata, image, payload, meta_data, literal])
|
||||
.unwrap();
|
||||
|
||||
let table = conn
|
||||
.create_table("nested_index_paths", batch)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
table
|
||||
.create_index(
|
||||
&["metadata.user_id"],
|
||||
Index::BTree(BTreeIndexBuilder::default()),
|
||||
)
|
||||
.name("metadata_user_id_idx".to_string())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
table
|
||||
.create_index(&["image.embedding"], Index::Auto)
|
||||
.name("image_embedding_idx".to_string())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
table
|
||||
.create_index(&["payload.text"], Index::FTS(Default::default()))
|
||||
.name("payload_text_idx".to_string())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
table
|
||||
.create_index(
|
||||
&["`meta-data`.`user-id`"],
|
||||
Index::BTree(BTreeIndexBuilder::default()),
|
||||
)
|
||||
.name("escaped_names_idx".to_string())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
table
|
||||
.create_index(
|
||||
&["literal.`a.b`"],
|
||||
Index::BTree(BTreeIndexBuilder::default()),
|
||||
)
|
||||
.name("literal_dot_idx".to_string())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut index_configs = table.list_indices().await.unwrap();
|
||||
index_configs.sort_by(|left, right| left.name.cmp(&right.name));
|
||||
|
||||
let indexed_columns = index_configs
|
||||
.iter()
|
||||
.map(|index| {
|
||||
(
|
||||
index.name.as_str(),
|
||||
index.columns.as_slice(),
|
||||
index.index_type.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
indexed_columns,
|
||||
vec![
|
||||
(
|
||||
"escaped_names_idx",
|
||||
&["`meta-data`.`user-id`".to_string()][..],
|
||||
crate::index::IndexType::BTree,
|
||||
),
|
||||
(
|
||||
"image_embedding_idx",
|
||||
&["image.embedding".to_string()][..],
|
||||
crate::index::IndexType::IvfPq,
|
||||
),
|
||||
(
|
||||
"literal_dot_idx",
|
||||
&["literal.`a.b`".to_string()][..],
|
||||
crate::index::IndexType::BTree,
|
||||
),
|
||||
(
|
||||
"metadata_user_id_idx",
|
||||
&["metadata.user_id".to_string()][..],
|
||||
crate::index::IndexType::BTree,
|
||||
),
|
||||
(
|
||||
"payload_text_idx",
|
||||
&["payload.text".to_string()][..],
|
||||
crate::index::IndexType::FTS,
|
||||
),
|
||||
]
|
||||
);
|
||||
|
||||
let vector_results = table
|
||||
.query()
|
||||
.nearest_to(&[0.0; 8])
|
||||
.unwrap()
|
||||
.column("image.embedding")
|
||||
.limit(1)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
vector_results
|
||||
.iter()
|
||||
.map(|batch| batch.num_rows())
|
||||
.sum::<usize>(),
|
||||
1
|
||||
);
|
||||
|
||||
let fts_results = table
|
||||
.query()
|
||||
.full_text_search(FullTextSearchQuery::new("document".to_string()))
|
||||
.limit(5)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(!fts_results.is_empty());
|
||||
|
||||
let filtered_results = table
|
||||
.query()
|
||||
.only_if("metadata.user_id = 42")
|
||||
.limit(1)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
filtered_results
|
||||
.iter()
|
||||
.map(|batch| batch.num_rows())
|
||||
.sum::<usize>(),
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_bitmap_index() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
|
||||
@@ -268,9 +268,7 @@ mod tests {
|
||||
};
|
||||
use crate::query::{ExecutableQuery, QueryBase, Select};
|
||||
use crate::table::add_data::NaNVectorBehavior;
|
||||
use crate::table::{
|
||||
ColumnDefinition, ColumnKind, NewColumnTransform, Table, TableDefinition, WriteOptions,
|
||||
};
|
||||
use crate::table::{ColumnDefinition, ColumnKind, Table, TableDefinition, WriteOptions};
|
||||
use crate::test_utils::TestCustomError;
|
||||
use crate::test_utils::embeddings::MockEmbed;
|
||||
|
||||
@@ -520,225 +518,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
/// Regression test for https://github.com/lancedb/lancedb/issues/3136.
|
||||
///
|
||||
/// When a column is added via `add_columns` AFTER an embedding column,
|
||||
/// the table schema becomes `[..., embedding, extra]`. Subsequent
|
||||
/// `table.add()` calls used to fail with a CastError because columns
|
||||
/// were matched positionally rather than by name.
|
||||
#[tokio::test]
|
||||
async fn test_add_with_embeddings_after_add_columns() {
|
||||
let registry = Arc::new(MemoryRegistry::new());
|
||||
let mock_embedding: Arc<dyn EmbeddingFunction> = Arc::new(MockEmbed::new("mock", 4));
|
||||
registry.register("mock", mock_embedding).unwrap();
|
||||
|
||||
let conn = connect("memory://")
|
||||
.embedding_registry(registry)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("text", DataType::Utf8, false),
|
||||
Field::new(
|
||||
"text_vec",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4),
|
||||
false,
|
||||
),
|
||||
]));
|
||||
|
||||
let embedding_def = EmbeddingDefinition::new("text", "mock", Some("text_vec"));
|
||||
let table_def = TableDefinition::new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
ColumnDefinition {
|
||||
kind: ColumnKind::Physical,
|
||||
},
|
||||
ColumnDefinition {
|
||||
kind: ColumnKind::Embedding(embedding_def),
|
||||
},
|
||||
],
|
||||
);
|
||||
let rich_schema = table_def.into_rich_schema();
|
||||
|
||||
let table = conn
|
||||
.create_empty_table("embed_evol_test", rich_schema)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Seed a row so add_columns has data to compute against.
|
||||
let seed_batch = record_batch!(("text", Utf8, ["hello"])).unwrap();
|
||||
table.add(seed_batch).execute().await.unwrap();
|
||||
|
||||
// Add a new physical column AFTER the embedding column.
|
||||
table
|
||||
.add_columns(
|
||||
NewColumnTransform::SqlExpressions(vec![("score".into(), "42.0".into())]),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Now add data including the new column but WITHOUT the embedding.
|
||||
// The input batch column order is [text, score]; after computing the
|
||||
// embedding it becomes [text, score, text_vec], but the table schema
|
||||
// is [text, text_vec, score]. Columns must be matched by name.
|
||||
let new_schema = Arc::new(Schema::new(vec![
|
||||
Field::new("text", DataType::Utf8, false),
|
||||
Field::new("score", DataType::Float64, true),
|
||||
]));
|
||||
let new_batch = RecordBatch::try_new(
|
||||
new_schema,
|
||||
vec![
|
||||
Arc::new(arrow_array::StringArray::from(vec!["foo", "bar"])),
|
||||
Arc::new(arrow_array::Float64Array::from(vec![1.0, 2.0])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
table.add(new_batch).execute().await.unwrap();
|
||||
|
||||
assert_eq!(table.count_rows(None).await.unwrap(), 3);
|
||||
|
||||
let results: Vec<RecordBatch> = table
|
||||
.query()
|
||||
.select(Select::columns(&["text", "text_vec", "score"]))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let total_rows: usize = results.iter().map(|b| b.num_rows()).sum();
|
||||
assert_eq!(total_rows, 3);
|
||||
for batch in &results {
|
||||
// text_vec must be populated for the newly added rows too.
|
||||
assert_eq!(batch.column(1).null_count(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
/// Like `test_add_with_embeddings_after_add_columns`, but the column
|
||||
/// added after the embedding is a nested struct rather than a scalar.
|
||||
/// Verifies that name-based column matching also works when the
|
||||
/// post-embedding column has a complex Arrow type.
|
||||
#[tokio::test]
|
||||
async fn test_add_with_embeddings_after_add_nested_columns() {
|
||||
let registry = Arc::new(MemoryRegistry::new());
|
||||
let mock_embedding: Arc<dyn EmbeddingFunction> = Arc::new(MockEmbed::new("mock", 4));
|
||||
registry.register("mock", mock_embedding).unwrap();
|
||||
|
||||
let conn = connect("memory://")
|
||||
.embedding_registry(registry)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("text", DataType::Utf8, false),
|
||||
Field::new(
|
||||
"text_vec",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4),
|
||||
false,
|
||||
),
|
||||
]));
|
||||
|
||||
let embedding_def = EmbeddingDefinition::new("text", "mock", Some("text_vec"));
|
||||
let table_def = TableDefinition::new(
|
||||
schema,
|
||||
vec![
|
||||
ColumnDefinition {
|
||||
kind: ColumnKind::Physical,
|
||||
},
|
||||
ColumnDefinition {
|
||||
kind: ColumnKind::Embedding(embedding_def),
|
||||
},
|
||||
],
|
||||
);
|
||||
let rich_schema = table_def.into_rich_schema();
|
||||
|
||||
let table = conn
|
||||
.create_empty_table("embed_nested_test", rich_schema)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let seed_batch = record_batch!(("text", Utf8, ["hello"])).unwrap();
|
||||
table.add(seed_batch).execute().await.unwrap();
|
||||
|
||||
// Add a STRUCT column after the embedding column.
|
||||
let meta_struct = DataType::Struct(
|
||||
vec![
|
||||
Field::new("source", DataType::Utf8, true),
|
||||
Field::new("score", DataType::Float64, true),
|
||||
]
|
||||
.into(),
|
||||
);
|
||||
let nested_schema = Arc::new(Schema::new(vec![Field::new(
|
||||
"meta",
|
||||
meta_struct.clone(),
|
||||
true,
|
||||
)]));
|
||||
table
|
||||
.add_columns(NewColumnTransform::AllNulls(nested_schema), None)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Insert with the nested struct present but the embedding column
|
||||
// absent. The computed batch is [text, meta, text_vec], but the
|
||||
// table schema is [text, text_vec, meta] — only name-based matching
|
||||
// can put `meta` (a struct) in the right slot.
|
||||
let source = Arc::new(arrow_array::StringArray::from(vec!["foo", "bar"]));
|
||||
let score = Arc::new(arrow_array::Float64Array::from(vec![1.0, 2.0]));
|
||||
let meta = Arc::new(arrow_array::StructArray::from(vec![
|
||||
(
|
||||
Arc::new(Field::new("source", DataType::Utf8, true)),
|
||||
source as Arc<dyn arrow_array::Array>,
|
||||
),
|
||||
(
|
||||
Arc::new(Field::new("score", DataType::Float64, true)),
|
||||
score as Arc<dyn arrow_array::Array>,
|
||||
),
|
||||
]));
|
||||
let new_schema = Arc::new(Schema::new(vec![
|
||||
Field::new("text", DataType::Utf8, false),
|
||||
Field::new("meta", meta_struct, true),
|
||||
]));
|
||||
let new_batch = RecordBatch::try_new(
|
||||
new_schema,
|
||||
vec![
|
||||
Arc::new(arrow_array::StringArray::from(vec!["foo", "bar"])),
|
||||
meta,
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
table.add(new_batch).execute().await.unwrap();
|
||||
|
||||
assert_eq!(table.count_rows(None).await.unwrap(), 3);
|
||||
|
||||
let results: Vec<RecordBatch> = table
|
||||
.query()
|
||||
.select(Select::columns(&["text", "text_vec", "meta"]))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let total_rows: usize = results.iter().map(|b| b.num_rows()).sum();
|
||||
assert_eq!(total_rows, 3);
|
||||
for batch in &results {
|
||||
assert_eq!(batch.schema().field(2).name(), "meta");
|
||||
assert!(matches!(
|
||||
batch.schema().field(2).data_type(),
|
||||
DataType::Struct(_)
|
||||
));
|
||||
// text_vec must be populated for the newly added rows too.
|
||||
assert_eq!(batch.column(1).null_count(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_add_casts_to_table_schema() {
|
||||
let table_schema = Arc::new(Schema::new(vec![
|
||||
|
||||
Reference in New Issue
Block a user