mirror of
https://github.com/lancedb/lancedb.git
synced 2026-06-05 13:20:39 +00:00
Compare commits
6 Commits
codex/upda
...
codex/upda
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef46e49876 | ||
|
|
d1d720d08a | ||
|
|
c2e543f1b7 | ||
|
|
216c1b5f77 | ||
|
|
fc1867da83 | ||
|
|
f951da2b00 |
64
Cargo.lock
generated
64
Cargo.lock
generated
@@ -3070,8 +3070,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
||||
|
||||
[[package]]
|
||||
name = "fsst"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"rand 0.9.2",
|
||||
@@ -4241,8 +4241,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -4308,8 +4308,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-arrow"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4329,8 +4329,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-bitpacking"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrayref",
|
||||
"paste",
|
||||
@@ -4339,8 +4339,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-core"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4377,8 +4377,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-datafusion"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4408,8 +4408,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-datagen"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4427,8 +4427,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-encoding"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
@@ -4465,8 +4465,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-file"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
@@ -4498,8 +4498,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-index"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -4562,8 +4562,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-io"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -4604,8 +4604,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-linalg"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4621,8 +4621,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-namespace"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -4634,8 +4634,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-namespace-impls"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-ipc",
|
||||
@@ -4679,8 +4679,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-table"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4719,8 +4719,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-testing"
|
||||
version = "4.0.0-beta.8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.8#e7eb014de4a8d9d958ed9eaa8d62eadb9cae3f40"
|
||||
version = "4.0.0-beta.12"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v4.0.0-beta.12#1e7b7252881261f290740d8d5a487c053f05039e"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-schema",
|
||||
|
||||
28
Cargo.toml
28
Cargo.toml
@@ -15,20 +15,20 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.91.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=4.0.0-beta.8", default-features = false, "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-core = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datagen = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-file = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-io = { "version" = "=4.0.0-beta.8", default-features = false, "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-index = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-linalg = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=4.0.0-beta.8", default-features = false, "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-table = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-testing = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datafusion = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-encoding = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-arrow = { "version" = "=4.0.0-beta.8", "tag" = "v4.0.0-beta.8", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance = { "version" = "=4.0.0-beta.12", default-features = false, "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-core = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datagen = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-file = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-io = { "version" = "=4.0.0-beta.12", default-features = false, "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-index = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-linalg = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=4.0.0-beta.12", default-features = false, "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-table = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-testing = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datafusion = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-encoding = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-arrow = { "version" = "=4.0.0-beta.12", "tag" = "v4.0.0-beta.12", "git" = "https://github.com/lance-format/lance.git" }
|
||||
ahash = "0.8"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "57.2", optional = false }
|
||||
|
||||
@@ -71,11 +71,12 @@ Add new columns with defined values.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **newColumnTransforms**: [`AddColumnsSql`](../interfaces/AddColumnsSql.md)[]
|
||||
pairs of column names and
|
||||
the SQL expression to use to calculate the value of the new column. These
|
||||
expressions will be evaluated for each row in the table, and can
|
||||
reference existing columns in the table.
|
||||
* **newColumnTransforms**: `Field`<`any`> \| `Field`<`any`>[] \| `Schema`<`any`> \| [`AddColumnsSql`](../interfaces/AddColumnsSql.md)[]
|
||||
Either:
|
||||
- An array of objects with column names and SQL expressions to calculate values
|
||||
- A single Arrow Field defining one column with its data type (column will be initialized with null values)
|
||||
- An array of Arrow Fields defining columns with their data types (columns will be initialized with null values)
|
||||
- An Arrow Schema defining columns with their data types (columns will be initialized with null values)
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<arrow.version>15.0.0</arrow.version>
|
||||
<lance-core.version>4.0.0-beta.8</lance-core.version>
|
||||
<lance-core.version>4.0.0-beta.12</lance-core.version>
|
||||
<spotless.skip>false</spotless.skip>
|
||||
<spotless.version>2.30.0</spotless.version>
|
||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||
|
||||
@@ -1259,6 +1259,98 @@ describe("schema evolution", function () {
|
||||
expect(await table.schema()).toEqual(expectedSchema);
|
||||
});
|
||||
|
||||
it("can add columns with schema for explicit data types", async function () {
|
||||
const con = await connect(tmpDir.name);
|
||||
const table = await con.createTable("vectors", [
|
||||
{ id: 1n, vector: [0.1, 0.2] },
|
||||
]);
|
||||
|
||||
// Define schema for new columns with explicit data types
|
||||
// Note: All columns must be nullable when using addColumns with Schema
|
||||
// because they are initially populated with null values
|
||||
const newColumnsSchema = new Schema([
|
||||
new Field("price", new Float64(), true),
|
||||
new Field("category", new Utf8(), true),
|
||||
new Field("rating", new Int32(), true),
|
||||
]);
|
||||
|
||||
const result = await table.addColumns(newColumnsSchema);
|
||||
expect(result).toHaveProperty("version");
|
||||
expect(result.version).toBe(2);
|
||||
|
||||
const expectedSchema = new Schema([
|
||||
new Field("id", new Int64(), true),
|
||||
new Field(
|
||||
"vector",
|
||||
new FixedSizeList(2, new Field("item", new Float32(), true)),
|
||||
true,
|
||||
),
|
||||
new Field("price", new Float64(), true),
|
||||
new Field("category", new Utf8(), true),
|
||||
new Field("rating", new Int32(), true),
|
||||
]);
|
||||
expect(await table.schema()).toEqual(expectedSchema);
|
||||
|
||||
// Verify that new columns are populated with null values
|
||||
const results = await table.query().toArray();
|
||||
expect(results).toHaveLength(1);
|
||||
expect(results[0].price).toBeNull();
|
||||
expect(results[0].category).toBeNull();
|
||||
expect(results[0].rating).toBeNull();
|
||||
});
|
||||
|
||||
it("can add a single column using Field", async function () {
|
||||
const con = await connect(tmpDir.name);
|
||||
const table = await con.createTable("vectors", [
|
||||
{ id: 1n, vector: [0.1, 0.2] },
|
||||
]);
|
||||
|
||||
// Add a single field
|
||||
const priceField = new Field("price", new Float64(), true);
|
||||
const result = await table.addColumns(priceField);
|
||||
expect(result).toHaveProperty("version");
|
||||
expect(result.version).toBe(2);
|
||||
|
||||
const expectedSchema = new Schema([
|
||||
new Field("id", new Int64(), true),
|
||||
new Field(
|
||||
"vector",
|
||||
new FixedSizeList(2, new Field("item", new Float32(), true)),
|
||||
true,
|
||||
),
|
||||
new Field("price", new Float64(), true),
|
||||
]);
|
||||
expect(await table.schema()).toEqual(expectedSchema);
|
||||
});
|
||||
|
||||
it("can add multiple columns using array of Fields", async function () {
|
||||
const con = await connect(tmpDir.name);
|
||||
const table = await con.createTable("vectors", [
|
||||
{ id: 1n, vector: [0.1, 0.2] },
|
||||
]);
|
||||
|
||||
// Add multiple fields as array
|
||||
const fields = [
|
||||
new Field("price", new Float64(), true),
|
||||
new Field("category", new Utf8(), true),
|
||||
];
|
||||
const result = await table.addColumns(fields);
|
||||
expect(result).toHaveProperty("version");
|
||||
expect(result.version).toBe(2);
|
||||
|
||||
const expectedSchema = new Schema([
|
||||
new Field("id", new Int64(), true),
|
||||
new Field(
|
||||
"vector",
|
||||
new FixedSizeList(2, new Field("item", new Float32(), true)),
|
||||
true,
|
||||
),
|
||||
new Field("price", new Float64(), true),
|
||||
new Field("category", new Utf8(), true),
|
||||
]);
|
||||
expect(await table.schema()).toEqual(expectedSchema);
|
||||
});
|
||||
|
||||
it("can alter the columns in the schema", async function () {
|
||||
const con = await connect(tmpDir.name);
|
||||
const schema = new Schema([
|
||||
|
||||
@@ -5,12 +5,15 @@ import {
|
||||
Table as ArrowTable,
|
||||
Data,
|
||||
DataType,
|
||||
Field,
|
||||
IntoVector,
|
||||
MultiVector,
|
||||
Schema,
|
||||
dataTypeToJson,
|
||||
fromDataToBuffer,
|
||||
fromTableToBuffer,
|
||||
isMultiVector,
|
||||
makeEmptyTable,
|
||||
tableFromIPC,
|
||||
} from "./arrow";
|
||||
|
||||
@@ -84,6 +87,16 @@ export interface OptimizeOptions {
|
||||
* tbl.optimize({cleanupOlderThan: new Date()});
|
||||
*/
|
||||
cleanupOlderThan: Date;
|
||||
/**
|
||||
* Because they may be part of an in-progress transaction, files newer than
|
||||
* 7 days old are not deleted by default. If you are sure that there are no
|
||||
* in-progress transactions, then you can set this to true to delete all
|
||||
* files older than `cleanupOlderThan`.
|
||||
*
|
||||
* **WARNING**: This should only be set to true if you can guarantee that
|
||||
* no other process is currently working on this dataset. Otherwise the
|
||||
* dataset could be put into a corrupted state.
|
||||
*/
|
||||
deleteUnverified: boolean;
|
||||
}
|
||||
|
||||
@@ -381,15 +394,16 @@ export abstract class Table {
|
||||
abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
|
||||
/**
|
||||
* Add new columns with defined values.
|
||||
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
|
||||
* the SQL expression to use to calculate the value of the new column. These
|
||||
* expressions will be evaluated for each row in the table, and can
|
||||
* reference existing columns in the table.
|
||||
* @param {AddColumnsSql[] | Field | Field[] | Schema} newColumnTransforms Either:
|
||||
* - An array of objects with column names and SQL expressions to calculate values
|
||||
* - A single Arrow Field defining one column with its data type (column will be initialized with null values)
|
||||
* - An array of Arrow Fields defining columns with their data types (columns will be initialized with null values)
|
||||
* - An Arrow Schema defining columns with their data types (columns will be initialized with null values)
|
||||
* @returns {Promise<AddColumnsResult>} A promise that resolves to an object
|
||||
* containing the new version number of the table after adding the columns.
|
||||
*/
|
||||
abstract addColumns(
|
||||
newColumnTransforms: AddColumnsSql[],
|
||||
newColumnTransforms: AddColumnsSql[] | Field | Field[] | Schema,
|
||||
): Promise<AddColumnsResult>;
|
||||
|
||||
/**
|
||||
@@ -501,19 +515,7 @@ export abstract class Table {
|
||||
* - Index: Optimizes the indices, adding new data to existing indices
|
||||
*
|
||||
*
|
||||
* Experimental API
|
||||
* ----------------
|
||||
*
|
||||
* The optimization process is undergoing active development and may change.
|
||||
* Our goal with these changes is to improve the performance of optimization and
|
||||
* reduce the complexity.
|
||||
*
|
||||
* That being said, it is essential today to run optimize if you want the best
|
||||
* performance. It should be stable and safe to use in production, but it our
|
||||
* hope that the API may be simplified (or not even need to be called) in the
|
||||
* future.
|
||||
*
|
||||
* The frequency an application shoudl call optimize is based on the frequency of
|
||||
* The frequency an application should call optimize is based on the frequency of
|
||||
* data modifications. If data is frequently added, deleted, or updated then
|
||||
* optimize should be run frequently. A good rule of thumb is to run optimize if
|
||||
* you have added or modified 100,000 or more records or run more than 20 data
|
||||
@@ -806,9 +808,40 @@ export class LocalTable extends Table {
|
||||
// TODO: Support BatchUDF
|
||||
|
||||
async addColumns(
|
||||
newColumnTransforms: AddColumnsSql[],
|
||||
newColumnTransforms: AddColumnsSql[] | Field | Field[] | Schema,
|
||||
): Promise<AddColumnsResult> {
|
||||
return await this.inner.addColumns(newColumnTransforms);
|
||||
// Handle single Field -> convert to array of Fields
|
||||
if (newColumnTransforms instanceof Field) {
|
||||
newColumnTransforms = [newColumnTransforms];
|
||||
}
|
||||
|
||||
// Handle array of Fields -> convert to Schema
|
||||
if (
|
||||
Array.isArray(newColumnTransforms) &&
|
||||
newColumnTransforms.length > 0 &&
|
||||
newColumnTransforms[0] instanceof Field
|
||||
) {
|
||||
const fields = newColumnTransforms as Field[];
|
||||
newColumnTransforms = new Schema(fields);
|
||||
}
|
||||
|
||||
// Handle Schema -> use schema-based approach
|
||||
if (newColumnTransforms instanceof Schema) {
|
||||
const schema = newColumnTransforms;
|
||||
// Convert schema to buffer using Arrow IPC format
|
||||
const emptyTable = makeEmptyTable(schema);
|
||||
const schemaBuf = await fromTableToBuffer(emptyTable);
|
||||
return await this.inner.addColumnsWithSchema(schemaBuf);
|
||||
}
|
||||
|
||||
// Handle SQL expressions (existing functionality)
|
||||
if (Array.isArray(newColumnTransforms)) {
|
||||
return await this.inner.addColumns(
|
||||
newColumnTransforms as AddColumnsSql[],
|
||||
);
|
||||
}
|
||||
|
||||
throw new Error("Invalid input type for addColumns");
|
||||
}
|
||||
|
||||
async alterColumns(
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use lancedb::ipc::ipc_file_to_batches;
|
||||
use lancedb::ipc::{ipc_file_to_batches, ipc_file_to_schema};
|
||||
use lancedb::table::{
|
||||
AddDataMode, ColumnAlteration as LanceColumnAlteration, Duration, NewColumnTransform,
|
||||
OptimizeAction, OptimizeOptions, Table as LanceDbTable,
|
||||
@@ -279,6 +279,23 @@ impl Table {
|
||||
Ok(res.into())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn add_columns_with_schema(
|
||||
&self,
|
||||
schema_buf: Buffer,
|
||||
) -> napi::Result<AddColumnsResult> {
|
||||
let schema = ipc_file_to_schema(schema_buf.to_vec())
|
||||
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC schema: {}", e)))?;
|
||||
|
||||
let transforms = NewColumnTransform::AllNulls(schema);
|
||||
let res = self
|
||||
.inner_ref()?
|
||||
.add_columns(transforms, None)
|
||||
.await
|
||||
.default_error()?;
|
||||
Ok(res.into())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn alter_columns(
|
||||
&self,
|
||||
|
||||
@@ -166,6 +166,8 @@ class Table:
|
||||
async def checkout(self, version: Union[int, str]): ...
|
||||
async def checkout_latest(self): ...
|
||||
async def restore(self, version: Optional[Union[int, str]] = None): ...
|
||||
async def prewarm_index(self, index_name: str) -> None: ...
|
||||
async def prewarm_data(self, columns: Optional[List[str]] = None) -> None: ...
|
||||
async def list_indices(self) -> list[IndexConfig]: ...
|
||||
async def delete(self, filter: str) -> DeleteResult: ...
|
||||
async def add_columns(self, columns: list[tuple[str, str]]) -> AddColumnsResult: ...
|
||||
|
||||
@@ -640,6 +640,45 @@ class RemoteTable(Table):
|
||||
def drop_index(self, index_name: str):
|
||||
return LOOP.run(self._table.drop_index(index_name))
|
||||
|
||||
def prewarm_index(self, name: str) -> None:
|
||||
"""Prewarm an index in the table.
|
||||
|
||||
This is a hint to the database that the index will be accessed in the
|
||||
future and should be loaded into memory if possible. This can reduce
|
||||
cold-start latency for subsequent queries.
|
||||
|
||||
This call initiates prewarming and returns once the request is accepted.
|
||||
It is idempotent and safe to call from multiple clients concurrently.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
The name of the index to prewarm
|
||||
"""
|
||||
return LOOP.run(self._table.prewarm_index(name))
|
||||
|
||||
def prewarm_data(self, columns: Optional[List[str]] = None) -> None:
|
||||
"""Prewarm data for the table.
|
||||
|
||||
This is a hint to the database that the given columns will be accessed
|
||||
in the future and the database should prefetch the data if possible.
|
||||
Currently only supported on remote tables.
|
||||
|
||||
This call initiates prewarming and returns once the request is accepted.
|
||||
It is idempotent and safe to call from multiple clients concurrently.
|
||||
|
||||
This operation has a large upfront cost but can speed up future queries
|
||||
that need to fetch the given columns. Large columns such as embeddings
|
||||
or binary data may not be practical to prewarm. This feature is intended
|
||||
for workloads that issue many queries against the same columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns: list of str, optional
|
||||
The columns to prewarm. If None, all columns are prewarmed.
|
||||
"""
|
||||
return LOOP.run(self._table.prewarm_data(columns))
|
||||
|
||||
def wait_for_index(
|
||||
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
|
||||
):
|
||||
|
||||
@@ -1506,22 +1506,17 @@ class Table(ABC):
|
||||
in-progress operation (e.g. appending new data) and these files will not
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
|
||||
.. warning::
|
||||
|
||||
This should only be set to True if you can guarantee that no other
|
||||
process is currently working on this dataset. Otherwise the dataset
|
||||
could be put into a corrupted state.
|
||||
|
||||
retrain: bool, default False
|
||||
This parameter is no longer used and is deprecated.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
|
||||
The optimization process is undergoing active development and may change.
|
||||
Our goal with these changes is to improve the performance of optimization and
|
||||
reduce the complexity.
|
||||
|
||||
That being said, it is essential today to run optimize if you want the best
|
||||
performance. It should be stable and safe to use in production, but it our
|
||||
hope that the API may be simplified (or not even need to be called) in the
|
||||
future.
|
||||
|
||||
The frequency an application shoudl call optimize is based on the frequency of
|
||||
The frequency an application should call optimize is based on the frequency of
|
||||
data modifications. If data is frequently added, deleted, or updated then
|
||||
optimize should be run frequently. A good rule of thumb is to run optimize if
|
||||
you have added or modified 100,000 or more records or run more than 20 data
|
||||
@@ -2219,12 +2214,18 @@ class LanceTable(Table):
|
||||
|
||||
def prewarm_index(self, name: str) -> None:
|
||||
"""
|
||||
Prewarms an index in the table
|
||||
Prewarm an index in the table.
|
||||
|
||||
This loads the entire index into memory
|
||||
This is a hint to the database that the index will be accessed in the
|
||||
future and should be loaded into memory if possible. This can reduce
|
||||
cold-start latency for subsequent queries.
|
||||
|
||||
If the index does not fit into the available cache this call
|
||||
may be wasteful
|
||||
This call initiates prewarming and returns once the request is accepted.
|
||||
It is idempotent and safe to call from multiple clients concurrently.
|
||||
|
||||
It is generally wasteful to call this if the index does not fit into the
|
||||
available cache. Not all index types support prewarming; unsupported
|
||||
indices will silently ignore the request.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -2233,6 +2234,29 @@ class LanceTable(Table):
|
||||
"""
|
||||
return LOOP.run(self._table.prewarm_index(name))
|
||||
|
||||
def prewarm_data(self, columns: Optional[List[str]] = None) -> None:
|
||||
"""
|
||||
Prewarm data for the table.
|
||||
|
||||
This is a hint to the database that the given columns will be accessed
|
||||
in the future and the database should prefetch the data if possible.
|
||||
Currently only supported on remote tables.
|
||||
|
||||
This call initiates prewarming and returns once the request is accepted.
|
||||
It is idempotent and safe to call from multiple clients concurrently.
|
||||
|
||||
This operation has a large upfront cost but can speed up future queries
|
||||
that need to fetch the given columns. Large columns such as embeddings
|
||||
or binary data may not be practical to prewarm. This feature is intended
|
||||
for workloads that issue many queries against the same columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns: list of str, optional
|
||||
The columns to prewarm. If None, all columns are prewarmed.
|
||||
"""
|
||||
return LOOP.run(self._table.prewarm_data(columns))
|
||||
|
||||
def wait_for_index(
|
||||
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
|
||||
) -> None:
|
||||
@@ -3018,22 +3042,17 @@ class LanceTable(Table):
|
||||
in-progress operation (e.g. appending new data) and these files will not
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
|
||||
.. warning::
|
||||
|
||||
This should only be set to True if you can guarantee that no other
|
||||
process is currently working on this dataset. Otherwise the dataset
|
||||
could be put into a corrupted state.
|
||||
|
||||
retrain: bool, default False
|
||||
This parameter is no longer used and is deprecated.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
|
||||
The optimization process is undergoing active development and may change.
|
||||
Our goal with these changes is to improve the performance of optimization and
|
||||
reduce the complexity.
|
||||
|
||||
That being said, it is essential today to run optimize if you want the best
|
||||
performance. It should be stable and safe to use in production, but it our
|
||||
hope that the API may be simplified (or not even need to be called) in the
|
||||
future.
|
||||
|
||||
The frequency an application shoudl call optimize is based on the frequency of
|
||||
The frequency an application should call optimize is based on the frequency of
|
||||
data modifications. If data is frequently added, deleted, or updated then
|
||||
optimize should be run frequently. A good rule of thumb is to run optimize if
|
||||
you have added or modified 100,000 or more records or run more than 20 data
|
||||
@@ -3634,19 +3653,47 @@ class AsyncTable:
|
||||
"""
|
||||
Prewarm an index in the table.
|
||||
|
||||
This is a hint to the database that the index will be accessed in the
|
||||
future and should be loaded into memory if possible. This can reduce
|
||||
cold-start latency for subsequent queries.
|
||||
|
||||
This call initiates prewarming and returns once the request is accepted.
|
||||
It is idempotent and safe to call from multiple clients concurrently.
|
||||
|
||||
It is generally wasteful to call this if the index does not fit into the
|
||||
available cache. Not all index types support prewarming; unsupported
|
||||
indices will silently ignore the request.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
The name of the index to prewarm
|
||||
|
||||
Notes
|
||||
-----
|
||||
This will load the index into memory. This may reduce the cold-start time for
|
||||
future queries. If the index does not fit in the cache then this call may be
|
||||
wasteful.
|
||||
"""
|
||||
await self._inner.prewarm_index(name)
|
||||
|
||||
async def prewarm_data(self, columns: Optional[List[str]] = None) -> None:
|
||||
"""
|
||||
Prewarm data for the table.
|
||||
|
||||
This is a hint to the database that the given columns will be accessed
|
||||
in the future and the database should prefetch the data if possible.
|
||||
Currently only supported on remote tables.
|
||||
|
||||
This call initiates prewarming and returns once the request is accepted.
|
||||
It is idempotent and safe to call from multiple clients concurrently.
|
||||
|
||||
This operation has a large upfront cost but can speed up future queries
|
||||
that need to fetch the given columns. Large columns such as embeddings
|
||||
or binary data may not be practical to prewarm. This feature is intended
|
||||
for workloads that issue many queries against the same columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns: list of str, optional
|
||||
The columns to prewarm. If None, all columns are prewarmed.
|
||||
"""
|
||||
await self._inner.prewarm_data(columns)
|
||||
|
||||
async def wait_for_index(
|
||||
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
|
||||
) -> None:
|
||||
@@ -4573,22 +4620,17 @@ class AsyncTable:
|
||||
in-progress operation (e.g. appending new data) and these files will not
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
|
||||
.. warning::
|
||||
|
||||
This should only be set to True if you can guarantee that no other
|
||||
process is currently working on this dataset. Otherwise the dataset
|
||||
could be put into a corrupted state.
|
||||
|
||||
retrain: bool, default False
|
||||
This parameter is no longer used and is deprecated.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
|
||||
The optimization process is undergoing active development and may change.
|
||||
Our goal with these changes is to improve the performance of optimization and
|
||||
reduce the complexity.
|
||||
|
||||
That being said, it is essential today to run optimize if you want the best
|
||||
performance. It should be stable and safe to use in production, but it our
|
||||
hope that the API may be simplified (or not even need to be called) in the
|
||||
future.
|
||||
|
||||
The frequency an application shoudl call optimize is based on the frequency of
|
||||
The frequency an application should call optimize is based on the frequency of
|
||||
data modifications. If data is frequently added, deleted, or updated then
|
||||
optimize should be run frequently. A good rule of thumb is to run optimize if
|
||||
you have added or modified 100,000 or more records or run more than 20 data
|
||||
|
||||
@@ -316,6 +316,19 @@ impl<'py> IntoPyObject<'py> for PySelect {
|
||||
Select::All => Ok(py.None().into_bound(py).into_any()),
|
||||
Select::Columns(columns) => Ok(columns.into_pyobject(py)?.into_any()),
|
||||
Select::Dynamic(columns) => Ok(columns.into_pyobject(py)?.into_any()),
|
||||
Select::Expr(pairs) => {
|
||||
// Serialize DataFusion Expr -> SQL string so Python sees the same
|
||||
// format as Select::Dynamic: a list of (name, sql_string) tuples.
|
||||
let sql_pairs: PyResult<Vec<(String, String)>> = pairs
|
||||
.into_iter()
|
||||
.map(|(name, expr)| {
|
||||
lancedb::expr::expr_to_sql_string(&expr)
|
||||
.map(|sql| (name, sql))
|
||||
.map_err(|e| PyRuntimeError::new_err(e.to_string()))
|
||||
})
|
||||
.collect();
|
||||
Ok(sql_pairs?.into_pyobject(py)?.into_any())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -426,6 +426,17 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn prewarm_data(
|
||||
self_: PyRef<'_, Self>,
|
||||
columns: Option<Vec<String>>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.prewarm_data(columns).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn list_indices(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
|
||||
@@ -339,6 +339,12 @@ impl PermutationReader {
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
Select::Expr(columns) => {
|
||||
// For Expr projections, we check if any alias is _rowid.
|
||||
// We can't validate the expression itself (it may differ from _rowid)
|
||||
// but we allow it through; the column will be included.
|
||||
Ok(columns.iter().any(|(alias, _)| alias == ROW_ID))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,25 @@ pub enum Select {
|
||||
///
|
||||
/// See [`Query::select`] for more details and examples
|
||||
Dynamic(Vec<(String, String)>),
|
||||
/// Advanced selection using type-safe DataFusion expressions
|
||||
///
|
||||
/// Similar to [`Select::Dynamic`] but uses [`datafusion_expr::Expr`] instead of
|
||||
/// raw SQL strings. Use [`crate::expr`] helpers to build expressions:
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::expr::{col, lit};
|
||||
/// use lancedb::query::Select;
|
||||
///
|
||||
/// // SELECT id, id * 2 AS id2 FROM ...
|
||||
/// let selection = Select::expr_projection(&[
|
||||
/// ("id", col("id")),
|
||||
/// ("id2", col("id") * lit(2)),
|
||||
/// ]);
|
||||
/// ```
|
||||
///
|
||||
/// Note: For remote/server-side queries the expressions are serialized to SQL strings
|
||||
/// automatically (same as [`Select::Dynamic`]).
|
||||
Expr(Vec<(String, datafusion_expr::Expr)>),
|
||||
}
|
||||
|
||||
impl Select {
|
||||
@@ -69,6 +88,29 @@ impl Select {
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
/// Create a typed-expression projection.
|
||||
///
|
||||
/// This is a convenience method for creating a [`Select::Expr`] variant from
|
||||
/// a slice of `(name, expr)` pairs where each `expr` is a [`datafusion_expr::Expr`].
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// use lancedb::expr::{col, lit};
|
||||
/// use lancedb::query::Select;
|
||||
///
|
||||
/// let selection = Select::expr_projection(&[
|
||||
/// ("id", col("id")),
|
||||
/// ("id2", col("id") * lit(2)),
|
||||
/// ]);
|
||||
/// ```
|
||||
pub fn expr_projection(columns: &[(impl AsRef<str>, datafusion_expr::Expr)]) -> Self {
|
||||
Self::Expr(
|
||||
columns
|
||||
.iter()
|
||||
.map(|(name, expr)| (name.as_ref().to_string(), expr.clone()))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// A trait for converting a type to a query vector
|
||||
@@ -1591,6 +1633,58 @@ mod tests {
|
||||
});
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_select_with_expr_projection() {
|
||||
// Mirrors test_select_with_transform but uses Select::Expr instead of Select::Dynamic
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let dataset_path = tmp_dir.path().join("test_expr.lance");
|
||||
let uri = dataset_path.to_str().unwrap();
|
||||
|
||||
let batches = make_non_empty_batches();
|
||||
let conn = connect(uri).execute().await.unwrap();
|
||||
let table = conn
|
||||
.create_table("my_table", batches)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
use crate::expr::{col, lit};
|
||||
let query = table.query().limit(10).select(Select::expr_projection(&[
|
||||
("id2", col("id") * lit(2i32)),
|
||||
("id", col("id")),
|
||||
]));
|
||||
|
||||
let schema = query.output_schema().await.unwrap();
|
||||
assert_eq!(
|
||||
schema,
|
||||
Arc::new(ArrowSchema::new(vec![
|
||||
ArrowField::new("id2", DataType::Int32, true),
|
||||
ArrowField::new("id", DataType::Int32, true),
|
||||
]))
|
||||
);
|
||||
|
||||
let result = query.execute().await;
|
||||
let mut batches = result
|
||||
.expect("should have result")
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(batches.len(), 1);
|
||||
let batch = batches.pop().unwrap();
|
||||
|
||||
// id and id2
|
||||
assert_eq!(batch.num_columns(), 2);
|
||||
|
||||
let id: &Int32Array = batch.column_by_name("id").unwrap().as_primitive();
|
||||
let id2: &Int32Array = batch.column_by_name("id2").unwrap().as_primitive();
|
||||
|
||||
id.iter().zip(id2.iter()).for_each(|(id, id2)| {
|
||||
let id = id.unwrap();
|
||||
let id2 = id2.unwrap();
|
||||
assert_eq!(id * 2, id2);
|
||||
});
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_execute_no_vector() {
|
||||
// TODO: Switch back to memory://foo after https://github.com/lancedb/lancedb/issues/1051
|
||||
|
||||
@@ -426,14 +426,11 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
|
||||
})?,
|
||||
);
|
||||
}
|
||||
if db_prefix.is_some() {
|
||||
if let Some(prefix) = db_prefix {
|
||||
headers.insert(
|
||||
HeaderName::from_static("x-lancedb-database-prefix"),
|
||||
HeaderValue::from_str(db_prefix.unwrap()).map_err(|_| Error::InvalidInput {
|
||||
message: format!(
|
||||
"non-ascii database prefix '{}' provided",
|
||||
db_prefix.unwrap()
|
||||
),
|
||||
HeaderValue::from_str(prefix).map_err(|_| Error::InvalidInput {
|
||||
message: format!("non-ascii database prefix '{}' provided", prefix),
|
||||
})?,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -477,6 +477,16 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
}));
|
||||
body["columns"] = alias_map.into();
|
||||
}
|
||||
Select::Expr(pairs) => {
|
||||
let alias_map: Result<serde_json::Map<String, serde_json::Value>> = pairs
|
||||
.iter()
|
||||
.map(|(name, expr)| {
|
||||
expr_to_sql_string(expr)
|
||||
.map(|sql| (name.clone(), serde_json::Value::String(sql)))
|
||||
})
|
||||
.collect();
|
||||
body["columns"] = alias_map?.into();
|
||||
}
|
||||
}
|
||||
|
||||
if params.fast_search {
|
||||
@@ -1645,10 +1655,33 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn prewarm_index(&self, _index_name: &str) -> Result<()> {
|
||||
Err(Error::NotSupported {
|
||||
message: "prewarm_index is not yet supported on LanceDB cloud.".into(),
|
||||
})
|
||||
async fn prewarm_index(&self, index_name: &str) -> Result<()> {
|
||||
let request = self.client.post(&format!(
|
||||
"/v1/table/{}/index/{}/prewarm/",
|
||||
self.identifier, index_name
|
||||
));
|
||||
let (request_id, response) = self.send(request, true).await?;
|
||||
if response.status() == StatusCode::NOT_FOUND {
|
||||
return Err(Error::IndexNotFound {
|
||||
name: index_name.to_string(),
|
||||
});
|
||||
}
|
||||
self.check_table_response(&request_id, response).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn prewarm_data(&self, columns: Option<Vec<String>>) -> Result<()> {
|
||||
let mut request = self.client.post(&format!(
|
||||
"/v1/table/{}/page_cache/prewarm/",
|
||||
self.identifier
|
||||
));
|
||||
let body = serde_json::json!({
|
||||
"columns": columns.unwrap_or_default(),
|
||||
});
|
||||
request = request.json(&body);
|
||||
let (request_id, response) = self.send(request, true).await?;
|
||||
self.check_table_response(&request_id, response).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn table_definition(&self) -> Result<TableDefinition> {
|
||||
@@ -3529,6 +3562,64 @@ mod tests {
|
||||
assert_eq!(result.version, if old_server { 0 } else { 43 });
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_prewarm_index() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(
|
||||
request.url().path(),
|
||||
"/v1/table/my_table/index/my_index/prewarm/"
|
||||
);
|
||||
http::Response::builder().status(200).body("{}").unwrap()
|
||||
});
|
||||
table.prewarm_index("my_index").await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_prewarm_index_not_found() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(
|
||||
request.url().path(),
|
||||
"/v1/table/my_table/index/my_index/prewarm/"
|
||||
);
|
||||
http::Response::builder().status(404).body("{}").unwrap()
|
||||
});
|
||||
let e = table.prewarm_index("my_index").await.unwrap_err();
|
||||
assert!(matches!(e, Error::IndexNotFound { .. }));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_prewarm_data() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(
|
||||
request.url().path(),
|
||||
"/v1/table/my_table/page_cache/prewarm/"
|
||||
);
|
||||
http::Response::builder().status(200).body("{}").unwrap()
|
||||
});
|
||||
table.prewarm_data(None).await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_prewarm_data_with_columns() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(
|
||||
request.url().path(),
|
||||
"/v1/table/my_table/page_cache/prewarm/"
|
||||
);
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||
assert_eq!(body["columns"], serde_json::json!(["col_a", "col_b"]));
|
||||
http::Response::builder().status(200).body("{}").unwrap()
|
||||
});
|
||||
table
|
||||
.prewarm_data(Some(vec!["col_a".into(), "col_b".into()]))
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_drop_index() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
|
||||
@@ -277,8 +277,13 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
||||
async fn list_indices(&self) -> Result<Vec<IndexConfig>>;
|
||||
/// Drop an index from the table.
|
||||
async fn drop_index(&self, name: &str) -> Result<()>;
|
||||
/// Prewarm an index in the table
|
||||
/// Prewarm an index in the table.
|
||||
async fn prewarm_index(&self, name: &str) -> Result<()>;
|
||||
/// Prewarm data for the table.
|
||||
///
|
||||
/// Currently only supported on remote tables.
|
||||
/// If `columns` is `None`, all columns are prewarmed.
|
||||
async fn prewarm_data(&self, columns: Option<Vec<String>>) -> Result<()>;
|
||||
/// Get statistics about the index.
|
||||
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>>;
|
||||
/// Merge insert new records into the table.
|
||||
@@ -946,17 +951,7 @@ impl Table {
|
||||
/// * Prune: Removes old versions of the dataset
|
||||
/// * Index: Optimizes the indices, adding new data to existing indices
|
||||
///
|
||||
/// <section class="warning">Experimental API</section>
|
||||
///
|
||||
/// The optimization process is undergoing active development and may change.
|
||||
/// Our goal with these changes is to improve the performance of optimization and
|
||||
/// reduce the complexity.
|
||||
///
|
||||
/// That being said, it is essential today to run optimize if you want the best
|
||||
/// performance. It should be stable and safe to use in production, but it our
|
||||
/// hope that the API may be simplified (or not even need to be called) in the future.
|
||||
///
|
||||
/// The frequency an application shoudl call optimize is based on the frequency of
|
||||
/// The frequency an application should call optimize is based on the frequency of
|
||||
/// data modifications. If data is frequently added, deleted, or updated then
|
||||
/// optimize should be run frequently. A good rule of thumb is to run optimize if
|
||||
/// you have added or modified 100,000 or more records or run more than 20 data
|
||||
@@ -1123,22 +1118,45 @@ impl Table {
|
||||
self.inner.drop_index(name).await
|
||||
}
|
||||
|
||||
/// Prewarm an index in the table
|
||||
/// Prewarm an index in the table.
|
||||
///
|
||||
/// This is a hint to fully load the index into memory. It can be used to
|
||||
/// avoid cold starts
|
||||
/// This is a hint to the database that the index will be accessed in the
|
||||
/// future and should be loaded into memory if possible. This can reduce
|
||||
/// cold-start latency for subsequent queries.
|
||||
///
|
||||
/// This call initiates prewarming and returns once the request is accepted.
|
||||
/// It is idempotent and safe to call from multiple clients concurrently.
|
||||
///
|
||||
/// It is generally wasteful to call this if the index does not fit into the
|
||||
/// available cache.
|
||||
///
|
||||
/// Note: This function is not yet supported on all indices, in which case it
|
||||
/// may do nothing.
|
||||
/// available cache. Not all index types support prewarming; unsupported
|
||||
/// indices will silently ignore the request.
|
||||
///
|
||||
/// Use [`Self::list_indices()`] to find the names of the indices.
|
||||
pub async fn prewarm_index(&self, name: &str) -> Result<()> {
|
||||
self.inner.prewarm_index(name).await
|
||||
}
|
||||
|
||||
/// Prewarm data for the table.
|
||||
///
|
||||
/// This is a hint to the database that the given columns will be accessed in
|
||||
/// the future and the database should prefetch the data if possible. This
|
||||
/// can reduce cold-start latency for subsequent queries. Currently only
|
||||
/// supported on remote tables.
|
||||
///
|
||||
/// This call initiates prewarming and returns once the request is accepted.
|
||||
/// It is idempotent and safe to call from multiple clients concurrently —
|
||||
/// calling it on already-prewarmed columns is a no-op on the server.
|
||||
///
|
||||
/// This operation has a large upfront cost but can speed up future queries
|
||||
/// that need to fetch the given columns. Large columns such as embeddings
|
||||
/// or binary data may not be practical to prewarm. This feature is intended
|
||||
/// for workloads that issue many queries against the same columns.
|
||||
///
|
||||
/// If `columns` is `None`, all columns are prewarmed.
|
||||
pub async fn prewarm_data(&self, columns: Option<Vec<String>>) -> Result<()> {
|
||||
self.inner.prewarm_data(columns).await
|
||||
}
|
||||
|
||||
/// Poll until the columns are fully indexed. Will return Error::Timeout if the columns
|
||||
/// are not fully indexed within the timeout.
|
||||
pub async fn wait_for_index(
|
||||
@@ -2290,6 +2308,12 @@ impl BaseTable for NativeTable {
|
||||
Ok(dataset.prewarm_index(index_name).await?)
|
||||
}
|
||||
|
||||
async fn prewarm_data(&self, _columns: Option<Vec<String>>) -> Result<()> {
|
||||
Err(Error::NotSupported {
|
||||
message: "prewarm_data is currently only supported on remote tables.".into(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<UpdateResult> {
|
||||
// Delegate to the submodule implementation
|
||||
update::execute_update(self, update).await
|
||||
|
||||
@@ -64,6 +64,9 @@ pub enum OptimizeAction {
|
||||
older_than: Option<Duration>,
|
||||
/// Because they may be part of an in-progress transaction, files newer than 7 days old are not deleted by default.
|
||||
/// If you are sure that there are no in-progress transactions, then you can set this to True to delete all files older than `older_than`.
|
||||
///
|
||||
/// **WARNING**: This should only be set to true if you can guarantee that no other process is
|
||||
/// currently working on this dataset. Otherwise the dataset could be put into a corrupted state.
|
||||
delete_unverified: Option<bool>,
|
||||
/// If true, an error will be returned if there are any old versions that are still tagged.
|
||||
error_if_tagged_old_versions: Option<bool>,
|
||||
@@ -117,6 +120,10 @@ pub(crate) async fn optimize_indices(table: &NativeTable, options: &OptimizeOpti
|
||||
/// If you are sure that there are no in-progress transactions, then you
|
||||
/// can set this to True to delete all files older than `older_than`.
|
||||
///
|
||||
/// **WARNING**: This should only be set to true if you can guarantee that
|
||||
/// no other process is currently working on this dataset. Otherwise the
|
||||
/// dataset could be put into a corrupted state.
|
||||
///
|
||||
/// This calls into [lance::dataset::Dataset::cleanup_old_versions] and
|
||||
/// returns the result.
|
||||
pub(crate) async fn cleanup_old_versions(
|
||||
|
||||
@@ -186,6 +186,13 @@ pub async fn create_plan(
|
||||
Select::Dynamic(ref select_with_transform) => {
|
||||
scanner.project_with_transform(select_with_transform.as_slice())?;
|
||||
}
|
||||
Select::Expr(ref expr_pairs) => {
|
||||
let sql_pairs: crate::Result<Vec<(String, String)>> = expr_pairs
|
||||
.iter()
|
||||
.map(|(name, expr)| expr_to_sql_string(expr).map(|sql| (name.clone(), sql)))
|
||||
.collect();
|
||||
scanner.project_with_transform(sql_pairs?.as_slice())?;
|
||||
}
|
||||
Select::All => {}
|
||||
}
|
||||
|
||||
@@ -340,6 +347,17 @@ fn convert_to_namespace_query(query: &AnyQuery) -> Result<NsQueryTableRequest> {
|
||||
.to_string(),
|
||||
});
|
||||
}
|
||||
Select::Expr(pairs) => {
|
||||
let sql_pairs: crate::Result<Vec<(String, String)>> = pairs
|
||||
.iter()
|
||||
.map(|(name, expr)| expr_to_sql_string(expr).map(|sql| (name.clone(), sql)))
|
||||
.collect();
|
||||
let sql_pairs = sql_pairs?;
|
||||
Some(Box::new(QueryTableRequestColumns {
|
||||
column_names: None,
|
||||
column_aliases: Some(sql_pairs.into_iter().collect()),
|
||||
}))
|
||||
}
|
||||
};
|
||||
|
||||
// Check for unsupported features
|
||||
@@ -411,6 +429,17 @@ fn convert_to_namespace_query(query: &AnyQuery) -> Result<NsQueryTableRequest> {
|
||||
.to_string(),
|
||||
});
|
||||
}
|
||||
Select::Expr(pairs) => {
|
||||
let sql_pairs: crate::Result<Vec<(String, String)>> = pairs
|
||||
.iter()
|
||||
.map(|(name, expr)| expr_to_sql_string(expr).map(|sql| (name.clone(), sql)))
|
||||
.collect();
|
||||
let sql_pairs = sql_pairs?;
|
||||
Some(Box::new(QueryTableRequestColumns {
|
||||
column_names: None,
|
||||
column_aliases: Some(sql_pairs.into_iter().collect()),
|
||||
}))
|
||||
}
|
||||
};
|
||||
|
||||
// Handle full text search if present
|
||||
|
||||
Reference in New Issue
Block a user