Compare commits

..

3 Commits

Author SHA1 Message Date
Xuanwo
980536c6ef fix: support nested field paths in native index creation 2026-05-22 18:04:47 +08:00
Daniel Rammer
1700d618e5 test(python): regenerate lindera ipadic fixtures for lindera 3.x
lance v7.0.0-beta.9 bumps lindera 0.44 -> 3.0.7, which changed the
tokenizer config schema (dictionary is now a string path, not a
{ path: ... } map) and the dictionary binary format (now requires
metadata.json). The old fixtures broke test_fts_lindera_tokenizer on
all platforms.

Lift the regenerated config.yml and main.zip from the lance
v7.0.0-beta.9 tag (lance-format/lance#6719) and update the
lindera_ipadic fixture's config writer to the 3.x schema.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 10:46:44 -05:00
lancedb automation
3726491b27 chore: update lance dependency to v7.0.0-beta.9 2026-05-15 14:46:00 +00:00
54 changed files with 162 additions and 2752 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.29.1-beta.0"
current_version = "0.28.0-beta.11"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

77
Cargo.lock generated
View File

@@ -3212,8 +3212,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow-array",
"rand 0.9.4",
@@ -4426,10 +4426,9 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a"
[[package]]
name = "lance"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arc-swap",
"arrow",
"arrow-arith",
"arrow-array",
@@ -4448,7 +4447,6 @@ dependencies = [
"byteorder",
"bytes",
"chrono",
"crossbeam-queue",
"crossbeam-skiplist",
"dashmap",
"datafusion",
@@ -4481,7 +4479,6 @@ dependencies = [
"prost-build",
"prost-types",
"rand 0.9.4",
"rayon",
"roaring",
"semver",
"serde",
@@ -4497,8 +4494,8 @@ dependencies = [
[[package]]
name = "lance-arrow"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4518,8 +4515,8 @@ dependencies = [
[[package]]
name = "lance-bitpacking"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrayref",
"paste",
@@ -4528,8 +4525,8 @@ dependencies = [
[[package]]
name = "lance-core"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4564,8 +4561,8 @@ dependencies = [
[[package]]
name = "lance-datafusion"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow",
"arrow-array",
@@ -4595,8 +4592,8 @@ dependencies = [
[[package]]
name = "lance-datagen"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow",
"arrow-array",
@@ -4614,8 +4611,8 @@ dependencies = [
[[package]]
name = "lance-encoding"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4650,8 +4647,8 @@ dependencies = [
[[package]]
name = "lance-file"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4682,8 +4679,8 @@ dependencies = [
[[package]]
name = "lance-index"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arc-swap",
"arrow",
@@ -4747,8 +4744,8 @@ dependencies = [
[[package]]
name = "lance-io"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow",
"arrow-arith",
@@ -4790,8 +4787,8 @@ dependencies = [
[[package]]
name = "lance-linalg"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4807,8 +4804,8 @@ dependencies = [
[[package]]
name = "lance-namespace"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow",
"async-trait",
@@ -4820,8 +4817,8 @@ dependencies = [
[[package]]
name = "lance-namespace-impls"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow",
"arrow-ipc",
@@ -4870,8 +4867,8 @@ dependencies = [
[[package]]
name = "lance-table"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow",
"arrow-array",
@@ -4910,8 +4907,8 @@ dependencies = [
[[package]]
name = "lance-testing"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"arrow-array",
"arrow-schema",
@@ -4922,8 +4919,8 @@ dependencies = [
[[package]]
name = "lance-tokenizer"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-beta.9"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.9#109ebf71fcfddef8faae1af519df339a330debfc"
dependencies = [
"jieba-rs",
"lindera",
@@ -4934,7 +4931,7 @@ dependencies = [
[[package]]
name = "lancedb"
version = "0.29.0"
version = "0.28.0-beta.11"
dependencies = [
"ahash",
"anyhow",
@@ -5016,7 +5013,7 @@ dependencies = [
[[package]]
name = "lancedb-nodejs"
version = "0.29.0"
version = "0.28.0-beta.11"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -5039,7 +5036,7 @@ dependencies = [
[[package]]
name = "lancedb-python"
version = "0.32.0"
version = "0.31.0-beta.11"
dependencies = [
"arrow",
"async-trait",

View File

@@ -13,20 +13,20 @@ categories = ["database-implementations"]
rust-version = "1.91.0"
[workspace.dependencies]
lance = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance = { "version" = "=7.0.0-beta.9", default-features = false, "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.0.0-beta.9", default-features = false, "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.0.0-beta.9", default-features = false, "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
ahash = "0.8"
# Note that this one does not include pyarrow
arrow = { version = "58.0.0", optional = false }

View File

@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
<dependency>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-core</artifactId>
<version>0.29.1-beta.0</version>
<version>0.28.0-beta.11</version>
</dependency>
```

View File

@@ -343,30 +343,6 @@ This is useful for pagination.
***
### orderBy()
```ts
orderBy(ordering): this
```
Sort the results by the specified column(s).
#### Parameters
* **ordering**: [`ColumnOrdering`](../interfaces/ColumnOrdering.md) \| [`ColumnOrdering`](../interfaces/ColumnOrdering.md)[]
#### Returns
`this`
This query builder.
#### Inherited from
`StandardQueryBase.orderBy`
***
### outputSchema()
```ts

View File

@@ -690,74 +690,6 @@ of the given query
***
### setLsmWriteSpec()
```ts
abstract setLsmWriteSpec(spec): Promise<void>
```
Install an [LsmWriteSpec](../interfaces/LsmWriteSpec.md) on this table, selecting Lance's MemWAL
LSM-style write path for future `mergeInsert` calls.
`LsmWriteSpec` chooses one of three sharding strategies via `specType`:
- `"bucket"` — hash-bucket writes by the single-column unenforced primary
key (`column` and `numBuckets` required).
- `"identity"` — shard by the raw value of a scalar `column`.
- `"unsharded"` — route every write to a single shard.
All variants require the table to have an unenforced primary key
([Table#setUnenforcedPrimaryKey](Table.md#setunenforcedprimarykey)); bucket sharding additionally
requires it to be the single column being bucketed.
#### Parameters
* **spec**: [`LsmWriteSpec`](../interfaces/LsmWriteSpec.md)
The sharding spec to install.
#### Returns
`Promise`&lt;`void`&gt;
#### Example
```ts
await table.setUnenforcedPrimaryKey("id");
await table.setLsmWriteSpec({
specType: "bucket",
column: "id",
numBuckets: 16,
maintainedIndexes: ["id_idx"],
});
```
***
### setUnenforcedPrimaryKey()
```ts
abstract setUnenforcedPrimaryKey(columns): Promise<void>
```
Set the unenforced primary key for this table to a single column.
"Unenforced" means LanceDB does not check uniqueness on writes; the
column is recorded in the schema as the primary key for use by features
such as `merge_insert`. Only single-column primary keys are supported,
and the key cannot be changed once set.
#### Parameters
* **columns**: `string` \| `string`[]
The primary key column. A one-element
array is also accepted; passing more than one column is rejected.
#### Returns
`Promise`&lt;`void`&gt;
***
### stats()
```ts
@@ -861,23 +793,6 @@ Return the table as an arrow table
***
### unsetLsmWriteSpec()
```ts
abstract unsetLsmWriteSpec(): Promise<void>
```
Remove the [LsmWriteSpec](../interfaces/LsmWriteSpec.md) from this table, reverting to the standard
`mergeInsert` write path.
Errors if no spec is currently set.
#### Returns
`Promise`&lt;`void`&gt;
***
### update()
#### update(opts)

View File

@@ -498,30 +498,6 @@ This is useful for pagination.
***
### orderBy()
```ts
orderBy(ordering): this
```
Sort the results by the specified column(s).
#### Parameters
* **ordering**: [`ColumnOrdering`](../interfaces/ColumnOrdering.md) \| [`ColumnOrdering`](../interfaces/ColumnOrdering.md)[]
#### Returns
`this`
This query builder.
#### Inherited from
`StandardQueryBase.orderBy`
***
### outputSchema()
```ts

View File

@@ -51,7 +51,6 @@
- [AlterColumnsResult](interfaces/AlterColumnsResult.md)
- [ClientConfig](interfaces/ClientConfig.md)
- [ColumnAlteration](interfaces/ColumnAlteration.md)
- [ColumnOrdering](interfaces/ColumnOrdering.md)
- [CompactionStats](interfaces/CompactionStats.md)
- [ConnectNamespaceOptions](interfaces/ConnectNamespaceOptions.md)
- [ConnectionOptions](interfaces/ConnectionOptions.md)
@@ -80,7 +79,6 @@
- [IvfRqOptions](interfaces/IvfRqOptions.md)
- [ListNamespacesOptions](interfaces/ListNamespacesOptions.md)
- [ListNamespacesResponse](interfaces/ListNamespacesResponse.md)
- [LsmWriteSpec](interfaces/LsmWriteSpec.md)
- [MergeResult](interfaces/MergeResult.md)
- [OpenTableOptions](interfaces/OpenTableOptions.md)
- [OptimizeOptions](interfaces/OptimizeOptions.md)

View File

@@ -1,31 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / ColumnOrdering
# Interface: ColumnOrdering
## Properties
### ascending?
```ts
optional ascending: boolean;
```
***
### columnName
```ts
columnName: string;
```
***
### nullsFirst?
```ts
optional nullsFirst: boolean;
```

View File

@@ -1,64 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / LsmWriteSpec
# Interface: LsmWriteSpec
Specification selecting Lance's MemWAL LSM-style write path for
`mergeInsert`.
`specType` is `"bucket"`, `"identity"`, or `"unsharded"`. For `"bucket"`,
`column` and `numBuckets` are required; for `"identity"`, `column` is
required.
## Properties
### column?
```ts
optional column: string;
```
Bucket and identity variants: the sharding column.
***
### maintainedIndexes?
```ts
optional maintainedIndexes: string[];
```
Names of indexes the MemWAL should keep up to date during writes.
***
### numBuckets?
```ts
optional numBuckets: number;
```
Bucket variant: the number of buckets, in `[1, 1024]`.
***
### specType
```ts
specType: "bucket" | "identity" | "unsharded";
```
One of `"bucket"`, `"identity"`, or `"unsharded"`.
***
### writerConfigDefaults?
```ts
optional writerConfigDefaults: Record<string, string>;
```
Default `ShardWriter` configuration recorded in the MemWAL index.

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.29.1-beta.0</version>
<version>0.28.0-beta.11</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.29.1-beta.0</version>
<version>0.28.0-beta.11</version>
<packaging>pom</packaging>
<name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description>
@@ -28,7 +28,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<arrow.version>15.0.0</arrow.version>
<lance-core.version>7.0.0-beta.13</lance-core.version>
<lance-core.version>7.0.0-beta.9</lance-core.version>
<spotless.skip>false</spotless.skip>
<spotless.version>2.30.0</spotless.version>
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.29.1-beta.0"
version = "0.28.0-beta.11"
publish = false
license.workspace = true
description.workspace = true

View File

@@ -109,209 +109,3 @@ describe("Query outputSchema", () => {
expect(schema.fields.length).toBe(3);
});
});
describe("Query orderBy", () => {
let tmpDir: tmp.DirResult;
let table: Table;
beforeEach(async () => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
const db = await connect(tmpDir.name);
// Create table with numeric data for sorting
const schema = new Schema([
new Field("id", new Int64(), true),
new Field("score", new Float32(), true),
new Field("name", new Utf8(), true),
]);
const data = makeArrowTable(
[
{ id: 1n, score: 3.5, name: "charlie" },
{ id: 2n, score: 1.2, name: "alice" },
{ id: 3n, score: 2.8, name: "bob" },
{ id: 4n, score: 0.5, name: "david" },
{ id: 5n, score: 4.1, name: "eve" },
],
{ schema },
);
table = await db.createTable("test", data);
});
afterEach(() => {
tmpDir.removeCallback();
});
it("should sort by single column ascending", async () => {
const results = await table
.query()
.orderBy({ columnName: "score", ascending: true, nullsFirst: false })
.toArray();
expect(results.length).toBe(5);
// Verify ascending order
expect(results[0].score).toBeCloseTo(0.5, 0.001);
expect(results[1].score).toBeCloseTo(1.2, 0.001);
expect(results[2].score).toBeCloseTo(2.8, 0.001);
expect(results[3].score).toBeCloseTo(3.5, 0.001);
expect(results[4].score).toBeCloseTo(4.1, 0.001);
});
it("should sort by single column descending", async () => {
const results = await table
.query()
.orderBy({ columnName: "score", ascending: false, nullsFirst: false })
.toArray();
expect(results.length).toBe(5);
// Verify descending order
expect(results[0].score).toBeCloseTo(4.1, 0.001);
expect(results[1].score).toBeCloseTo(3.5, 0.001);
expect(results[2].score).toBeCloseTo(2.8, 0.001);
expect(results[3].score).toBeCloseTo(1.2, 0.001);
expect(results[4].score).toBeCloseTo(0.5, 0.001);
});
it("should use ascending as default direction", async () => {
const results = await table
.query()
.orderBy({ columnName: "score" })
.toArray();
expect(results.length).toBe(5);
// Verify ascending order (default)
expect(results[0].score).toBeCloseTo(0.5, 0.001);
expect(results[1].score).toBeCloseTo(1.2, 0.001);
expect(results[2].score).toBeCloseTo(2.8, 0.001);
expect(results[3].score).toBeCloseTo(3.5, 0.001);
expect(results[4].score).toBeCloseTo(4.1, 0.001);
});
it("should sort by string column", async () => {
const results = await table
.query()
.orderBy({ columnName: "name" })
.toArray();
expect(results.length).toBe(5);
// Verify alphabetical order
expect(results[0].name).toBe("alice");
expect(results[1].name).toBe("bob");
expect(results[2].name).toBe("charlie");
expect(results[3].name).toBe("david");
expect(results[4].name).toBe("eve");
});
it("should support method chaining with where", async () => {
const results = await table
.query()
.where("score > 2.0")
.orderBy({ columnName: "score" })
.toArray();
expect(results.length).toBe(3);
// Verify filtered and sorted
expect(results[0].score).toBeCloseTo(2.8, 0.001);
expect(results[1].score).toBeCloseTo(3.5, 0.001);
expect(results[2].score).toBeCloseTo(4.1, 0.001);
});
it("should support method chaining with limit", async () => {
const results = await table
.query()
.orderBy({ columnName: "score", ascending: false })
.limit(3)
.toArray();
expect(results.length).toBe(3);
// Verify top 3 in descending order
expect(results[0].score).toBeCloseTo(4.1, 0.001);
expect(results[1].score).toBeCloseTo(3.5, 0.001);
expect(results[2].score).toBeCloseTo(2.8, 0.001);
});
it("should support method chaining with offset", async () => {
const results = await table
.query()
.orderBy({ columnName: "score" })
.offset(2)
.limit(2)
.toArray();
expect(results.length).toBe(2);
// Verify results skip first 2 and take next 2
expect(results[0].score).toBeCloseTo(2.8, 0.001);
expect(results[1].score).toBeCloseTo(3.5, 0.001);
});
it("should support method chaining with select", async () => {
const results = await table
.query()
.orderBy({ columnName: "name" })
.select(["name", "score"])
.toArray();
expect(results.length).toBe(5);
// Verify only selected columns are present
expect(Object.keys(results[0])).toEqual(["name", "score"]);
expect(Object.keys(results[4])).toEqual(["name", "score"]);
// Verify sorted by name
expect(results[0].name).toBe("alice");
expect(results[4].name).toBe("eve");
});
it("should support complex method chaining", async () => {
const results = await table
.query()
.where("score > 1.0")
.orderBy({ columnName: "score", ascending: false })
.limit(3)
.select(["id", "score", "name"])
.toArray();
expect(results.length).toBe(3);
// Verify filtered, sorted, limited, and projected
expect(results[0].score).toBeCloseTo(4.1, 0.001);
expect(results[1].score).toBeCloseTo(3.5, 0.001);
expect(results[2].score).toBeCloseTo(2.8, 0.001);
expect(Object.keys(results[0])).toEqual(["id", "score", "name"]);
});
it("should support multi-column ordering and null placement", async () => {
const schema = new Schema([
new Field("group", new Int64(), true),
new Field("score", new Float32(), true),
new Field("name", new Utf8(), true),
]);
const data = makeArrowTable(
[
{ group: 1n, score: null, name: "z" },
{ group: 1n, score: 1.0, name: "b" },
{ group: 1n, score: 1.0, name: "a" },
{ group: 2n, score: 0.5, name: "c" },
],
{ schema },
);
const nullTable = await (await connect(tmpDir.name)).createTable(
"test_multi_order",
data,
{ mode: "overwrite" },
);
const results = await nullTable
.query()
.orderBy([
{ columnName: "group", ascending: true, nullsFirst: false },
{ columnName: "score", ascending: true, nullsFirst: true },
{ columnName: "name", ascending: true, nullsFirst: false },
])
.toArray();
expect(results.map((r) => [r.group, r.score, r.name])).toEqual([
[1n, null, "z"],
[1n, 1.0, "a"],
[1n, 1.0, "b"],
[2n, 0.5, "c"],
]);
});
});

View File

@@ -2348,130 +2348,3 @@ describe("when creating a table with Float32Array vectors", () => {
expect((fsl.children[0].type as Float32).precision).toBe(1);
});
});
describe("setUnenforcedPrimaryKey", () => {
let tmpDir: tmp.DirResult;
beforeEach(() => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
});
afterEach(() => tmpDir.removeCallback());
it("sets a single-column primary key (string or one-element array)", async () => {
const conn = await connect(tmpDir.name);
const schema = new arrow.Schema([
new arrow.Field("id", new arrow.Int64(), false),
]);
const t1 = await conn.createEmptyTable("t1", schema);
await t1.setUnenforcedPrimaryKey("id");
const t2 = await conn.createEmptyTable("t2", schema);
await t2.setUnenforcedPrimaryKey(["id"]);
});
it("rejects a compound primary key", async () => {
const conn = await connect(tmpDir.name);
const table = await conn.createEmptyTable(
"t",
new arrow.Schema([
new arrow.Field("id", new arrow.Int64(), false),
new arrow.Field("name", new arrow.Utf8(), false),
]),
);
await expect(
table.setUnenforcedPrimaryKey(["id", "name"]),
).rejects.toThrow();
});
it("rejects changing the primary key once set", async () => {
const conn = await connect(tmpDir.name);
const table = await conn.createEmptyTable(
"t",
new arrow.Schema([
new arrow.Field("id", new arrow.Int64(), false),
new arrow.Field("name", new arrow.Utf8(), false),
]),
);
await table.setUnenforcedPrimaryKey("id");
await expect(table.setUnenforcedPrimaryKey("name")).rejects.toThrow();
await expect(table.setUnenforcedPrimaryKey("id")).rejects.toThrow();
});
});
describe("setLsmWriteSpec / unsetLsmWriteSpec", () => {
let tmpDir: tmp.DirResult;
beforeEach(() => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
});
afterEach(() => tmpDir.removeCallback());
async function makeTable(conn: Connection): Promise<Table> {
return await conn.createEmptyTable(
"t",
new arrow.Schema([new arrow.Field("id", new arrow.Int64(), false)]),
);
}
it("installs and removes a bucket spec", async () => {
const conn = await connect(tmpDir.name);
const table = await makeTable(conn);
await table.setUnenforcedPrimaryKey("id");
await table.setLsmWriteSpec({
specType: "bucket",
column: "id",
numBuckets: 4,
});
await table.unsetLsmWriteSpec();
// A second unset errors — there is no spec left to remove.
await expect(table.unsetLsmWriteSpec()).rejects.toThrow();
// A fresh spec can be installed after unset.
await table.setLsmWriteSpec({
specType: "bucket",
column: "id",
numBuckets: 8,
});
});
it("installs an unsharded spec", async () => {
const conn = await connect(tmpDir.name);
const table = await makeTable(conn);
await table.setUnenforcedPrimaryKey("id");
await table.setLsmWriteSpec({ specType: "unsharded" });
await table.unsetLsmWriteSpec();
});
it("installs an identity spec", async () => {
const conn = await connect(tmpDir.name);
const table = await makeTable(conn);
await table.setUnenforcedPrimaryKey("id");
await table.setLsmWriteSpec({ specType: "identity", column: "id" });
await table.unsetLsmWriteSpec();
});
it("rejects an invalid spec", async () => {
const conn = await connect(tmpDir.name);
const table = await makeTable(conn);
await table.setUnenforcedPrimaryKey("id");
// num_buckets out of range.
await expect(
table.setLsmWriteSpec({
specType: "bucket",
column: "id",
numBuckets: 0,
}),
).rejects.toThrow();
// Column mismatch.
await expect(
table.setLsmWriteSpec({
specType: "bucket",
column: "missing",
numBuckets: 4,
}),
).rejects.toThrow();
});
});

View File

@@ -38,14 +38,5 @@ test("filtering examples", async () => {
// --8<-- [start:sql_search]
await tbl.query().where("id = 10").limit(10).toArray();
// --8<-- [end:sql_search]
// --8<-- [start:orderby_search]
await tbl
.query()
.where("id > 10")
.orderBy({ columnName: "id", ascending: false })
.limit(5)
.toArray();
// --8<-- [end:orderby_search]
});
});

View File

@@ -82,7 +82,6 @@ export {
VectorQuery,
TakeQuery,
QueryExecutionOptions,
ColumnOrdering,
FullTextSearchOptions,
RecordBatchIterator,
FullTextQuery,
@@ -113,7 +112,6 @@ export {
UpdateOptions,
OptimizeOptions,
Version,
LsmWriteSpec,
ColumnAlteration,
} from "./table";

View File

@@ -79,12 +79,6 @@ export interface QueryExecutionOptions {
timeoutMs?: number;
}
export interface ColumnOrdering {
columnName: string;
ascending?: boolean;
nullsFirst?: boolean;
}
/**
* Options that control the behavior of a full text search
*/
@@ -423,21 +417,6 @@ export class StandardQueryBase<
return this;
}
/**
* Sort the results by the specified column(s).
* @returns This query builder.
*/
orderBy(ordering: ColumnOrdering | ColumnOrdering[]): this {
const orderings = Array.isArray(ordering) ? ordering : [ordering];
const normalized = orderings.map((o) => ({
columnName: o.columnName,
ascending: o.ascending ?? true,
nullsFirst: o.nullsFirst ?? false,
}));
this.doCall((inner) => inner.orderBy(normalized));
return this;
}
/**
* Skip searching un-indexed data. This can make search faster, but will miss
* any data that is not yet indexed.

View File

@@ -106,27 +106,6 @@ export interface Version {
metadata: Record<string, string>;
}
/**
* Specification selecting Lance's MemWAL LSM-style write path for
* `mergeInsert`.
*
* `specType` is `"bucket"`, `"identity"`, or `"unsharded"`. For `"bucket"`,
* `column` and `numBuckets` are required; for `"identity"`, `column` is
* required.
*/
export interface LsmWriteSpec {
/** One of `"bucket"`, `"identity"`, or `"unsharded"`. */
specType: "bucket" | "identity" | "unsharded";
/** Bucket and identity variants: the sharding column. */
column?: string;
/** Bucket variant: the number of buckets, in `[1, 1024]`. */
numBuckets?: number;
/** Names of indexes the MemWAL should keep up to date during writes. */
maintainedIndexes?: string[];
/** Default `ShardWriter` configuration recorded in the MemWAL index. */
writerConfigDefaults?: Record<string, string>;
}
/**
* A Table is a collection of Records in a LanceDB Database.
*
@@ -470,54 +449,6 @@ export abstract class Table {
* containing the new version number of the table after dropping the columns.
*/
abstract dropColumns(columnNames: string[]): Promise<DropColumnsResult>;
/**
* Set the unenforced primary key for this table to a single column.
*
* "Unenforced" means LanceDB does not check uniqueness on writes; the
* column is recorded in the schema as the primary key for use by features
* such as `merge_insert`. Only single-column primary keys are supported,
* and the key cannot be changed once set.
* @param {string | string[]} columns The primary key column. A one-element
* array is also accepted; passing more than one column is rejected.
* @returns {Promise<void>}
*/
abstract setUnenforcedPrimaryKey(columns: string | string[]): Promise<void>;
/**
* Install an {@link LsmWriteSpec} on this table, selecting Lance's MemWAL
* LSM-style write path for future `mergeInsert` calls.
*
* `LsmWriteSpec` chooses one of three sharding strategies via `specType`:
*
* - `"bucket"` — hash-bucket writes by the single-column unenforced primary
* key (`column` and `numBuckets` required).
* - `"identity"` — shard by the raw value of a scalar `column`.
* - `"unsharded"` — route every write to a single shard.
*
* All variants require the table to have an unenforced primary key
* ({@link Table#setUnenforcedPrimaryKey}); bucket sharding additionally
* requires it to be the single column being bucketed.
* @param {LsmWriteSpec} spec The sharding spec to install.
* @returns {Promise<void>}
* @example
* ```ts
* await table.setUnenforcedPrimaryKey("id");
* await table.setLsmWriteSpec({
* specType: "bucket",
* column: "id",
* numBuckets: 16,
* maintainedIndexes: ["id_idx"],
* });
* ```
*/
abstract setLsmWriteSpec(spec: LsmWriteSpec): Promise<void>;
/**
* Remove the {@link LsmWriteSpec} from this table, reverting to the standard
* `mergeInsert` write path.
*
* Errors if no spec is currently set.
* @returns {Promise<void>}
*/
abstract unsetLsmWriteSpec(): Promise<void>;
/** Retrieve the version of the table */
abstract version(): Promise<number>;
@@ -966,19 +897,6 @@ export class LocalTable extends Table {
return await this.inner.dropColumns(columnNames);
}
async setUnenforcedPrimaryKey(columns: string | string[]): Promise<void> {
const cols = typeof columns === "string" ? [columns] : columns;
return await this.inner.setUnenforcedPrimaryKey(cols);
}
async setLsmWriteSpec(spec: LsmWriteSpec): Promise<void> {
return await this.inner.setLsmWriteSpec(spec);
}
async unsetLsmWriteSpec(): Promise<void> {
return await this.inner.unsetLsmWriteSpec();
}
async version(): Promise<number> {
return await this.inner.version();
}

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.11",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.11",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.11",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.11",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.11",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.11",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.11",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.11",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -3,12 +3,6 @@
use std::sync::Arc;
use crate::error::NapiErrorExt;
use crate::error::convert_error;
use crate::iterator::RecordBatchIterator;
use crate::rerankers::RerankHybridCallbackArgs;
use crate::rerankers::Reranker;
use crate::util::{parse_distance_type, schema_to_buffer};
use arrow_array::{
Array, Float16Array as ArrowFloat16Array, Float32Array as ArrowFloat32Array,
Float64Array as ArrowFloat64Array, UInt8Array as ArrowUInt8Array,
@@ -25,27 +19,16 @@ use lancedb::query::QueryBase;
use lancedb::query::QueryExecutionOptions;
use lancedb::query::Select;
use lancedb::query::TakeQuery as LanceDbTakeQuery;
use lancedb::query::{ColumnOrdering as LanceDbColumnOrdering, VectorQuery as LanceDbVectorQuery};
use lancedb::query::VectorQuery as LanceDbVectorQuery;
use napi::bindgen_prelude::*;
use napi_derive::napi;
#[napi(object)]
pub struct ColumnOrdering {
pub ascending: bool,
pub nulls_first: bool,
pub column_name: String,
}
impl From<ColumnOrdering> for LanceDbColumnOrdering {
fn from(value: ColumnOrdering) -> Self {
match (value.ascending, value.nulls_first) {
(true, true) => Self::asc_nulls_first(value.column_name),
(true, false) => Self::asc_nulls_last(value.column_name),
(false, true) => Self::desc_nulls_first(value.column_name),
(false, false) => Self::desc_nulls_last(value.column_name),
}
}
}
use crate::error::NapiErrorExt;
use crate::error::convert_error;
use crate::iterator::RecordBatchIterator;
use crate::rerankers::RerankHybridCallbackArgs;
use crate::rerankers::Reranker;
use crate::util::{parse_distance_type, schema_to_buffer};
fn bytes_to_arrow_array(data: Uint8Array, dtype: String) -> napi::Result<Arc<dyn Array>> {
let buf = arrow_buffer::Buffer::from(data.to_vec());
@@ -145,18 +128,6 @@ impl Query {
self.inner = self.inner.clone().with_row_id();
}
#[napi]
pub fn order_by(&mut self, ordering: Option<Vec<ColumnOrdering>>) -> napi::Result<()> {
let ordering = ordering.map(|ordering| {
ordering
.into_iter()
.map(LanceDbColumnOrdering::from)
.collect()
});
self.inner = self.inner.clone().order_by(ordering);
Ok(())
}
#[napi(catch_unwind)]
pub async fn output_schema(&self) -> napi::Result<Buffer> {
let schema = self.inner.output_schema().await.default_error()?;
@@ -357,18 +328,6 @@ impl VectorQuery {
Ok(())
}
#[napi]
pub fn order_by(&mut self, ordering: Option<Vec<ColumnOrdering>>) -> napi::Result<()> {
let ordering = ordering.map(|ordering| {
ordering
.into_iter()
.map(LanceDbColumnOrdering::from)
.collect()
});
self.inner = self.inner.clone().order_by(ordering);
Ok(())
}
#[napi(catch_unwind)]
pub async fn output_schema(&self) -> napi::Result<Buffer> {
let schema = self.inner.output_schema().await.default_error()?;

View File

@@ -344,31 +344,6 @@ impl Table {
Ok(res.into())
}
#[napi(catch_unwind)]
pub async fn set_unenforced_primary_key(&self, columns: Vec<String>) -> napi::Result<()> {
self.inner_ref()?
.set_unenforced_primary_key(columns)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn set_lsm_write_spec(&self, spec: LsmWriteSpec) -> napi::Result<()> {
let native_spec = lancedb::table::LsmWriteSpec::try_from(spec)?;
self.inner_ref()?
.set_lsm_write_spec(native_spec)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn unset_lsm_write_spec(&self) -> napi::Result<()> {
self.inner_ref()?
.unset_lsm_write_spec()
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn version(&self) -> napi::Result<i64> {
self.inner_ref()?
@@ -563,63 +538,6 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
}
}
/// Specification selecting Lance's MemWAL LSM-style write path for
/// `mergeInsert`.
///
/// `specType` must be `"bucket"`, `"identity"`, or `"unsharded"`. For
/// `"bucket"`, `column` and `numBuckets` are required; for `"identity"`,
/// `column` is required.
#[napi(object)]
#[derive(Clone, Debug)]
pub struct LsmWriteSpec {
/// One of `"bucket"`, `"identity"`, or `"unsharded"`.
pub spec_type: String,
/// Bucket and identity variants: the sharding column.
pub column: Option<String>,
/// Bucket variant: the number of buckets, in `[1, 1024]`.
pub num_buckets: Option<u32>,
/// Names of indexes the MemWAL should keep up to date during writes.
pub maintained_indexes: Option<Vec<String>>,
/// Default `ShardWriter` configuration recorded in the MemWAL index.
pub writer_config_defaults: Option<HashMap<String, String>>,
}
impl TryFrom<LsmWriteSpec> for lancedb::table::LsmWriteSpec {
type Error = napi::Error;
fn try_from(value: LsmWriteSpec) -> napi::Result<Self> {
let maintained = value.maintained_indexes.unwrap_or_default();
let writer_config_defaults = value.writer_config_defaults.unwrap_or_default();
let spec = match value.spec_type.as_str() {
"bucket" => {
let column = value.column.ok_or_else(|| {
napi::Error::from_reason("LsmWriteSpec bucket requires `column`")
})?;
let num_buckets = value.num_buckets.ok_or_else(|| {
napi::Error::from_reason("LsmWriteSpec bucket requires `numBuckets`")
})?;
Self::bucket(column, num_buckets)
}
"identity" => {
let column = value.column.ok_or_else(|| {
napi::Error::from_reason("LsmWriteSpec identity requires `column`")
})?;
Self::identity(column)
}
"unsharded" => Self::unsharded(),
other => {
return Err(napi::Error::from_reason(format!(
"LsmWriteSpec `specType` must be 'bucket', 'identity', or 'unsharded', got '{}'",
other
)));
}
};
Ok(spec
.with_maintained_indexes(maintained)
.with_writer_config_defaults(writer_config_defaults))
}
}
/// Statistics about a compaction operation.
#[napi(object)]
#[derive(Clone, Debug)]

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.32.1-beta.0"
current_version = "0.31.0-beta.11"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.32.1-beta.0"
version = "0.31.0-beta.11"
publish = false
edition.workspace = true
description = "Python bindings for LanceDB"

View File

@@ -217,9 +217,6 @@ class Table:
async def uri(self) -> str: ...
async def initial_storage_options(self) -> Optional[Dict[str, str]]: ...
async def latest_storage_options(self) -> Optional[Dict[str, str]]: ...
async def set_unenforced_primary_key(self, columns: List[str]) -> None: ...
async def set_lsm_write_spec(self, spec: LsmWriteSpec) -> None: ...
async def unset_lsm_write_spec(self) -> None: ...
@property
def tags(self) -> Tags: ...
def query(self) -> Query: ...
@@ -258,11 +255,6 @@ class RecordBatchStream:
def __aiter__(self) -> "RecordBatchStream": ...
async def __anext__(self) -> pa.RecordBatch: ...
class ColumnOrdering(TypedDict):
column_name: str
ascending: bool
nulls_first: bool
class Query:
def where(self, filter: str): ...
def where_expr(self, expr: PyExpr): ...
@@ -276,7 +268,6 @@ class Query:
def postfilter(self): ...
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
def nearest_to_text(self, query: dict) -> FTSQuery: ...
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
async def output_schema(self) -> pa.Schema: ...
async def execute(
self, max_batch_length: Optional[int], timeout: Optional[timedelta]
@@ -305,7 +296,6 @@ class FTSQuery:
def get_query(self) -> str: ...
def add_query_vector(self, query_vec: pa.Array) -> None: ...
def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
async def output_schema(self) -> pa.Schema: ...
async def execute(
self, max_batch_length: Optional[int], timeout: Optional[timedelta]
@@ -331,7 +321,6 @@ class VectorQuery:
def maximum_nprobes(self, maximum_nprobes: int): ...
def bypass_vector_index(self): ...
def nearest_to_text(self, query: dict) -> HybridQuery: ...
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
def to_query_request(self) -> PyQueryRequest: ...
class HybridQuery:
@@ -350,7 +339,6 @@ class HybridQuery:
def minimum_nprobes(self, minimum_nprobes: int): ...
def maximum_nprobes(self, maximum_nprobes: int): ...
def bypass_vector_index(self): ...
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
def to_vector_query(self) -> VectorQuery: ...
def to_fts_query(self) -> FTSQuery: ...
def get_limit(self) -> int: ...
@@ -380,7 +368,6 @@ class PyQueryRequest:
bypass_vector_index: Optional[bool]
postfilter: Optional[bool]
norm: Optional[str]
order_by: Optional[List[ColumnOrdering]]
class CompactionStats:
fragments_removed: int
@@ -421,37 +408,6 @@ class MergeResult:
num_deleted_rows: int
num_attempts: int
class LsmWriteSpec:
"""Specification selecting Lance's MemWAL LSM-style write path for
`merge_insert`."""
@staticmethod
def bucket(column: str, num_buckets: int) -> "LsmWriteSpec": ...
@staticmethod
def identity(column: str) -> "LsmWriteSpec": ...
@staticmethod
def unsharded() -> "LsmWriteSpec": ...
def with_maintained_indexes(self, indexes: List[str]) -> "LsmWriteSpec":
"""Return a copy of this spec asking the MemWAL to keep the named
indexes up to date as rows are appended."""
...
def with_writer_config_defaults(self, defaults: Dict[str, str]) -> "LsmWriteSpec":
"""Return a copy of this spec recording the given default
`ShardWriter` configuration in the MemWAL index."""
...
@property
def spec_type(self) -> str:
"""One of 'bucket', 'identity', or 'unsharded'."""
...
@property
def column(self) -> Optional[str]: ...
@property
def num_buckets(self) -> Optional[int]: ...
@property
def maintained_indexes(self) -> List[str]: ...
@property
def writer_config_defaults(self) -> Dict[str, str]: ...
class AddColumnsResult:
version: int

View File

@@ -968,32 +968,22 @@ class Permutation:
new.transform_fn = transform
return new
def take_offsets(self, offsets: list[int]) -> Any:
"""
Take rows from the permutation by offset
The returned value is passed through the permutation's current transform,
so `with_format` and `with_transform` affect this method in the same way
they affect iteration.
"""
async def do_take_offsets():
return await self.reader.take_offsets(offsets, selection=self.selection)
batch = LOOP.run(do_take_offsets())
return self.transform_fn(batch)
def __getitem__(self, index: int) -> Any:
"""
Returns a single row from the permutation by offset
"""
return self.take_offsets([index])
return self.__getitems__([index])
def __getitems__(self, indices: list[int]) -> Any:
"""
Returns rows from the permutation by offset
"""
return self.take_offsets(indices)
async def do_getitems():
return await self.reader.take_offsets(indices, selection=self.selection)
batch = LOOP.run(do_getitems())
return self.transform_fn(batch)
@deprecated(details="Use with_skip instead")
def skip(self, skip: int) -> "Permutation":

View File

@@ -92,12 +92,6 @@ def ensure_vector_query(
return val
class ColumnOrdering(pydantic.BaseModel):
column_name: str
ascending: bool = True
nulls_first: bool = False
class FullTextQueryType(str, Enum):
MATCH = "match"
MATCH_PHRASE = "match_phrase"
@@ -510,8 +504,6 @@ class Query(pydantic.BaseModel):
# Bypass the vector index and use a brute force search
bypass_vector_index: Optional[bool] = None
order_by: Optional[List[ColumnOrdering]] = None
@classmethod
def from_inner(cls, req: PyQueryRequest) -> Self:
query = cls()
@@ -532,8 +524,6 @@ class Query(pydantic.BaseModel):
query.refine_factor = req.refine_factor
query.bypass_vector_index = req.bypass_vector_index
query.postfilter = req.postfilter
if req.order_by is not None:
query.order_by = [ColumnOrdering(**o) for o in req.order_by]
if req.full_text_search is not None:
query.full_text_query = FullTextSearchQuery(
columns=None,
@@ -582,22 +572,9 @@ class LanceQueryBuilder(ABC):
If "auto", the query type is inferred based on the query.
vector_column_name: str
The name of the vector column to use for vector search.
ordering_field_name: Optional[str]
.. deprecated:: 0.27.0
Use ``order_by()`` method instead.
fts_columns: Optional[Union[str, List[str]]]
The columns to search in for full text search.
fast_search: bool
Skip flat search of unindexed data.
"""
if ordering_field_name is not None:
import warnings
warnings.warn(
"ordering_field_name is deprecated, use .order_by() method instead.",
DeprecationWarning,
stacklevel=2,
)
# Check hybrid search first as it supports empty query pattern
if query_type == "hybrid":
# hybrid fts and vector query
@@ -694,7 +671,6 @@ class LanceQueryBuilder(ABC):
self._text = None
self._ef = None
self._bypass_vector_index = None
self._order_by = None
@deprecation.deprecated(
deprecated_in="0.3.1",
@@ -971,24 +947,6 @@ class LanceQueryBuilder(ABC):
""" # noqa: E501
return self._table._explain_plan(self.to_query_object(), verbose=verbose)
def order_by(self, ordering: Optional[List[ColumnOrdering]]) -> Self:
"""
Set the ordering for the results.
Parameters
----------
ordering: Optional[List[ColumnOrdering]]
The ordering to use for the results. If None, then the default ordering
will be used.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
self._order_by = ordering
return self
def analyze_plan(self) -> str:
"""
Run the query and return its execution plan with runtime metrics.
@@ -1356,7 +1314,6 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
fast_search=self._fast_search,
ef=self._ef,
bypass_vector_index=self._bypass_vector_index,
order_by=self._order_by,
)
def to_batches(
@@ -1508,9 +1465,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
super().__init__(table)
self._query = query
self._phrase_query = False
# Deprecated compatibility parameter. Native FTS ordering is now
# configured through order_by(); LanceQueryBuilder.create emits the warning.
_ = ordering_field_name
self.ordering_field_name = ordering_field_name
self._reranker = None
self._fast_search = fast_search
if isinstance(fts_columns, str):
@@ -1559,7 +1514,6 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
),
offset=self._offset,
fast_search=self._fast_search,
order_by=self._order_by,
)
def output_schema(self) -> pa.Schema:
@@ -1625,7 +1579,6 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder):
limit=self._limit,
with_row_id=self._with_row_id,
offset=self._offset,
order_by=self._order_by,
)
def output_schema(self) -> pa.Schema:
@@ -2549,27 +2502,6 @@ class AsyncStandardQuery(AsyncQueryBase):
self._inner.offset(offset)
return self
def order_by(self, ordering: Optional[List[ColumnOrdering]]) -> Self:
"""
Set the ordering for the results.
Parameters
----------
ordering: Optional[List[ColumnOrdering]]
The ordering to use for the results. If None, then the default ordering
will be used.
"""
if ordering is None:
self._inner.order_by(None)
else:
self._inner.order_by(
[
o.model_dump() if hasattr(o, "model_dump") else o.dict()
for o in ordering
]
)
return self
def fast_search(self) -> Self:
"""
Skip searching un-indexed data.

View File

@@ -14,7 +14,6 @@ from lancedb._lancedb import (
DeleteResult,
DropColumnsResult,
IndexConfig,
LsmWriteSpec,
MergeResult,
UpdateResult,
)
@@ -656,18 +655,6 @@ class RemoteTable(Table):
def drop_columns(self, columns: Iterable[str]) -> DropColumnsResult:
return LOOP.run(self._table.drop_columns(columns))
def set_unenforced_primary_key(self, columns: Union[str, Iterable[str]]) -> None:
"""Not supported on LanceDB Cloud."""
return LOOP.run(self._table.set_unenforced_primary_key(columns))
def set_lsm_write_spec(self, spec: "LsmWriteSpec") -> None:
"""Not supported on LanceDB Cloud."""
return LOOP.run(self._table.set_lsm_write_spec(spec))
def unset_lsm_write_spec(self) -> None:
"""Not supported on LanceDB Cloud."""
return LOOP.run(self._table.unset_lsm_write_spec())
def drop_index(self, index_name: str):
return LOOP.run(self._table.drop_index(index_name))

View File

@@ -154,7 +154,6 @@ if TYPE_CHECKING:
AlterColumnsResult,
DeleteResult,
DropColumnsResult,
LsmWriteSpec,
MergeResult,
UpdateResult,
)
@@ -3264,21 +3263,6 @@ class LanceTable(Table):
def drop_columns(self, columns: Iterable[str]) -> DropColumnsResult:
return LOOP.run(self._table.drop_columns(columns))
def set_unenforced_primary_key(self, columns: Union[str, Iterable[str]]) -> None:
"""Set the unenforced primary key. See
[`AsyncTable.set_unenforced_primary_key`][lancedb.AsyncTable.set_unenforced_primary_key]."""
return LOOP.run(self._table.set_unenforced_primary_key(columns))
def set_lsm_write_spec(self, spec: "LsmWriteSpec") -> None:
"""Install an LsmWriteSpec. See
[`AsyncTable.set_lsm_write_spec`][lancedb.AsyncTable.set_lsm_write_spec]."""
return LOOP.run(self._table.set_lsm_write_spec(spec))
def unset_lsm_write_spec(self) -> None:
"""Remove the LsmWriteSpec. See
[`AsyncTable.unset_lsm_write_spec`][lancedb.AsyncTable.unset_lsm_write_spec]."""
return LOOP.run(self._table.unset_lsm_write_spec())
def uses_v2_manifest_paths(self) -> bool:
"""
Check if the table is using the new v2 manifest paths.
@@ -3824,69 +3808,6 @@ class AsyncTable:
Any attempt to use the table after it has been closed will raise an error."""
return self._inner.close()
async def set_unenforced_primary_key(
self, columns: Union[str, Iterable[str]]
) -> None:
"""Set the unenforced primary key for this table to the given
ordered list of columns.
"Unenforced" means LanceDB does not check uniqueness on writes; the
columns are recorded in the schema as the primary key so that
features such as `merge_insert` can use them. Calling this again
replaces any previously-set primary key.
Parameters
----------
columns : str or Iterable[str]
Either a single column name (single-column key) or an ordered
iterable of column names (composite key). Each column dtype
must be one of: int32, int64, utf8, large_utf8, binary,
large_binary, fixed_size_binary.
"""
if isinstance(columns, str):
columns = [columns]
else:
columns = list(columns)
await self._inner.set_unenforced_primary_key(columns)
async def set_lsm_write_spec(self, spec: "LsmWriteSpec") -> None:
"""Install an LsmWriteSpec on this table.
The spec selects Lance's MemWAL LSM-style write path for future
`merge_insert` calls. ``LsmWriteSpec`` chooses one of three sharding
strategies:
- ``LsmWriteSpec.bucket(column, num_buckets)`` — hash-bucket writes by
the single-column unenforced primary key.
- ``LsmWriteSpec.identity(column)`` — shard by the raw value of a
scalar column.
- ``LsmWriteSpec.unsharded()`` — route every write to a single shard.
All variants require the table to have an unenforced primary key set
via [`set_unenforced_primary_key`]; bucket sharding additionally
requires it to be the single column being bucketed.
Parameters
----------
spec : LsmWriteSpec
The sharding spec to install.
Examples
--------
>>> from lancedb._lancedb import LsmWriteSpec
>>> # table.set_unenforced_primary_key("id")
>>> # table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 16))
"""
await self._inner.set_lsm_write_spec(spec)
async def unset_lsm_write_spec(self) -> None:
"""Remove the LsmWriteSpec from this table.
Reverts to the standard `merge_insert` write path. Errors if no spec
is currently set.
"""
await self._inner.unset_lsm_write_spec()
@property
def name(self) -> str:
"""The name of the table."""
@@ -4591,8 +4512,6 @@ class AsyncTable:
async_query = async_query.fast_search()
if query.with_row_id:
async_query = async_query.with_row_id()
if query.order_by:
async_query = async_query.order_by(query.order_by)
if query.vector:
async_query = async_query.nearest_to(query.vector).distance_range(

View File

@@ -29,7 +29,6 @@ from lancedb.query import (
MultiMatchQuery,
PhraseQuery,
BooleanQuery,
ColumnOrdering,
Occur,
LanceFtsQueryBuilder,
)
@@ -500,36 +499,6 @@ async def test_search_fts_specify_column_async(async_table):
pass
def test_search_order_by_descending(table):
table.create_fts_index("text")
rows = (
table.search("puppy")
.order_by([ColumnOrdering(column_name="count", ascending=False)])
.limit(20)
.select(["text", "count"])
.to_list()
)
for r in rows:
assert "puppy" in r["text"]
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
def test_search_order_by_ascending(table):
table.create_fts_index("text")
rows = (
table.search("puppy")
.order_by([ColumnOrdering(column_name="count", ascending=True)])
.limit(20)
.select(["text", "count"])
.to_list()
)
for r in rows:
assert "puppy" in r["text"]
assert sorted(rows, key=lambda x: x["count"]) == rows
def test_create_index_from_table(tmp_path, table):
table.create_fts_index("text")
df = table.search("puppy").limit(5).select(["text"]).to_pandas()

View File

@@ -1,149 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
"""Tests for installing and clearing an LsmWriteSpec via
`Table.set_lsm_write_spec` / `Table.unset_lsm_write_spec`.
"""
from datetime import timedelta
import lancedb
import pyarrow as pa
import pytest
from lancedb._lancedb import LsmWriteSpec
SCHEMA = pa.schema(
[
pa.field("id", pa.utf8(), nullable=False),
pa.field("v", pa.int32(), nullable=False),
]
)
def _batch(ids, vs):
return pa.RecordBatch.from_arrays(
[pa.array(ids, type=pa.utf8()), pa.array(vs, type=pa.int32())],
schema=SCHEMA,
)
def _reader(ids, vs):
return pa.RecordBatchReader.from_batches(SCHEMA, [_batch(ids, vs)])
def _make_table(tmp_path):
db = lancedb.connect(tmp_path, read_consistency_interval=timedelta(seconds=0))
table = db.create_table("t", _reader(["seed"], [0]))
return db, table
def test_set_lsm_write_spec_validates(tmp_path):
_db, table = _make_table(tmp_path)
# No PK set yet.
with pytest.raises(Exception, match="primary key"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
table.set_unenforced_primary_key("id")
# Column mismatch.
with pytest.raises(Exception, match="match"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("v", 4))
# Out-of-range num_buckets.
with pytest.raises(Exception, match="num_buckets"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 0))
with pytest.raises(Exception, match="num_buckets"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 1025))
# Happy path then mutation rejected.
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
with pytest.raises(Exception, match="mutation"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 8))
def test_unset_lsm_write_spec(tmp_path):
_db, table = _make_table(tmp_path)
# unset errors when no spec is set.
with pytest.raises(Exception, match="no LSM write spec"):
table.unset_lsm_write_spec()
# Install a spec, then remove it; afterwards a fresh spec can be set.
table.set_unenforced_primary_key("id")
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
table.unset_lsm_write_spec()
# A second unset errors — there is no spec left to remove.
with pytest.raises(Exception, match="no LSM write spec"):
table.unset_lsm_write_spec()
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 8))
def test_set_unsharded_spec(tmp_path):
_db, table = _make_table(tmp_path)
# Lance MemWAL still requires a primary key on the dataset; Unsharded
# just skips per-row hashing.
table.set_unenforced_primary_key("id")
table.set_lsm_write_spec(LsmWriteSpec.unsharded())
table.unset_lsm_write_spec()
def test_lsm_write_spec_repr():
s = LsmWriteSpec.bucket("id", 4)
assert s.spec_type == "bucket"
assert s.column == "id"
assert s.num_buckets == 4
assert s.maintained_indexes == []
assert "bucket" in repr(s)
assert "id" in repr(s)
assert "4" in repr(s)
u = LsmWriteSpec.unsharded()
assert u.spec_type == "unsharded"
assert u.column is None
assert u.num_buckets is None
assert "unsharded" in repr(u)
def test_lsm_write_spec_with_maintained_indexes():
s = LsmWriteSpec.bucket("id", 4).with_maintained_indexes(["idx_a", "idx_b"])
assert s.maintained_indexes == ["idx_a", "idx_b"]
@pytest.mark.asyncio
async def test_async_set_unset_lsm_write_spec(tmp_path):
db = await lancedb.connect_async(
tmp_path, read_consistency_interval=timedelta(seconds=0)
)
table = await db.create_table(
"t",
pa.RecordBatchReader.from_batches(SCHEMA, [_batch(["seed"], [0])]),
)
await table.set_unenforced_primary_key("id")
await table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
await table.unset_lsm_write_spec()
# A second unset errors.
with pytest.raises(Exception, match="no LSM write spec"):
await table.unset_lsm_write_spec()
def test_set_identity_spec(tmp_path):
_db, table = _make_table(tmp_path)
# Identity sharding still requires an unenforced primary key on the
# table; it shards by the raw value of the given column.
table.set_unenforced_primary_key("id")
table.set_lsm_write_spec(LsmWriteSpec.identity("v"))
table.unset_lsm_write_spec()
def test_lsm_write_spec_identity_and_writer_config_defaults():
s = LsmWriteSpec.identity("v")
assert s.spec_type == "identity"
assert s.column == "v"
assert s.num_buckets is None
assert "identity" in repr(s)
s = s.with_writer_config_defaults({"durable_write": "false"})
assert s.writer_config_defaults == {"durable_write": "false"}
assert "durable_write" in repr(s)

View File

@@ -1080,29 +1080,3 @@ def test_getitems_invalid_offset(some_permutation: Permutation):
"""Test __getitems__ with an out-of-range offset raises an error."""
with pytest.raises(Exception):
some_permutation.__getitems__([999999])
def test_take_offsets(some_permutation: Permutation):
result = some_permutation.take_offsets([0, 1, 2])
assert isinstance(result, list)
assert "id" in result[0]
assert "value" in result[0]
assert len(result) == 3
def test_take_offsets_empty_identity_permutation(mem_db):
tbl = mem_db.create_table(
"test_table", pa.table({"id": range(10), "value": range(10)})
)
permutation = Permutation.identity(tbl)
result = permutation.take_offsets([])
assert result == []
def test_take_offsets_empty_permutation(some_permutation: Permutation):
result = some_permutation.take_offsets([])
assert result == []

View File

@@ -1,79 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
"""Tests for Table.set_unenforced_primary_key."""
from datetime import timedelta
import lancedb
import pyarrow as pa
import pytest
def _empty_table(path, schema):
db = lancedb.connect(path, read_consistency_interval=timedelta(seconds=0))
return db.create_table("t", schema=schema)
def test_set_unenforced_primary_key_accepts_string_or_one_element_list(tmp_path):
schema = pa.schema([pa.field("id", pa.int64(), nullable=False)])
# Bare string.
table = _empty_table(tmp_path / "s", schema)
table.set_unenforced_primary_key("id")
# One-element list.
table = _empty_table(tmp_path / "l", schema)
table.set_unenforced_primary_key(["id"])
def test_set_unenforced_primary_key_rejects_compound_and_empty(tmp_path):
table = _empty_table(
tmp_path,
pa.schema(
[
pa.field("a", pa.utf8(), nullable=False),
pa.field("b", pa.int64(), nullable=False),
]
),
)
# Compound keys are not supported.
with pytest.raises(Exception, match="compound"):
table.set_unenforced_primary_key(["a", "b"])
# Empty input.
with pytest.raises(Exception, match="required"):
table.set_unenforced_primary_key([])
def test_set_unenforced_primary_key_is_immutable(tmp_path):
table = _empty_table(
tmp_path,
pa.schema(
[
pa.field("a", pa.utf8(), nullable=False),
pa.field("b", pa.int64(), nullable=False),
]
),
)
table.set_unenforced_primary_key("a")
# The primary key cannot be changed or re-set once installed.
with pytest.raises(Exception, match="already set"):
table.set_unenforced_primary_key("b")
with pytest.raises(Exception, match="already set"):
table.set_unenforced_primary_key("a")
def test_set_unenforced_primary_key_validates(tmp_path):
table = _empty_table(
tmp_path / "t", pa.schema([pa.field("id", pa.utf8(), nullable=False)])
)
# Unknown column.
with pytest.raises(Exception, match="not found"):
table.set_unenforced_primary_key("nonexistent")
# Unsupported dtype (Float32 not in the supported set).
bad = _empty_table(
tmp_path / "bad", pa.schema([pa.field("id", pa.float32(), nullable=False)])
)
with pytest.raises(Exception, match="not supported"):
bad.set_unenforced_primary_key("id")

View File

@@ -25,7 +25,6 @@ from lancedb.query import (
AsyncHybridQuery,
AsyncQueryBase,
AsyncVectorQuery,
ColumnOrdering,
LanceVectorQueryBuilder,
MatchQuery,
PhraseQuery,
@@ -165,71 +164,6 @@ def test_offset(table):
assert len(results_with_offset.to_pandas()) == 1
def test_order_by_plain_query(mem_db):
table = mem_db.create_table(
"test_order_by",
pa.table(
{
"group": [1, 1, 1, 2],
"score": [None, 1.0, 1.0, 0.5],
"name": ["z", "b", "a", "c"],
}
),
)
res = (
table.search()
.order_by(
[
ColumnOrdering(column_name="group", ascending=True, nulls_first=False),
ColumnOrdering(column_name="score", ascending=True, nulls_first=True),
ColumnOrdering(column_name="name", ascending=True, nulls_first=False),
]
)
.to_arrow()
)
assert res.select(["group", "score", "name"]).to_pylist() == [
{"group": 1, "score": None, "name": "z"},
{"group": 1, "score": 1.0, "name": "a"},
{"group": 1, "score": 1.0, "name": "b"},
{"group": 2, "score": 0.5, "name": "c"},
]
@pytest.mark.asyncio
async def test_order_by_async_query(mem_db_async: AsyncConnection):
table = await mem_db_async.create_table(
"test_order_by_async",
pa.table(
{
"group": [1, 1, 1, 2],
"score": [None, 1.0, 1.0, 0.5],
"name": ["z", "b", "a", "c"],
}
),
)
res = await (
table.query()
.order_by(
[
ColumnOrdering(column_name="group", ascending=True, nulls_first=False),
ColumnOrdering(column_name="score", ascending=True, nulls_first=True),
ColumnOrdering(column_name="name", ascending=True, nulls_first=False),
]
)
.to_arrow()
)
assert res.select(["group", "score", "name"]).to_pylist() == [
{"group": 1, "score": None, "name": "z"},
{"group": 1, "score": 1.0, "name": "a"},
{"group": 1, "score": 1.0, "name": "b"},
{"group": 2, "score": 0.5, "name": "c"},
]
def test_query_builder(table):
rs = (
LanceVectorQueryBuilder(table, [0, 0], "vector")

View File

@@ -16,7 +16,6 @@ from packaging.version import Version
import lancedb
from lancedb.conftest import MockTextEmbeddingFunction
from lancedb.query import ColumnOrdering
from lancedb.remote import ClientConfig
from lancedb.remote.errors import HttpError, RetryError
import pytest
@@ -661,18 +660,6 @@ def test_query_sync_maximal():
"ef": None,
"filter": "id > 0",
"columns": ["id", "name"],
"order_by": [
{
"column_name": "score",
"ascending": False,
"nulls_first": True,
},
{
"column_name": "id",
"ascending": True,
"nulls_first": False,
},
],
"vector_column": "vector2",
"fast_search": True,
"with_row_id": True,
@@ -690,14 +677,6 @@ def test_query_sync_maximal():
.refine_factor(10)
.nprobes(5)
.where("id > 0", prefilter=True)
.order_by(
[
ColumnOrdering(
column_name="score", ascending=False, nulls_first=True
),
ColumnOrdering(column_name="id", ascending=True, nulls_first=False),
]
)
.with_row_id(True)
.select(["id", "name"])
.to_list()

View File

@@ -15,8 +15,8 @@ use pyo3::{
use query::{FTSQuery, HybridQuery, Query, VectorQuery};
use session::Session;
use table::{
AddColumnsResult, AddResult, AlterColumnsResult, DeleteResult, DropColumnsResult, LsmWriteSpec,
MergeResult, Table, UpdateResult,
AddColumnsResult, AddResult, AlterColumnsResult, DeleteResult, DropColumnsResult, MergeResult,
Table, UpdateResult,
};
pub mod arrow;
@@ -52,7 +52,6 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<AlterColumnsResult>()?;
m.add_class::<AddResult>()?;
m.add_class::<MergeResult>()?;
m.add_class::<LsmWriteSpec>()?;
m.add_class::<DeleteResult>()?;
m.add_class::<DropColumnsResult>()?;
m.add_class::<UpdateResult>()?;

View File

@@ -23,7 +23,7 @@ use lancedb::query::QueryBase;
use lancedb::query::QueryExecutionOptions;
use lancedb::query::QueryFilter;
use lancedb::query::{
ColumnOrdering, ExecutableQuery, Query as LanceDbQuery, Select, TakeQuery as LanceDbTakeQuery,
ExecutableQuery, Query as LanceDbQuery, Select, TakeQuery as LanceDbTakeQuery,
VectorQuery as LanceDbVectorQuery,
};
use lancedb::table::AnyQuery;
@@ -207,48 +207,6 @@ impl<'py> IntoPyObject<'py> for PyLanceDB<FtsQuery> {
#[derive(Clone)]
pub struct PyQueryVectors(Vec<Arc<dyn Array>>);
#[derive(Clone, FromPyObject)]
#[pyo3(from_item_all)]
pub struct PyColumnOrdering {
pub column_name: String,
pub ascending: bool,
pub nulls_first: bool,
}
impl From<ColumnOrdering> for PyColumnOrdering {
fn from(ordering: ColumnOrdering) -> Self {
Self {
column_name: ordering.column_name,
ascending: ordering.ascending,
nulls_first: ordering.nulls_first,
}
}
}
impl From<PyColumnOrdering> for ColumnOrdering {
fn from(ordering: PyColumnOrdering) -> Self {
Self {
column_name: ordering.column_name,
ascending: ordering.ascending,
nulls_first: ordering.nulls_first,
}
}
}
impl<'py> IntoPyObject<'py> for PyColumnOrdering {
type Target = PyDict;
type Output = Bound<'py, Self::Target>;
type Error = PyErr;
fn into_pyobject(self, py: pyo3::Python<'py>) -> PyResult<Self::Output> {
let dict = PyDict::new(py);
dict.set_item("column_name", self.column_name)?;
dict.set_item("ascending", self.ascending)?;
dict.set_item("nulls_first", self.nulls_first)?;
Ok(dict)
}
}
impl<'py> IntoPyObject<'py> for PyQueryVectors {
type Target = PyList;
type Output = Bound<'py, Self::Target>;
@@ -288,7 +246,6 @@ pub struct PyQueryRequest {
pub bypass_vector_index: Option<bool>,
pub postfilter: Option<bool>,
pub norm: Option<String>,
pub order_by: Option<Vec<PyColumnOrdering>>,
}
impl From<AnyQuery> for PyQueryRequest {
@@ -316,9 +273,6 @@ impl From<AnyQuery> for PyQueryRequest {
bypass_vector_index: None,
postfilter: None,
norm: None,
order_by: query_request
.order_by
.map(|order_by| order_by.into_iter().map(PyColumnOrdering::from).collect()),
},
AnyQuery::VectorQuery(vector_query) => Self {
limit: vector_query.base.limit,
@@ -343,10 +297,6 @@ impl From<AnyQuery> for PyQueryRequest {
bypass_vector_index: Some(!vector_query.use_index),
postfilter: Some(!vector_query.base.prefilter),
norm: vector_query.base.norm.map(|n| n.to_string()),
order_by: vector_query
.base
.order_by
.map(|order_by| order_by.into_iter().map(PyColumnOrdering::from).collect()),
},
}
}
@@ -525,13 +475,6 @@ impl Query {
})
}
pub fn order_by(&mut self, ordering: Option<Vec<PyColumnOrdering>>) -> PyResult<()> {
let ordering =
ordering.map(|ordering| ordering.into_iter().map(ColumnOrdering::from).collect());
self.inner = self.inner.clone().order_by(ordering);
Ok(())
}
#[pyo3(signature = ())]
pub fn output_schema(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner.clone();
@@ -704,13 +647,6 @@ impl FTSQuery {
self.inner = self.inner.clone().offset(offset as usize);
}
pub fn order_by(&mut self, ordering: Option<Vec<PyColumnOrdering>>) -> PyResult<()> {
let ordering =
ordering.map(|ordering| ordering.into_iter().map(ColumnOrdering::from).collect());
self.inner = self.inner.clone().order_by(ordering);
Ok(())
}
pub fn fast_search(&mut self) {
self.inner = self.inner.clone().fast_search();
}
@@ -846,13 +782,6 @@ impl VectorQuery {
self.inner = self.inner.clone().offset(offset as usize);
}
pub fn order_by(&mut self, ordering: Option<Vec<PyColumnOrdering>>) -> PyResult<()> {
let ordering =
ordering.map(|ordering| ordering.into_iter().map(ColumnOrdering::from).collect());
self.inner = self.inner.clone().order_by(ordering);
Ok(())
}
pub fn fast_search(&mut self) {
self.inner = self.inner.clone().fast_search();
}
@@ -1025,12 +954,6 @@ impl HybridQuery {
self.inner_fts.offset(offset);
}
pub fn order_by(&mut self, ordering: Option<Vec<PyColumnOrdering>>) -> PyResult<()> {
self.inner_vec.order_by(ordering.clone())?;
self.inner_fts.order_by(ordering)?;
Ok(())
}
pub fn fast_search(&mut self) {
self.inner_vec.fast_search();
self.inner_fts.fast_search();

View File

@@ -171,141 +171,6 @@ impl From<lancedb::table::MergeResult> for MergeResult {
}
}
/// Specification selecting Lance's MemWAL LSM-style write path for
/// `merge_insert`.
///
/// Constructed via the `bucket(...)`, `identity(...)`, or `unsharded()`
/// classmethods, then optionally chain `with_maintained_indexes(...)` and
/// `with_writer_config_defaults(...)`.
#[pyclass(from_py_object)]
#[derive(Clone, Debug)]
pub struct LsmWriteSpec {
inner: lancedb::table::LsmWriteSpec,
}
#[pymethods]
impl LsmWriteSpec {
/// Hash-bucket sharding by the unenforced primary key column.
#[staticmethod]
pub fn bucket(column: String, num_buckets: u32) -> Self {
Self {
inner: lancedb::table::LsmWriteSpec::bucket(column, num_buckets),
}
}
/// Identity sharding — shard by the raw value of `column`.
#[staticmethod]
pub fn identity(column: String) -> Self {
Self {
inner: lancedb::table::LsmWriteSpec::identity(column),
}
}
/// No sharding — every `merge_insert` call writes to a single
/// MemWAL shard.
#[staticmethod]
pub fn unsharded() -> Self {
Self {
inner: lancedb::table::LsmWriteSpec::unsharded(),
}
}
/// Replace the list of indexes the MemWAL should keep up to date as
/// rows are appended. Each name must reference an index that
/// already exists on the table at the time `set_lsm_write_spec`
/// is called.
pub fn with_maintained_indexes(&self, indexes: Vec<String>) -> Self {
Self {
inner: self.inner.clone().with_maintained_indexes(indexes),
}
}
/// Replace the default `ShardWriter` configuration recorded in the
/// MemWAL index, so every writer starts from the same defaults.
pub fn with_writer_config_defaults(&self, defaults: HashMap<String, String>) -> Self {
Self {
inner: self.inner.clone().with_writer_config_defaults(defaults),
}
}
pub fn __repr__(&self) -> String {
match &self.inner {
lancedb::table::LsmWriteSpec::Bucket {
column,
num_buckets,
maintained_indexes,
writer_config_defaults,
} => format!(
"LsmWriteSpec.bucket(column={:?}, num_buckets={}, maintained_indexes={:?}, writer_config_defaults={:?})",
column, num_buckets, maintained_indexes, writer_config_defaults,
),
lancedb::table::LsmWriteSpec::Identity {
column,
maintained_indexes,
writer_config_defaults,
} => format!(
"LsmWriteSpec.identity(column={:?}, maintained_indexes={:?}, writer_config_defaults={:?})",
column, maintained_indexes, writer_config_defaults,
),
lancedb::table::LsmWriteSpec::Unsharded {
maintained_indexes,
writer_config_defaults,
} => format!(
"LsmWriteSpec.unsharded(maintained_indexes={:?}, writer_config_defaults={:?})",
maintained_indexes, writer_config_defaults,
),
}
}
/// Discriminator string identifying the variant ("bucket", "identity",
/// or "unsharded").
#[getter]
pub fn spec_type(&self) -> &'static str {
match &self.inner {
lancedb::table::LsmWriteSpec::Bucket { .. } => "bucket",
lancedb::table::LsmWriteSpec::Identity { .. } => "identity",
lancedb::table::LsmWriteSpec::Unsharded { .. } => "unsharded",
}
}
/// Bucket and identity variants: the sharding column. `None` for unsharded.
#[getter]
pub fn column(&self) -> Option<String> {
match &self.inner {
lancedb::table::LsmWriteSpec::Bucket { column, .. }
| lancedb::table::LsmWriteSpec::Identity { column, .. } => Some(column.clone()),
lancedb::table::LsmWriteSpec::Unsharded { .. } => None,
}
}
/// Bucket variant only: the number of buckets.
#[getter]
pub fn num_buckets(&self) -> Option<u32> {
match &self.inner {
lancedb::table::LsmWriteSpec::Bucket { num_buckets, .. } => Some(*num_buckets),
_ => None,
}
}
/// Names of indexes the MemWAL should keep up to date during writes.
#[getter]
pub fn maintained_indexes(&self) -> Vec<String> {
self.inner.maintained_indexes().to_vec()
}
/// Default `ShardWriter` configuration recorded by this spec.
#[getter]
pub fn writer_config_defaults(&self) -> HashMap<String, String> {
self.inner.writer_config_defaults().clone()
}
}
impl From<LsmWriteSpec> for lancedb::table::LsmWriteSpec {
fn from(spec: LsmWriteSpec) -> Self {
spec.inner
}
}
#[pyclass(get_all, from_py_object)]
#[derive(Clone, Debug)]
pub struct AddColumnsResult {
@@ -940,37 +805,6 @@ impl Table {
})
}
pub fn set_unenforced_primary_key<'a>(
self_: PyRef<'a, Self>,
columns: Vec<String>,
) -> PyResult<Bound<'a, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
inner
.set_unenforced_primary_key(columns)
.await
.infer_error()
})
}
pub fn set_lsm_write_spec<'a>(
self_: PyRef<'a, Self>,
spec: LsmWriteSpec,
) -> PyResult<Bound<'a, PyAny>> {
let inner = self_.inner_ref()?.clone();
let native_spec = lancedb::table::LsmWriteSpec::from(spec);
future_into_py(self_.py(), async move {
inner.set_lsm_write_spec(native_spec).await.infer_error()
})
}
pub fn unset_lsm_write_spec(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
inner.unset_lsm_write_spec().await.infer_error()
})
}
pub fn uses_v2_manifest_paths(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.29.1-beta.0"
version = "0.28.0-beta.11"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -450,10 +450,6 @@ impl PermutationReader {
}
pub async fn take_offsets(&self, offsets: &[u64], selection: Select) -> Result<RecordBatch> {
if offsets.is_empty() {
return Ok(RecordBatch::new_empty(self.output_schema(selection).await?));
}
if let Some(permutation_table) = &self.permutation_table {
let offset_map = self.get_offset_map(permutation_table).await?;
let row_ids = offsets
@@ -959,62 +955,4 @@ mod tests {
.to_vec();
assert_eq!(idx_values, &all_idx_values[4997..5000]);
}
#[tokio::test]
async fn test_take_offsets_empty_identity_reader() {
let base_table = lance_datagen::gen_batch()
.col("idx", lance_datagen::array::step::<Int32Type>())
.into_mem_table("tbl", RowCount::from(10), BatchCount::from(1))
.await;
let reader = PermutationReader::identity(base_table.base_table().clone()).await;
let batch = reader.take_offsets(&[], Select::All).await.unwrap();
assert_eq!(batch.num_rows(), 0);
assert_eq!(batch.num_columns(), 1);
assert_eq!(batch.schema().field(0).name(), "idx");
}
#[tokio::test]
async fn test_take_offsets_empty_with_permutation_table() {
let (base_table, row_ids_table, _) = setup_permutation_tables(5).await;
let reader = PermutationReader::try_from_tables(
base_table.base_table().clone(),
row_ids_table.base_table().clone(),
0,
)
.await
.unwrap();
let batch = reader.take_offsets(&[], Select::All).await.unwrap();
assert_eq!(batch.num_rows(), 0);
assert_eq!(batch.schema().fields().len(), 2);
assert_eq!(batch.schema().field(0).name(), "idx");
assert_eq!(batch.schema().field(1).name(), "other_col");
}
#[tokio::test]
async fn test_take_offsets_empty_with_column_selection() {
let (base_table, row_ids_table, _) = setup_permutation_tables(5).await;
let reader = PermutationReader::try_from_tables(
base_table.base_table().clone(),
row_ids_table.base_table().clone(),
0,
)
.await
.unwrap();
let batch = reader
.take_offsets(&[], Select::Columns(vec!["idx".to_string()]))
.await
.unwrap();
assert_eq!(batch.num_rows(), 0);
assert_eq!(batch.num_columns(), 1);
assert_eq!(batch.schema().field(0).name(), "idx");
}
}

View File

@@ -11,8 +11,6 @@ use datafusion_expr::Expr;
use datafusion_physical_plan::ExecutionPlan;
use futures::{FutureExt, TryFutureExt, TryStreamExt, stream, try_join};
use half::f16;
/// Re-export Lance ColumnOrdering type for use in query ordering
pub use lance::dataset::scanner::ColumnOrdering;
use lance::dataset::{ROW_ID, scanner::DatasetRecordBatchStream};
use lance_arrow::RecordBatchExt;
use lance_datafusion::exec::execute_plan;
@@ -512,11 +510,6 @@ pub trait QueryBase {
/// the scores are converted to ranks and then normalized. If "Score", the
/// scores are normalized directly.
fn norm(self, norm: NormalizeMethod) -> Self;
/// Sort the results by the specified column(s).
///
/// This allows ordering query results by one or more columns in either ascending or descending order.
fn order_by(self, ordering: Option<Vec<ColumnOrdering>>) -> Self;
}
pub trait HasQuery {
@@ -581,11 +574,6 @@ impl<T: HasQuery> QueryBase for T {
self.mut_query().norm = Some(norm);
self
}
fn order_by(mut self, ordering: Option<Vec<ColumnOrdering>>) -> Self {
self.mut_query().order_by = ordering;
self
}
}
/// Options for controlling the execution of a query
@@ -762,11 +750,6 @@ pub struct QueryRequest {
///
/// By default, this is false (scoring columns are auto-projected for backward compatibility).
pub disable_scoring_autoprojection: bool,
/// Sort the results by the specified column(s).
///
/// This allows ordering query results by one or more columns in either ascending or descending order.
pub order_by: Option<Vec<ColumnOrdering>>,
}
impl Default for QueryRequest {
@@ -783,7 +766,6 @@ impl Default for QueryRequest {
reranker: None,
norm: None,
disable_scoring_autoprojection: false,
order_by: None,
}
}
}

View File

@@ -518,21 +518,6 @@ impl<S: HttpSend> RemoteTable<S> {
}
}
if let Some(order_by) = &params.order_by {
body["order_by"] = serde_json::Value::Array(
order_by
.iter()
.map(|o| {
serde_json::json!({
"column_name": o.column_name,
"ascending": o.ascending,
"nulls_first": o.nulls_first,
})
})
.collect(),
);
}
Ok(())
}
@@ -1667,24 +1652,6 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
Ok(merge_insert_response)
}
async fn set_unenforced_primary_key(&self, _columns: &[&str]) -> Result<()> {
Err(Error::NotSupported {
message: "set_unenforced_primary_key is not supported on LanceDB cloud.".into(),
})
}
async fn set_lsm_write_spec(&self, _spec: crate::table::LsmWriteSpec) -> Result<()> {
Err(Error::NotSupported {
message: "set_lsm_write_spec is not supported on LanceDB cloud.".into(),
})
}
async fn unset_lsm_write_spec(&self) -> Result<()> {
Err(Error::NotSupported {
message: "unset_lsm_write_spec is not supported on LanceDB cloud.".into(),
})
}
async fn tags(&self) -> Result<Box<dyn Tags + '_>> {
Ok(Box::new(RemoteTags { inner: self }))
}
@@ -2111,7 +2078,7 @@ mod tests {
use crate::{
DistanceType, Error, Table,
index::{Index, IndexStatistics, IndexType, vector::IvfPqIndexBuilder},
query::{ColumnOrdering, ExecutableQuery, QueryBase},
query::{ExecutableQuery, QueryBase},
remote::ARROW_FILE_CONTENT_TYPE,
};
@@ -3021,18 +2988,6 @@ mod tests {
"distance_type": "cosine",
"bypass_vector_index": true,
"columns": ["a", "b"],
"order_by": [
{
"column_name": "score",
"ascending": false,
"nulls_first": true,
},
{
"column_name": "id",
"ascending": true,
"nulls_first": false,
}
],
"nprobes": 12,
"minimum_nprobes": 12,
"maximum_nprobes": 12,
@@ -3064,10 +3019,6 @@ mod tests {
.limit(42)
.offset(10)
.select(Select::columns(&["a", "b"]))
.order_by(Some(vec![
ColumnOrdering::desc_nulls_first("score".to_string()),
ColumnOrdering::asc_nulls_last("id".to_string()),
]))
.nearest_to(vec![0.1, 0.2, 0.3])
.unwrap()
.column("my_vector")

View File

@@ -74,7 +74,6 @@ pub(crate) mod dataset;
pub mod delete;
pub mod merge;
pub mod optimize;
mod primary_key;
pub mod query;
pub mod schema_evolution;
pub mod update;
@@ -273,176 +272,6 @@ pub trait Tags: Send + Sync {
pub use self::merge::MergeResult;
/// Specification selecting Lance's MemWAL LSM-style write path for
/// `merge_insert`.
///
/// Construct via [`LsmWriteSpec::bucket`], [`LsmWriteSpec::identity`], or
/// [`LsmWriteSpec::unsharded`], then optionally chain
/// [`LsmWriteSpec::with_maintained_indexes`] (indexes the MemWAL keeps up to
/// date) and [`LsmWriteSpec::with_writer_config_defaults`] (default
/// `ShardWriter` configuration recorded in the MemWAL index).
///
/// All variants require the table to have an unenforced primary key.
///
/// Install a spec with [`Table::set_lsm_write_spec`] and remove it with
/// [`Table::unset_lsm_write_spec`]. The actual `merge_insert` dispatch
/// onto the MemWAL writer is a follow-up.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum LsmWriteSpec {
/// Hash-bucket sharding by the unenforced primary key column.
///
/// `column` must equal the table's currently-set single-column
/// unenforced primary key. `num_buckets` must be in `[1, 1024]`.
/// Iceberg-compatible Murmur3-x86-32 (seed 0) is used so each row's
/// `bucket(column, num_buckets)` value is stable across processes.
Bucket {
column: String,
num_buckets: u32,
/// Names of indexes (already created on the table) that the
/// MemWAL should maintain in-memory as rows are appended.
maintained_indexes: Vec<String>,
/// Default `ShardWriter` configuration recorded in the MemWAL index.
writer_config_defaults: HashMap<String, String>,
},
/// Identity sharding — shard by the raw value of `column`.
///
/// Use this when the data is already partitioned by `column`; each
/// distinct value of `column` becomes its own shard.
Identity {
column: String,
/// Names of indexes (already created on the table) that the
/// MemWAL should maintain in-memory as rows are appended.
maintained_indexes: Vec<String>,
/// Default `ShardWriter` configuration recorded in the MemWAL index.
writer_config_defaults: HashMap<String, String>,
},
/// No sharding — every `merge_insert` call writes to a single MemWAL shard.
Unsharded {
/// Names of indexes (already created on the table) that the
/// MemWAL should maintain in-memory as rows are appended.
maintained_indexes: Vec<String>,
/// Default `ShardWriter` configuration recorded in the MemWAL index.
writer_config_defaults: HashMap<String, String>,
},
}
impl LsmWriteSpec {
/// Construct a hash-bucket sharding spec with no maintained indexes.
pub fn bucket(column: impl Into<String>, num_buckets: u32) -> Self {
Self::Bucket {
column: column.into(),
num_buckets,
maintained_indexes: Vec::new(),
writer_config_defaults: HashMap::new(),
}
}
/// Construct an identity-sharding spec (shard by the raw value of
/// `column`) with no maintained indexes.
pub fn identity(column: impl Into<String>) -> Self {
Self::Identity {
column: column.into(),
maintained_indexes: Vec::new(),
writer_config_defaults: HashMap::new(),
}
}
/// Construct an unsharded spec with no maintained indexes.
pub fn unsharded() -> Self {
Self::Unsharded {
maintained_indexes: Vec::new(),
writer_config_defaults: HashMap::new(),
}
}
/// Replace the list of indexes the MemWAL should keep up to date as
/// rows are appended. Each name must reference an index that already
/// exists on the table at the time `set_lsm_write_spec` is called.
pub fn with_maintained_indexes<I, S>(mut self, indexes: I) -> Self
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
let v: Vec<String> = indexes.into_iter().map(Into::into).collect();
match &mut self {
Self::Bucket {
maintained_indexes, ..
}
| Self::Identity {
maintained_indexes, ..
}
| Self::Unsharded {
maintained_indexes, ..
} => *maintained_indexes = v,
}
self
}
/// Replace the default `ShardWriter` configuration recorded in the MemWAL
/// index, so every writer starts from the same defaults. Keys are
/// `ShardWriter` config field names (`Duration` knobs use a `_ms` suffix);
/// values are their string encodings.
pub fn with_writer_config_defaults<I, K, V>(mut self, defaults: I) -> Self
where
I: IntoIterator<Item = (K, V)>,
K: Into<String>,
V: Into<String>,
{
let m: HashMap<String, String> = defaults
.into_iter()
.map(|(k, v)| (k.into(), v.into()))
.collect();
match &mut self {
Self::Bucket {
writer_config_defaults,
..
}
| Self::Identity {
writer_config_defaults,
..
}
| Self::Unsharded {
writer_config_defaults,
..
} => *writer_config_defaults = m,
}
self
}
/// Borrow the list of index names this spec asks MemWAL to maintain.
pub fn maintained_indexes(&self) -> &[String] {
match self {
Self::Bucket {
maintained_indexes, ..
}
| Self::Identity {
maintained_indexes, ..
}
| Self::Unsharded {
maintained_indexes, ..
} => maintained_indexes,
}
}
/// Borrow the default `ShardWriter` configuration recorded by this spec.
pub fn writer_config_defaults(&self) -> &HashMap<String, String> {
match self {
Self::Bucket {
writer_config_defaults,
..
}
| Self::Identity {
writer_config_defaults,
..
}
| Self::Unsharded {
writer_config_defaults,
..
} => writer_config_defaults,
}
}
}
/// A trait for anything "table-like". This is used for both native tables (which target
/// Lance datasets) and remote tables (which target LanceDB cloud)
///
@@ -516,43 +345,6 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
params: MergeInsertBuilder,
new_data: Box<dyn RecordBatchReader + Send>,
) -> Result<MergeResult>;
/// Set the unenforced primary key for the table to a single column.
///
/// "Unenforced" means LanceDB does not check uniqueness on writes; the
/// column is recorded in the schema as the primary key for use by
/// features such as `merge_insert`. Only single-column primary keys are
/// supported, and the key cannot be changed once set.
///
/// The default implementation returns `NotSupported`; table types
/// backed by a Lance dataset override it.
async fn set_unenforced_primary_key(&self, _columns: &[&str]) -> Result<()> {
Err(Error::NotSupported {
message: "set_unenforced_primary_key is not supported on this table type".into(),
})
}
/// Install an [`LsmWriteSpec`] on this table.
///
/// The spec selects Lance's MemWAL LSM-style write path for future
/// `merge_insert` calls.
///
/// The default implementation returns `NotSupported`. Implementations
/// that support the MemWAL LSM write path must override this.
async fn set_lsm_write_spec(&self, _spec: LsmWriteSpec) -> Result<()> {
Err(Error::NotSupported {
message: "set_lsm_write_spec is not supported on this table type".into(),
})
}
/// Remove the [`LsmWriteSpec`] from this table.
///
/// This is a no-op if no spec is currently set.
///
/// The default implementation returns `NotSupported`. Implementations
/// that support the MemWAL LSM write path must override this.
async fn unset_lsm_write_spec(&self) -> Result<()> {
Err(Error::NotSupported {
message: "unset_lsm_write_spec is not supported on this table type".into(),
})
}
/// Gets the table tag manager.
async fn tags(&self) -> Result<Box<dyn Tags + '_>>;
/// Optimize the dataset.
@@ -1271,68 +1063,6 @@ impl Table {
self.inner.drop_columns(columns).await
}
/// Set the unenforced primary key for this table to a single column.
///
/// "Unenforced" means LanceDB does not check uniqueness on writes; the
/// column is recorded in the schema as the primary key so that features
/// such as `merge_insert` can use it.
///
/// Only single-column primary keys are supported, and the key cannot be
/// changed once set — calling this on a table that already has an
/// unenforced primary key fails. `columns` is an iterable for binding
/// ergonomics but must yield exactly one column:
///
/// - `table.set_unenforced_primary_key(["id"])`
pub async fn set_unenforced_primary_key<I, S>(&self, columns: I) -> Result<()>
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
let owned: Vec<String> = columns.into_iter().map(Into::into).collect();
let borrowed: Vec<&str> = owned.iter().map(String::as_str).collect();
self.inner.set_unenforced_primary_key(&borrowed).await
}
/// Install an [`LsmWriteSpec`] on this table, selecting Lance's MemWAL
/// LSM-style write path for future `merge_insert` calls.
///
/// [`LsmWriteSpec`] chooses one of three sharding strategies:
///
/// - [`LsmWriteSpec::bucket`] — hash-bucket writes by the single-column
/// unenforced primary key.
/// - [`LsmWriteSpec::identity`] — shard by the raw value of a scalar column.
/// - [`LsmWriteSpec::unsharded`] — route every write to a single shard.
///
/// All variants require the table to have an unenforced primary key
/// ([`Table::set_unenforced_primary_key`]); bucket sharding additionally
/// requires it to be the single column being bucketed.
///
/// # Example
///
/// ```
/// # use lancedb::table::{LsmWriteSpec, Table};
/// # async fn example(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
/// table.set_unenforced_primary_key(["id"]).await?;
/// table
/// .set_lsm_write_spec(
/// LsmWriteSpec::bucket("id", 16).with_maintained_indexes(["id_idx"]),
/// )
/// .await?;
/// # Ok(())
/// # }
/// ```
pub async fn set_lsm_write_spec(&self, spec: LsmWriteSpec) -> Result<()> {
self.inner.set_lsm_write_spec(spec).await
}
/// Remove the [`LsmWriteSpec`] from this table, reverting to the standard
/// `merge_insert` write path.
///
/// Errors if no spec is currently set.
pub async fn unset_lsm_write_spec(&self) -> Result<()> {
self.inner.unset_lsm_write_spec().await
}
/// Retrieve the version of the table
///
/// LanceDb supports versioning. Every operation that modifies the table increases
@@ -2661,13 +2391,28 @@ impl BaseTable for NativeTable {
message: "Multi-column (composite) indices are not yet supported".to_string(),
});
}
let schema = self.schema().await?;
let dataset = self.dataset.get().await?;
let Some(field_path) = dataset.schema().resolve_case_insensitive(&opts.columns[0]) else {
return Err(Error::Schema {
message: format!(
"Unable to get field named {:?}. Valid fields: {:?}",
opts.columns[0],
dataset.schema().field_paths()
),
});
};
let field = (*field_path.last().expect("resolved field path is non-empty")).clone();
let names = field_path
.iter()
.map(|f| f.name.as_str())
.collect::<Vec<_>>();
let column = lance_core::datatypes::format_field_path(&names);
drop(dataset);
let field = schema.field_with_name(&opts.columns[0])?;
let lance_idx_params = self.make_index_params(field, opts.index.clone()).await?;
let index_type = self.get_index_type_for_field(field, &opts.index);
let columns = [field.name().as_str()];
let field = Field::from(&field);
let lance_idx_params = self.make_index_params(&field, opts.index.clone()).await?;
let index_type = self.get_index_type_for_field(&field, &opts.index);
let columns = [column.as_str()];
self.dataset.ensure_mutable()?;
let mut dataset = (*self.dataset.get().await?).clone();
let mut builder = dataset
@@ -2739,18 +2484,6 @@ impl BaseTable for NativeTable {
merge::execute_merge_insert(self, params, new_data).await
}
async fn set_unenforced_primary_key(&self, columns: &[&str]) -> Result<()> {
primary_key::set_unenforced_primary_key(self, columns).await
}
async fn set_lsm_write_spec(&self, spec: LsmWriteSpec) -> Result<()> {
merge::lsm::set_lsm_write_spec(self, spec).await
}
async fn unset_lsm_write_spec(&self) -> Result<()> {
merge::lsm::unset_lsm_write_spec(self).await
}
/// Delete rows from the table
async fn delete(&self, predicate: &str) -> Result<DeleteResult> {
// Delegate to the submodule implementation
@@ -2825,11 +2558,11 @@ impl BaseTable for NativeTable {
let mut columns = Vec::with_capacity(idx.fields.len());
for field_id in &idx.fields {
let Some(field) = dataset.schema().field_by_id(*field_id) else {
let Ok(field_path) = dataset.schema().field_path(*field_id) else {
log::warn!("The index {} ({}) referenced a field with id {} which does not exist in the schema", idx.name, idx.uuid, field_id);
return None;
};
columns.push(field.name.clone());
columns.push(field_path);
}
let name = idx.name.clone();
@@ -3043,7 +2776,7 @@ mod tests {
use arrow_array::{
Array, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray, RecordBatch,
RecordBatchIterator, RecordBatchReader, StringArray,
RecordBatchIterator, RecordBatchReader, StringArray, StructArray,
builder::{ListBuilder, StringBuilder},
};
use arrow_array::{BinaryArray, LargeBinaryArray};
@@ -3052,7 +2785,6 @@ mod tests {
use futures::TryStreamExt;
use lance::Dataset;
use lance::io::{ObjectStoreParams, WrappingObjectStore};
use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION;
use tempfile::tempdir;
use super::*;
@@ -3650,6 +3382,57 @@ mod tests {
assert_eq!(stats.num_unindexed_rows, 0);
}
#[tokio::test]
async fn test_create_scalar_index_on_nested_field() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let metadata_fields = vec![Arc::new(Field::new("user_id", DataType::Int32, false))];
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new(
"metadata",
DataType::Struct(metadata_fields.clone().into()),
false,
),
]));
let metadata = StructArray::new(
metadata_fields.into(),
vec![Arc::new(Int32Array::from_iter_values(0..10))],
None,
);
let batch = RecordBatch::try_new(
schema,
vec![
Arc::new(Int32Array::from_iter_values(0..10)),
Arc::new(metadata),
],
)
.unwrap();
let conn = ConnectBuilder::new(uri).execute().await.unwrap();
let table = conn
.create_table("my_table", batch)
.execute()
.await
.unwrap();
table
.create_index(
&["metadata.user_id"],
Index::BTree(BTreeIndexBuilder::default()),
)
.execute()
.await
.unwrap();
let index_configs = table.list_indices().await.unwrap();
assert_eq!(index_configs.len(), 1);
assert_eq!(
index_configs[0].columns,
vec!["metadata.user_id".to_string()]
);
}
#[tokio::test]
async fn test_create_bitmap_index() {
let tmp_dir = tempdir().unwrap();
@@ -4147,395 +3930,6 @@ mod tests {
);
}
#[tokio::test]
async fn test_set_unenforced_primary_key() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("name", DataType::Utf8, true),
Field::new("score", DataType::Float64, true),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(arrow_array::Int64Array::from(vec![1, 2, 3])),
Arc::new(StringArray::from(vec!["a", "b", "c"])),
Arc::new(arrow_array::Float64Array::from(vec![1.0, 2.0, 3.0])),
],
)
.unwrap();
let reader: Box<dyn arrow_array::RecordBatchReader + Send> =
Box::new(RecordBatchIterator::new(vec![Ok(batch)], schema.clone()));
let conn = ConnectBuilder::new(uri)
.read_consistency_interval(Duration::from_secs(0))
.execute()
.await
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
// Reject empty input.
let err = table
.set_unenforced_primary_key(Vec::<&str>::new())
.await
.expect_err("empty input should be rejected");
assert!(matches!(err, Error::InvalidInput { .. }), "got {:?}", err);
// Reject compound (multi-column) input.
let err = table
.set_unenforced_primary_key(["id", "name"])
.await
.expect_err("compound primary key should be rejected");
assert!(matches!(err, Error::InvalidInput { .. }), "got {:?}", err);
// Reject unknown column.
let err = table
.set_unenforced_primary_key(["nonexistent"])
.await
.expect_err("nonexistent column should be rejected");
assert!(matches!(err, Error::InvalidInput { .. }), "got {:?}", err);
// Reject unsupported dtype (Float64).
let err = table
.set_unenforced_primary_key(["score"])
.await
.expect_err("Float64 should be rejected");
assert!(matches!(err, Error::InvalidInput { .. }), "got {:?}", err);
// None of the rejected calls set a primary key.
let lance_schema = table.as_native().unwrap().manifest().await.unwrap().schema;
assert!(lance_schema.unenforced_primary_key().is_empty());
// Happy path: set the primary key to "id".
table.set_unenforced_primary_key(["id"]).await.unwrap();
let lance_schema = table.as_native().unwrap().manifest().await.unwrap().schema;
let pk = lance_schema.unenforced_primary_key();
assert_eq!(pk.len(), 1);
assert_eq!(pk[0].name, "id");
// Position metadata is 1-indexed.
assert_eq!(
pk[0].metadata.get(LANCE_UNENFORCED_PRIMARY_KEY_POSITION),
Some(&"1".to_string())
);
// The primary key is immutable: re-setting it is rejected, whether to
// the same column or a different one.
let err = table
.set_unenforced_primary_key(["id"])
.await
.expect_err("re-setting the same primary key should be rejected");
assert!(matches!(err, Error::InvalidInput { .. }), "got {:?}", err);
let err = table
.set_unenforced_primary_key(["name"])
.await
.expect_err("changing the primary key should be rejected");
assert!(matches!(err, Error::InvalidInput { .. }), "got {:?}", err);
// The primary key is unchanged after the rejected calls.
let lance_schema = table.as_native().unwrap().manifest().await.unwrap().schema;
let pk = lance_schema.unenforced_primary_key();
assert_eq!(pk.len(), 1);
assert_eq!(pk[0].name, "id");
}
#[tokio::test]
async fn test_set_unenforced_primary_key_concurrent() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![Arc::new(arrow_array::Int64Array::from(vec![1, 2, 3]))],
)
.unwrap();
let reader: Box<dyn arrow_array::RecordBatchReader + Send> =
Box::new(RecordBatchIterator::new(vec![Ok(batch)], schema.clone()));
// A long read-consistency interval keeps each handle pinned to the
// version it opened, so the second handle commits against a stale
// base — the same situation as two processes racing.
let conn = ConnectBuilder::new(uri)
.read_consistency_interval(Duration::from_secs(3600))
.execute()
.await
.unwrap();
conn.create_table("t", reader).execute().await.unwrap();
let table_a = conn.open_table("t").execute().await.unwrap();
let table_b = conn.open_table("t").execute().await.unwrap();
// Handle A sets the primary key first.
table_a.set_unenforced_primary_key(["id"]).await.unwrap();
// Handle B committed against a stale base that had no primary key, so
// its own up-front check did not see A's key. The commit itself must
// still fail rather than silently overriding A's primary key. (The
// cross-process race on a *different* column is caught by the Lance
// commit layer.)
let err = table_b
.set_unenforced_primary_key(["id"])
.await
.expect_err("concurrent primary key commit on a stale base should fail");
assert!(
!matches!(err, Error::InvalidInput { .. }),
"expected a commit-time conflict, not an up-front input error: {:?}",
err
);
// The committed primary key is exactly what A set — no corruption.
let fresh = conn.open_table("t").execute().await.unwrap();
let lance_schema = fresh.as_native().unwrap().manifest().await.unwrap().schema;
let pk = lance_schema.unenforced_primary_key();
assert_eq!(pk.len(), 1);
assert_eq!(pk[0].name, "id");
}
#[tokio::test]
async fn test_set_lsm_write_spec() {
use arrow_array::StringArray;
use lance::dataset::mem_wal::DatasetMemWalExt;
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("name", DataType::Utf8, true),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(arrow_array::Int64Array::from(vec![1, 2, 3])),
Arc::new(StringArray::from(vec!["a", "b", "c"])),
],
)
.unwrap();
let reader: Box<dyn arrow_array::RecordBatchReader + Send> =
Box::new(RecordBatchIterator::new(vec![Ok(batch)], schema.clone()));
let conn = ConnectBuilder::new(uri)
.read_consistency_interval(Duration::from_secs(0))
.execute()
.await
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
// Reject when no PK is set.
let err = table
.set_lsm_write_spec(LsmWriteSpec::bucket("id", 4))
.await
.expect_err("should reject without PK");
assert!(matches!(err, Error::Lance { .. }), "got {:?}", err);
// Set PK, then a mismatched column on the spec must be rejected.
table.set_unenforced_primary_key(["id"]).await.unwrap();
let err = table
.set_lsm_write_spec(LsmWriteSpec::bucket("name", 4))
.await
.expect_err("should reject column != PK");
assert!(matches!(err, Error::Lance { .. }), "got {:?}", err);
// Reject num_buckets out of range.
for bad in [0u32, 1025] {
let err = table
.set_lsm_write_spec(LsmWriteSpec::bucket("id", bad))
.await
.expect_err("should reject");
assert!(matches!(err, Error::Lance { .. }), "got {:?}", err);
}
// Happy path: install spec; verify MemWAL details record it.
table
.set_lsm_write_spec(LsmWriteSpec::bucket("id", 4))
.await
.unwrap();
let native_tbl = table.as_native().unwrap();
let dataset = native_tbl.dataset.get().await.unwrap();
let details = dataset
.mem_wal_index_details()
.await
.unwrap()
.expect("MemWAL index should be initialized");
assert_eq!(details.num_shards, 4);
assert_eq!(details.sharding_specs.len(), 1);
let installed = &details.sharding_specs[0];
assert_eq!(installed.fields.len(), 1);
let f = &installed.fields[0];
assert_eq!(f.transform.as_deref(), Some("bucket"));
assert_eq!(
f.parameters.get("num_buckets").map(String::as_str),
Some("4")
);
// Bucket parameters must hold only `num_buckets`.
assert_eq!(f.parameters.len(), 1);
// Mutation rejected.
let err = table
.set_lsm_write_spec(LsmWriteSpec::bucket("id", 8))
.await
.expect_err("mutation should be rejected");
assert!(matches!(err, Error::InvalidInput { .. }), "got {:?}", err);
}
#[tokio::test]
async fn test_set_lsm_write_spec_unsharded() {
use lance::dataset::mem_wal::DatasetMemWalExt;
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![Arc::new(arrow_array::Int64Array::from(vec![1]))],
)
.unwrap();
let reader: Box<dyn arrow_array::RecordBatchReader + Send> =
Box::new(RecordBatchIterator::new(vec![Ok(batch)], schema.clone()));
let conn = ConnectBuilder::new(uri)
.read_consistency_interval(Duration::from_secs(0))
.execute()
.await
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
// Lance's MemWAL still requires *some* unenforced primary key on
// the dataset; Unsharded just skips the per-row hashing step.
table.set_unenforced_primary_key(["id"]).await.unwrap();
table
.set_lsm_write_spec(LsmWriteSpec::unsharded())
.await
.unwrap();
let dataset = table.as_native().unwrap().dataset.get().await.unwrap();
let details = dataset
.mem_wal_index_details()
.await
.unwrap()
.expect("MemWAL index should be initialized");
assert_eq!(details.num_shards, 1);
assert_eq!(details.sharding_specs.len(), 1);
let f = &details.sharding_specs[0].fields[0];
assert_eq!(f.transform.as_deref(), Some("unsharded"));
assert!(f.source_ids.is_empty());
}
#[tokio::test]
async fn test_set_lsm_write_spec_identity() {
use lance::dataset::mem_wal::DatasetMemWalExt;
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("region", DataType::Utf8, true),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(arrow_array::Int64Array::from(vec![1, 2, 3])),
Arc::new(StringArray::from(vec!["a", "b", "c"])),
],
)
.unwrap();
let reader: Box<dyn arrow_array::RecordBatchReader + Send> =
Box::new(RecordBatchIterator::new(vec![Ok(batch)], schema.clone()));
let conn = ConnectBuilder::new(uri)
.read_consistency_interval(Duration::from_secs(0))
.execute()
.await
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
table.set_unenforced_primary_key(["id"]).await.unwrap();
table
.set_lsm_write_spec(
LsmWriteSpec::identity("region")
.with_writer_config_defaults([("durable_write", "false")]),
)
.await
.unwrap();
let dataset = table.as_native().unwrap().dataset.get().await.unwrap();
let details = dataset
.mem_wal_index_details()
.await
.unwrap()
.expect("MemWAL index should be initialized");
// Identity sharding records an open-ended shard count.
assert_eq!(details.num_shards, 0);
assert_eq!(details.sharding_specs.len(), 1);
let f = &details.sharding_specs[0].fields[0];
assert_eq!(f.transform.as_deref(), Some("identity"));
// Writer config defaults round-trip into the MemWAL index.
assert_eq!(
details
.writer_config_defaults
.get("durable_write")
.map(String::as_str),
Some("false")
);
}
#[tokio::test]
async fn test_unset_lsm_write_spec() {
use lance::dataset::mem_wal::DatasetMemWalExt;
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![Arc::new(arrow_array::Int64Array::from(vec![1]))],
)
.unwrap();
let reader: Box<dyn arrow_array::RecordBatchReader + Send> =
Box::new(RecordBatchIterator::new(vec![Ok(batch)], schema.clone()));
let conn = ConnectBuilder::new(uri)
.read_consistency_interval(Duration::from_secs(0))
.execute()
.await
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
// unset errors when no spec is set.
table.unset_lsm_write_spec().await.unwrap_err();
// Install a spec, then unset it.
table.set_unenforced_primary_key(["id"]).await.unwrap();
table
.set_lsm_write_spec(LsmWriteSpec::bucket("id", 4))
.await
.unwrap();
{
let dataset = table.as_native().unwrap().dataset.get().await.unwrap();
assert!(dataset.mem_wal_index_details().await.unwrap().is_some());
}
table.unset_lsm_write_spec().await.unwrap();
{
let dataset = table.as_native().unwrap().dataset.get().await.unwrap();
assert!(dataset.mem_wal_index_details().await.unwrap().is_none());
}
// A second unset errors; a fresh spec can still be installed afterwards.
table.unset_lsm_write_spec().await.unwrap_err();
table
.set_lsm_write_spec(LsmWriteSpec::bucket("id", 8))
.await
.unwrap();
{
let dataset = table.as_native().unwrap().dataset.get().await.unwrap();
assert!(dataset.mem_wal_index_details().await.unwrap().is_some());
}
}
#[tokio::test]
pub async fn test_stats() {
let tmp_dir = tempdir().unwrap();

View File

@@ -16,8 +16,6 @@ use crate::error::{Error, Result};
use super::{BaseTable, NativeTable};
pub(crate) mod lsm;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct MergeResult {
// The commit version associated with the operation.

View File

@@ -1,101 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! MemWAL LSM write-path spec management.
//!
//! [`set_lsm_write_spec`] installs a [`super::super::LsmWriteSpec`] on a
//! table, which selects Lance's MemWAL LSM-style write path for future
//! `merge_insert` calls. [`unset_lsm_write_spec`] removes it. The actual
//! `merge_insert` dispatch and writer are a follow-up.
use lance::dataset::mem_wal::DatasetMemWalExt;
use lance::index::DatasetIndexExt;
use crate::error::{Error, Result};
use crate::table::{LsmWriteSpec, NativeTable};
// =============================================================================
// set_lsm_write_spec
// =============================================================================
/// Install an [`LsmWriteSpec`] on the table.
///
/// The bucket / unsharded sharding spec is constructed and validated by Lance's
/// [`InitializeMemWalBuilder`](lance::dataset::mem_wal::InitializeMemWalBuilder).
#[allow(clippy::redundant_pub_crate)]
pub(crate) async fn set_lsm_write_spec(table: &NativeTable, spec: LsmWriteSpec) -> Result<()> {
table.dataset.ensure_mutable()?;
{
let dataset = table.dataset.get().await?;
if dataset.mem_wal_index_details().await?.is_some() {
return Err(Error::InvalidInput {
message: "set_lsm_write_spec: an LSM write spec is already set on this table; mutation is not supported".into(),
});
}
}
let mut dataset = (*table.dataset.get().await?).clone();
let mut builder = dataset.initialize_mem_wal();
let (maintained_indexes, writer_config_defaults) = match spec {
LsmWriteSpec::Bucket {
column,
num_buckets,
maintained_indexes,
writer_config_defaults,
} => {
builder = builder.bucket_sharding(column, num_buckets);
(maintained_indexes, writer_config_defaults)
}
LsmWriteSpec::Identity {
column,
maintained_indexes,
writer_config_defaults,
} => {
builder = builder.identity_sharding(column);
(maintained_indexes, writer_config_defaults)
}
LsmWriteSpec::Unsharded {
maintained_indexes,
writer_config_defaults,
} => {
builder = builder.unsharded();
(maintained_indexes, writer_config_defaults)
}
};
builder = builder.maintained_indexes(maintained_indexes);
for (key, value) in writer_config_defaults {
builder = builder.add_writer_config_default(key, value);
}
builder.execute().await?;
table.dataset.update(dataset);
Ok(())
}
// =============================================================================
// unset_lsm_write_spec
// =============================================================================
/// Remove the [`LsmWriteSpec`] from the table by dropping the MemWAL index.
///
/// Errors if no spec is currently set.
#[allow(clippy::redundant_pub_crate)]
pub(crate) async fn unset_lsm_write_spec(table: &NativeTable) -> Result<()> {
table.dataset.ensure_mutable()?;
{
let dataset = table.dataset.get().await?;
if dataset.mem_wal_index_details().await?.is_none() {
return Err(Error::InvalidInput {
message: "unset_lsm_write_spec: no LSM write spec is set on this table".into(),
});
}
}
let mut dataset = (*table.dataset.get().await?).clone();
dataset
.drop_index(lance_index::mem_wal::MEM_WAL_INDEX_NAME)
.await?;
table.dataset.update(dataset);
Ok(())
}

View File

@@ -1,113 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Table-level unenforced primary key support.
//!
//! [`set_unenforced_primary_key`] records a column as the table's primary key
//! by writing Lance schema field metadata. "Unenforced" means LanceDB does not
//! check uniqueness on write; the key is metadata for features such as
//! `merge_insert` to consume.
//!
//! Only a single-column primary key is supported, and the key cannot be
//! changed once set.
use arrow_schema::DataType;
use lance_core::datatypes::{LANCE_UNENFORCED_PRIMARY_KEY, LANCE_UNENFORCED_PRIMARY_KEY_POSITION};
use crate::error::{Error, Result};
use crate::table::NativeTable;
/// Set the unenforced primary key on `table` to the single column in `columns`.
///
/// Fails if `columns` is not exactly one column (compound primary keys are not
/// supported), if the column does not exist or has an unsupported dtype, or if
/// the table already has an unenforced primary key (changing the primary key
/// is not supported).
pub(super) async fn set_unenforced_primary_key(
table: &NativeTable,
columns: &[&str],
) -> Result<()> {
table.dataset.ensure_mutable()?;
if columns.is_empty() {
return Err(Error::InvalidInput {
message: "set_unenforced_primary_key: a column is required".into(),
});
}
if columns.len() > 1 {
return Err(Error::InvalidInput {
message: format!(
"set_unenforced_primary_key: compound primary keys are not supported, got {} columns",
columns.len()
),
});
}
let column = columns[0];
let updates = {
let dataset = table.dataset.get().await?;
let schema = dataset.schema();
// The primary key is immutable once set. The Lance commit layer is the
// source of truth for this (it also covers the concurrent-writer race);
// this check just fails fast with a clear message.
if !schema.unenforced_primary_key().is_empty() {
return Err(Error::InvalidInput {
message: "set_unenforced_primary_key: an unenforced primary key is already set on this table; changing it is not supported".into(),
});
}
let field = schema.field(column).ok_or_else(|| Error::InvalidInput {
message: format!(
"set_unenforced_primary_key: column '{}' not found on table",
column
),
})?;
if !is_supported_pk_dtype(&field.data_type()) {
return Err(Error::InvalidInput {
message: format!(
"set_unenforced_primary_key: column '{}' has dtype {:?} which is not supported as a primary key. Supported: Int32, Int64, Utf8, LargeUtf8, Binary, LargeBinary, FixedSizeBinary",
column,
field.data_type()
),
});
}
// Position metadata is 1-indexed; `Schema::unenforced_primary_key`
// treats position 0 as a legacy "no specific position" fallback.
let mut metadata = field.metadata.clone();
metadata.remove(LANCE_UNENFORCED_PRIMARY_KEY);
metadata.insert(
LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_string(),
"1".to_string(),
);
vec![(field_id_to_u32(field.id, &field.name)?, metadata)]
};
let mut dataset = (*table.dataset.get().await?).clone();
dataset.replace_field_metadata(updates).await?;
table.dataset.update(dataset);
Ok(())
}
fn field_id_to_u32(id: i32, name: &str) -> Result<u32> {
u32::try_from(id).map_err(|_| Error::Runtime {
message: format!(
"internal: field '{}' has unexpected negative field id {}",
name, id
),
})
}
fn is_supported_pk_dtype(dtype: &DataType) -> bool {
matches!(
dtype,
DataType::Int32
| DataType::Int64
| DataType::Utf8
| DataType::LargeUtf8
| DataType::Binary
| DataType::LargeBinary
| DataType::FixedSizeBinary(_)
)
}

View File

@@ -242,10 +242,6 @@ pub async fn create_plan(
scanner.disable_scoring_autoprojection();
}
if let Some(order_by) = &query.base.order_by {
scanner.order_by(Some(order_by.clone()))?;
}
Ok(scanner.create_plan().await?)
}