Compare commits

...

4 Commits

Author SHA1 Message Date
Will Jones
e244d753e5 docs(nodejs): regenerate Table.md for setLsmWriteSpec
JSDoc was updated in the previous commit; regenerate the markdown to match.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 15:00:53 -07:00
Will Jones
a69dd0f62b fix: remove primary key constraint from MemWAL bucket sharding
Lance v7.0.0-rc.1 intentionally removed the requirement for
bucket_sharding to match (or even require) the unenforced primary key
column. Update LanceDB to match: drop the PK-related doc comments and
the test assertions that expected rejection when no PK is set or when
the bucket column differs from the PK.

The Rust changes are taken from #3435; this commit additionally applies
the equivalent updates to the Python and TypeScript bindings.

See https://github.com/lance-format/lance/issues/6917

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 14:18:58 -07:00
Will Jones
02b112e931 test: ignore latest_version_hint.json in manifest dir assertions
Lance v7.0.0-rc.1 writes _versions/latest_version_hint.json on
non-lexically-ordered stores (e.g. the local filesystem) to speed up
latest-version lookup. The V2-manifest-path tests iterate the whole
_versions directory and asserted every entry matches the manifest
filename pattern; filter to *.manifest files so the hint file is ignored.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 12:54:31 -07:00
lancedb automation
05504f280a chore: update lance dependency to v7.0.0-rc.1 2026-05-21 22:55:23 +00:00
11 changed files with 86 additions and 129 deletions

74
Cargo.lock generated
View File

@@ -3284,8 +3284,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow-array",
"rand 0.9.4",
@@ -4506,8 +4506,8 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a"
[[package]]
name = "lance"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arc-swap",
"arrow",
@@ -4525,6 +4525,7 @@ dependencies = [
"async_cell",
"aws-credential-types",
"aws-sdk-dynamodb",
"bitpacking",
"byteorder",
"bytes",
"chrono",
@@ -4554,6 +4555,7 @@ dependencies = [
"lance-table",
"lance-tokenizer",
"log",
"moka",
"object_store",
"permutation",
"pin-project",
@@ -4577,8 +4579,8 @@ dependencies = [
[[package]]
name = "lance-arrow"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4598,8 +4600,8 @@ dependencies = [
[[package]]
name = "lance-bitpacking"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrayref",
"paste",
@@ -4608,8 +4610,8 @@ dependencies = [
[[package]]
name = "lance-core"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4644,8 +4646,8 @@ dependencies = [
[[package]]
name = "lance-datafusion"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow",
"arrow-array",
@@ -4675,8 +4677,8 @@ dependencies = [
[[package]]
name = "lance-datagen"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow",
"arrow-array",
@@ -4694,8 +4696,8 @@ dependencies = [
[[package]]
name = "lance-encoding"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4730,8 +4732,8 @@ dependencies = [
[[package]]
name = "lance-file"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4762,8 +4764,8 @@ dependencies = [
[[package]]
name = "lance-index"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arc-swap",
"arrow",
@@ -4827,8 +4829,8 @@ dependencies = [
[[package]]
name = "lance-io"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow",
"arrow-arith",
@@ -4870,8 +4872,8 @@ dependencies = [
[[package]]
name = "lance-linalg"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4887,8 +4889,8 @@ dependencies = [
[[package]]
name = "lance-namespace"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow",
"async-trait",
@@ -4900,8 +4902,8 @@ dependencies = [
[[package]]
name = "lance-namespace-impls"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow",
"arrow-ipc",
@@ -4936,9 +4938,9 @@ dependencies = [
[[package]]
name = "lance-namespace-reqwest-client"
version = "0.7.6"
version = "0.7.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f65e31bdaa13e01dab6e7cf566da31df243c34a542f0d915d3601ec0e01e61d2"
checksum = "6369eee4682fb11edf538388b43c61ce288b8302fe89bb40944d7daa7faaae99"
dependencies = [
"reqwest 0.12.28",
"serde",
@@ -4950,8 +4952,8 @@ dependencies = [
[[package]]
name = "lance-table"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow",
"arrow-array",
@@ -4990,8 +4992,8 @@ dependencies = [
[[package]]
name = "lance-testing"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"arrow-array",
"arrow-schema",
@@ -5002,8 +5004,8 @@ dependencies = [
[[package]]
name = "lance-tokenizer"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
dependencies = [
"jieba-rs",
"lindera",

View File

@@ -13,20 +13,20 @@ categories = ["database-implementations"]
rust-version = "1.91.0"
[workspace.dependencies]
lance = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance = { "version" = "=7.0.0-rc.1", default-features = false, "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.0.0-rc.1", default-features = false, "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.0.0-rc.1", default-features = false, "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
ahash = "0.8"
# Note that this one does not include pyarrow
arrow = { version = "58.0.0", optional = false }

View File

@@ -701,15 +701,11 @@ LSM-style write path for future `mergeInsert` calls.
`LsmWriteSpec` chooses one of three sharding strategies via `specType`:
- `"bucket"` — hash-bucket writes by the single-column unenforced primary
key (`column` and `numBuckets` required).
- `"bucket"` — hash-bucket writes by a scalar `column` (`column` and
`numBuckets` required).
- `"identity"` — shard by the raw value of a scalar `column`.
- `"unsharded"` — route every write to a single shard.
All variants require the table to have an unenforced primary key
([Table#setUnenforcedPrimaryKey](Table.md#setunenforcedprimarykey)); bucket sharding additionally
requires it to be the single column being bucketed.
#### Parameters
* **spec**: [`LsmWriteSpec`](../interfaces/LsmWriteSpec.md)
@@ -722,7 +718,6 @@ requires it to be the single column being bucketed.
#### Example
```ts
await table.setUnenforcedPrimaryKey("id");
await table.setLsmWriteSpec({
specType: "bucket",
column: "id",

View File

@@ -28,7 +28,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<arrow.version>15.0.0</arrow.version>
<lance-core.version>7.0.0-beta.13</lance-core.version>
<lance-core.version>7.0.0-rc.1</lance-core.version>
<spotless.skip>false</spotless.skip>
<spotless.version>2.30.0</spotless.version>
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>

View File

@@ -7,6 +7,13 @@ import * as tmp from "tmp";
import { Connection, Table, connect, connectNamespace } from "../lancedb";
import { LocalTable } from "../lancedb/table";
// The `_versions` directory may also contain `latest_version_hint.json`, which
// Lance writes on non-lexically-ordered stores (e.g. the local filesystem) to
// speed up latest-version lookup. Only assert on manifest files.
function manifestFiles(versionsDir: string): string[] {
return readdirSync(versionsDir).filter((f) => f.endsWith(".manifest"));
}
describe("when connecting", () => {
let tmpDir: tmp.DirResult;
beforeEach(() => {
@@ -171,7 +178,7 @@ describe("given a connection", () => {
let manifestDir =
tmpDir.name + "/test_manifest_paths_v2_empty.lance/_versions";
readdirSync(manifestDir).forEach((file) => {
manifestFiles(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d{20}\.manifest$/);
});
@@ -180,7 +187,7 @@ describe("given a connection", () => {
})) as LocalTable;
expect(await table.usesV2ManifestPaths()).toBe(true);
manifestDir = tmpDir.name + "/test_manifest_paths_v2.lance/_versions";
readdirSync(manifestDir).forEach((file) => {
manifestFiles(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d{20}\.manifest$/);
});
});
@@ -199,14 +206,14 @@ describe("given a connection", () => {
const manifestDir =
tmpDir.name + "/test_manifest_path_migration.lance/_versions";
readdirSync(manifestDir).forEach((file) => {
manifestFiles(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d\.manifest$/);
});
await table.migrateManifestPathsV2();
expect(await table.usesV2ManifestPaths()).toBe(true);
readdirSync(manifestDir).forEach((file) => {
manifestFiles(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d{20}\.manifest$/);
});
});

View File

@@ -537,19 +537,14 @@ export abstract class Table {
*
* `LsmWriteSpec` chooses one of three sharding strategies via `specType`:
*
* - `"bucket"` — hash-bucket writes by the single-column unenforced primary
* key (`column` and `numBuckets` required).
* - `"bucket"` — hash-bucket writes by a scalar `column` (`column` and
* `numBuckets` required).
* - `"identity"` — shard by the raw value of a scalar `column`.
* - `"unsharded"` — route every write to a single shard.
*
* All variants require the table to have an unenforced primary key
* ({@link Table#setUnenforcedPrimaryKey}); bucket sharding additionally
* requires it to be the single column being bucketed.
* @param {LsmWriteSpec} spec The sharding spec to install.
* @returns {Promise<void>}
* @example
* ```ts
* await table.setUnenforcedPrimaryKey("id");
* await table.setLsmWriteSpec({
* specType: "bucket",
* column: "id",

View File

@@ -3875,15 +3875,11 @@ class AsyncTable:
strategies:
- ``LsmWriteSpec.bucket(column, num_buckets)`` — hash-bucket writes by
the single-column unenforced primary key.
a scalar column.
- ``LsmWriteSpec.identity(column)`` — shard by the raw value of a
scalar column.
- ``LsmWriteSpec.unsharded()`` — route every write to a single shard.
All variants require the table to have an unenforced primary key set
via [`set_unenforced_primary_key`]; bucket sharding additionally
requires it to be the single column being bucketed.
Parameters
----------
spec : LsmWriteSpec
@@ -3892,7 +3888,6 @@ class AsyncTable:
Examples
--------
>>> from lancedb._lancedb import LsmWriteSpec
>>> # table.set_unenforced_primary_key("id")
>>> # table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 16))
"""
await self._inner.set_lsm_write_spec(spec)

View File

@@ -450,6 +450,13 @@ async def test_create_exist_ok_async(tmp_db_async: lancedb.AsyncConnection):
# await db.create_table("test", schema=bad_schema, exist_ok=True)
def _manifest_files(versions_dir):
# The `_versions` directory may also contain `latest_version_hint.json`,
# which Lance writes on non-lexically-ordered stores (e.g. the local
# filesystem) to speed up latest-version lookup. Only assert on manifests.
return [f for f in os.listdir(versions_dir) if f.endswith(".manifest")]
@pytest.mark.asyncio
async def test_create_table_v2_manifest_paths_async(tmp_path):
db_with_v2_paths = await lancedb.connect_async(
@@ -465,7 +472,7 @@ async def test_create_table_v2_manifest_paths_async(tmp_path):
)
assert await tbl.uses_v2_manifest_paths()
manifests_dir = tmp_path / "test_v2_manifest_paths.lance" / "_versions"
for manifest in os.listdir(manifests_dir):
for manifest in _manifest_files(manifests_dir):
assert re.match(r"\d{20}\.manifest", manifest)
# Start a table in V1 mode then migrate
@@ -475,13 +482,13 @@ async def test_create_table_v2_manifest_paths_async(tmp_path):
)
assert not await tbl.uses_v2_manifest_paths()
manifests_dir = tmp_path / "test_v2_migration.lance" / "_versions"
for manifest in os.listdir(manifests_dir):
for manifest in _manifest_files(manifests_dir):
assert re.match(r"\d\.manifest", manifest)
await tbl.migrate_manifest_paths_v2()
assert await tbl.uses_v2_manifest_paths()
for manifest in os.listdir(manifests_dir):
for manifest in _manifest_files(manifests_dir):
assert re.match(r"\d{20}\.manifest", manifest)

View File

@@ -40,16 +40,6 @@ def _make_table(tmp_path):
def test_set_lsm_write_spec_validates(tmp_path):
_db, table = _make_table(tmp_path)
# No PK set yet.
with pytest.raises(Exception, match="primary key"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
table.set_unenforced_primary_key("id")
# Column mismatch.
with pytest.raises(Exception, match="match"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("v", 4))
# Out-of-range num_buckets.
with pytest.raises(Exception, match="num_buckets"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 0))
@@ -70,7 +60,6 @@ def test_unset_lsm_write_spec(tmp_path):
table.unset_lsm_write_spec()
# Install a spec, then remove it; afterwards a fresh spec can be set.
table.set_unenforced_primary_key("id")
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
table.unset_lsm_write_spec()
# A second unset errors — there is no spec left to remove.
@@ -81,9 +70,7 @@ def test_unset_lsm_write_spec(tmp_path):
def test_set_unsharded_spec(tmp_path):
_db, table = _make_table(tmp_path)
# Lance MemWAL still requires a primary key on the dataset; Unsharded
# just skips per-row hashing.
table.set_unenforced_primary_key("id")
# Unsharded routes every write to a single shard, skipping per-row hashing.
table.set_lsm_write_spec(LsmWriteSpec.unsharded())
table.unset_lsm_write_spec()
@@ -120,7 +107,6 @@ async def test_async_set_unset_lsm_write_spec(tmp_path):
pa.RecordBatchReader.from_batches(SCHEMA, [_batch(["seed"], [0])]),
)
await table.set_unenforced_primary_key("id")
await table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
await table.unset_lsm_write_spec()
# A second unset errors.
@@ -130,9 +116,7 @@ async def test_async_set_unset_lsm_write_spec(tmp_path):
def test_set_identity_spec(tmp_path):
_db, table = _make_table(tmp_path)
# Identity sharding still requires an unenforced primary key on the
# table; it shards by the raw value of the given column.
table.set_unenforced_primary_key("id")
# Identity sharding shards by the raw value of the given column.
table.set_lsm_write_spec(LsmWriteSpec.identity("v"))
table.unset_lsm_write_spec()

View File

@@ -185,7 +185,7 @@ pub struct LsmWriteSpec {
#[pymethods]
impl LsmWriteSpec {
/// Hash-bucket sharding by the unenforced primary key column.
/// Hash-bucket sharding by a scalar column.
#[staticmethod]
pub fn bucket(column: String, num_buckets: u32) -> Self {
Self {

View File

@@ -282,17 +282,15 @@ pub use self::merge::MergeResult;
/// date) and [`LsmWriteSpec::with_writer_config_defaults`] (default
/// `ShardWriter` configuration recorded in the MemWAL index).
///
/// All variants require the table to have an unenforced primary key.
///
/// Install a spec with [`Table::set_lsm_write_spec`] and remove it with
/// [`Table::unset_lsm_write_spec`]. The actual `merge_insert` dispatch
/// onto the MemWAL writer is a follow-up.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum LsmWriteSpec {
/// Hash-bucket sharding by the unenforced primary key column.
/// Hash-bucket sharding by a scalar column.
///
/// `column` must equal the table's currently-set single-column
/// unenforced primary key. `num_buckets` must be in `[1, 1024]`.
/// `column` must be a non-nested column with a supported scalar type.
/// `num_buckets` must be in `[1, 1024]`.
/// Iceberg-compatible Murmur3-x86-32 (seed 0) is used so each row's
/// `bucket(column, num_buckets)` value is stable across processes.
Bucket {
@@ -1298,21 +1296,15 @@ impl Table {
///
/// [`LsmWriteSpec`] chooses one of three sharding strategies:
///
/// - [`LsmWriteSpec::bucket`] — hash-bucket writes by the single-column
/// unenforced primary key.
/// - [`LsmWriteSpec::bucket`] — hash-bucket writes by a scalar column.
/// - [`LsmWriteSpec::identity`] — shard by the raw value of a scalar column.
/// - [`LsmWriteSpec::unsharded`] — route every write to a single shard.
///
/// All variants require the table to have an unenforced primary key
/// ([`Table::set_unenforced_primary_key`]); bucket sharding additionally
/// requires it to be the single column being bucketed.
///
/// # Example
///
/// ```
/// # use lancedb::table::{LsmWriteSpec, Table};
/// # async fn example(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
/// table.set_unenforced_primary_key(["id"]).await?;
/// table
/// .set_lsm_write_spec(
/// LsmWriteSpec::bucket("id", 16).with_maintained_indexes(["id_idx"]),
@@ -4600,21 +4592,6 @@ mod tests {
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
// Reject when no PK is set.
let err = table
.set_lsm_write_spec(LsmWriteSpec::bucket("id", 4))
.await
.expect_err("should reject without PK");
assert!(matches!(err, Error::Lance { .. }), "got {:?}", err);
// Set PK, then a mismatched column on the spec must be rejected.
table.set_unenforced_primary_key(["id"]).await.unwrap();
let err = table
.set_lsm_write_spec(LsmWriteSpec::bucket("name", 4))
.await
.expect_err("should reject column != PK");
assert!(matches!(err, Error::Lance { .. }), "got {:?}", err);
// Reject num_buckets out of range.
for bad in [0u32, 1025] {
let err = table
@@ -4680,9 +4657,6 @@ mod tests {
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
// Lance's MemWAL still requires *some* unenforced primary key on
// the dataset; Unsharded just skips the per-row hashing step.
table.set_unenforced_primary_key(["id"]).await.unwrap();
table
.set_lsm_write_spec(LsmWriteSpec::unsharded())
.await
@@ -4729,7 +4703,6 @@ mod tests {
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
table.set_unenforced_primary_key(["id"]).await.unwrap();
table
.set_lsm_write_spec(
LsmWriteSpec::identity("region")
@@ -4785,7 +4758,6 @@ mod tests {
table.unset_lsm_write_spec().await.unwrap_err();
// Install a spec, then unset it.
table.set_unenforced_primary_key(["id"]).await.unwrap();
table
.set_lsm_write_spec(LsmWriteSpec::bucket("id", 4))
.await