diff --git a/nodejs/lancedb/table.ts b/nodejs/lancedb/table.ts index fe495392a..0264ba321 100644 --- a/nodejs/lancedb/table.ts +++ b/nodejs/lancedb/table.ts @@ -537,19 +537,14 @@ export abstract class Table { * * `LsmWriteSpec` chooses one of three sharding strategies via `specType`: * - * - `"bucket"` — hash-bucket writes by the single-column unenforced primary - * key (`column` and `numBuckets` required). + * - `"bucket"` — hash-bucket writes by a scalar `column` (`column` and + * `numBuckets` required). * - `"identity"` — shard by the raw value of a scalar `column`. * - `"unsharded"` — route every write to a single shard. - * - * All variants require the table to have an unenforced primary key - * ({@link Table#setUnenforcedPrimaryKey}); bucket sharding additionally - * requires it to be the single column being bucketed. * @param {LsmWriteSpec} spec The sharding spec to install. * @returns {Promise} * @example * ```ts - * await table.setUnenforcedPrimaryKey("id"); * await table.setLsmWriteSpec({ * specType: "bucket", * column: "id", diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 3a9ae0801..937c1df40 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -3875,15 +3875,11 @@ class AsyncTable: strategies: - ``LsmWriteSpec.bucket(column, num_buckets)`` — hash-bucket writes by - the single-column unenforced primary key. + a scalar column. - ``LsmWriteSpec.identity(column)`` — shard by the raw value of a scalar column. - ``LsmWriteSpec.unsharded()`` — route every write to a single shard. - All variants require the table to have an unenforced primary key set - via [`set_unenforced_primary_key`]; bucket sharding additionally - requires it to be the single column being bucketed. - Parameters ---------- spec : LsmWriteSpec @@ -3892,7 +3888,6 @@ class AsyncTable: Examples -------- >>> from lancedb._lancedb import LsmWriteSpec - >>> # table.set_unenforced_primary_key("id") >>> # table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 16)) """ await self._inner.set_lsm_write_spec(spec) diff --git a/python/python/tests/test_lsm_write_spec.py b/python/python/tests/test_lsm_write_spec.py index b81153994..e867ea23c 100644 --- a/python/python/tests/test_lsm_write_spec.py +++ b/python/python/tests/test_lsm_write_spec.py @@ -40,16 +40,6 @@ def _make_table(tmp_path): def test_set_lsm_write_spec_validates(tmp_path): _db, table = _make_table(tmp_path) - # No PK set yet. - with pytest.raises(Exception, match="primary key"): - table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4)) - - table.set_unenforced_primary_key("id") - - # Column mismatch. - with pytest.raises(Exception, match="match"): - table.set_lsm_write_spec(LsmWriteSpec.bucket("v", 4)) - # Out-of-range num_buckets. with pytest.raises(Exception, match="num_buckets"): table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 0)) @@ -70,7 +60,6 @@ def test_unset_lsm_write_spec(tmp_path): table.unset_lsm_write_spec() # Install a spec, then remove it; afterwards a fresh spec can be set. - table.set_unenforced_primary_key("id") table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4)) table.unset_lsm_write_spec() # A second unset errors — there is no spec left to remove. @@ -81,9 +70,7 @@ def test_unset_lsm_write_spec(tmp_path): def test_set_unsharded_spec(tmp_path): _db, table = _make_table(tmp_path) - # Lance MemWAL still requires a primary key on the dataset; Unsharded - # just skips per-row hashing. - table.set_unenforced_primary_key("id") + # Unsharded routes every write to a single shard, skipping per-row hashing. table.set_lsm_write_spec(LsmWriteSpec.unsharded()) table.unset_lsm_write_spec() @@ -120,7 +107,6 @@ async def test_async_set_unset_lsm_write_spec(tmp_path): pa.RecordBatchReader.from_batches(SCHEMA, [_batch(["seed"], [0])]), ) - await table.set_unenforced_primary_key("id") await table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4)) await table.unset_lsm_write_spec() # A second unset errors. @@ -130,9 +116,7 @@ async def test_async_set_unset_lsm_write_spec(tmp_path): def test_set_identity_spec(tmp_path): _db, table = _make_table(tmp_path) - # Identity sharding still requires an unenforced primary key on the - # table; it shards by the raw value of the given column. - table.set_unenforced_primary_key("id") + # Identity sharding shards by the raw value of the given column. table.set_lsm_write_spec(LsmWriteSpec.identity("v")) table.unset_lsm_write_spec() diff --git a/python/src/table.rs b/python/src/table.rs index 546bec555..10acc9c04 100644 --- a/python/src/table.rs +++ b/python/src/table.rs @@ -185,7 +185,7 @@ pub struct LsmWriteSpec { #[pymethods] impl LsmWriteSpec { - /// Hash-bucket sharding by the unenforced primary key column. + /// Hash-bucket sharding by a scalar column. #[staticmethod] pub fn bucket(column: String, num_buckets: u32) -> Self { Self { diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 03f967e6e..8a4ff9d7b 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -282,17 +282,15 @@ pub use self::merge::MergeResult; /// date) and [`LsmWriteSpec::with_writer_config_defaults`] (default /// `ShardWriter` configuration recorded in the MemWAL index). /// -/// All variants require the table to have an unenforced primary key. -/// /// Install a spec with [`Table::set_lsm_write_spec`] and remove it with /// [`Table::unset_lsm_write_spec`]. The actual `merge_insert` dispatch /// onto the MemWAL writer is a follow-up. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum LsmWriteSpec { - /// Hash-bucket sharding by the unenforced primary key column. + /// Hash-bucket sharding by a scalar column. /// - /// `column` must equal the table's currently-set single-column - /// unenforced primary key. `num_buckets` must be in `[1, 1024]`. + /// `column` must be a non-nested column with a supported scalar type. + /// `num_buckets` must be in `[1, 1024]`. /// Iceberg-compatible Murmur3-x86-32 (seed 0) is used so each row's /// `bucket(column, num_buckets)` value is stable across processes. Bucket { @@ -1298,21 +1296,15 @@ impl Table { /// /// [`LsmWriteSpec`] chooses one of three sharding strategies: /// - /// - [`LsmWriteSpec::bucket`] — hash-bucket writes by the single-column - /// unenforced primary key. + /// - [`LsmWriteSpec::bucket`] — hash-bucket writes by a scalar column. /// - [`LsmWriteSpec::identity`] — shard by the raw value of a scalar column. /// - [`LsmWriteSpec::unsharded`] — route every write to a single shard. /// - /// All variants require the table to have an unenforced primary key - /// ([`Table::set_unenforced_primary_key`]); bucket sharding additionally - /// requires it to be the single column being bucketed. - /// /// # Example /// /// ``` /// # use lancedb::table::{LsmWriteSpec, Table}; /// # async fn example(table: &Table) -> Result<(), Box> { - /// table.set_unenforced_primary_key(["id"]).await?; /// table /// .set_lsm_write_spec( /// LsmWriteSpec::bucket("id", 16).with_maintained_indexes(["id_idx"]), @@ -4600,21 +4592,6 @@ mod tests { .unwrap(); let table = conn.create_table("t", reader).execute().await.unwrap(); - // Reject when no PK is set. - let err = table - .set_lsm_write_spec(LsmWriteSpec::bucket("id", 4)) - .await - .expect_err("should reject without PK"); - assert!(matches!(err, Error::Lance { .. }), "got {:?}", err); - - // Set PK, then a mismatched column on the spec must be rejected. - table.set_unenforced_primary_key(["id"]).await.unwrap(); - let err = table - .set_lsm_write_spec(LsmWriteSpec::bucket("name", 4)) - .await - .expect_err("should reject column != PK"); - assert!(matches!(err, Error::Lance { .. }), "got {:?}", err); - // Reject num_buckets out of range. for bad in [0u32, 1025] { let err = table @@ -4680,9 +4657,6 @@ mod tests { .unwrap(); let table = conn.create_table("t", reader).execute().await.unwrap(); - // Lance's MemWAL still requires *some* unenforced primary key on - // the dataset; Unsharded just skips the per-row hashing step. - table.set_unenforced_primary_key(["id"]).await.unwrap(); table .set_lsm_write_spec(LsmWriteSpec::unsharded()) .await @@ -4729,7 +4703,6 @@ mod tests { .unwrap(); let table = conn.create_table("t", reader).execute().await.unwrap(); - table.set_unenforced_primary_key(["id"]).await.unwrap(); table .set_lsm_write_spec( LsmWriteSpec::identity("region") @@ -4785,7 +4758,6 @@ mod tests { table.unset_lsm_write_spec().await.unwrap_err(); // Install a spec, then unset it. - table.set_unenforced_primary_key(["id"]).await.unwrap(); table .set_lsm_write_spec(LsmWriteSpec::bucket("id", 4)) .await