fix: remove primary key constraint from MemWAL bucket sharding

Lance v7.0.0-rc.1 intentionally removed the requirement for bucket_sharding to match (or even require) the unenforced primary key column. Update LanceDB to match: drop the PK-related doc comments and the test assertions that expected rejection when no PK is set or when the bucket column differs from the PK. The Rust changes are taken from #3435; this commit additionally applies the equivalent updates to the Python and TypeScript bindings. See https://github.com/lance-format/lance/issues/6917 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 15:00:39 +00:00 · 2026-05-22 14:18:58 -07:00
parent 02b112e931
commit a69dd0f62b
5 changed files with 10 additions and 64 deletions
--- a/nodejs/lancedb/table.ts
+++ b/nodejs/lancedb/table.ts
@@ -537,19 +537,14 @@ export abstract class Table {
   *
   * `LsmWriteSpec` chooses one of three sharding strategies via `specType`:
   *
-   * - `"bucket"` — hash-bucket writes by the single-column unenforced primary
-   *   key (`column` and `numBuckets` required).
+   * - `"bucket"` — hash-bucket writes by a scalar `column` (`column` and
+   *   `numBuckets` required).
   * - `"identity"` — shard by the raw value of a scalar `column`.
   * - `"unsharded"` — route every write to a single shard.
-   *
-   * All variants require the table to have an unenforced primary key
-   * ({@link Table#setUnenforcedPrimaryKey}); bucket sharding additionally
-   * requires it to be the single column being bucketed.
   * @param {LsmWriteSpec} spec The sharding spec to install.
   * @returns {Promise<void>}
   * @example
   * ```ts
-   * await table.setUnenforcedPrimaryKey("id");
   * await table.setLsmWriteSpec({
   *   specType: "bucket",
   *   column: "id",
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -3875,15 +3875,11 @@ class AsyncTable:
        strategies:

        - ``LsmWriteSpec.bucket(column, num_buckets)`` — hash-bucket writes by
-          the single-column unenforced primary key.
+          a scalar column.
        - ``LsmWriteSpec.identity(column)`` — shard by the raw value of a
          scalar column.
        - ``LsmWriteSpec.unsharded()`` — route every write to a single shard.

-        All variants require the table to have an unenforced primary key set
-        via [`set_unenforced_primary_key`]; bucket sharding additionally
-        requires it to be the single column being bucketed.
-
        Parameters
        ----------
        spec : LsmWriteSpec
@@ -3892,7 +3888,6 @@ class AsyncTable:
        Examples
        --------
        >>> from lancedb._lancedb import LsmWriteSpec
-        >>> # table.set_unenforced_primary_key("id")
        >>> # table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 16))
        """
        await self._inner.set_lsm_write_spec(spec)
--- a/python/python/tests/test_lsm_write_spec.py
+++ b/python/python/tests/test_lsm_write_spec.py
@@ -40,16 +40,6 @@ def _make_table(tmp_path):
 def test_set_lsm_write_spec_validates(tmp_path):
    _db, table = _make_table(tmp_path)

-    # No PK set yet.
-    with pytest.raises(Exception, match="primary key"):
-        table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
-
-    table.set_unenforced_primary_key("id")
-
-    # Column mismatch.
-    with pytest.raises(Exception, match="match"):
-        table.set_lsm_write_spec(LsmWriteSpec.bucket("v", 4))
-
    # Out-of-range num_buckets.
    with pytest.raises(Exception, match="num_buckets"):
        table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 0))
@@ -70,7 +60,6 @@ def test_unset_lsm_write_spec(tmp_path):
        table.unset_lsm_write_spec()

    # Install a spec, then remove it; afterwards a fresh spec can be set.
-    table.set_unenforced_primary_key("id")
    table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
    table.unset_lsm_write_spec()
    # A second unset errors — there is no spec left to remove.
@@ -81,9 +70,7 @@ def test_unset_lsm_write_spec(tmp_path):

 def test_set_unsharded_spec(tmp_path):
    _db, table = _make_table(tmp_path)
-    # Lance MemWAL still requires a primary key on the dataset; Unsharded
-    # just skips per-row hashing.
-    table.set_unenforced_primary_key("id")
+    # Unsharded routes every write to a single shard, skipping per-row hashing.
    table.set_lsm_write_spec(LsmWriteSpec.unsharded())
    table.unset_lsm_write_spec()

@@ -120,7 +107,6 @@ async def test_async_set_unset_lsm_write_spec(tmp_path):
        pa.RecordBatchReader.from_batches(SCHEMA, [_batch(["seed"], [0])]),
    )

-    await table.set_unenforced_primary_key("id")
    await table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
    await table.unset_lsm_write_spec()
    # A second unset errors.
@@ -130,9 +116,7 @@ async def test_async_set_unset_lsm_write_spec(tmp_path):

 def test_set_identity_spec(tmp_path):
    _db, table = _make_table(tmp_path)
-    # Identity sharding still requires an unenforced primary key on the
-    # table; it shards by the raw value of the given column.
-    table.set_unenforced_primary_key("id")
+    # Identity sharding shards by the raw value of the given column.
    table.set_lsm_write_spec(LsmWriteSpec.identity("v"))
    table.unset_lsm_write_spec()

--- a/python/src/table.rs
+++ b/python/src/table.rs
@@ -185,7 +185,7 @@ pub struct LsmWriteSpec {

 #[pymethods]
 impl LsmWriteSpec {
-    /// Hash-bucket sharding by the unenforced primary key column.
+    /// Hash-bucket sharding by a scalar column.
    #[staticmethod]
    pub fn bucket(column: String, num_buckets: u32) -> Self {
        Self {
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -282,17 +282,15 @@ pub use self::merge::MergeResult;
 /// date) and [`LsmWriteSpec::with_writer_config_defaults`] (default
 /// `ShardWriter` configuration recorded in the MemWAL index).
 ///
-/// All variants require the table to have an unenforced primary key.
-///
 /// Install a spec with [`Table::set_lsm_write_spec`] and remove it with
 /// [`Table::unset_lsm_write_spec`]. The actual `merge_insert` dispatch
 /// onto the MemWAL writer is a follow-up.
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub enum LsmWriteSpec {
-    /// Hash-bucket sharding by the unenforced primary key column.
+    /// Hash-bucket sharding by a scalar column.
    ///
-    /// `column` must equal the table's currently-set single-column
-    /// unenforced primary key. `num_buckets` must be in `[1, 1024]`.
+    /// `column` must be a non-nested column with a supported scalar type.
+    /// `num_buckets` must be in `[1, 1024]`.
    /// Iceberg-compatible Murmur3-x86-32 (seed 0) is used so each row's
    /// `bucket(column, num_buckets)` value is stable across processes.
    Bucket {
@@ -1298,21 +1296,15 @@ impl Table {
    ///
    /// [`LsmWriteSpec`] chooses one of three sharding strategies:
    ///
-    /// - [`LsmWriteSpec::bucket`] — hash-bucket writes by the single-column
-    ///   unenforced primary key.
+    /// - [`LsmWriteSpec::bucket`] — hash-bucket writes by a scalar column.
    /// - [`LsmWriteSpec::identity`] — shard by the raw value of a scalar column.
    /// - [`LsmWriteSpec::unsharded`] — route every write to a single shard.
    ///
-    /// All variants require the table to have an unenforced primary key
-    /// ([`Table::set_unenforced_primary_key`]); bucket sharding additionally
-    /// requires it to be the single column being bucketed.
-    ///
    /// # Example
    ///
    /// ```
    /// # use lancedb::table::{LsmWriteSpec, Table};
    /// # async fn example(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
-    /// table.set_unenforced_primary_key(["id"]).await?;
    /// table
    ///     .set_lsm_write_spec(
    ///         LsmWriteSpec::bucket("id", 16).with_maintained_indexes(["id_idx"]),
@@ -4600,21 +4592,6 @@ mod tests {
            .unwrap();
        let table = conn.create_table("t", reader).execute().await.unwrap();

-        // Reject when no PK is set.
-        let err = table
-            .set_lsm_write_spec(LsmWriteSpec::bucket("id", 4))
-            .await
-            .expect_err("should reject without PK");
-        assert!(matches!(err, Error::Lance { .. }), "got {:?}", err);
-
-        // Set PK, then a mismatched column on the spec must be rejected.
-        table.set_unenforced_primary_key(["id"]).await.unwrap();
-        let err = table
-            .set_lsm_write_spec(LsmWriteSpec::bucket("name", 4))
-            .await
-            .expect_err("should reject column != PK");
-        assert!(matches!(err, Error::Lance { .. }), "got {:?}", err);
-
        // Reject num_buckets out of range.
        for bad in [0u32, 1025] {
            let err = table
@@ -4680,9 +4657,6 @@ mod tests {
            .unwrap();
        let table = conn.create_table("t", reader).execute().await.unwrap();

-        // Lance's MemWAL still requires *some* unenforced primary key on
-        // the dataset; Unsharded just skips the per-row hashing step.
-        table.set_unenforced_primary_key(["id"]).await.unwrap();
        table
            .set_lsm_write_spec(LsmWriteSpec::unsharded())
            .await
@@ -4729,7 +4703,6 @@ mod tests {
            .unwrap();
        let table = conn.create_table("t", reader).execute().await.unwrap();

-        table.set_unenforced_primary_key(["id"]).await.unwrap();
        table
            .set_lsm_write_spec(
                LsmWriteSpec::identity("region")
@@ -4785,7 +4758,6 @@ mod tests {
        table.unset_lsm_write_spec().await.unwrap_err();

        // Install a spec, then unset it.
-        table.set_unenforced_primary_key(["id"]).await.unwrap();
        table
            .set_lsm_write_spec(LsmWriteSpec::bucket("id", 4))
            .await