feat: support setting LSM write spec for a table (#3396)

## Summary Split out from #3354 Adds `LsmWriteSpec` and `Table::set_lsm_write_spec` / `unset_lsm_write_spec` to install and clear the spec that selects Lance's MemWAL LSM-style write path for `merge_insert`. `LsmWriteSpec` offers three sharding strategies, all built on Lance's `InitializeMemWalBuilder`: - `LsmWriteSpec::bucket(column, num_buckets)` — hash-bucket sharding by the single-column unenforced primary key. - `LsmWriteSpec::identity(column)` — identity sharding by the raw value of a scalar column. - `LsmWriteSpec::unsharded()` — a single MemWAL shard. Each can be refined with `with_maintained_indexes(...)` (indexes the MemWAL keeps up to date as rows are appended) and `with_writer_config_defaults(...)` (default `ShardWriter` configuration recorded in the MemWAL index, so every writer starts from the same defaults). All variants require the table to have an unenforced primary key. - `set_lsm_write_spec` installs the spec by initializing the MemWAL index; `unset_lsm_write_spec` removes it (dropping the MemWAL index), reverting to the standard `merge_insert` path. `unset` is idempotent. - Bindings: Python (`LsmWriteSpec.bucket` / `.identity` / `.unsharded`, `set_lsm_write_spec` / `unset_lsm_write_spec`) and TypeScript (`setLsmWriteSpec` with `specType` `"bucket"` / `"identity"` / `"unsharded"`). `RemoteTable` returns `NotSupported`. The actual `merge_insert` LSM dispatch and `ShardWriter` write path are a follow-up — this PR only installs and clears the spec.
2026-07-04 11:30:46 +00:00 · 2026-05-18 00:11:33 -07:00
parent 6a431ff0a0
commit 0d30b31998
19 changed files with 1386 additions and 50 deletions
--- a/python/python/lancedb/_lancedb.pyi
+++ b/python/python/lancedb/_lancedb.pyi
@@ -218,6 +218,8 @@ class Table:
    async def initial_storage_options(self) -> Optional[Dict[str, str]]: ...
    async def latest_storage_options(self) -> Optional[Dict[str, str]]: ...
    async def set_unenforced_primary_key(self, columns: List[str]) -> None: ...
+    async def set_lsm_write_spec(self, spec: LsmWriteSpec) -> None: ...
+    async def unset_lsm_write_spec(self) -> None: ...
    @property
    def tags(self) -> Tags: ...
    def query(self) -> Query: ...
@@ -419,6 +421,37 @@ class MergeResult:
    num_deleted_rows: int
    num_attempts: int

+class LsmWriteSpec:
+    """Specification selecting Lance's MemWAL LSM-style write path for
+    `merge_insert`."""
+
+    @staticmethod
+    def bucket(column: str, num_buckets: int) -> "LsmWriteSpec": ...
+    @staticmethod
+    def identity(column: str) -> "LsmWriteSpec": ...
+    @staticmethod
+    def unsharded() -> "LsmWriteSpec": ...
+    def with_maintained_indexes(self, indexes: List[str]) -> "LsmWriteSpec":
+        """Return a copy of this spec asking the MemWAL to keep the named
+        indexes up to date as rows are appended."""
+        ...
+    def with_writer_config_defaults(self, defaults: Dict[str, str]) -> "LsmWriteSpec":
+        """Return a copy of this spec recording the given default
+        `ShardWriter` configuration in the MemWAL index."""
+        ...
+    @property
+    def spec_type(self) -> str:
+        """One of 'bucket', 'identity', or 'unsharded'."""
+        ...
+    @property
+    def column(self) -> Optional[str]: ...
+    @property
+    def num_buckets(self) -> Optional[int]: ...
+    @property
+    def maintained_indexes(self) -> List[str]: ...
+    @property
+    def writer_config_defaults(self) -> Dict[str, str]: ...
+
 class AddColumnsResult:
    version: int

--- a/python/python/lancedb/remote/table.py
+++ b/python/python/lancedb/remote/table.py
@@ -14,6 +14,7 @@ from lancedb._lancedb import (
    DeleteResult,
    DropColumnsResult,
    IndexConfig,
+    LsmWriteSpec,
    MergeResult,
    UpdateResult,
 )
@@ -659,6 +660,14 @@ class RemoteTable(Table):
        """Not supported on LanceDB Cloud."""
        return LOOP.run(self._table.set_unenforced_primary_key(columns))

+    def set_lsm_write_spec(self, spec: "LsmWriteSpec") -> None:
+        """Not supported on LanceDB Cloud."""
+        return LOOP.run(self._table.set_lsm_write_spec(spec))
+
+    def unset_lsm_write_spec(self) -> None:
+        """Not supported on LanceDB Cloud."""
+        return LOOP.run(self._table.unset_lsm_write_spec())
+
    def drop_index(self, index_name: str):
        return LOOP.run(self._table.drop_index(index_name))

--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -154,6 +154,7 @@ if TYPE_CHECKING:
        AlterColumnsResult,
        DeleteResult,
        DropColumnsResult,
+        LsmWriteSpec,
        MergeResult,
        UpdateResult,
    )
@@ -3268,6 +3269,16 @@ class LanceTable(Table):
        [`AsyncTable.set_unenforced_primary_key`][lancedb.AsyncTable.set_unenforced_primary_key]."""
        return LOOP.run(self._table.set_unenforced_primary_key(columns))

+    def set_lsm_write_spec(self, spec: "LsmWriteSpec") -> None:
+        """Install an LsmWriteSpec. See
+        [`AsyncTable.set_lsm_write_spec`][lancedb.AsyncTable.set_lsm_write_spec]."""
+        return LOOP.run(self._table.set_lsm_write_spec(spec))
+
+    def unset_lsm_write_spec(self) -> None:
+        """Remove the LsmWriteSpec. See
+        [`AsyncTable.unset_lsm_write_spec`][lancedb.AsyncTable.unset_lsm_write_spec]."""
+        return LOOP.run(self._table.unset_lsm_write_spec())
+
    def uses_v2_manifest_paths(self) -> bool:
        """
        Check if the table is using the new v2 manifest paths.
@@ -3838,6 +3849,44 @@ class AsyncTable:
            columns = list(columns)
        await self._inner.set_unenforced_primary_key(columns)

+    async def set_lsm_write_spec(self, spec: "LsmWriteSpec") -> None:
+        """Install an LsmWriteSpec on this table.
+
+        The spec selects Lance's MemWAL LSM-style write path for future
+        `merge_insert` calls. ``LsmWriteSpec`` chooses one of three sharding
+        strategies:
+
+        - ``LsmWriteSpec.bucket(column, num_buckets)`` — hash-bucket writes by
+          the single-column unenforced primary key.
+        - ``LsmWriteSpec.identity(column)`` — shard by the raw value of a
+          scalar column.
+        - ``LsmWriteSpec.unsharded()`` — route every write to a single shard.
+
+        All variants require the table to have an unenforced primary key set
+        via [`set_unenforced_primary_key`]; bucket sharding additionally
+        requires it to be the single column being bucketed.
+
+        Parameters
+        ----------
+        spec : LsmWriteSpec
+            The sharding spec to install.
+
+        Examples
+        --------
+        >>> from lancedb._lancedb import LsmWriteSpec
+        >>> # table.set_unenforced_primary_key("id")
+        >>> # table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 16))
+        """
+        await self._inner.set_lsm_write_spec(spec)
+
+    async def unset_lsm_write_spec(self) -> None:
+        """Remove the LsmWriteSpec from this table.
+
+        Reverts to the standard `merge_insert` write path. Errors if no spec
+        is currently set.
+        """
+        await self._inner.unset_lsm_write_spec()
+
    @property
    def name(self) -> str:
        """The name of the table."""
--- a/python/python/tests/test_lsm_write_spec.py
+++ b/python/python/tests/test_lsm_write_spec.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+"""Tests for installing and clearing an LsmWriteSpec via
+`Table.set_lsm_write_spec` / `Table.unset_lsm_write_spec`.
+"""
+
+from datetime import timedelta
+
+import lancedb
+import pyarrow as pa
+import pytest
+from lancedb._lancedb import LsmWriteSpec
+
+SCHEMA = pa.schema(
+    [
+        pa.field("id", pa.utf8(), nullable=False),
+        pa.field("v", pa.int32(), nullable=False),
+    ]
+)
+
+
+def _batch(ids, vs):
+    return pa.RecordBatch.from_arrays(
+        [pa.array(ids, type=pa.utf8()), pa.array(vs, type=pa.int32())],
+        schema=SCHEMA,
+    )
+
+
+def _reader(ids, vs):
+    return pa.RecordBatchReader.from_batches(SCHEMA, [_batch(ids, vs)])
+
+
+def _make_table(tmp_path):
+    db = lancedb.connect(tmp_path, read_consistency_interval=timedelta(seconds=0))
+    table = db.create_table("t", _reader(["seed"], [0]))
+    return db, table
+
+
+def test_set_lsm_write_spec_validates(tmp_path):
+    _db, table = _make_table(tmp_path)
+
+    # No PK set yet.
+    with pytest.raises(Exception, match="primary key"):
+        table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
+
+    table.set_unenforced_primary_key("id")
+
+    # Column mismatch.
+    with pytest.raises(Exception, match="match"):
+        table.set_lsm_write_spec(LsmWriteSpec.bucket("v", 4))
+
+    # Out-of-range num_buckets.
+    with pytest.raises(Exception, match="num_buckets"):
+        table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 0))
+    with pytest.raises(Exception, match="num_buckets"):
+        table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 1025))
+
+    # Happy path then mutation rejected.
+    table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
+    with pytest.raises(Exception, match="mutation"):
+        table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 8))
+
+
+def test_unset_lsm_write_spec(tmp_path):
+    _db, table = _make_table(tmp_path)
+
+    # unset errors when no spec is set.
+    with pytest.raises(Exception, match="no LSM write spec"):
+        table.unset_lsm_write_spec()
+
+    # Install a spec, then remove it; afterwards a fresh spec can be set.
+    table.set_unenforced_primary_key("id")
+    table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
+    table.unset_lsm_write_spec()
+    # A second unset errors — there is no spec left to remove.
+    with pytest.raises(Exception, match="no LSM write spec"):
+        table.unset_lsm_write_spec()
+    table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 8))
+
+
+def test_set_unsharded_spec(tmp_path):
+    _db, table = _make_table(tmp_path)
+    # Lance MemWAL still requires a primary key on the dataset; Unsharded
+    # just skips per-row hashing.
+    table.set_unenforced_primary_key("id")
+    table.set_lsm_write_spec(LsmWriteSpec.unsharded())
+    table.unset_lsm_write_spec()
+
+
+def test_lsm_write_spec_repr():
+    s = LsmWriteSpec.bucket("id", 4)
+    assert s.spec_type == "bucket"
+    assert s.column == "id"
+    assert s.num_buckets == 4
+    assert s.maintained_indexes == []
+    assert "bucket" in repr(s)
+    assert "id" in repr(s)
+    assert "4" in repr(s)
+
+    u = LsmWriteSpec.unsharded()
+    assert u.spec_type == "unsharded"
+    assert u.column is None
+    assert u.num_buckets is None
+    assert "unsharded" in repr(u)
+
+
+def test_lsm_write_spec_with_maintained_indexes():
+    s = LsmWriteSpec.bucket("id", 4).with_maintained_indexes(["idx_a", "idx_b"])
+    assert s.maintained_indexes == ["idx_a", "idx_b"]
+
+
+@pytest.mark.asyncio
+async def test_async_set_unset_lsm_write_spec(tmp_path):
+    db = await lancedb.connect_async(
+        tmp_path, read_consistency_interval=timedelta(seconds=0)
+    )
+    table = await db.create_table(
+        "t",
+        pa.RecordBatchReader.from_batches(SCHEMA, [_batch(["seed"], [0])]),
+    )
+
+    await table.set_unenforced_primary_key("id")
+    await table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
+    await table.unset_lsm_write_spec()
+    # A second unset errors.
+    with pytest.raises(Exception, match="no LSM write spec"):
+        await table.unset_lsm_write_spec()
+
+
+def test_set_identity_spec(tmp_path):
+    _db, table = _make_table(tmp_path)
+    # Identity sharding still requires an unenforced primary key on the
+    # table; it shards by the raw value of the given column.
+    table.set_unenforced_primary_key("id")
+    table.set_lsm_write_spec(LsmWriteSpec.identity("v"))
+    table.unset_lsm_write_spec()
+
+
+def test_lsm_write_spec_identity_and_writer_config_defaults():
+    s = LsmWriteSpec.identity("v")
+    assert s.spec_type == "identity"
+    assert s.column == "v"
+    assert s.num_buckets is None
+    assert "identity" in repr(s)
+
+    s = s.with_writer_config_defaults({"durable_write": "false"})
+    assert s.writer_config_defaults == {"durable_write": "false"}
+    assert "durable_write" in repr(s)
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -15,8 +15,8 @@ use pyo3::{
 use query::{FTSQuery, HybridQuery, Query, VectorQuery};
 use session::Session;
 use table::{
-    AddColumnsResult, AddResult, AlterColumnsResult, DeleteResult, DropColumnsResult, MergeResult,
-    Table, UpdateResult,
+    AddColumnsResult, AddResult, AlterColumnsResult, DeleteResult, DropColumnsResult, LsmWriteSpec,
+    MergeResult, Table, UpdateResult,
 };

 pub mod arrow;
@@ -52,6 +52,7 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_class::<AlterColumnsResult>()?;
    m.add_class::<AddResult>()?;
    m.add_class::<MergeResult>()?;
+    m.add_class::<LsmWriteSpec>()?;
    m.add_class::<DeleteResult>()?;
    m.add_class::<DropColumnsResult>()?;
    m.add_class::<UpdateResult>()?;
--- a/python/src/table.rs
+++ b/python/src/table.rs
@@ -171,6 +171,141 @@ impl From<lancedb::table::MergeResult> for MergeResult {
    }
 }

+/// Specification selecting Lance's MemWAL LSM-style write path for
+/// `merge_insert`.
+///
+/// Constructed via the `bucket(...)`, `identity(...)`, or `unsharded()`
+/// classmethods, then optionally chain `with_maintained_indexes(...)` and
+/// `with_writer_config_defaults(...)`.
+#[pyclass(from_py_object)]
+#[derive(Clone, Debug)]
+pub struct LsmWriteSpec {
+    inner: lancedb::table::LsmWriteSpec,
+}
+
+#[pymethods]
+impl LsmWriteSpec {
+    /// Hash-bucket sharding by the unenforced primary key column.
+    #[staticmethod]
+    pub fn bucket(column: String, num_buckets: u32) -> Self {
+        Self {
+            inner: lancedb::table::LsmWriteSpec::bucket(column, num_buckets),
+        }
+    }
+
+    /// Identity sharding — shard by the raw value of `column`.
+    #[staticmethod]
+    pub fn identity(column: String) -> Self {
+        Self {
+            inner: lancedb::table::LsmWriteSpec::identity(column),
+        }
+    }
+
+    /// No sharding — every `merge_insert` call writes to a single
+    /// MemWAL shard.
+    #[staticmethod]
+    pub fn unsharded() -> Self {
+        Self {
+            inner: lancedb::table::LsmWriteSpec::unsharded(),
+        }
+    }
+
+    /// Replace the list of indexes the MemWAL should keep up to date as
+    /// rows are appended. Each name must reference an index that
+    /// already exists on the table at the time `set_lsm_write_spec`
+    /// is called.
+    pub fn with_maintained_indexes(&self, indexes: Vec<String>) -> Self {
+        Self {
+            inner: self.inner.clone().with_maintained_indexes(indexes),
+        }
+    }
+
+    /// Replace the default `ShardWriter` configuration recorded in the
+    /// MemWAL index, so every writer starts from the same defaults.
+    pub fn with_writer_config_defaults(&self, defaults: HashMap<String, String>) -> Self {
+        Self {
+            inner: self.inner.clone().with_writer_config_defaults(defaults),
+        }
+    }
+
+    pub fn __repr__(&self) -> String {
+        match &self.inner {
+            lancedb::table::LsmWriteSpec::Bucket {
+                column,
+                num_buckets,
+                maintained_indexes,
+                writer_config_defaults,
+            } => format!(
+                "LsmWriteSpec.bucket(column={:?}, num_buckets={}, maintained_indexes={:?}, writer_config_defaults={:?})",
+                column, num_buckets, maintained_indexes, writer_config_defaults,
+            ),
+            lancedb::table::LsmWriteSpec::Identity {
+                column,
+                maintained_indexes,
+                writer_config_defaults,
+            } => format!(
+                "LsmWriteSpec.identity(column={:?}, maintained_indexes={:?}, writer_config_defaults={:?})",
+                column, maintained_indexes, writer_config_defaults,
+            ),
+            lancedb::table::LsmWriteSpec::Unsharded {
+                maintained_indexes,
+                writer_config_defaults,
+            } => format!(
+                "LsmWriteSpec.unsharded(maintained_indexes={:?}, writer_config_defaults={:?})",
+                maintained_indexes, writer_config_defaults,
+            ),
+        }
+    }
+
+    /// Discriminator string identifying the variant ("bucket", "identity",
+    /// or "unsharded").
+    #[getter]
+    pub fn spec_type(&self) -> &'static str {
+        match &self.inner {
+            lancedb::table::LsmWriteSpec::Bucket { .. } => "bucket",
+            lancedb::table::LsmWriteSpec::Identity { .. } => "identity",
+            lancedb::table::LsmWriteSpec::Unsharded { .. } => "unsharded",
+        }
+    }
+
+    /// Bucket and identity variants: the sharding column. `None` for unsharded.
+    #[getter]
+    pub fn column(&self) -> Option<String> {
+        match &self.inner {
+            lancedb::table::LsmWriteSpec::Bucket { column, .. }
+            | lancedb::table::LsmWriteSpec::Identity { column, .. } => Some(column.clone()),
+            lancedb::table::LsmWriteSpec::Unsharded { .. } => None,
+        }
+    }
+
+    /// Bucket variant only: the number of buckets.
+    #[getter]
+    pub fn num_buckets(&self) -> Option<u32> {
+        match &self.inner {
+            lancedb::table::LsmWriteSpec::Bucket { num_buckets, .. } => Some(*num_buckets),
+            _ => None,
+        }
+    }
+
+    /// Names of indexes the MemWAL should keep up to date during writes.
+    #[getter]
+    pub fn maintained_indexes(&self) -> Vec<String> {
+        self.inner.maintained_indexes().to_vec()
+    }
+
+    /// Default `ShardWriter` configuration recorded by this spec.
+    #[getter]
+    pub fn writer_config_defaults(&self) -> HashMap<String, String> {
+        self.inner.writer_config_defaults().clone()
+    }
+}
+
+impl From<LsmWriteSpec> for lancedb::table::LsmWriteSpec {
+    fn from(spec: LsmWriteSpec) -> Self {
+        spec.inner
+    }
+}
+
 #[pyclass(get_all, from_py_object)]
 #[derive(Clone, Debug)]
 pub struct AddColumnsResult {
@@ -818,6 +953,24 @@ impl Table {
        })
    }

+    pub fn set_lsm_write_spec<'a>(
+        self_: PyRef<'a, Self>,
+        spec: LsmWriteSpec,
+    ) -> PyResult<Bound<'a, PyAny>> {
+        let inner = self_.inner_ref()?.clone();
+        let native_spec = lancedb::table::LsmWriteSpec::from(spec);
+        future_into_py(self_.py(), async move {
+            inner.set_lsm_write_spec(native_spec).await.infer_error()
+        })
+    }
+
+    pub fn unset_lsm_write_spec(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
+        let inner = self_.inner_ref()?.clone();
+        future_into_py(self_.py(), async move {
+            inner.unset_lsm_write_spec().await.infer_error()
+        })
+    }
+
    pub fn uses_v2_manifest_paths(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
        let inner = self_.inner_ref()?.clone();
        future_into_py(self_.py(), async move {