feat: schema evolution APIs in all SDKs (#1851)

* Support `add_columns`, `alter_columns`, `drop_columns` in Remote SDK and async Python * Add `data_type` parameter to node * Docs updates
2025-12-27 23:12:58 +00:00 · 2024-12-04 14:47:50 -08:00
parent bd82e1f66d
commit 79eaa52184
10 changed files with 535 additions and 44 deletions
--- a/python/python/lancedb/remote/table.py
+++ b/python/python/lancedb/remote/table.py
@@ -490,19 +490,13 @@ class RemoteTable(Table):
        return LOOP.run(self._table.count_rows(filter))

    def add_columns(self, transforms: Dict[str, str]):
-        raise NotImplementedError(
-            "add_columns() is not yet supported on the LanceDB cloud"
-        )
+        return LOOP.run(self._table.add_columns(transforms))

-    def alter_columns(self, alterations: Iterable[Dict[str, str]]):
-        raise NotImplementedError(
-            "alter_columns() is not yet supported on the LanceDB cloud"
-        )
+    def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
+        return LOOP.run(self._table.alter_columns(*alterations))

    def drop_columns(self, columns: Iterable[str]):
-        raise NotImplementedError(
-            "drop_columns() is not yet supported on the LanceDB cloud"
-        )
+        return LOOP.run(self._table.drop_columns(columns))


 def add_index(tbl: pa.Table, i: int) -> pa.Table:
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -967,8 +967,6 @@ class Table(ABC):
        """
        Add new columns with defined values.

-        This is not yet available in LanceDB Cloud.
-
        Parameters
        ----------
        transforms: Dict[str, str]
@@ -978,20 +976,21 @@ class Table(ABC):
        """

    @abstractmethod
-    def alter_columns(self, alterations: Iterable[Dict[str, str]]):
+    def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
        """
        Alter column names and nullability.

-        This is not yet available in LanceDB Cloud.
-
        alterations : Iterable[Dict[str, Any]]
            A sequence of dictionaries, each with the following keys:
            - "path": str
                The column path to alter. For a top-level column, this is the name.
                For a nested column, this is the dot-separated path, e.g. "a.b.c".
-            - "name": str, optional
+            - "rename": str, optional
                The new name of the column. If not specified, the column name is
                not changed.
+            - "data_type": pyarrow.DataType, optional
+               The new data type of the column. Existing values will be casted
+               to this type. If not specified, the column data type is not changed.
            - "nullable": bool, optional
                Whether the column should be nullable. If not specified, the column
                nullability is not changed. Only non-nullable columns can be changed
@@ -1004,8 +1003,6 @@ class Table(ABC):
        """
        Drop columns from the table.

-        This is not yet available in LanceDB Cloud.
-
        Parameters
        ----------
        columns : Iterable[str]
@@ -2923,6 +2920,53 @@ class AsyncTable:

        return await self._inner.update(updates_sql, where)

+    async def add_columns(self, transforms: Dict[str, str]):
+        """
+        Add new columns with defined values.
+
+        Parameters
+        ----------
+        transforms: Dict[str, str]
+            A map of column name to a SQL expression to use to calculate the
+            value of the new column. These expressions will be evaluated for
+            each row in the table, and can reference existing columns.
+        """
+        await self._inner.add_columns(list(transforms.items()))
+
+    async def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
+        """
+        Alter column names and nullability.
+
+        alterations : Iterable[Dict[str, Any]]
+            A sequence of dictionaries, each with the following keys:
+            - "path": str
+                The column path to alter. For a top-level column, this is the name.
+                For a nested column, this is the dot-separated path, e.g. "a.b.c".
+            - "rename": str, optional
+                The new name of the column. If not specified, the column name is
+                not changed.
+            - "data_type": pyarrow.DataType, optional
+               The new data type of the column. Existing values will be casted
+               to this type. If not specified, the column data type is not changed.
+            - "nullable": bool, optional
+                Whether the column should be nullable. If not specified, the column
+                nullability is not changed. Only non-nullable columns can be changed
+                to nullable. Currently, you cannot change a nullable column to
+                non-nullable.
+        """
+        await self._inner.alter_columns(alterations)
+
+    async def drop_columns(self, columns: Iterable[str]):
+        """
+        Drop columns from the table.
+
+        Parameters
+        ----------
+        columns : Iterable[str]
+            The names of the columns to drop.
+        """
+        await self._inner.drop_columns(columns)
+
    async def version(self) -> int:
        """
        Retrieve the version of the table
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -1292,6 +1292,19 @@ def test_add_columns(tmp_path):
    assert table.to_arrow().column_names == ["id", "new_col"]
    assert table.to_arrow()["new_col"].to_pylist() == [2, 3]

+    table.add_columns({"null_int": "cast(null as bigint)"})
+    assert table.schema.field("null_int").type == pa.int64()
+
+
+@pytest.mark.asyncio
+async def test_add_columns_async(db_async: AsyncConnection):
+    data = pa.table({"id": [0, 1]})
+    table = await db_async.create_table("my_table", data=data)
+    await table.add_columns({"new_col": "id + 2"})
+    data = await table.to_arrow()
+    assert data.column_names == ["id", "new_col"]
+    assert data["new_col"].to_pylist() == [2, 3]
+

 def test_alter_columns(tmp_path):
    db = lancedb.connect(tmp_path)
@@ -1301,6 +1314,18 @@ def test_alter_columns(tmp_path):
    assert table.to_arrow().column_names == ["new_id"]


+@pytest.mark.asyncio
+async def test_alter_columns_async(db_async: AsyncConnection):
+    data = pa.table({"id": [0, 1]})
+    table = await db_async.create_table("my_table", data=data)
+    await table.alter_columns({"path": "id", "rename": "new_id"})
+    assert (await table.to_arrow()).column_names == ["new_id"]
+    await table.alter_columns(dict(path="new_id", data_type=pa.int16(), nullable=True))
+    data = await table.to_arrow()
+    assert data.column(0).type == pa.int16()
+    assert data.schema.field(0).nullable
+
+
 def test_drop_columns(tmp_path):
    db = lancedb.connect(tmp_path)
    data = pa.table({"id": [0, 1], "category": ["a", "b"]})
@@ -1309,6 +1334,14 @@ def test_drop_columns(tmp_path):
    assert table.to_arrow().column_names == ["id"]


+@pytest.mark.asyncio
+async def test_drop_columns_async(db_async: AsyncConnection):
+    data = pa.table({"id": [0, 1], "category": ["a", "b"]})
+    table = await db_async.create_table("my_table", data=data)
+    await table.drop_columns(["category"])
+    assert (await table.to_arrow()).column_names == ["id"]
+
+
@pytest.mark.asyncio
 async def test_time_travel(db_async: AsyncConnection):
    # Setup
--- a/python/src/table.rs
+++ b/python/src/table.rs
@@ -1,14 +1,18 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
 use arrow::{
+    datatypes::DataType,
    ffi_stream::ArrowArrayStreamReader,
    pyarrow::{FromPyArrow, ToPyArrow},
 };
 use lancedb::table::{
-    AddDataMode, Duration, OptimizeAction, OptimizeOptions, Table as LanceDbTable,
+    AddDataMode, ColumnAlteration, Duration, NewColumnTransform, OptimizeAction, OptimizeOptions,
+    Table as LanceDbTable,
 };
 use pyo3::{
    exceptions::{PyRuntimeError, PyValueError},
    pyclass, pymethods,
-    types::{IntoPyDict, PyDict, PyDictMethods, PyString},
+    types::{IntoPyDict, PyAnyMethods, PyDict, PyDictMethods, PyString},
    Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
 };
 use pyo3_asyncio_0_21::tokio::future_into_py;
@@ -406,6 +410,72 @@ impl Table {
                .infer_error()
        })
    }
+
+    pub fn add_columns(
+        self_: PyRef<'_, Self>,
+        definitions: Vec<(String, String)>,
+    ) -> PyResult<Bound<'_, PyAny>> {
+        let definitions = NewColumnTransform::SqlExpressions(definitions);
+
+        let inner = self_.inner_ref()?.clone();
+        future_into_py(self_.py(), async move {
+            inner.add_columns(definitions, None).await.infer_error()?;
+            Ok(())
+        })
+    }
+
+    pub fn alter_columns<'a>(
+        self_: PyRef<'a, Self>,
+        alterations: Vec<Bound<PyDict>>,
+    ) -> PyResult<Bound<'a, PyAny>> {
+        let alterations = alterations
+            .iter()
+            .map(|alteration| {
+                let path = alteration
+                    .get_item("path")?
+                    .ok_or_else(|| PyValueError::new_err("Missing path"))?
+                    .extract()?;
+                let rename = {
+                    // We prefer rename, but support name for backwards compatibility
+                    let rename = if let Ok(Some(rename)) = alteration.get_item("rename") {
+                        Some(rename)
+                    } else {
+                        alteration.get_item("name")?
+                    };
+                    rename.map(|name| name.extract()).transpose()?
+                };
+                let nullable = alteration
+                    .get_item("nullable")?
+                    .map(|val| val.extract())
+                    .transpose()?;
+                let data_type = alteration
+                    .get_item("data_type")?
+                    .map(|val| DataType::from_pyarrow_bound(&val))
+                    .transpose()?;
+                Ok(ColumnAlteration {
+                    path,
+                    rename,
+                    nullable,
+                    data_type,
+                })
+            })
+            .collect::<PyResult<Vec<_>>>()?;
+
+        let inner = self_.inner_ref()?.clone();
+        future_into_py(self_.py(), async move {
+            inner.alter_columns(&alterations).await.infer_error()?;
+            Ok(())
+        })
+    }
+
+    pub fn drop_columns(self_: PyRef<Self>, columns: Vec<String>) -> PyResult<Bound<PyAny>> {
+        let inner = self_.inner_ref()?.clone();
+        future_into_py(self_.py(), async move {
+            let column_refs = columns.iter().map(String::as_str).collect::<Vec<&str>>();
+            inner.drop_columns(&column_refs).await.infer_error()?;
+            Ok(())
+        })
+    }
 }

 #[derive(FromPyObject)]