feat: schema evolution APIs in all SDKs (#1851)

* Support `add_columns`, `alter_columns`, `drop_columns` in Remote SDK
and async Python
* Add `data_type` parameter to node
* Docs updates
This commit is contained in:
Will Jones
2024-12-04 14:47:50 -08:00
committed by GitHub
parent bd82e1f66d
commit 79eaa52184
10 changed files with 535 additions and 44 deletions

View File

@@ -490,19 +490,13 @@ class RemoteTable(Table):
return LOOP.run(self._table.count_rows(filter))
def add_columns(self, transforms: Dict[str, str]):
raise NotImplementedError(
"add_columns() is not yet supported on the LanceDB cloud"
)
return LOOP.run(self._table.add_columns(transforms))
def alter_columns(self, alterations: Iterable[Dict[str, str]]):
raise NotImplementedError(
"alter_columns() is not yet supported on the LanceDB cloud"
)
def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
return LOOP.run(self._table.alter_columns(*alterations))
def drop_columns(self, columns: Iterable[str]):
raise NotImplementedError(
"drop_columns() is not yet supported on the LanceDB cloud"
)
return LOOP.run(self._table.drop_columns(columns))
def add_index(tbl: pa.Table, i: int) -> pa.Table:

View File

@@ -967,8 +967,6 @@ class Table(ABC):
"""
Add new columns with defined values.
This is not yet available in LanceDB Cloud.
Parameters
----------
transforms: Dict[str, str]
@@ -978,20 +976,21 @@ class Table(ABC):
"""
@abstractmethod
def alter_columns(self, alterations: Iterable[Dict[str, str]]):
def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
"""
Alter column names and nullability.
This is not yet available in LanceDB Cloud.
alterations : Iterable[Dict[str, Any]]
A sequence of dictionaries, each with the following keys:
- "path": str
The column path to alter. For a top-level column, this is the name.
For a nested column, this is the dot-separated path, e.g. "a.b.c".
- "name": str, optional
- "rename": str, optional
The new name of the column. If not specified, the column name is
not changed.
- "data_type": pyarrow.DataType, optional
The new data type of the column. Existing values will be casted
to this type. If not specified, the column data type is not changed.
- "nullable": bool, optional
Whether the column should be nullable. If not specified, the column
nullability is not changed. Only non-nullable columns can be changed
@@ -1004,8 +1003,6 @@ class Table(ABC):
"""
Drop columns from the table.
This is not yet available in LanceDB Cloud.
Parameters
----------
columns : Iterable[str]
@@ -2923,6 +2920,53 @@ class AsyncTable:
return await self._inner.update(updates_sql, where)
async def add_columns(self, transforms: Dict[str, str]):
"""
Add new columns with defined values.
Parameters
----------
transforms: Dict[str, str]
A map of column name to a SQL expression to use to calculate the
value of the new column. These expressions will be evaluated for
each row in the table, and can reference existing columns.
"""
await self._inner.add_columns(list(transforms.items()))
async def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
"""
Alter column names and nullability.
alterations : Iterable[Dict[str, Any]]
A sequence of dictionaries, each with the following keys:
- "path": str
The column path to alter. For a top-level column, this is the name.
For a nested column, this is the dot-separated path, e.g. "a.b.c".
- "rename": str, optional
The new name of the column. If not specified, the column name is
not changed.
- "data_type": pyarrow.DataType, optional
The new data type of the column. Existing values will be casted
to this type. If not specified, the column data type is not changed.
- "nullable": bool, optional
Whether the column should be nullable. If not specified, the column
nullability is not changed. Only non-nullable columns can be changed
to nullable. Currently, you cannot change a nullable column to
non-nullable.
"""
await self._inner.alter_columns(alterations)
async def drop_columns(self, columns: Iterable[str]):
"""
Drop columns from the table.
Parameters
----------
columns : Iterable[str]
The names of the columns to drop.
"""
await self._inner.drop_columns(columns)
async def version(self) -> int:
"""
Retrieve the version of the table

View File

@@ -1292,6 +1292,19 @@ def test_add_columns(tmp_path):
assert table.to_arrow().column_names == ["id", "new_col"]
assert table.to_arrow()["new_col"].to_pylist() == [2, 3]
table.add_columns({"null_int": "cast(null as bigint)"})
assert table.schema.field("null_int").type == pa.int64()
@pytest.mark.asyncio
async def test_add_columns_async(db_async: AsyncConnection):
data = pa.table({"id": [0, 1]})
table = await db_async.create_table("my_table", data=data)
await table.add_columns({"new_col": "id + 2"})
data = await table.to_arrow()
assert data.column_names == ["id", "new_col"]
assert data["new_col"].to_pylist() == [2, 3]
def test_alter_columns(tmp_path):
db = lancedb.connect(tmp_path)
@@ -1301,6 +1314,18 @@ def test_alter_columns(tmp_path):
assert table.to_arrow().column_names == ["new_id"]
@pytest.mark.asyncio
async def test_alter_columns_async(db_async: AsyncConnection):
data = pa.table({"id": [0, 1]})
table = await db_async.create_table("my_table", data=data)
await table.alter_columns({"path": "id", "rename": "new_id"})
assert (await table.to_arrow()).column_names == ["new_id"]
await table.alter_columns(dict(path="new_id", data_type=pa.int16(), nullable=True))
data = await table.to_arrow()
assert data.column(0).type == pa.int16()
assert data.schema.field(0).nullable
def test_drop_columns(tmp_path):
db = lancedb.connect(tmp_path)
data = pa.table({"id": [0, 1], "category": ["a", "b"]})
@@ -1309,6 +1334,14 @@ def test_drop_columns(tmp_path):
assert table.to_arrow().column_names == ["id"]
@pytest.mark.asyncio
async def test_drop_columns_async(db_async: AsyncConnection):
data = pa.table({"id": [0, 1], "category": ["a", "b"]})
table = await db_async.create_table("my_table", data=data)
await table.drop_columns(["category"])
assert (await table.to_arrow()).column_names == ["id"]
@pytest.mark.asyncio
async def test_time_travel(db_async: AsyncConnection):
# Setup

View File

@@ -1,14 +1,18 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
use arrow::{
datatypes::DataType,
ffi_stream::ArrowArrayStreamReader,
pyarrow::{FromPyArrow, ToPyArrow},
};
use lancedb::table::{
AddDataMode, Duration, OptimizeAction, OptimizeOptions, Table as LanceDbTable,
AddDataMode, ColumnAlteration, Duration, NewColumnTransform, OptimizeAction, OptimizeOptions,
Table as LanceDbTable,
};
use pyo3::{
exceptions::{PyRuntimeError, PyValueError},
pyclass, pymethods,
types::{IntoPyDict, PyDict, PyDictMethods, PyString},
types::{IntoPyDict, PyAnyMethods, PyDict, PyDictMethods, PyString},
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
};
use pyo3_asyncio_0_21::tokio::future_into_py;
@@ -406,6 +410,72 @@ impl Table {
.infer_error()
})
}
pub fn add_columns(
self_: PyRef<'_, Self>,
definitions: Vec<(String, String)>,
) -> PyResult<Bound<'_, PyAny>> {
let definitions = NewColumnTransform::SqlExpressions(definitions);
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
inner.add_columns(definitions, None).await.infer_error()?;
Ok(())
})
}
pub fn alter_columns<'a>(
self_: PyRef<'a, Self>,
alterations: Vec<Bound<PyDict>>,
) -> PyResult<Bound<'a, PyAny>> {
let alterations = alterations
.iter()
.map(|alteration| {
let path = alteration
.get_item("path")?
.ok_or_else(|| PyValueError::new_err("Missing path"))?
.extract()?;
let rename = {
// We prefer rename, but support name for backwards compatibility
let rename = if let Ok(Some(rename)) = alteration.get_item("rename") {
Some(rename)
} else {
alteration.get_item("name")?
};
rename.map(|name| name.extract()).transpose()?
};
let nullable = alteration
.get_item("nullable")?
.map(|val| val.extract())
.transpose()?;
let data_type = alteration
.get_item("data_type")?
.map(|val| DataType::from_pyarrow_bound(&val))
.transpose()?;
Ok(ColumnAlteration {
path,
rename,
nullable,
data_type,
})
})
.collect::<PyResult<Vec<_>>>()?;
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
inner.alter_columns(&alterations).await.infer_error()?;
Ok(())
})
}
pub fn drop_columns(self_: PyRef<Self>, columns: Vec<String>) -> PyResult<Bound<PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
let column_refs = columns.iter().map(String::as_str).collect::<Vec<&str>>();
inner.drop_columns(&column_refs).await.infer_error()?;
Ok(())
})
}
}
#[derive(FromPyObject)]