mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 23:12:58 +00:00
feat: schema evolution APIs in all SDKs (#1851)
* Support `add_columns`, `alter_columns`, `drop_columns` in Remote SDK and async Python * Add `data_type` parameter to node * Docs updates
This commit is contained in:
@@ -490,19 +490,13 @@ class RemoteTable(Table):
|
||||
return LOOP.run(self._table.count_rows(filter))
|
||||
|
||||
def add_columns(self, transforms: Dict[str, str]):
|
||||
raise NotImplementedError(
|
||||
"add_columns() is not yet supported on the LanceDB cloud"
|
||||
)
|
||||
return LOOP.run(self._table.add_columns(transforms))
|
||||
|
||||
def alter_columns(self, alterations: Iterable[Dict[str, str]]):
|
||||
raise NotImplementedError(
|
||||
"alter_columns() is not yet supported on the LanceDB cloud"
|
||||
)
|
||||
def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
|
||||
return LOOP.run(self._table.alter_columns(*alterations))
|
||||
|
||||
def drop_columns(self, columns: Iterable[str]):
|
||||
raise NotImplementedError(
|
||||
"drop_columns() is not yet supported on the LanceDB cloud"
|
||||
)
|
||||
return LOOP.run(self._table.drop_columns(columns))
|
||||
|
||||
|
||||
def add_index(tbl: pa.Table, i: int) -> pa.Table:
|
||||
|
||||
@@ -967,8 +967,6 @@ class Table(ABC):
|
||||
"""
|
||||
Add new columns with defined values.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transforms: Dict[str, str]
|
||||
@@ -978,20 +976,21 @@ class Table(ABC):
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def alter_columns(self, alterations: Iterable[Dict[str, str]]):
|
||||
def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
|
||||
"""
|
||||
Alter column names and nullability.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
alterations : Iterable[Dict[str, Any]]
|
||||
A sequence of dictionaries, each with the following keys:
|
||||
- "path": str
|
||||
The column path to alter. For a top-level column, this is the name.
|
||||
For a nested column, this is the dot-separated path, e.g. "a.b.c".
|
||||
- "name": str, optional
|
||||
- "rename": str, optional
|
||||
The new name of the column. If not specified, the column name is
|
||||
not changed.
|
||||
- "data_type": pyarrow.DataType, optional
|
||||
The new data type of the column. Existing values will be casted
|
||||
to this type. If not specified, the column data type is not changed.
|
||||
- "nullable": bool, optional
|
||||
Whether the column should be nullable. If not specified, the column
|
||||
nullability is not changed. Only non-nullable columns can be changed
|
||||
@@ -1004,8 +1003,6 @@ class Table(ABC):
|
||||
"""
|
||||
Drop columns from the table.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : Iterable[str]
|
||||
@@ -2923,6 +2920,53 @@ class AsyncTable:
|
||||
|
||||
return await self._inner.update(updates_sql, where)
|
||||
|
||||
async def add_columns(self, transforms: Dict[str, str]):
|
||||
"""
|
||||
Add new columns with defined values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transforms: Dict[str, str]
|
||||
A map of column name to a SQL expression to use to calculate the
|
||||
value of the new column. These expressions will be evaluated for
|
||||
each row in the table, and can reference existing columns.
|
||||
"""
|
||||
await self._inner.add_columns(list(transforms.items()))
|
||||
|
||||
async def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
|
||||
"""
|
||||
Alter column names and nullability.
|
||||
|
||||
alterations : Iterable[Dict[str, Any]]
|
||||
A sequence of dictionaries, each with the following keys:
|
||||
- "path": str
|
||||
The column path to alter. For a top-level column, this is the name.
|
||||
For a nested column, this is the dot-separated path, e.g. "a.b.c".
|
||||
- "rename": str, optional
|
||||
The new name of the column. If not specified, the column name is
|
||||
not changed.
|
||||
- "data_type": pyarrow.DataType, optional
|
||||
The new data type of the column. Existing values will be casted
|
||||
to this type. If not specified, the column data type is not changed.
|
||||
- "nullable": bool, optional
|
||||
Whether the column should be nullable. If not specified, the column
|
||||
nullability is not changed. Only non-nullable columns can be changed
|
||||
to nullable. Currently, you cannot change a nullable column to
|
||||
non-nullable.
|
||||
"""
|
||||
await self._inner.alter_columns(alterations)
|
||||
|
||||
async def drop_columns(self, columns: Iterable[str]):
|
||||
"""
|
||||
Drop columns from the table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : Iterable[str]
|
||||
The names of the columns to drop.
|
||||
"""
|
||||
await self._inner.drop_columns(columns)
|
||||
|
||||
async def version(self) -> int:
|
||||
"""
|
||||
Retrieve the version of the table
|
||||
|
||||
@@ -1292,6 +1292,19 @@ def test_add_columns(tmp_path):
|
||||
assert table.to_arrow().column_names == ["id", "new_col"]
|
||||
assert table.to_arrow()["new_col"].to_pylist() == [2, 3]
|
||||
|
||||
table.add_columns({"null_int": "cast(null as bigint)"})
|
||||
assert table.schema.field("null_int").type == pa.int64()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_columns_async(db_async: AsyncConnection):
|
||||
data = pa.table({"id": [0, 1]})
|
||||
table = await db_async.create_table("my_table", data=data)
|
||||
await table.add_columns({"new_col": "id + 2"})
|
||||
data = await table.to_arrow()
|
||||
assert data.column_names == ["id", "new_col"]
|
||||
assert data["new_col"].to_pylist() == [2, 3]
|
||||
|
||||
|
||||
def test_alter_columns(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
@@ -1301,6 +1314,18 @@ def test_alter_columns(tmp_path):
|
||||
assert table.to_arrow().column_names == ["new_id"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alter_columns_async(db_async: AsyncConnection):
|
||||
data = pa.table({"id": [0, 1]})
|
||||
table = await db_async.create_table("my_table", data=data)
|
||||
await table.alter_columns({"path": "id", "rename": "new_id"})
|
||||
assert (await table.to_arrow()).column_names == ["new_id"]
|
||||
await table.alter_columns(dict(path="new_id", data_type=pa.int16(), nullable=True))
|
||||
data = await table.to_arrow()
|
||||
assert data.column(0).type == pa.int16()
|
||||
assert data.schema.field(0).nullable
|
||||
|
||||
|
||||
def test_drop_columns(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pa.table({"id": [0, 1], "category": ["a", "b"]})
|
||||
@@ -1309,6 +1334,14 @@ def test_drop_columns(tmp_path):
|
||||
assert table.to_arrow().column_names == ["id"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_drop_columns_async(db_async: AsyncConnection):
|
||||
data = pa.table({"id": [0, 1], "category": ["a", "b"]})
|
||||
table = await db_async.create_table("my_table", data=data)
|
||||
await table.drop_columns(["category"])
|
||||
assert (await table.to_arrow()).column_names == ["id"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_time_travel(db_async: AsyncConnection):
|
||||
# Setup
|
||||
|
||||
@@ -1,14 +1,18 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
use arrow::{
|
||||
datatypes::DataType,
|
||||
ffi_stream::ArrowArrayStreamReader,
|
||||
pyarrow::{FromPyArrow, ToPyArrow},
|
||||
};
|
||||
use lancedb::table::{
|
||||
AddDataMode, Duration, OptimizeAction, OptimizeOptions, Table as LanceDbTable,
|
||||
AddDataMode, ColumnAlteration, Duration, NewColumnTransform, OptimizeAction, OptimizeOptions,
|
||||
Table as LanceDbTable,
|
||||
};
|
||||
use pyo3::{
|
||||
exceptions::{PyRuntimeError, PyValueError},
|
||||
pyclass, pymethods,
|
||||
types::{IntoPyDict, PyDict, PyDictMethods, PyString},
|
||||
types::{IntoPyDict, PyAnyMethods, PyDict, PyDictMethods, PyString},
|
||||
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
|
||||
};
|
||||
use pyo3_asyncio_0_21::tokio::future_into_py;
|
||||
@@ -406,6 +410,72 @@ impl Table {
|
||||
.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_columns(
|
||||
self_: PyRef<'_, Self>,
|
||||
definitions: Vec<(String, String)>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let definitions = NewColumnTransform::SqlExpressions(definitions);
|
||||
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.add_columns(definitions, None).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn alter_columns<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
alterations: Vec<Bound<PyDict>>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let alterations = alterations
|
||||
.iter()
|
||||
.map(|alteration| {
|
||||
let path = alteration
|
||||
.get_item("path")?
|
||||
.ok_or_else(|| PyValueError::new_err("Missing path"))?
|
||||
.extract()?;
|
||||
let rename = {
|
||||
// We prefer rename, but support name for backwards compatibility
|
||||
let rename = if let Ok(Some(rename)) = alteration.get_item("rename") {
|
||||
Some(rename)
|
||||
} else {
|
||||
alteration.get_item("name")?
|
||||
};
|
||||
rename.map(|name| name.extract()).transpose()?
|
||||
};
|
||||
let nullable = alteration
|
||||
.get_item("nullable")?
|
||||
.map(|val| val.extract())
|
||||
.transpose()?;
|
||||
let data_type = alteration
|
||||
.get_item("data_type")?
|
||||
.map(|val| DataType::from_pyarrow_bound(&val))
|
||||
.transpose()?;
|
||||
Ok(ColumnAlteration {
|
||||
path,
|
||||
rename,
|
||||
nullable,
|
||||
data_type,
|
||||
})
|
||||
})
|
||||
.collect::<PyResult<Vec<_>>>()?;
|
||||
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.alter_columns(&alterations).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn drop_columns(self_: PyRef<Self>, columns: Vec<String>) -> PyResult<Bound<PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let column_refs = columns.iter().map(String::as_str).collect::<Vec<&str>>();
|
||||
inner.drop_columns(&column_refs).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
|
||||
Reference in New Issue
Block a user