feat: {add|alter|drop}_columns APIs (#1015)

Initial work for #959. This exposes the basic functionality for each in
all of the APIs. Will add user guide documentation in a later PR.
This commit is contained in:
Will Jones
2024-02-26 11:04:53 -08:00
committed by Weston Pace
parent ad1e81a1d1
commit 464a36ad38
14 changed files with 660 additions and 11 deletions

View File

@@ -15,7 +15,7 @@ import logging
import uuid
from concurrent.futures import Future
from functools import cached_property
from typing import Dict, Optional, Union
from typing import Dict, Iterable, Optional, Union
import pyarrow as pa
from lance import json_to_schema
@@ -473,6 +473,21 @@ class RemoteTable(Table):
"count_rows() is not yet supported on the LanceDB cloud"
)
def add_columns(self, transforms: Dict[str, str]):
raise NotImplementedError(
"add_columns() is not yet supported on the LanceDB cloud"
)
def alter_columns(self, alterations: Iterable[Dict[str, str]]):
raise NotImplementedError(
"alter_columns() is not yet supported on the LanceDB cloud"
)
def drop_columns(self, columns: Iterable[str]):
raise NotImplementedError(
"drop_columns() is not yet supported on the LanceDB cloud"
)
def add_index(tbl: pa.Table, i: int) -> pa.Table:
return tbl.add_column(

View File

@@ -659,6 +659,56 @@ class Table(ABC):
For most cases, the default should be fine.
"""
@abstractmethod
def add_columns(self, transforms: Dict[str, str]):
"""
Add new columns with defined values.
This is not yet available in LanceDB Cloud.
Parameters
----------
transforms: Dict[str, str]
A map of column name to a SQL expression to use to calculate the
value of the new column. These expressions will be evaluated for
each row in the table, and can reference existing columns.
"""
@abstractmethod
def alter_columns(self, alterations: Iterable[Dict[str, str]]):
"""
Alter column names and nullability.
This is not yet available in LanceDB Cloud.
alterations : Iterable[Dict[str, Any]]
A sequence of dictionaries, each with the following keys:
- "path": str
The column path to alter. For a top-level column, this is the name.
For a nested column, this is the dot-separated path, e.g. "a.b.c".
- "name": str, optional
The new name of the column. If not specified, the column name is
not changed.
- "nullable": bool, optional
Whether the column should be nullable. If not specified, the column
nullability is not changed. Only non-nullable columns can be changed
to nullable. Currently, you cannot change a nullable column to
non-nullable.
"""
@abstractmethod
def drop_columns(self, columns: Iterable[str]):
"""
Drop columns from the table.
This is not yet available in LanceDB Cloud.
Parameters
----------
columns : Iterable[str]
The names of the columns to drop.
"""
class _LanceDatasetRef(ABC):
@property
@@ -1528,6 +1578,22 @@ class LanceTable(Table):
"""
return self.to_lance().optimize.compact_files(*args, **kwargs)
def add_columns(self, transforms: Dict[str, str]):
self._dataset_mut.add_columns(transforms)
def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
modified = []
# I called this name in pylance, but I think I regret that now. So we
# allow both name and rename.
for alter in alterations:
if "rename" in alter:
alter["name"] = alter.pop("rename")
modified.append(alter)
self._dataset_mut.alter_columns(*modified)
def drop_columns(self, columns: Iterable[str]):
self._dataset_mut.drop_columns(columns)
def _sanitize_schema(
data: pa.Table,

View File

@@ -56,7 +56,7 @@ embeddings = ["openai>=1.6.1", "sentence-transformers", "torch", "pillow", "open
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
[tool.ruff]
[tool.ruff.lint]
select = ["F", "E", "W", "I", "G", "TCH", "PERF"]
[tool.pytest.ini_options]

View File

@@ -898,3 +898,29 @@ def test_restore_consistency(tmp_path):
table.add([{"id": 2}])
assert table_fixed.version == table.version - 1
assert table_ref_latest.version == table.version
# Schema evolution
def test_add_columns(tmp_path):
db = lancedb.connect(tmp_path)
data = pa.table({"id": [0, 1]})
table = LanceTable.create(db, "my_table", data=data)
table.add_columns({"new_col": "id + 2"})
assert table.to_arrow().column_names == ["id", "new_col"]
assert table.to_arrow()["new_col"].to_pylist() == [2, 3]
def test_alter_columns(tmp_path):
db = lancedb.connect(tmp_path)
data = pa.table({"id": [0, 1]})
table = LanceTable.create(db, "my_table", data=data)
table.alter_columns({"path": "id", "rename": "new_id"})
assert table.to_arrow().column_names == ["new_id"]
def test_drop_columns(tmp_path):
db = lancedb.connect(tmp_path)
data = pa.table({"id": [0, 1], "category": ["a", "b"]})
table = LanceTable.create(db, "my_table", data=data)
table.drop_columns(["category"])
assert table.to_arrow().column_names == ["id"]