feat(python)!: support inserting and upserting subschemas (#1965)

BREAKING CHANGE: For a field "vector", list of integers will now be
converted to binary (uint8) vectors instead of f32 vectors. Use float
values instead for f32 vectors.

* Adds proper support for inserting and upserting subsets of the full
schema. I thought I had previously implemented this in #1827, but it
turns out I had not tested carefully enough.
* Refactors `_santize_data` and other utility functions to be simpler
and not require `numpy` or `combine_chunks()`.
* Added a new suite of unit tests to validate sanitization utilities.

## Examples

```python
import pandas as pd
import lancedb

db = lancedb.connect("memory://demo")
intial_data = pd.DataFrame({
    "a": [1, 2, 3],
    "b": [4, 5, 6],
    "c": [7, 8, 9]
})
table = db.create_table("demo", intial_data)

# Insert a subschema
new_data = pd.DataFrame({"a": [10, 11]})
table.add(new_data)
table.to_pandas()
```
```
    a    b    c
0   1  4.0  7.0
1   2  5.0  8.0
2   3  6.0  9.0
3  10  NaN  NaN
4  11  NaN  NaN
```


```python
# Upsert a subschema
upsert_data = pd.DataFrame({
    "a": [3, 10, 15],
    "b": [6, 7, 8],
})
table.merge_insert(on="a").when_matched_update_all().when_not_matched_insert_all().execute(upsert_data)
table.to_pandas()
```
```
    a    b    c
0   1  4.0  7.0
1   2  5.0  8.0
2   3  6.0  9.0
3  10  7.0  NaN
4  11  NaN  NaN
5  15  8.0  NaN
```
This commit is contained in:
Will Jones
2025-01-08 10:11:10 -08:00
committed by GitHub
parent 3c0a64be8f
commit c557e77f09
10 changed files with 874 additions and 292 deletions

View File

@@ -242,8 +242,8 @@ def test_add_subschema(mem_db: DBConnection):
data = {"price": 10.0, "item": "foo"}
table.add([data])
data = {"price": 2.0, "vector": [3.1, 4.1]}
table.add([data])
data = pd.DataFrame({"price": [2.0], "vector": [[3.1, 4.1]]})
table.add(data)
data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
table.add([data])
@@ -259,7 +259,7 @@ def test_add_subschema(mem_db: DBConnection):
data = {"item": "foo"}
# We can't omit a column if it's not nullable
with pytest.raises(RuntimeError, match="Invalid user input"):
with pytest.raises(RuntimeError, match="Append with different schema"):
table.add([data])
# We can add it if we make the column nullable
@@ -292,6 +292,7 @@ def test_add_nullability(mem_db: DBConnection):
]
)
table = mem_db.create_table("test", schema=schema)
assert table.schema.field("vector").nullable is False
nullable_schema = pa.schema(
[
@@ -320,7 +321,10 @@ def test_add_nullability(mem_db: DBConnection):
schema=nullable_schema,
)
# We can't add nullable schema if it contains nulls
with pytest.raises(Exception, match="Vector column vector has NaNs"):
with pytest.raises(
Exception,
match="Casting field 'vector' with null values to non-nullable",
):
table.add(data)
# But we can make it nullable
@@ -776,6 +780,38 @@ def test_merge_insert(mem_db: DBConnection):
assert table.to_arrow().sort_by("a") == expected
# We vary the data format because there are slight differences in how
# subschemas are handled in different formats
@pytest.mark.parametrize(
"data_format",
[
lambda table: table,
lambda table: table.to_pandas(),
lambda table: table.to_pylist(),
],
ids=["pa.Table", "pd.DataFrame", "rows"],
)
def test_merge_insert_subschema(mem_db: DBConnection, data_format):
initial_data = pa.table(
{"id": range(3), "a": [1.0, 2.0, 3.0], "c": ["x", "x", "x"]}
)
table = mem_db.create_table("my_table", data=initial_data)
new_data = pa.table({"id": [2, 3], "c": ["y", "y"]})
new_data = data_format(new_data)
(
table.merge_insert(on="id")
.when_matched_update_all()
.when_not_matched_insert_all()
.execute(new_data)
)
expected = pa.table(
{"id": [0, 1, 2, 3], "a": [1.0, 2.0, 3.0, None], "c": ["x", "x", "y", "y"]}
)
assert table.to_arrow().sort_by("id") == expected
@pytest.mark.asyncio
async def test_merge_insert_async(mem_db_async: AsyncConnection):
data = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})