mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 22:59:57 +00:00
feat(python)!: support inserting and upserting subschemas (#1965)
BREAKING CHANGE: For a field "vector", list of integers will now be converted to binary (uint8) vectors instead of f32 vectors. Use float values instead for f32 vectors. * Adds proper support for inserting and upserting subsets of the full schema. I thought I had previously implemented this in #1827, but it turns out I had not tested carefully enough. * Refactors `_santize_data` and other utility functions to be simpler and not require `numpy` or `combine_chunks()`. * Added a new suite of unit tests to validate sanitization utilities. ## Examples ```python import pandas as pd import lancedb db = lancedb.connect("memory://demo") intial_data = pd.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9] }) table = db.create_table("demo", intial_data) # Insert a subschema new_data = pd.DataFrame({"a": [10, 11]}) table.add(new_data) table.to_pandas() ``` ``` a b c 0 1 4.0 7.0 1 2 5.0 8.0 2 3 6.0 9.0 3 10 NaN NaN 4 11 NaN NaN ``` ```python # Upsert a subschema upsert_data = pd.DataFrame({ "a": [3, 10, 15], "b": [6, 7, 8], }) table.merge_insert(on="a").when_matched_update_all().when_not_matched_insert_all().execute(upsert_data) table.to_pandas() ``` ``` a b c 0 1 4.0 7.0 1 2 5.0 8.0 2 3 6.0 9.0 3 10 7.0 NaN 4 11 NaN NaN 5 15 8.0 NaN ```
This commit is contained in:
@@ -242,8 +242,8 @@ def test_add_subschema(mem_db: DBConnection):
|
||||
|
||||
data = {"price": 10.0, "item": "foo"}
|
||||
table.add([data])
|
||||
data = {"price": 2.0, "vector": [3.1, 4.1]}
|
||||
table.add([data])
|
||||
data = pd.DataFrame({"price": [2.0], "vector": [[3.1, 4.1]]})
|
||||
table.add(data)
|
||||
data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
|
||||
table.add([data])
|
||||
|
||||
@@ -259,7 +259,7 @@ def test_add_subschema(mem_db: DBConnection):
|
||||
|
||||
data = {"item": "foo"}
|
||||
# We can't omit a column if it's not nullable
|
||||
with pytest.raises(RuntimeError, match="Invalid user input"):
|
||||
with pytest.raises(RuntimeError, match="Append with different schema"):
|
||||
table.add([data])
|
||||
|
||||
# We can add it if we make the column nullable
|
||||
@@ -292,6 +292,7 @@ def test_add_nullability(mem_db: DBConnection):
|
||||
]
|
||||
)
|
||||
table = mem_db.create_table("test", schema=schema)
|
||||
assert table.schema.field("vector").nullable is False
|
||||
|
||||
nullable_schema = pa.schema(
|
||||
[
|
||||
@@ -320,7 +321,10 @@ def test_add_nullability(mem_db: DBConnection):
|
||||
schema=nullable_schema,
|
||||
)
|
||||
# We can't add nullable schema if it contains nulls
|
||||
with pytest.raises(Exception, match="Vector column vector has NaNs"):
|
||||
with pytest.raises(
|
||||
Exception,
|
||||
match="Casting field 'vector' with null values to non-nullable",
|
||||
):
|
||||
table.add(data)
|
||||
|
||||
# But we can make it nullable
|
||||
@@ -776,6 +780,38 @@ def test_merge_insert(mem_db: DBConnection):
|
||||
assert table.to_arrow().sort_by("a") == expected
|
||||
|
||||
|
||||
# We vary the data format because there are slight differences in how
|
||||
# subschemas are handled in different formats
|
||||
@pytest.mark.parametrize(
|
||||
"data_format",
|
||||
[
|
||||
lambda table: table,
|
||||
lambda table: table.to_pandas(),
|
||||
lambda table: table.to_pylist(),
|
||||
],
|
||||
ids=["pa.Table", "pd.DataFrame", "rows"],
|
||||
)
|
||||
def test_merge_insert_subschema(mem_db: DBConnection, data_format):
|
||||
initial_data = pa.table(
|
||||
{"id": range(3), "a": [1.0, 2.0, 3.0], "c": ["x", "x", "x"]}
|
||||
)
|
||||
table = mem_db.create_table("my_table", data=initial_data)
|
||||
|
||||
new_data = pa.table({"id": [2, 3], "c": ["y", "y"]})
|
||||
new_data = data_format(new_data)
|
||||
(
|
||||
table.merge_insert(on="id")
|
||||
.when_matched_update_all()
|
||||
.when_not_matched_insert_all()
|
||||
.execute(new_data)
|
||||
)
|
||||
|
||||
expected = pa.table(
|
||||
{"id": [0, 1, 2, 3], "a": [1.0, 2.0, 3.0, None], "c": ["x", "x", "y", "y"]}
|
||||
)
|
||||
assert table.to_arrow().sort_by("id") == expected
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_merge_insert_async(mem_db_async: AsyncConnection):
|
||||
data = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
|
||||
|
||||
Reference in New Issue
Block a user