feat(python)!: support inserting and upserting subschemas (#1965)

BREAKING CHANGE: For a field "vector", list of integers will now be converted to binary (uint8) vectors instead of f32 vectors. Use float values instead for f32 vectors. * Adds proper support for inserting and upserting subsets of the full schema. I thought I had previously implemented this in #1827, but it turns out I had not tested carefully enough. * Refactors `_santize_data` and other utility functions to be simpler and not require `numpy` or `combine_chunks()`. * Added a new suite of unit tests to validate sanitization utilities. ## Examples ```python import pandas as pd import lancedb db = lancedb.connect("memory://demo") intial_data = pd.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9] }) table = db.create_table("demo", intial_data) # Insert a subschema new_data = pd.DataFrame({"a": [10, 11]}) table.add(new_data) table.to_pandas() ``` ``` a b c 0 1 4.0 7.0 1 2 5.0 8.0 2 3 6.0 9.0 3 10 NaN NaN 4 11 NaN NaN ``` ```python # Upsert a subschema upsert_data = pd.DataFrame({ "a": [3, 10, 15], "b": [6, 7, 8], }) table.merge_insert(on="a").when_matched_update_all().when_not_matched_insert_all().execute(upsert_data) table.to_pandas() ``` ``` a b c 0 1 4.0 7.0 1 2 5.0 8.0 2 3 6.0 9.0 3 10 7.0 NaN 4 11 NaN NaN 5 15 8.0 NaN ```
2025-12-26 22:59:57 +00:00 · 2025-01-08 10:11:10 -08:00
parent 3c0a64be8f
commit c557e77f09
10 changed files with 874 additions and 292 deletions
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -242,8 +242,8 @@ def test_add_subschema(mem_db: DBConnection):

    data = {"price": 10.0, "item": "foo"}
    table.add([data])
-    data = {"price": 2.0, "vector": [3.1, 4.1]}
-    table.add([data])
+    data = pd.DataFrame({"price": [2.0], "vector": [[3.1, 4.1]]})
+    table.add(data)
    data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
    table.add([data])

@@ -259,7 +259,7 @@ def test_add_subschema(mem_db: DBConnection):

    data = {"item": "foo"}
    # We can't omit a column if it's not nullable
-    with pytest.raises(RuntimeError, match="Invalid user input"):
+    with pytest.raises(RuntimeError, match="Append with different schema"):
        table.add([data])

    # We can add it if we make the column nullable
@@ -292,6 +292,7 @@ def test_add_nullability(mem_db: DBConnection):
        ]
    )
    table = mem_db.create_table("test", schema=schema)
+    assert table.schema.field("vector").nullable is False

    nullable_schema = pa.schema(
        [
@@ -320,7 +321,10 @@ def test_add_nullability(mem_db: DBConnection):
        schema=nullable_schema,
    )
    # We can't add nullable schema if it contains nulls
-    with pytest.raises(Exception, match="Vector column vector has NaNs"):
+    with pytest.raises(
+        Exception,
+        match="Casting field 'vector' with null values to non-nullable",
+    ):
        table.add(data)

    # But we can make it nullable
@@ -776,6 +780,38 @@ def test_merge_insert(mem_db: DBConnection):
    assert table.to_arrow().sort_by("a") == expected


+# We vary the data format because there are slight differences in how
+# subschemas are handled in different formats
+@pytest.mark.parametrize(
+    "data_format",
+    [
+        lambda table: table,
+        lambda table: table.to_pandas(),
+        lambda table: table.to_pylist(),
+    ],
+    ids=["pa.Table", "pd.DataFrame", "rows"],
+)
+def test_merge_insert_subschema(mem_db: DBConnection, data_format):
+    initial_data = pa.table(
+        {"id": range(3), "a": [1.0, 2.0, 3.0], "c": ["x", "x", "x"]}
+    )
+    table = mem_db.create_table("my_table", data=initial_data)
+
+    new_data = pa.table({"id": [2, 3], "c": ["y", "y"]})
+    new_data = data_format(new_data)
+    (
+        table.merge_insert(on="id")
+        .when_matched_update_all()
+        .when_not_matched_insert_all()
+        .execute(new_data)
+    )
+
+    expected = pa.table(
+        {"id": [0, 1, 2, 3], "a": [1.0, 2.0, 3.0, None], "c": ["x", "x", "y", "y"]}
+    )
+    assert table.to_arrow().sort_by("id") == expected
+
+
@pytest.mark.asyncio
 async def test_merge_insert_async(mem_db_async: AsyncConnection):
    data = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})