mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-24 07:20:40 +00:00
feat: flexible null handling and insert subschemas in Python (#1827)
* Test that we can insert subschemas (omit nullable columns) in Python. * More work is needed to support this in Node. See: https://github.com/lancedb/lancedb/issues/1832 * Test that we can insert data with nullable schema but no nulls in non-nullable schema. * Add `"null"` option for `on_bad_vectors` where we fill with null if the vector is bad. * Make null values not considered bad if the field itself is nullable.
This commit is contained in:
@@ -240,6 +240,121 @@ def test_add(db):
|
||||
_add(table, schema)
|
||||
|
||||
|
||||
def test_add_subschema(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||
pa.field("item", pa.string(), nullable=True),
|
||||
pa.field("price", pa.float64(), nullable=False),
|
||||
]
|
||||
)
|
||||
table = db.create_table("test", schema=schema)
|
||||
|
||||
data = {"price": 10.0, "item": "foo"}
|
||||
table.add([data])
|
||||
data = {"price": 2.0, "vector": [3.1, 4.1]}
|
||||
table.add([data])
|
||||
data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
|
||||
table.add([data])
|
||||
|
||||
expected = pa.table(
|
||||
{
|
||||
"vector": [None, [3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", None, "bar"],
|
||||
"price": [10.0, 2.0, 3.0],
|
||||
},
|
||||
schema=schema,
|
||||
)
|
||||
assert table.to_arrow() == expected
|
||||
|
||||
data = {"item": "foo"}
|
||||
# We can't omit a column if it's not nullable
|
||||
with pytest.raises(OSError, match="Invalid user input"):
|
||||
table.add([data])
|
||||
|
||||
# We can add it if we make the column nullable
|
||||
table.alter_columns(dict(path="price", nullable=True))
|
||||
table.add([data])
|
||||
|
||||
expected_schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||
pa.field("item", pa.string(), nullable=True),
|
||||
pa.field("price", pa.float64(), nullable=True),
|
||||
]
|
||||
)
|
||||
expected = pa.table(
|
||||
{
|
||||
"vector": [None, [3.1, 4.1], [5.9, 26.5], None],
|
||||
"item": ["foo", None, "bar", "foo"],
|
||||
"price": [10.0, 2.0, 3.0, None],
|
||||
},
|
||||
schema=expected_schema,
|
||||
)
|
||||
assert table.to_arrow() == expected
|
||||
|
||||
|
||||
def test_add_nullability(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=False),
|
||||
pa.field("id", pa.string(), nullable=False),
|
||||
]
|
||||
)
|
||||
table = db.create_table("test", schema=schema)
|
||||
|
||||
nullable_schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||
pa.field("id", pa.string(), nullable=True),
|
||||
]
|
||||
)
|
||||
data = pa.table(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"id": ["foo", "bar"],
|
||||
},
|
||||
schema=nullable_schema,
|
||||
)
|
||||
# We can add nullable schema if it doesn't actually contain nulls
|
||||
table.add(data)
|
||||
|
||||
expected = data.cast(schema)
|
||||
assert table.to_arrow() == expected
|
||||
|
||||
data = pa.table(
|
||||
{
|
||||
"vector": [None],
|
||||
"id": ["baz"],
|
||||
},
|
||||
schema=nullable_schema,
|
||||
)
|
||||
# We can't add nullable schema if it contains nulls
|
||||
with pytest.raises(Exception, match="Vector column vector has NaNs"):
|
||||
table.add(data)
|
||||
|
||||
# But we can make it nullable
|
||||
table.alter_columns(dict(path="vector", nullable=True))
|
||||
table.add(data)
|
||||
|
||||
expected_schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||
pa.field("id", pa.string(), nullable=False),
|
||||
]
|
||||
)
|
||||
expected = pa.table(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5], None],
|
||||
"id": ["foo", "bar", "baz"],
|
||||
},
|
||||
schema=expected_schema,
|
||||
)
|
||||
assert table.to_arrow() == expected
|
||||
|
||||
|
||||
def test_add_pydantic_model(db):
|
||||
# https://github.com/lancedb/lancedb/issues/562
|
||||
|
||||
|
||||
Reference in New Issue
Block a user