Set default to error instead of drop (#259)

when encountering bad input data, we can default to principle of least
surprise and raise an exception.

Co-authored-by: Chang She <chang@lancedb.com>
This commit is contained in:
Chang She
2023-07-05 22:44:18 -07:00
committed by GitHub
parent bb3df62dce
commit 507eeae9c8
6 changed files with 40 additions and 43 deletions

View File

@@ -181,7 +181,21 @@ def test_create_index_method():
def test_add_with_nans(db):
# By default we drop bad input vectors
# by default we raise an error on bad input vectors
bad_data = [
{"vector": [np.nan], "item": "bar", "price": 20.0},
{"vector": [5], "item": "bar", "price": 20.0},
{"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
{"vector": [np.nan, 5.0], "item": "bar", "price": 20.0},
]
for row in bad_data:
with pytest.raises(ValueError):
LanceTable.create(
db,
"error_test",
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, row],
)
table = LanceTable.create(
db,
"drop_test",
@@ -191,6 +205,7 @@ def test_add_with_nans(db):
{"vector": [5], "item": "bar", "price": 20.0},
{"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
],
on_bad_vectors="drop",
)
assert len(table) == 1
@@ -210,18 +225,3 @@ def test_add_with_nans(db):
arrow_tbl = table.to_lance().to_table(filter="item == 'bar'")
v = arrow_tbl["vector"].to_pylist()[0]
assert np.allclose(v, np.array([0.0, 0.0]))
bad_data = [
{"vector": [np.nan], "item": "bar", "price": 20.0},
{"vector": [5], "item": "bar", "price": 20.0},
{"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
{"vector": [np.nan, 5.0], "item": "bar", "price": 20.0},
]
for row in bad_data:
with pytest.raises(ValueError):
LanceTable.create(
db,
"raise_test",
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, row],
on_bad_vectors="raise",
)