mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-24 15:30:38 +00:00
feat: hook up new writer for insert (#3029)
This hooks up a new writer implementation for the `add()` method. The main immediate benefit is it allows streaming requests to remote tables, and at the same time allowing retries for most inputs. In NodeJS, we always convert the data to `Vec<RecordBatch>`, so it's always retry-able. For Python, all are retry-able, except `Iterator` and `pa.RecordBatchReader`, which can only be consumed once. Some, like `pa.datasets.Dataset` are retry-able *and* streaming. A lot of the changes here are to make the new DataFusion write pipeline maintain the same behavior as the existing Python-based preprocessing, such as: * casting input data to target schema * rejecting NaN values if `on_bad_vectors="error"` * applying embedding functions. In future PRs, we'll enhance these by moving the embedding calls into DataFusion and making sure we parallelize them. See: https://github.com/lancedb/lancedb/issues/3048 --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -810,7 +810,7 @@ def test_create_index_name_and_train_parameters(
|
||||
)
|
||||
|
||||
|
||||
def test_add_with_nans(mem_db: DBConnection):
|
||||
def test_create_with_nans(mem_db: DBConnection):
|
||||
# by default we raise an error on bad input vectors
|
||||
bad_data = [
|
||||
{"vector": [np.nan], "item": "bar", "price": 20.0},
|
||||
@@ -854,6 +854,57 @@ def test_add_with_nans(mem_db: DBConnection):
|
||||
assert np.allclose(v, np.array([0.0, 0.0]))
|
||||
|
||||
|
||||
def test_add_with_nans(mem_db: DBConnection):
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||
pa.field("item", pa.string(), nullable=True),
|
||||
pa.field("price", pa.float64(), nullable=False),
|
||||
],
|
||||
)
|
||||
table = mem_db.create_table("test", schema=schema)
|
||||
# by default we raise an error on bad input vectors
|
||||
bad_data = [
|
||||
{"vector": [np.nan], "item": "bar", "price": 20.0},
|
||||
{"vector": [5], "item": "bar", "price": 20.0},
|
||||
{"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
|
||||
{"vector": [np.nan, 5.0], "item": "bar", "price": 20.0},
|
||||
]
|
||||
for row in bad_data:
|
||||
with pytest.raises(ValueError):
|
||||
table.add(
|
||||
data=[row],
|
||||
)
|
||||
|
||||
table.add(
|
||||
[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [2.1, 4.1], "item": "foo", "price": 9.0},
|
||||
{"vector": [np.nan], "item": "bar", "price": 20.0},
|
||||
{"vector": [5], "item": "bar", "price": 20.0},
|
||||
{"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
|
||||
],
|
||||
on_bad_vectors="drop",
|
||||
)
|
||||
assert len(table) == 2
|
||||
table.delete("true")
|
||||
|
||||
# We can fill bad input with some value
|
||||
table.add(
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [np.nan], "item": "bar", "price": 20.0},
|
||||
{"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
|
||||
],
|
||||
on_bad_vectors="fill",
|
||||
fill_value=0.0,
|
||||
)
|
||||
assert len(table) == 3
|
||||
arrow_tbl = table.search().where("item == 'bar'").to_arrow()
|
||||
v = arrow_tbl["vector"].to_pylist()[0]
|
||||
assert np.allclose(v, np.array([0.0, 0.0]))
|
||||
|
||||
|
||||
def test_restore(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"my_table",
|
||||
|
||||
Reference in New Issue
Block a user