feat: hook up new writer for insert (#3029)

This hooks up a new writer implementation for the `add()` method. The
main immediate benefit is it allows streaming requests to remote tables,
and at the same time allowing retries for most inputs.

In NodeJS, we always convert the data to `Vec<RecordBatch>`, so it's
always retry-able.

For Python, all are retry-able, except `Iterator` and
`pa.RecordBatchReader`, which can only be consumed once. Some, like
`pa.datasets.Dataset` are retry-able *and* streaming.

A lot of the changes here are to make the new DataFusion write pipeline
maintain the same behavior as the existing Python-based preprocessing,
such as:

* casting input data to target schema
* rejecting NaN values if `on_bad_vectors="error"`
* applying embedding functions.

In future PRs, we'll enhance these by moving the embedding calls into
DataFusion and making sure we parallelize them. See:
https://github.com/lancedb/lancedb/issues/3048

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Will Jones
2026-02-23 14:43:31 -08:00
committed by GitHub
parent 367262662d
commit 0e486511fa
20 changed files with 2446 additions and 359 deletions

View File

@@ -810,7 +810,7 @@ def test_create_index_name_and_train_parameters(
)
def test_add_with_nans(mem_db: DBConnection):
def test_create_with_nans(mem_db: DBConnection):
# by default we raise an error on bad input vectors
bad_data = [
{"vector": [np.nan], "item": "bar", "price": 20.0},
@@ -854,6 +854,57 @@ def test_add_with_nans(mem_db: DBConnection):
assert np.allclose(v, np.array([0.0, 0.0]))
def test_add_with_nans(mem_db: DBConnection):
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
pa.field("item", pa.string(), nullable=True),
pa.field("price", pa.float64(), nullable=False),
],
)
table = mem_db.create_table("test", schema=schema)
# by default we raise an error on bad input vectors
bad_data = [
{"vector": [np.nan], "item": "bar", "price": 20.0},
{"vector": [5], "item": "bar", "price": 20.0},
{"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
{"vector": [np.nan, 5.0], "item": "bar", "price": 20.0},
]
for row in bad_data:
with pytest.raises(ValueError):
table.add(
data=[row],
)
table.add(
[
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [2.1, 4.1], "item": "foo", "price": 9.0},
{"vector": [np.nan], "item": "bar", "price": 20.0},
{"vector": [5], "item": "bar", "price": 20.0},
{"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
],
on_bad_vectors="drop",
)
assert len(table) == 2
table.delete("true")
# We can fill bad input with some value
table.add(
data=[
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [np.nan], "item": "bar", "price": 20.0},
{"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
],
on_bad_vectors="fill",
fill_value=0.0,
)
assert len(table) == 3
arrow_tbl = table.search().where("item == 'bar'").to_arrow()
v = arrow_tbl["vector"].to_pylist()[0]
assert np.allclose(v, np.array([0.0, 0.0]))
def test_restore(mem_db: DBConnection):
table = mem_db.create_table(
"my_table",