Files
lancedb/python/python/lancedb/arrow.py
Will Jones 0e486511fa feat: hook up new writer for insert (#3029)
This hooks up a new writer implementation for the `add()` method. The
main immediate benefit is it allows streaming requests to remote tables,
and at the same time allowing retries for most inputs.

In NodeJS, we always convert the data to `Vec<RecordBatch>`, so it's
always retry-able.

For Python, all are retry-able, except `Iterator` and
`pa.RecordBatchReader`, which can only be consumed once. Some, like
`pa.datasets.Dataset` are retry-able *and* streaming.

A lot of the changes here are to make the new DataFusion write pipeline
maintain the same behavior as the existing Python-based preprocessing,
such as:

* casting input data to target schema
* rejecting NaN values if `on_bad_vectors="error"`
* applying embedding functions.

In future PRs, we'll enhance these by moving the embedding calls into
DataFusion and making sure we parallelize them. See:
https://github.com/lancedb/lancedb/issues/3048

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 14:43:31 -08:00

114 lines
3.3 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
from functools import singledispatch
from typing import List, Optional, Tuple, Union
from lancedb.pydantic import LanceModel, model_to_dict
import pyarrow as pa
from ._lancedb import RecordBatchStream
class AsyncRecordBatchReader:
"""
An async iterator over a stream of RecordBatches.
Also allows access to the schema of the stream
"""
def __init__(
self,
inner: Union[RecordBatchStream, pa.Table],
max_batch_length: Optional[int] = None,
):
"""
Attributes
----------
schema : pa.Schema
The schema of the batches produced by the stream.
Accessing the schema does not consume any data from the stream
"""
if isinstance(inner, pa.Table):
self._inner = self._async_iter_from_table(inner, max_batch_length)
self.schema: pa.Schema = inner.schema
elif isinstance(inner, RecordBatchStream):
self._inner = inner
self.schema: pa.Schema = inner.schema
else:
raise TypeError("inner must be a RecordBatchStream or a Table")
async def read_all(self) -> List[pa.RecordBatch]:
"""
Read all the record batches from the stream
This consumes the entire stream and returns a list of record batches
If there are a lot of results this may consume a lot of memory
"""
return [batch async for batch in self]
def __aiter__(self):
return self
async def __anext__(self) -> pa.RecordBatch:
return await self._inner.__anext__()
@staticmethod
async def _async_iter_from_table(
table: pa.Table, max_batch_length: Optional[int] = None
):
"""
Create an AsyncRecordBatchReader from a Table
This is useful when you have a Table that you want to iterate
over asynchronously
"""
batches = table.to_batches(max_chunksize=max_batch_length)
for batch in batches:
yield batch
def peek_reader(
reader: pa.RecordBatchReader,
) -> Tuple[pa.RecordBatch, pa.RecordBatchReader]:
if not isinstance(reader, pa.RecordBatchReader):
raise TypeError("reader must be a RecordBatchReader")
batch = reader.read_next_batch()
def all_batches():
yield batch
yield from reader
return batch, pa.RecordBatchReader.from_batches(batch.schema, all_batches())
@singledispatch
def to_arrow(data) -> pa.Table:
"""Convert a single data object to a pa.Table."""
raise NotImplementedError(f"to_arrow not implemented for type {type(data)}")
@to_arrow.register(pa.RecordBatch)
def _arrow_from_batch(data: pa.RecordBatch) -> pa.Table:
return pa.Table.from_batches([data])
@to_arrow.register(pa.Table)
def _arrow_from_table(data: pa.Table) -> pa.Table:
return data
@to_arrow.register(list)
def _arrow_from_list(data: list) -> pa.Table:
if not data:
raise ValueError("Cannot create table from empty list without a schema")
if isinstance(data[0], LanceModel):
schema = data[0].__class__.to_arrow_schema()
dicts = [model_to_dict(d) for d in data]
return pa.Table.from_pylist(dicts, schema=schema)
return pa.Table.from_pylist(data)