fix: handle empty list with schema in table creation (#2548)

## Summary
Fixes IndexError when creating tables with empty list data and a
provided schema. Previously, `_into_pyarrow_reader()` would attempt to
access `data[0]` on empty lists, causing an IndexError. Now properly
handles empty lists by using the provided schema.

Also adds regression tests for GitHub issues #1968 and #303 to prevent
future regressions with empty table scenarios.

## Changes
- Fix IndexError in `_into_pyarrow_reader()` for empty list + schema
case
- Add Optional[pa.Schema] parameter to handle empty data gracefully  
- Add `test_create_table_empty_list_with_schema` for the IndexError fix
- Add `test_create_empty_then_add_data` for issue #1968
- Add `test_search_empty_table` for issue #303

## Test plan
- [x] All new regression tests pass
- [x] Existing tests continue to pass
- [x] Code formatted with `make format`
This commit is contained in:
Tristan Zajonc
2025-07-24 19:23:43 -07:00
committed by GitHub
parent 050f0086b8
commit 055bf91d3e
3 changed files with 69 additions and 2 deletions

View File

@@ -102,7 +102,9 @@ if TYPE_CHECKING:
)
def _into_pyarrow_reader(data) -> pa.RecordBatchReader:
def _into_pyarrow_reader(
data, schema: Optional[pa.Schema] = None
) -> pa.RecordBatchReader:
from lancedb.dependencies import datasets
if _check_for_hugging_face(data):
@@ -123,6 +125,12 @@ def _into_pyarrow_reader(data) -> pa.RecordBatchReader:
raise ValueError("Cannot add a single dictionary to a table. Use a list.")
if isinstance(data, list):
# Handle empty list case
if not data:
if schema is None:
raise ValueError("Cannot create table from empty list without a schema")
return pa.Table.from_pylist(data, schema=schema).to_reader()
# convert to list of dict if data is a bunch of LanceModels
if isinstance(data[0], LanceModel):
schema = data[0].__class__.to_arrow_schema()
@@ -236,7 +244,7 @@ def _sanitize_data(
# 1. There might be embedding columns missing that will be added
# in the add_embeddings step.
# 2. If `allow_subschemas` is True, there might be columns missing.
reader = _into_pyarrow_reader(data)
reader = _into_pyarrow_reader(data, target_schema)
reader = _append_vector_columns(reader, target_schema, metadata=metadata)

View File

@@ -1339,3 +1339,20 @@ async def test_query_timeout_async(tmp_path):
.nearest_to([0.0, 0.0])
.to_list(timeout=timedelta(0))
)
def test_search_empty_table(mem_db):
"""Test searching on empty table should not crash
Regression test for issue #303:
https://github.com/lancedb/lancedb/issues/303
Searching on empty table produces scary error message
"""
schema = pa.schema(
[pa.field("vector", pa.list_(pa.float32(), 2)), pa.field("id", pa.int64())]
)
table = mem_db.create_table("test_empty_search", schema=schema)
# Search on empty table should return empty results, not crash
results = table.search([1.0, 2.0]).limit(5).to_list()
assert results == []

View File

@@ -1804,3 +1804,45 @@ def test_stats(mem_db: DBConnection):
},
},
}
def test_create_table_empty_list_with_schema(mem_db: DBConnection):
"""Test creating table with empty list data and schema
Regression test for IndexError: list index out of range
when calling create_table(name, data=[], schema=schema)
"""
schema = pa.schema(
[pa.field("vector", pa.list_(pa.float32(), 2)), pa.field("id", pa.int64())]
)
table = mem_db.create_table("test_empty_list", data=[], schema=schema)
assert table.count_rows() == 0
assert table.schema == schema
def test_create_table_empty_list_no_schema_error(mem_db: DBConnection):
"""Test that creating table with empty list and no schema raises error"""
with pytest.raises(
ValueError, match="Cannot create table from empty list without a schema"
):
mem_db.create_table("test_empty_no_schema", data=[])
def test_add_table_with_empty_embeddings(tmp_path):
"""Test exact scenario from issue #1968
Regression test for issue #1968:
https://github.com/lancedb/lancedb/issues/1968
"""
db = lancedb.connect(tmp_path)
class MySchema(LanceModel):
text: str
embedding: Vector(16)
table = db.create_table("test", schema=MySchema)
table.add(
[{"text": "bar", "embedding": [0.1] * 16}],
on_bad_vectors="drop",
)
assert table.count_rows() == 1