fix: handle empty list with schema in table creation (#2548)

## Summary
Fixes IndexError when creating tables with empty list data and a
provided schema. Previously, `_into_pyarrow_reader()` would attempt to
access `data[0]` on empty lists, causing an IndexError. Now properly
handles empty lists by using the provided schema.

Also adds regression tests for GitHub issues #1968 and #303 to prevent
future regressions with empty table scenarios.

## Changes
- Fix IndexError in `_into_pyarrow_reader()` for empty list + schema
case
- Add Optional[pa.Schema] parameter to handle empty data gracefully  
- Add `test_create_table_empty_list_with_schema` for the IndexError fix
- Add `test_create_empty_then_add_data` for issue #1968
- Add `test_search_empty_table` for issue #303

## Test plan
- [x] All new regression tests pass
- [x] Existing tests continue to pass
- [x] Code formatted with `make format`
This commit is contained in:
Tristan Zajonc
2025-07-24 19:23:43 -07:00
committed by GitHub
parent 050f0086b8
commit 055bf91d3e
3 changed files with 69 additions and 2 deletions

View File

@@ -1804,3 +1804,45 @@ def test_stats(mem_db: DBConnection):
},
},
}
def test_create_table_empty_list_with_schema(mem_db: DBConnection):
"""Test creating table with empty list data and schema
Regression test for IndexError: list index out of range
when calling create_table(name, data=[], schema=schema)
"""
schema = pa.schema(
[pa.field("vector", pa.list_(pa.float32(), 2)), pa.field("id", pa.int64())]
)
table = mem_db.create_table("test_empty_list", data=[], schema=schema)
assert table.count_rows() == 0
assert table.schema == schema
def test_create_table_empty_list_no_schema_error(mem_db: DBConnection):
"""Test that creating table with empty list and no schema raises error"""
with pytest.raises(
ValueError, match="Cannot create table from empty list without a schema"
):
mem_db.create_table("test_empty_no_schema", data=[])
def test_add_table_with_empty_embeddings(tmp_path):
"""Test exact scenario from issue #1968
Regression test for issue #1968:
https://github.com/lancedb/lancedb/issues/1968
"""
db = lancedb.connect(tmp_path)
class MySchema(LanceModel):
text: str
embedding: Vector(16)
table = db.create_table("test", schema=MySchema)
table.add(
[{"text": "bar", "embedding": [0.1] * 16}],
on_bad_vectors="drop",
)
assert table.count_rows() == 1