From 055bf91d3ebae12d7cd182dd1c3343e7ff16706b Mon Sep 17 00:00:00 2001 From: Tristan Zajonc Date: Thu, 24 Jul 2025 19:23:43 -0700 Subject: [PATCH] fix: handle empty list with schema in table creation (#2548) ## Summary Fixes IndexError when creating tables with empty list data and a provided schema. Previously, `_into_pyarrow_reader()` would attempt to access `data[0]` on empty lists, causing an IndexError. Now properly handles empty lists by using the provided schema. Also adds regression tests for GitHub issues #1968 and #303 to prevent future regressions with empty table scenarios. ## Changes - Fix IndexError in `_into_pyarrow_reader()` for empty list + schema case - Add Optional[pa.Schema] parameter to handle empty data gracefully - Add `test_create_table_empty_list_with_schema` for the IndexError fix - Add `test_create_empty_then_add_data` for issue #1968 - Add `test_search_empty_table` for issue #303 ## Test plan - [x] All new regression tests pass - [x] Existing tests continue to pass - [x] Code formatted with `make format` --- python/python/lancedb/table.py | 12 +++++++-- python/python/tests/test_query.py | 17 +++++++++++++ python/python/tests/test_table.py | 42 +++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index e47b63e6..c60582cb 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -102,7 +102,9 @@ if TYPE_CHECKING: ) -def _into_pyarrow_reader(data) -> pa.RecordBatchReader: +def _into_pyarrow_reader( + data, schema: Optional[pa.Schema] = None +) -> pa.RecordBatchReader: from lancedb.dependencies import datasets if _check_for_hugging_face(data): @@ -123,6 +125,12 @@ def _into_pyarrow_reader(data) -> pa.RecordBatchReader: raise ValueError("Cannot add a single dictionary to a table. Use a list.") if isinstance(data, list): + # Handle empty list case + if not data: + if schema is None: + raise ValueError("Cannot create table from empty list without a schema") + return pa.Table.from_pylist(data, schema=schema).to_reader() + # convert to list of dict if data is a bunch of LanceModels if isinstance(data[0], LanceModel): schema = data[0].__class__.to_arrow_schema() @@ -236,7 +244,7 @@ def _sanitize_data( # 1. There might be embedding columns missing that will be added # in the add_embeddings step. # 2. If `allow_subschemas` is True, there might be columns missing. - reader = _into_pyarrow_reader(data) + reader = _into_pyarrow_reader(data, target_schema) reader = _append_vector_columns(reader, target_schema, metadata=metadata) diff --git a/python/python/tests/test_query.py b/python/python/tests/test_query.py index 72a504c3..56e987da 100644 --- a/python/python/tests/test_query.py +++ b/python/python/tests/test_query.py @@ -1339,3 +1339,20 @@ async def test_query_timeout_async(tmp_path): .nearest_to([0.0, 0.0]) .to_list(timeout=timedelta(0)) ) + + +def test_search_empty_table(mem_db): + """Test searching on empty table should not crash + + Regression test for issue #303: + https://github.com/lancedb/lancedb/issues/303 + Searching on empty table produces scary error message + """ + schema = pa.schema( + [pa.field("vector", pa.list_(pa.float32(), 2)), pa.field("id", pa.int64())] + ) + table = mem_db.create_table("test_empty_search", schema=schema) + + # Search on empty table should return empty results, not crash + results = table.search([1.0, 2.0]).limit(5).to_list() + assert results == [] diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index 60751e64..1e88d794 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -1804,3 +1804,45 @@ def test_stats(mem_db: DBConnection): }, }, } + + +def test_create_table_empty_list_with_schema(mem_db: DBConnection): + """Test creating table with empty list data and schema + + Regression test for IndexError: list index out of range + when calling create_table(name, data=[], schema=schema) + """ + schema = pa.schema( + [pa.field("vector", pa.list_(pa.float32(), 2)), pa.field("id", pa.int64())] + ) + table = mem_db.create_table("test_empty_list", data=[], schema=schema) + assert table.count_rows() == 0 + assert table.schema == schema + + +def test_create_table_empty_list_no_schema_error(mem_db: DBConnection): + """Test that creating table with empty list and no schema raises error""" + with pytest.raises( + ValueError, match="Cannot create table from empty list without a schema" + ): + mem_db.create_table("test_empty_no_schema", data=[]) + + +def test_add_table_with_empty_embeddings(tmp_path): + """Test exact scenario from issue #1968 + + Regression test for issue #1968: + https://github.com/lancedb/lancedb/issues/1968 + """ + db = lancedb.connect(tmp_path) + + class MySchema(LanceModel): + text: str + embedding: Vector(16) + + table = db.create_table("test", schema=MySchema) + table.add( + [{"text": "bar", "embedding": [0.1] * 16}], + on_bad_vectors="drop", + ) + assert table.count_rows() == 1