feat: infer vector type to float32 if integers are out of uint8 range (#2856)

## Summary
- infer integer vector columns as float32 when any value exceeds uint8
range or is negative
- keep uint8 for integer vectors within range and nulls only
- add sync/async tests covering large integer vector inference

## Testing
- ./.venv/bin/pytest python/python/tests/test_table.py -k
"large_int_vectors"
This commit is contained in:
BubbleCal
2025-12-08 17:10:25 +08:00
committed by GitHub
parent 0960e19559
commit 39a18baf59
2 changed files with 54 additions and 1 deletions

View File

@@ -46,6 +46,39 @@ def test_basic(mem_db: DBConnection):
assert table.to_arrow() == expected_data
def test_create_table_infers_large_int_vectors(mem_db: DBConnection):
data = [{"vector": [0, 300]}]
table = mem_db.create_table(
"int_vector_overflow", data=data, mode="overwrite", exist_ok=True
)
vector_field = table.schema.field("vector")
assert vector_field.type == pa.list_(pa.float32(), 2)
vector_column = table.to_arrow().column("vector")
assert vector_column.type == pa.list_(pa.float32(), 2)
assert vector_column.to_pylist() == [[0.0, 300.0]]
@pytest.mark.asyncio
async def test_create_table_async_infers_large_int_vectors(
mem_db_async: AsyncConnection,
):
data = [{"vector": [256, 257]}]
table = await mem_db_async.create_table(
"int_vector_overflow_async", data=data, mode="overwrite", exist_ok=True
)
schema = await table.schema()
assert schema.field("vector").type == pa.list_(pa.float32(), 2)
vector_column = (await table.to_arrow()).column("vector")
assert vector_column.type == pa.list_(pa.float32(), 2)
assert vector_column.to_pylist() == [[256.0, 257.0]]
def test_input_data_type(mem_db: DBConnection, tmp_path):
schema = pa.schema(
{