feat: infer vector columns when name contains 'vector' or 'embedding' (#2547)

## Summary

- Enhanced vector column detection to use substring matching instead of
exact matching
- Now detects columns with names containing "vector" or "embedding"
(case-insensitive)
- Added integer vector support to Node.js implementation (matching
Python)
- Comprehensive test coverage for both float and integer vector types

## Changes

### Python (`python/python/lancedb/table.py`)
- Updated `_infer_target_schema()` to use substring matching with helper
function `_is_vector_column()`
- Preserved original field names instead of forcing "vector"
- Consolidated duplicate logic for better maintainability

### Node.js (`nodejs/lancedb/arrow.ts`)
- Enhanced type inference with `nameSuggestsVectorColumn()` helper
function
- Added `isAllIntegers()` function with performance optimization (checks
first 10 elements)
- Implemented integer vector support using `Uint8` type (matching
Python)
- Improved type safety by removing `any` usage

### Tests
- **Python**: Added
`test_infer_target_schema_with_vector_embedding_names()` in
`test_util.py`
- **Node.js**: Added comprehensive test case in `arrow.test.ts`
- Both test suites cover various naming patterns and integer/float
vector types

## Examples of newly supported column names:
- `user_vector`, `text_embedding`, `doc_embeddings`
- `my_vector_field`, `embedding_model`
- `VECTOR_COL`, `Vector_Mixed` (case-insensitive)
- Both float and integer arrays are properly converted to fixed-size
lists

## Test plan
- [x] All existing tests pass (backward compatibility maintained)
- [x] New tests pass for both Python and Node.js implementations
- [x] Integer vector detection works correctly in Node.js
- [x] Code passes linting and formatting checks
- [x] Performance optimized for large vector arrays

Fixes #2546

🤖 Generated with [Claude Code](https://claude.ai/code)

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Will Jones
2025-08-04 15:36:49 -07:00
committed by GitHub
parent 0a1ea1858d
commit 9d683e4f0b
4 changed files with 224 additions and 27 deletions

View File

@@ -2926,6 +2926,12 @@ def has_nan_values(arr: Union[pa.ListArray, pa.ChunkedArray]) -> pa.BooleanArray
return pc.is_in(indices, has_nan_indices)
def _name_suggests_vector_column(field_name: str) -> bool:
"""Check if a field name indicates a vector column."""
name_lower = field_name.lower()
return "vector" in name_lower or "embedding" in name_lower
def _infer_target_schema(
reader: pa.RecordBatchReader,
) -> Tuple[pa.Schema, pa.RecordBatchReader]:
@@ -2933,35 +2939,27 @@ def _infer_target_schema(
peeked = None
for i, field in enumerate(schema):
if (
field.name == VECTOR_COLUMN_NAME
and (pa.types.is_list(field.type) or pa.types.is_large_list(field.type))
and pa.types.is_floating(field.type.value_type)
):
is_list_type = pa.types.is_list(field.type) or pa.types.is_large_list(
field.type
)
if _name_suggests_vector_column(field.name) and is_list_type:
if peeked is None:
peeked, reader = peek_reader(reader)
# Use the most common length of the list as the dimensions
dim = _modal_list_size(peeked.column(i))
new_field = pa.field(
VECTOR_COLUMN_NAME,
pa.list_(pa.float32(), dim),
nullable=field.nullable,
)
# Determine target type based on value type
if pa.types.is_floating(field.type.value_type):
target_type = pa.list_(pa.float32(), dim)
elif pa.types.is_integer(field.type.value_type):
target_type = pa.list_(pa.uint8(), dim)
else:
continue # Skip non-numeric types
schema = schema.set(i, new_field)
elif (
field.name == VECTOR_COLUMN_NAME
and (pa.types.is_list(field.type) or pa.types.is_large_list(field.type))
and pa.types.is_integer(field.type.value_type)
):
if peeked is None:
peeked, reader = peek_reader(reader)
# Use the most common length of the list as the dimensions
dim = _modal_list_size(peeked.column(i))
new_field = pa.field(
VECTOR_COLUMN_NAME,
pa.list_(pa.uint8(), dim),
field.name, # preserve original field name
target_type,
nullable=field.nullable,
)

View File

@@ -390,6 +390,87 @@ def test_infer_target_schema():
assert output == expected
def test_infer_target_schema_with_vector_embedding_names():
"""Test that _infer_target_schema detects vector columns with 'vector'/'embedding'.
This tests the enhanced column name detection for vector inference.
"""
# Test float vectors with various naming patterns
example = pa.schema(
{
"user_vector": pa.list_(pa.float64()),
"text_embedding": pa.list_(pa.float64()),
"doc_embeddings": pa.list_(pa.float64()),
"my_vector_field": pa.list_(pa.float64()),
"embedding_model": pa.list_(pa.float64()),
"VECTOR_COL": pa.list_(pa.float64()), # uppercase
"Vector_Mixed": pa.list_(pa.float64()), # mixed case
"normal_list": pa.list_(pa.float64()), # should not be converted
}
)
data = pa.table(
{
"user_vector": [[1.0, 2.0]],
"text_embedding": [[3.0, 4.0]],
"doc_embeddings": [[5.0, 6.0]],
"my_vector_field": [[7.0, 8.0]],
"embedding_model": [[9.0, 10.0]],
"VECTOR_COL": [[11.0, 12.0]],
"Vector_Mixed": [[13.0, 14.0]],
"normal_list": [[15.0, 16.0]],
},
schema=example,
)
expected = pa.schema(
{
"user_vector": pa.list_(pa.float32(), 2), # converted
"text_embedding": pa.list_(pa.float32(), 2), # converted
"doc_embeddings": pa.list_(pa.float32(), 2), # converted
"my_vector_field": pa.list_(pa.float32(), 2), # converted
"embedding_model": pa.list_(pa.float32(), 2), # converted
"VECTOR_COL": pa.list_(pa.float32(), 2), # converted
"Vector_Mixed": pa.list_(pa.float32(), 2), # converted
"normal_list": pa.list_(pa.float64()), # not converted
}
)
output, _ = _infer_target_schema(data.to_reader())
assert output == expected
# Test integer vectors with various naming patterns
example_int = pa.schema(
{
"user_vector": pa.list_(pa.int32()),
"text_embedding": pa.list_(pa.int64()),
"doc_embeddings": pa.list_(pa.int16()),
"normal_list": pa.list_(pa.int32()), # should not be converted
}
)
data_int = pa.table(
{
"user_vector": [[1, 2]],
"text_embedding": [[3, 4]],
"doc_embeddings": [[5, 6]],
"normal_list": [[7, 8]],
},
schema=example_int,
)
expected_int = pa.schema(
{
"user_vector": pa.list_(pa.uint8(), 2), # converted
"text_embedding": pa.list_(pa.uint8(), 2), # converted
"doc_embeddings": pa.list_(pa.uint8(), 2), # converted
"normal_list": pa.list_(pa.int32()), # not converted
}
)
output_int, _ = _infer_target_schema(data_int.to_reader())
assert output_int == expected_int
@pytest.mark.parametrize(
"data",
[