mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-14 02:20:40 +00:00
feat: infer vector columns when name contains 'vector' or 'embedding' (#2547)
## Summary - Enhanced vector column detection to use substring matching instead of exact matching - Now detects columns with names containing "vector" or "embedding" (case-insensitive) - Added integer vector support to Node.js implementation (matching Python) - Comprehensive test coverage for both float and integer vector types ## Changes ### Python (`python/python/lancedb/table.py`) - Updated `_infer_target_schema()` to use substring matching with helper function `_is_vector_column()` - Preserved original field names instead of forcing "vector" - Consolidated duplicate logic for better maintainability ### Node.js (`nodejs/lancedb/arrow.ts`) - Enhanced type inference with `nameSuggestsVectorColumn()` helper function - Added `isAllIntegers()` function with performance optimization (checks first 10 elements) - Implemented integer vector support using `Uint8` type (matching Python) - Improved type safety by removing `any` usage ### Tests - **Python**: Added `test_infer_target_schema_with_vector_embedding_names()` in `test_util.py` - **Node.js**: Added comprehensive test case in `arrow.test.ts` - Both test suites cover various naming patterns and integer/float vector types ## Examples of newly supported column names: - `user_vector`, `text_embedding`, `doc_embeddings` - `my_vector_field`, `embedding_model` - `VECTOR_COL`, `Vector_Mixed` (case-insensitive) - Both float and integer arrays are properly converted to fixed-size lists ## Test plan - [x] All existing tests pass (backward compatibility maintained) - [x] New tests pass for both Python and Node.js implementations - [x] Integer vector detection works correctly in Node.js - [x] Code passes linting and formatting checks - [x] Performance optimized for large vector arrays Fixes #2546 🤖 Generated with [Claude Code](https://claude.ai/code) --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -390,6 +390,87 @@ def test_infer_target_schema():
|
||||
assert output == expected
|
||||
|
||||
|
||||
def test_infer_target_schema_with_vector_embedding_names():
|
||||
"""Test that _infer_target_schema detects vector columns with 'vector'/'embedding'.
|
||||
|
||||
This tests the enhanced column name detection for vector inference.
|
||||
"""
|
||||
|
||||
# Test float vectors with various naming patterns
|
||||
example = pa.schema(
|
||||
{
|
||||
"user_vector": pa.list_(pa.float64()),
|
||||
"text_embedding": pa.list_(pa.float64()),
|
||||
"doc_embeddings": pa.list_(pa.float64()),
|
||||
"my_vector_field": pa.list_(pa.float64()),
|
||||
"embedding_model": pa.list_(pa.float64()),
|
||||
"VECTOR_COL": pa.list_(pa.float64()), # uppercase
|
||||
"Vector_Mixed": pa.list_(pa.float64()), # mixed case
|
||||
"normal_list": pa.list_(pa.float64()), # should not be converted
|
||||
}
|
||||
)
|
||||
data = pa.table(
|
||||
{
|
||||
"user_vector": [[1.0, 2.0]],
|
||||
"text_embedding": [[3.0, 4.0]],
|
||||
"doc_embeddings": [[5.0, 6.0]],
|
||||
"my_vector_field": [[7.0, 8.0]],
|
||||
"embedding_model": [[9.0, 10.0]],
|
||||
"VECTOR_COL": [[11.0, 12.0]],
|
||||
"Vector_Mixed": [[13.0, 14.0]],
|
||||
"normal_list": [[15.0, 16.0]],
|
||||
},
|
||||
schema=example,
|
||||
)
|
||||
|
||||
expected = pa.schema(
|
||||
{
|
||||
"user_vector": pa.list_(pa.float32(), 2), # converted
|
||||
"text_embedding": pa.list_(pa.float32(), 2), # converted
|
||||
"doc_embeddings": pa.list_(pa.float32(), 2), # converted
|
||||
"my_vector_field": pa.list_(pa.float32(), 2), # converted
|
||||
"embedding_model": pa.list_(pa.float32(), 2), # converted
|
||||
"VECTOR_COL": pa.list_(pa.float32(), 2), # converted
|
||||
"Vector_Mixed": pa.list_(pa.float32(), 2), # converted
|
||||
"normal_list": pa.list_(pa.float64()), # not converted
|
||||
}
|
||||
)
|
||||
|
||||
output, _ = _infer_target_schema(data.to_reader())
|
||||
assert output == expected
|
||||
|
||||
# Test integer vectors with various naming patterns
|
||||
example_int = pa.schema(
|
||||
{
|
||||
"user_vector": pa.list_(pa.int32()),
|
||||
"text_embedding": pa.list_(pa.int64()),
|
||||
"doc_embeddings": pa.list_(pa.int16()),
|
||||
"normal_list": pa.list_(pa.int32()), # should not be converted
|
||||
}
|
||||
)
|
||||
data_int = pa.table(
|
||||
{
|
||||
"user_vector": [[1, 2]],
|
||||
"text_embedding": [[3, 4]],
|
||||
"doc_embeddings": [[5, 6]],
|
||||
"normal_list": [[7, 8]],
|
||||
},
|
||||
schema=example_int,
|
||||
)
|
||||
|
||||
expected_int = pa.schema(
|
||||
{
|
||||
"user_vector": pa.list_(pa.uint8(), 2), # converted
|
||||
"text_embedding": pa.list_(pa.uint8(), 2), # converted
|
||||
"doc_embeddings": pa.list_(pa.uint8(), 2), # converted
|
||||
"normal_list": pa.list_(pa.int32()), # not converted
|
||||
}
|
||||
)
|
||||
|
||||
output_int, _ = _infer_target_schema(data_int.to_reader())
|
||||
assert output_int == expected_int
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
|
||||
Reference in New Issue
Block a user