feat: infer vector columns when name contains 'vector' or 'embedding' (#2547)

## Summary - Enhanced vector column detection to use substring matching instead of exact matching - Now detects columns with names containing "vector" or "embedding" (case-insensitive) - Added integer vector support to Node.js implementation (matching Python) - Comprehensive test coverage for both float and integer vector types ## Changes ### Python (`python/python/lancedb/table.py`) - Updated `_infer_target_schema()` to use substring matching with helper function `_is_vector_column()` - Preserved original field names instead of forcing "vector" - Consolidated duplicate logic for better maintainability ### Node.js (`nodejs/lancedb/arrow.ts`) - Enhanced type inference with `nameSuggestsVectorColumn()` helper function - Added `isAllIntegers()` function with performance optimization (checks first 10 elements) - Implemented integer vector support using `Uint8` type (matching Python) - Improved type safety by removing `any` usage ### Tests - **Python**: Added `test_infer_target_schema_with_vector_embedding_names()` in `test_util.py` - **Node.js**: Added comprehensive test case in `arrow.test.ts` - Both test suites cover various naming patterns and integer/float vector types ## Examples of newly supported column names: - `user_vector`, `text_embedding`, `doc_embeddings` - `my_vector_field`, `embedding_model` - `VECTOR_COL`, `Vector_Mixed` (case-insensitive) - Both float and integer arrays are properly converted to fixed-size lists ## Test plan - [x] All existing tests pass (backward compatibility maintained) - [x] New tests pass for both Python and Node.js implementations - [x] Integer vector detection works correctly in Node.js - [x] Code passes linting and formatting checks - [x] Performance optimized for large vector arrays Fixes #2546 🤖 Generated with [Claude Code](https://claude.ai/code) --------- Co-authored-by: Claude <noreply@anthropic.com>
2026-05-14 02:20:40 +00:00 · 2025-08-04 15:36:49 -07:00
parent 0a1ea1858d
commit 9d683e4f0b
4 changed files with 224 additions and 27 deletions
--- a/python/python/tests/test_util.py
+++ b/python/python/tests/test_util.py
@@ -390,6 +390,87 @@ def test_infer_target_schema():
    assert output == expected


+def test_infer_target_schema_with_vector_embedding_names():
+    """Test that _infer_target_schema detects vector columns with 'vector'/'embedding'.
+
+    This tests the enhanced column name detection for vector inference.
+    """
+
+    # Test float vectors with various naming patterns
+    example = pa.schema(
+        {
+            "user_vector": pa.list_(pa.float64()),
+            "text_embedding": pa.list_(pa.float64()),
+            "doc_embeddings": pa.list_(pa.float64()),
+            "my_vector_field": pa.list_(pa.float64()),
+            "embedding_model": pa.list_(pa.float64()),
+            "VECTOR_COL": pa.list_(pa.float64()),  # uppercase
+            "Vector_Mixed": pa.list_(pa.float64()),  # mixed case
+            "normal_list": pa.list_(pa.float64()),  # should not be converted
+        }
+    )
+    data = pa.table(
+        {
+            "user_vector": [[1.0, 2.0]],
+            "text_embedding": [[3.0, 4.0]],
+            "doc_embeddings": [[5.0, 6.0]],
+            "my_vector_field": [[7.0, 8.0]],
+            "embedding_model": [[9.0, 10.0]],
+            "VECTOR_COL": [[11.0, 12.0]],
+            "Vector_Mixed": [[13.0, 14.0]],
+            "normal_list": [[15.0, 16.0]],
+        },
+        schema=example,
+    )
+
+    expected = pa.schema(
+        {
+            "user_vector": pa.list_(pa.float32(), 2),  # converted
+            "text_embedding": pa.list_(pa.float32(), 2),  # converted
+            "doc_embeddings": pa.list_(pa.float32(), 2),  # converted
+            "my_vector_field": pa.list_(pa.float32(), 2),  # converted
+            "embedding_model": pa.list_(pa.float32(), 2),  # converted
+            "VECTOR_COL": pa.list_(pa.float32(), 2),  # converted
+            "Vector_Mixed": pa.list_(pa.float32(), 2),  # converted
+            "normal_list": pa.list_(pa.float64()),  # not converted
+        }
+    )
+
+    output, _ = _infer_target_schema(data.to_reader())
+    assert output == expected
+
+    # Test integer vectors with various naming patterns
+    example_int = pa.schema(
+        {
+            "user_vector": pa.list_(pa.int32()),
+            "text_embedding": pa.list_(pa.int64()),
+            "doc_embeddings": pa.list_(pa.int16()),
+            "normal_list": pa.list_(pa.int32()),  # should not be converted
+        }
+    )
+    data_int = pa.table(
+        {
+            "user_vector": [[1, 2]],
+            "text_embedding": [[3, 4]],
+            "doc_embeddings": [[5, 6]],
+            "normal_list": [[7, 8]],
+        },
+        schema=example_int,
+    )
+
+    expected_int = pa.schema(
+        {
+            "user_vector": pa.list_(pa.uint8(), 2),  # converted
+            "text_embedding": pa.list_(pa.uint8(), 2),  # converted
+            "doc_embeddings": pa.list_(pa.uint8(), 2),  # converted
+            "normal_list": pa.list_(pa.int32()),  # not converted
+        }
+    )
+
+    output_int, _ = _infer_target_schema(data_int.to_reader())
+    assert output_int == expected_int
+
+
@pytest.mark.parametrize(
    "data",
    [