feat: infer vector columns when name contains 'vector' or 'embedding' (#2547)

## Summary - Enhanced vector column detection to use substring matching instead of exact matching - Now detects columns with names containing "vector" or "embedding" (case-insensitive) - Added integer vector support to Node.js implementation (matching Python) - Comprehensive test coverage for both float and integer vector types ## Changes ### Python (`python/python/lancedb/table.py`) - Updated `_infer_target_schema()` to use substring matching with helper function `_is_vector_column()` - Preserved original field names instead of forcing "vector" - Consolidated duplicate logic for better maintainability ### Node.js (`nodejs/lancedb/arrow.ts`) - Enhanced type inference with `nameSuggestsVectorColumn()` helper function - Added `isAllIntegers()` function with performance optimization (checks first 10 elements) - Implemented integer vector support using `Uint8` type (matching Python) - Improved type safety by removing `any` usage ### Tests - **Python**: Added `test_infer_target_schema_with_vector_embedding_names()` in `test_util.py` - **Node.js**: Added comprehensive test case in `arrow.test.ts` - Both test suites cover various naming patterns and integer/float vector types ## Examples of newly supported column names: - `user_vector`, `text_embedding`, `doc_embeddings` - `my_vector_field`, `embedding_model` - `VECTOR_COL`, `Vector_Mixed` (case-insensitive) - Both float and integer arrays are properly converted to fixed-size lists ## Test plan - [x] All existing tests pass (backward compatibility maintained) - [x] New tests pass for both Python and Node.js implementations - [x] Integer vector detection works correctly in Node.js - [x] Code passes linting and formatting checks - [x] Performance optimized for large vector arrays Fixes #2546 🤖 Generated with [Claude Code](https://claude.ai/code) --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-12-26 14:49:57 +00:00 · 2025-08-04 15:36:49 -07:00
parent 0a1ea1858d
commit 9d683e4f0b
4 changed files with 224 additions and 27 deletions
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -2926,6 +2926,12 @@ def has_nan_values(arr: Union[pa.ListArray, pa.ChunkedArray]) -> pa.BooleanArray
    return pc.is_in(indices, has_nan_indices)


+def _name_suggests_vector_column(field_name: str) -> bool:
+    """Check if a field name indicates a vector column."""
+    name_lower = field_name.lower()
+    return "vector" in name_lower or "embedding" in name_lower
+
+
 def _infer_target_schema(
    reader: pa.RecordBatchReader,
 ) -> Tuple[pa.Schema, pa.RecordBatchReader]:
@@ -2933,35 +2939,27 @@ def _infer_target_schema(
    peeked = None

    for i, field in enumerate(schema):
-        if (
-            field.name == VECTOR_COLUMN_NAME
-            and (pa.types.is_list(field.type) or pa.types.is_large_list(field.type))
-            and pa.types.is_floating(field.type.value_type)
-        ):
+        is_list_type = pa.types.is_list(field.type) or pa.types.is_large_list(
+            field.type
+        )
+
+        if _name_suggests_vector_column(field.name) and is_list_type:
            if peeked is None:
                peeked, reader = peek_reader(reader)
            # Use the most common length of the list as the dimensions
            dim = _modal_list_size(peeked.column(i))

-            new_field = pa.field(
-                VECTOR_COLUMN_NAME,
-                pa.list_(pa.float32(), dim),
-                nullable=field.nullable,
-            )
+            # Determine target type based on value type
+            if pa.types.is_floating(field.type.value_type):
+                target_type = pa.list_(pa.float32(), dim)
+            elif pa.types.is_integer(field.type.value_type):
+                target_type = pa.list_(pa.uint8(), dim)
+            else:
+                continue  # Skip non-numeric types

-            schema = schema.set(i, new_field)
-        elif (
-            field.name == VECTOR_COLUMN_NAME
-            and (pa.types.is_list(field.type) or pa.types.is_large_list(field.type))
-            and pa.types.is_integer(field.type.value_type)
-        ):
-            if peeked is None:
-                peeked, reader = peek_reader(reader)
-            # Use the most common length of the list as the dimensions
-            dim = _modal_list_size(peeked.column(i))
            new_field = pa.field(
-                VECTOR_COLUMN_NAME,
-                pa.list_(pa.uint8(), dim),
+                field.name,  # preserve original field name
+                target_type,
                nullable=field.nullable,
            )

--- a/python/python/tests/test_util.py
+++ b/python/python/tests/test_util.py
@@ -390,6 +390,87 @@ def test_infer_target_schema():
    assert output == expected


+def test_infer_target_schema_with_vector_embedding_names():
+    """Test that _infer_target_schema detects vector columns with 'vector'/'embedding'.
+
+    This tests the enhanced column name detection for vector inference.
+    """
+
+    # Test float vectors with various naming patterns
+    example = pa.schema(
+        {
+            "user_vector": pa.list_(pa.float64()),
+            "text_embedding": pa.list_(pa.float64()),
+            "doc_embeddings": pa.list_(pa.float64()),
+            "my_vector_field": pa.list_(pa.float64()),
+            "embedding_model": pa.list_(pa.float64()),
+            "VECTOR_COL": pa.list_(pa.float64()),  # uppercase
+            "Vector_Mixed": pa.list_(pa.float64()),  # mixed case
+            "normal_list": pa.list_(pa.float64()),  # should not be converted
+        }
+    )
+    data = pa.table(
+        {
+            "user_vector": [[1.0, 2.0]],
+            "text_embedding": [[3.0, 4.0]],
+            "doc_embeddings": [[5.0, 6.0]],
+            "my_vector_field": [[7.0, 8.0]],
+            "embedding_model": [[9.0, 10.0]],
+            "VECTOR_COL": [[11.0, 12.0]],
+            "Vector_Mixed": [[13.0, 14.0]],
+            "normal_list": [[15.0, 16.0]],
+        },
+        schema=example,
+    )
+
+    expected = pa.schema(
+        {
+            "user_vector": pa.list_(pa.float32(), 2),  # converted
+            "text_embedding": pa.list_(pa.float32(), 2),  # converted
+            "doc_embeddings": pa.list_(pa.float32(), 2),  # converted
+            "my_vector_field": pa.list_(pa.float32(), 2),  # converted
+            "embedding_model": pa.list_(pa.float32(), 2),  # converted
+            "VECTOR_COL": pa.list_(pa.float32(), 2),  # converted
+            "Vector_Mixed": pa.list_(pa.float32(), 2),  # converted
+            "normal_list": pa.list_(pa.float64()),  # not converted
+        }
+    )
+
+    output, _ = _infer_target_schema(data.to_reader())
+    assert output == expected
+
+    # Test integer vectors with various naming patterns
+    example_int = pa.schema(
+        {
+            "user_vector": pa.list_(pa.int32()),
+            "text_embedding": pa.list_(pa.int64()),
+            "doc_embeddings": pa.list_(pa.int16()),
+            "normal_list": pa.list_(pa.int32()),  # should not be converted
+        }
+    )
+    data_int = pa.table(
+        {
+            "user_vector": [[1, 2]],
+            "text_embedding": [[3, 4]],
+            "doc_embeddings": [[5, 6]],
+            "normal_list": [[7, 8]],
+        },
+        schema=example_int,
+    )
+
+    expected_int = pa.schema(
+        {
+            "user_vector": pa.list_(pa.uint8(), 2),  # converted
+            "text_embedding": pa.list_(pa.uint8(), 2),  # converted
+            "doc_embeddings": pa.list_(pa.uint8(), 2),  # converted
+            "normal_list": pa.list_(pa.int32()),  # not converted
+        }
+    )
+
+    output_int, _ = _infer_target_schema(data_int.to_reader())
+    assert output_int == expected_int
+
+
@pytest.mark.parametrize(
    "data",
    [