mirror of
https://github.com/lancedb/lancedb.git
synced 2026-06-02 03:40:40 +00:00
feat: infer vector columns when name contains 'vector' or 'embedding' (#2547)
## Summary - Enhanced vector column detection to use substring matching instead of exact matching - Now detects columns with names containing "vector" or "embedding" (case-insensitive) - Added integer vector support to Node.js implementation (matching Python) - Comprehensive test coverage for both float and integer vector types ## Changes ### Python (`python/python/lancedb/table.py`) - Updated `_infer_target_schema()` to use substring matching with helper function `_is_vector_column()` - Preserved original field names instead of forcing "vector" - Consolidated duplicate logic for better maintainability ### Node.js (`nodejs/lancedb/arrow.ts`) - Enhanced type inference with `nameSuggestsVectorColumn()` helper function - Added `isAllIntegers()` function with performance optimization (checks first 10 elements) - Implemented integer vector support using `Uint8` type (matching Python) - Improved type safety by removing `any` usage ### Tests - **Python**: Added `test_infer_target_schema_with_vector_embedding_names()` in `test_util.py` - **Node.js**: Added comprehensive test case in `arrow.test.ts` - Both test suites cover various naming patterns and integer/float vector types ## Examples of newly supported column names: - `user_vector`, `text_embedding`, `doc_embeddings` - `my_vector_field`, `embedding_model` - `VECTOR_COL`, `Vector_Mixed` (case-insensitive) - Both float and integer arrays are properly converted to fixed-size lists ## Test plan - [x] All existing tests pass (backward compatibility maintained) - [x] New tests pass for both Python and Node.js implementations - [x] Integer vector detection works correctly in Node.js - [x] Code passes linting and formatting checks - [x] Performance optimized for large vector arrays Fixes #2546 🤖 Generated with [Claude Code](https://claude.ai/code) --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -34,6 +34,7 @@ import {
|
||||
Struct,
|
||||
Timestamp,
|
||||
Type,
|
||||
Uint8,
|
||||
Utf8,
|
||||
Vector,
|
||||
makeVector as arrowMakeVector,
|
||||
@@ -51,6 +52,15 @@ import {
|
||||
sanitizeTable,
|
||||
sanitizeType,
|
||||
} from "./sanitize";
|
||||
|
||||
/**
|
||||
* Check if a field name indicates a vector column.
|
||||
*/
|
||||
function nameSuggestsVectorColumn(fieldName: string): boolean {
|
||||
const nameLower = fieldName.toLowerCase();
|
||||
return nameLower.includes("vector") || nameLower.includes("embedding");
|
||||
}
|
||||
|
||||
export * from "apache-arrow";
|
||||
export type SchemaLike =
|
||||
| Schema
|
||||
@@ -591,10 +601,17 @@ function inferType(
|
||||
return undefined;
|
||||
}
|
||||
// Try to automatically detect embedding columns.
|
||||
if (valueType instanceof Float && path[path.length - 1] === "vector") {
|
||||
// We default to Float32 for vectors.
|
||||
const child = new Field("item", new Float32(), true);
|
||||
return new FixedSizeList(value.length, child);
|
||||
if (nameSuggestsVectorColumn(path[path.length - 1])) {
|
||||
// Check if value is a Uint8Array for integer vector type determination
|
||||
if (value instanceof Uint8Array) {
|
||||
// For integer vectors, we default to Uint8 (matching Python implementation)
|
||||
const child = new Field("item", new Uint8(), true);
|
||||
return new FixedSizeList(value.length, child);
|
||||
} else {
|
||||
// For float vectors, we default to Float32
|
||||
const child = new Field("item", new Float32(), true);
|
||||
return new FixedSizeList(value.length, child);
|
||||
}
|
||||
} else {
|
||||
const child = new Field("item", valueType, true);
|
||||
return new List(child);
|
||||
|
||||
Reference in New Issue
Block a user