From 2261eb95a03576901c32985a9794daf11beb2e8f Mon Sep 17 00:00:00 2001 From: Neha Prasad Date: Fri, 19 Sep 2025 21:47:28 +0530 Subject: [PATCH] fix(node): handle undefined vector fields with embedding functions (#2655) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fixes issue where passing `{ vector: undefined }` with an embedding function threw "Found field not in schema" error instead of calling the embedding function like `null` or omitted fields. **Changes:** - Modified `rowPathsAndValues` to skip undefined values during schema inference - Added test case verifying undefined, null, and omitted vector fields all work correctly **Before:** `{ vector: undefined }` → Error **After:** `{ vector: undefined }` → Calls embedding function Closes #2647 --- nodejs/__test__/embedding.test.ts | 54 +++++++++++++++++++++++++++++++ nodejs/lancedb/arrow.ts | 6 +++- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/nodejs/__test__/embedding.test.ts b/nodejs/__test__/embedding.test.ts index 30d8b1fc..e56e8063 100644 --- a/nodejs/__test__/embedding.test.ts +++ b/nodejs/__test__/embedding.test.ts @@ -256,6 +256,60 @@ describe("embedding functions", () => { expect(actual).toHaveProperty("text"); }); + it("should handle undefined vector field with embedding function correctly", async () => { + @register("undefined_test") + class MockEmbeddingFunction extends EmbeddingFunction { + ndims() { + return 3; + } + embeddingDataType(): Float { + return new Float32(); + } + async computeQueryEmbeddings(_data: string) { + return [1, 2, 3]; + } + async computeSourceEmbeddings(data: string[]) { + return Array.from({ length: data.length }).fill([ + 1, 2, 3, + ]) as number[][]; + } + } + const func = getRegistry() + .get("undefined_test")! + .create(); + const schema = new Schema([ + new Field("text", new Utf8(), true), + new Field( + "vector", + new FixedSizeList(3, new Field("item", new Float32(), true)), + true, + ), + ]); + + const db = await connect(tmpDir.name); + const table = await db.createEmptyTable("test_undefined", schema, { + embeddingFunction: { + function: func, + sourceColumn: "text", + vectorColumn: "vector", + }, + }); + + // Test that undefined, null, and omitted vector fields all work + await table.add([{ text: "test1", vector: undefined }]); + await table.add([{ text: "test2", vector: null }]); + await table.add([{ text: "test3" }]); + + const rows = await table.query().toArray(); + expect(rows.length).toBe(3); + + // All rows should have vectors computed by the embedding function + for (const row of rows) { + expect(row.vector).toBeDefined(); + expect(JSON.parse(JSON.stringify(row.vector))).toEqual([1, 2, 3]); + } + }); + test.each([new Float16(), new Float32(), new Float64()])( "should be able to provide manual embeddings with multiple float datatype", async (floatType) => { diff --git a/nodejs/lancedb/arrow.ts b/nodejs/lancedb/arrow.ts index ed399742..52a892ff 100644 --- a/nodejs/lancedb/arrow.ts +++ b/nodejs/lancedb/arrow.ts @@ -512,7 +512,11 @@ function* rowPathsAndValues( if (isObject(value)) { yield* rowPathsAndValues(value, [...basePath, key]); } else { - yield [[...basePath, key], value]; + // Skip undefined values - they should be treated the same as missing fields + // for embedding function purposes + if (value !== undefined) { + yield [[...basePath, key], value]; + } } } }