fix(node): allow undefined/omitted values for nullable vector fields (#2656)

**Problem**: When a vector field is marked as nullable, users should be
able to omit it or pass `undefined`, but this was throwing an error:
"Table has embeddings: 'vector', but no embedding function was provided"

fixes: #2646

**Solution**: Modified `validateSchemaEmbeddings` to check
`field.nullable` before treating `undefined` values as missing embedding
fields.

**Changes**:
- Fixed validation logic in `nodejs/lancedb/arrow.ts`
- Enabled previously skipped test for nullable fields
- Added reproduction test case

**Behavior**:
-  `{ vector: undefined }` now works for nullable fields
-  `{}` (omitted field) now works for nullable fields  
-  `{ vector: null }` still works (unchanged)
-  Non-nullable fields still properly throw errors (unchanged)

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
Co-authored-by: neha <neha@posthog.com>
This commit is contained in:
Neha Prasad
2025-10-02 23:23:05 +05:30
committed by GitHub
parent 1aa0fd16e7
commit 9e2a68541e
2 changed files with 84 additions and 16 deletions

View File

@@ -211,8 +211,7 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
},
);
// TODO: https://github.com/lancedb/lancedb/issues/1832
it.skip("should be able to omit nullable fields", async () => {
it("should be able to omit nullable fields", async () => {
const db = await connect(tmpDir.name);
const schema = new arrow.Schema([
new arrow.Field(
@@ -236,23 +235,36 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
await table.add([data3]);
let res = await table.query().limit(10).toArray();
const resVector = res.map((r) => r.get("vector").toArray());
const resVector = res.map((r) =>
r.vector ? Array.from(r.vector) : null,
);
expect(resVector).toEqual([null, data2.vector, data3.vector]);
const resItem = res.map((r) => r.get("item").toArray());
const resItem = res.map((r) => r.item);
expect(resItem).toEqual(["foo", null, "bar"]);
const resPrice = res.map((r) => r.get("price").toArray());
const resPrice = res.map((r) => r.price);
expect(resPrice).toEqual([10.0, 2.0, 3.0]);
const data4 = { item: "foo" };
// We can't omit a column if it's not nullable
await expect(table.add([data4])).rejects.toThrow("Invalid user input");
await expect(table.add([data4])).rejects.toThrow(
"Append with different schema",
);
// But we can alter columns to make them nullable
await table.alterColumns([{ path: "price", nullable: true }]);
await table.add([data4]);
res = (await table.query().limit(10).toArray()).map((r) => r.toJSON());
expect(res).toEqual([data1, data2, data3, data4]);
res = (await table.query().limit(10).toArray()).map((r) => ({
...r.toJSON(),
vector: r.vector ? Array.from(r.vector) : null,
}));
// Rust fills missing nullable fields with null
expect(res).toEqual([
{ ...data1, vector: null },
{ ...data2, item: null },
data3,
{ ...data4, price: null, vector: null },
]);
});
it("should be able to insert nullable data for non-nullable fields", async () => {
@@ -330,6 +342,43 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
const table = await db.createTable("my_table", data);
expect(await table.countRows()).toEqual(2);
});
it("should allow undefined and omitted nullable vector fields", async () => {
// Test for the bug: can't pass undefined or omit vector column
const db = await connect("memory://");
const schema = new arrow.Schema([
new arrow.Field("id", new arrow.Int32(), true),
new arrow.Field(
"vector",
new arrow.FixedSizeList(
32,
new arrow.Field("item", new arrow.Float32(), true),
),
true, // nullable = true
),
]);
const table = await db.createEmptyTable("test_table", schema);
// Should not throw error for undefined value
await table.add([{ id: 0, vector: undefined }]);
// Should not throw error for omitted field
await table.add([{ id: 1 }]);
// Should still work for null
await table.add([{ id: 2, vector: null }]);
// Should still work for actual vector
const testVector = new Array(32).fill(0.5);
await table.add([{ id: 3, vector: testVector }]);
expect(await table.countRows()).toEqual(4);
const res = await table.query().limit(10).toArray();
const resVector = res.map((r) =>
r.vector ? Array.from(r.vector) : null,
);
expect(resVector).toEqual([null, null, null, testVector]);
});
},
);
@@ -1454,7 +1503,9 @@ describe("when optimizing a dataset", () => {
it("delete unverified", async () => {
const version = await table.version();
const versionFile = `${tmpDir.name}/${table.name}.lance/_versions/${version - 1}.manifest`;
const versionFile = `${tmpDir.name}/${table.name}.lance/_versions/${
version - 1
}.manifest`;
fs.rmSync(versionFile);
let stats = await table.optimize({ deleteUnverified: false });

View File

@@ -1285,19 +1285,36 @@ function validateSchemaEmbeddings(
if (isFixedSizeList(field.type)) {
field = sanitizeField(field);
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
// Check if there's an embedding function registered for this field
let hasEmbeddingFunction = false;
// Check schema metadata for embedding functions
if (schema.metadata.has("embedding_functions")) {
const embeddings = JSON.parse(
schema.metadata.get("embedding_functions")!,
);
if (
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
embeddings.find((f: any) => f["vectorColumn"] === field.name) ===
undefined
) {
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
if (embeddings.find((f: any) => f["vectorColumn"] === field.name)) {
hasEmbeddingFunction = true;
}
}
// Check passed embedding function parameter
if (embeddings && embeddings.vectorColumn === field.name) {
hasEmbeddingFunction = true;
}
// If the field is nullable AND there's no embedding function, allow undefined/omitted values
if (field.nullable && !hasEmbeddingFunction) {
fields.push(field);
} else {
// Either not nullable OR has embedding function - require explicit values
if (hasEmbeddingFunction) {
// Don't add to missingEmbeddingFields since this is expected to be filled by embedding function
fields.push(field);
} else {
missingEmbeddingFields.push(field);
}
} else {
missingEmbeddingFields.push(field);
}
} else {
fields.push(field);