fix: undefined values should become null in nullable fields (#2658)

### Bug Fix: Undefined Values in Nullable Fields

**Issue**: When inserting data with `undefined` values into nullable
fields, LanceDB was incorrectly coercing them to default values (`false`
for booleans, `NaN` for numbers, `""` for strings) instead of `null`.

**Fix**: Modified the `makeVector()` function in `arrow.ts` to properly
convert `undefined` values to `null` for nullable fields before passing
data to Apache Arrow.

fixes: #2645

**Result**: Now `{ text: undefined, number: undefined, bool: undefined
}` correctly becomes `{ text: null, number: null, bool: null }` when
fields are marked as nullable in the schema.

**Files Changed**: 
- `nodejs/lancedb/arrow.ts` (core fix)
- `nodejs/__test__/arrow.test.ts` (test coverage)

- This ensures proper null handling for nullable fields as expected by
users.

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
Neha Prasad
2025-09-24 02:59:52 +05:30
committed by GitHub
parent 1befebf614
commit b0800b4b71
2 changed files with 50 additions and 3 deletions

View File

@@ -1039,3 +1039,33 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
});
},
);
// Test for the undefined values bug fix
describe("undefined values handling", () => {
it("should handle mixed undefined and actual values", () => {
const schema = new Schema([
new Field("text", new Utf8(), true), // nullable
new Field("number", new Int32(), true), // nullable
new Field("bool", new Bool(), true), // nullable
]);
const data = [
{ text: undefined, number: 42, bool: true },
{ text: "hello", number: undefined, bool: false },
{ text: "world", number: 123, bool: undefined },
];
const table = makeArrowTable(data, { schema });
const result = table.toArray();
expect(result).toHaveLength(3);
expect(result[0].text).toBe(null);
expect(result[0].number).toBe(42);
expect(result[0].bool).toBe(true);
expect(result[1].text).toBe("hello");
expect(result[1].number).toBe(null);
expect(result[1].bool).toBe(false);
expect(result[2].text).toBe("world");
expect(result[2].number).toBe(123);
expect(result[2].bool).toBe(null);
});
});

View File

@@ -705,7 +705,7 @@ function transposeData(
}
return current;
});
return makeVector(values, field.type);
return makeVector(values, field.type, undefined, field.nullable);
}
}
@@ -752,9 +752,15 @@ function makeVector(
values: unknown[],
type?: DataType,
stringAsDictionary?: boolean,
nullable?: boolean,
// biome-ignore lint/suspicious/noExplicitAny: skip
): Vector<any> {
if (type !== undefined) {
// Convert undefined values to null for nullable fields
if (nullable) {
values = values.map((v) => (v === undefined ? null : v));
}
// workaround for: https://github.com/apache/arrow-js/issues/68
if (DataType.isBool(type)) {
const hasNonNullValue = values.some((v) => v !== null && v !== undefined);
@@ -769,6 +775,7 @@ function makeVector(
return arrowMakeVector(data);
}
}
// No need for inference, let Arrow create it
if (type instanceof Int) {
if (DataType.isInt(type) && type.bitWidth === 64) {
@@ -893,7 +900,12 @@ async function applyEmbeddingsFromMetadata(
for (const field of schema.fields) {
if (!(field.name in columns)) {
const nullValues = new Array(table.numRows).fill(null);
columns[field.name] = makeVector(nullValues, field.type);
columns[field.name] = makeVector(
nullValues,
field.type,
undefined,
field.nullable,
);
}
}
@@ -957,7 +969,12 @@ async function applyEmbeddings<T>(
} else if (schema != null) {
const destField = schema.fields.find((f) => f.name === destColumn);
if (destField != null) {
newColumns[destColumn] = makeVector([], destField.type);
newColumns[destColumn] = makeVector(
[],
destField.type,
undefined,
destField.nullable,
);
} else {
throw new Error(
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`,