fix: improve error handling and embedding logic in arrow.ts (#2433)

- Enhanced error messages for schema inference failures to suggest
providing an explicit schema.
- Updated embedding application logic to check for existing destination
columns, allowing for filling embeddings in columns that are all null.
- Added comments for clarity on handling existing columns during
embedding application.

Fixes https://github.com/lancedb/lancedb/issues/2183

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

## Summary by CodeRabbit

- **Bug Fixes**
  - Improved error messages for schema inference to enhance readability.
- Prevented redundant embedding application by skipping columns that
already contain data, avoiding unnecessary errors and computations.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
satya-nutella
2025-06-19 01:15:11 +05:30
committed by GitHub
parent b77314168d
commit 9645fe52c2
2 changed files with 31 additions and 11 deletions

View File

@@ -592,14 +592,14 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
).rejects.toThrow("column vector was missing");
});
it("will provide a nice error if run twice", async function () {
it("will skip embedding application if already applied", async function () {
const records = sampleRecords();
const table = await convertToTable(records, dummyEmbeddingConfig);
// fromTableToBuffer will try and apply the embeddings again
await expect(
fromTableToBuffer(table, dummyEmbeddingConfig),
).rejects.toThrow("already existed");
// but should skip since the column already has non-null values
const result = await fromTableToBuffer(table, dummyEmbeddingConfig);
expect(result.byteLength).toBeGreaterThan(0);
});
});

View File

@@ -417,7 +417,9 @@ function inferSchema(
} else {
const inferredType = inferType(value, path, opts);
if (inferredType === undefined) {
throw new Error(`Failed to infer data type for field ${path.join(".")} at row ${rowI}. \
throw new Error(`Failed to infer data type for field ${path.join(
".",
)} at row ${rowI}. \
Consider providing an explicit schema.`);
}
pathTree.set(path, inferredType);
@@ -799,11 +801,17 @@ async function applyEmbeddingsFromMetadata(
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
);
}
// Check if destination column exists and handle accordingly
if (columns[destColumn] !== undefined) {
throw new Error(
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
);
const existingColumn = columns[destColumn];
// If the column exists but is all null, we can fill it with embeddings
if (existingColumn.nullCount !== existingColumn.length) {
// Column has non-null values, skip embedding application
continue;
}
}
if (table.batches.length > 1) {
throw new Error(
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
@@ -903,11 +911,23 @@ async function applyEmbeddings<T>(
);
}
} else {
// Check if destination column exists and handle accordingly
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
throw new Error(
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
);
const existingColumn = newColumns[destColumn];
// If the column exists but is all null, we can fill it with embeddings
if (existingColumn.nullCount !== existingColumn.length) {
// Column has non-null values, skip embedding application and return table as-is
let newTable = new ArrowTable(newColumns);
if (schema != null) {
newTable = alignTable(newTable, schema as Schema);
}
return new ArrowTable(
new Schema(newTable.schema.fields, schemaMetadata),
newTable.batches,
);
}
}
if (table.batches.length > 1) {
throw new Error(
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",