From 9645fe52c2c279f8c1c7a696df4fa46f4c088542 Mon Sep 17 00:00:00 2001 From: satya-nutella <80167324+satya-nutella@users.noreply.github.com> Date: Thu, 19 Jun 2025 01:15:11 +0530 Subject: [PATCH] fix: improve error handling and embedding logic in arrow.ts (#2433) - Enhanced error messages for schema inference failures to suggest providing an explicit schema. - Updated embedding application logic to check for existing destination columns, allowing for filling embeddings in columns that are all null. - Added comments for clarity on handling existing columns during embedding application. Fixes https://github.com/lancedb/lancedb/issues/2183 ## Summary by CodeRabbit ## Summary by CodeRabbit - **Bug Fixes** - Improved error messages for schema inference to enhance readability. - Prevented redundant embedding application by skipping columns that already contain data, avoiding unnecessary errors and computations. --- nodejs/__test__/arrow.test.ts | 8 ++++---- nodejs/lancedb/arrow.ts | 34 +++++++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/nodejs/__test__/arrow.test.ts b/nodejs/__test__/arrow.test.ts index 78aef4c8..3f0703a0 100644 --- a/nodejs/__test__/arrow.test.ts +++ b/nodejs/__test__/arrow.test.ts @@ -592,14 +592,14 @@ describe.each([arrow15, arrow16, arrow17, arrow18])( ).rejects.toThrow("column vector was missing"); }); - it("will provide a nice error if run twice", async function () { + it("will skip embedding application if already applied", async function () { const records = sampleRecords(); const table = await convertToTable(records, dummyEmbeddingConfig); // fromTableToBuffer will try and apply the embeddings again - await expect( - fromTableToBuffer(table, dummyEmbeddingConfig), - ).rejects.toThrow("already existed"); + // but should skip since the column already has non-null values + const result = await fromTableToBuffer(table, dummyEmbeddingConfig); + expect(result.byteLength).toBeGreaterThan(0); }); }); diff --git a/nodejs/lancedb/arrow.ts b/nodejs/lancedb/arrow.ts index 944f62f7..20aa4cea 100644 --- a/nodejs/lancedb/arrow.ts +++ b/nodejs/lancedb/arrow.ts @@ -417,7 +417,9 @@ function inferSchema( } else { const inferredType = inferType(value, path, opts); if (inferredType === undefined) { - throw new Error(`Failed to infer data type for field ${path.join(".")} at row ${rowI}. \ + throw new Error(`Failed to infer data type for field ${path.join( + ".", + )} at row ${rowI}. \ Consider providing an explicit schema.`); } pathTree.set(path, inferredType); @@ -799,11 +801,17 @@ async function applyEmbeddingsFromMetadata( `Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`, ); } + + // Check if destination column exists and handle accordingly if (columns[destColumn] !== undefined) { - throw new Error( - `Attempt to apply embeddings to table failed because column ${destColumn} already existed`, - ); + const existingColumn = columns[destColumn]; + // If the column exists but is all null, we can fill it with embeddings + if (existingColumn.nullCount !== existingColumn.length) { + // Column has non-null values, skip embedding application + continue; + } } + if (table.batches.length > 1) { throw new Error( "Internal error: `makeArrowTable` unexpectedly created a table with more than one batch", @@ -903,11 +911,23 @@ async function applyEmbeddings( ); } } else { + // Check if destination column exists and handle accordingly if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) { - throw new Error( - `Attempt to apply embeddings to table failed because column ${destColumn} already existed`, - ); + const existingColumn = newColumns[destColumn]; + // If the column exists but is all null, we can fill it with embeddings + if (existingColumn.nullCount !== existingColumn.length) { + // Column has non-null values, skip embedding application and return table as-is + let newTable = new ArrowTable(newColumns); + if (schema != null) { + newTable = alignTable(newTable, schema as Schema); + } + return new ArrowTable( + new Schema(newTable.schema.fields, schemaMetadata), + newTable.batches, + ); + } } + if (table.batches.length > 1) { throw new Error( "Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",