mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-18 04:20:39 +00:00
fix(nodejs): add better error handling when missing embedding functions (#1290)
note: running the default lint command `npm run lint -- --fix` seems to have made a lot of unrelated changes.
This commit is contained in:
@@ -20,7 +20,7 @@ import {
|
||||
type Vector,
|
||||
FixedSizeList,
|
||||
vectorFromArray,
|
||||
type Schema,
|
||||
Schema,
|
||||
Table as ArrowTable,
|
||||
RecordBatchStreamWriter,
|
||||
List,
|
||||
@@ -85,6 +85,7 @@ export class MakeArrowTableOptions {
|
||||
vectorColumns: Record<string, VectorColumnOptions> = {
|
||||
vector: new VectorColumnOptions(),
|
||||
};
|
||||
embeddings?: EmbeddingFunction<unknown>;
|
||||
|
||||
/**
|
||||
* If true then string columns will be encoded with dictionary encoding
|
||||
@@ -208,6 +209,7 @@ export function makeArrowTable(
|
||||
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
||||
if (opt.schema !== undefined && opt.schema !== null) {
|
||||
opt.schema = sanitizeSchema(opt.schema);
|
||||
opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
|
||||
}
|
||||
const columns: Record<string, Vector> = {};
|
||||
// TODO: sample dataset to find missing columns
|
||||
@@ -287,8 +289,8 @@ export function makeArrowTable(
|
||||
// then patch the schema of the batches so we can use
|
||||
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
||||
const firstTable = new ArrowTable(columns);
|
||||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
||||
const batchesFixed = firstTable.batches.map(
|
||||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
||||
(batch) => new RecordBatch(opt.schema!, batch.data),
|
||||
);
|
||||
return new ArrowTable(opt.schema, batchesFixed);
|
||||
@@ -648,3 +650,41 @@ function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
|
||||
export function createEmptyTable(schema: Schema): ArrowTable {
|
||||
return new ArrowTable(sanitizeSchema(schema));
|
||||
}
|
||||
|
||||
function validateSchemaEmbeddings(
|
||||
schema: Schema,
|
||||
data: Array<Record<string, unknown>>,
|
||||
embeddings: EmbeddingFunction<unknown> | undefined,
|
||||
) {
|
||||
const fields = [];
|
||||
const missingEmbeddingFields = [];
|
||||
|
||||
// First we check if the field is a `FixedSizeList`
|
||||
// Then we check if the data contains the field
|
||||
// if it does not, we add it to the list of missing embedding fields
|
||||
// Finally, we check if those missing embedding fields are `this._embeddings`
|
||||
// if they are not, we throw an error
|
||||
for (const field of schema.fields) {
|
||||
if (field.type instanceof FixedSizeList) {
|
||||
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
||||
missingEmbeddingFields.push(field);
|
||||
} else {
|
||||
fields.push(field);
|
||||
}
|
||||
} else {
|
||||
fields.push(field);
|
||||
}
|
||||
}
|
||||
|
||||
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
||||
console.log({ missingEmbeddingFields, embeddings });
|
||||
|
||||
throw new Error(
|
||||
`Table has embeddings: "${missingEmbeddingFields
|
||||
.map((f) => f.name)
|
||||
.join(",")}", but no embedding function was provided`,
|
||||
);
|
||||
}
|
||||
|
||||
return new Schema(fields);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user