// SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The LanceDB Authors import { Data as ArrowData, Table as ArrowTable, Binary, Bool, BufferType, DataType, DateUnit, Date_, Decimal, Dictionary, Duration, Field, FixedSizeBinary, FixedSizeList, Float, Float32, Float64, Int, Int32, Int64, LargeBinary, List, Null, Precision, RecordBatch, RecordBatchFileReader, RecordBatchFileWriter, RecordBatchStreamWriter, Schema, Struct, Timestamp, Type, Utf8, Vector, makeVector as arrowMakeVector, vectorFromArray as badVectorFromArray, makeBuilder, makeData, makeTable, } from "apache-arrow"; import { Buffers } from "apache-arrow/data"; import { type EmbeddingFunction } from "./embedding/embedding_function"; import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry"; import { sanitizeField, sanitizeSchema, sanitizeTable, sanitizeType, } from "./sanitize"; export * from "apache-arrow"; export type SchemaLike = | Schema | { fields: FieldLike[]; metadata: Map; get names(): unknown[]; }; export type FieldLike = | Field | { type: string; name: string; nullable?: boolean; metadata?: Map; }; export type DataLike = // biome-ignore lint/suspicious/noExplicitAny: | import("apache-arrow").Data> | { // biome-ignore lint/suspicious/noExplicitAny: type: any; length: number; offset: number; stride: number; nullable: boolean; children: DataLike[]; get nullCount(): number; // biome-ignore lint/suspicious/noExplicitAny: values: Buffers[BufferType.DATA]; // biome-ignore lint/suspicious/noExplicitAny: typeIds: Buffers[BufferType.TYPE]; // biome-ignore lint/suspicious/noExplicitAny: nullBitmap: Buffers[BufferType.VALIDITY]; // biome-ignore lint/suspicious/noExplicitAny: valueOffsets: Buffers[BufferType.OFFSET]; }; export type RecordBatchLike = | RecordBatch | { schema: SchemaLike; data: DataLike; }; export type TableLike = | ArrowTable | { schema: SchemaLike; batches: RecordBatchLike[] }; export type IntoVector = | Float32Array | Float64Array | number[] | Promise; export function isArrowTable(value: object): value is TableLike { if (value instanceof ArrowTable) return true; return "schema" in value && "batches" in value; } export function isNull(value: unknown): value is Null { return value instanceof Null || DataType.isNull(value); } export function isInt(value: unknown): value is Int { return value instanceof Int || DataType.isInt(value); } export function isFloat(value: unknown): value is Float { return value instanceof Float || DataType.isFloat(value); } export function isBinary(value: unknown): value is Binary { return value instanceof Binary || DataType.isBinary(value); } export function isLargeBinary(value: unknown): value is LargeBinary { return value instanceof LargeBinary || DataType.isLargeBinary(value); } export function isUtf8(value: unknown): value is Utf8 { return value instanceof Utf8 || DataType.isUtf8(value); } export function isLargeUtf8(value: unknown): value is Utf8 { return value instanceof Utf8 || DataType.isLargeUtf8(value); } export function isBool(value: unknown): value is Utf8 { return value instanceof Utf8 || DataType.isBool(value); } export function isDecimal(value: unknown): value is Utf8 { return value instanceof Utf8 || DataType.isDecimal(value); } export function isDate(value: unknown): value is Utf8 { return value instanceof Utf8 || DataType.isDate(value); } export function isTime(value: unknown): value is Utf8 { return value instanceof Utf8 || DataType.isTime(value); } export function isTimestamp(value: unknown): value is Utf8 { return value instanceof Utf8 || DataType.isTimestamp(value); } export function isInterval(value: unknown): value is Utf8 { return value instanceof Utf8 || DataType.isInterval(value); } export function isDuration(value: unknown): value is Utf8 { return value instanceof Utf8 || DataType.isDuration(value); } export function isList(value: unknown): value is List { return value instanceof List || DataType.isList(value); } export function isStruct(value: unknown): value is Struct { return value instanceof Struct || DataType.isStruct(value); } export function isUnion(value: unknown): value is Struct { return value instanceof Struct || DataType.isUnion(value); } export function isFixedSizeBinary(value: unknown): value is FixedSizeBinary { return value instanceof FixedSizeBinary || DataType.isFixedSizeBinary(value); } export function isFixedSizeList(value: unknown): value is FixedSizeList { return value instanceof FixedSizeList || DataType.isFixedSizeList(value); } /** Data type accepted by NodeJS SDK */ export type Data = Record[] | TableLike; /* * Options to control how a column should be converted to a vector array */ export class VectorColumnOptions { /** Vector column type. */ type: Float = new Float32(); constructor(values?: Partial) { Object.assign(this, values); } } // biome-ignore lint/suspicious/noExplicitAny: skip function vectorFromArray(data: any, type?: DataType) { // Workaround for: https://github.com/apache/arrow/issues/45862 // If FSL type with float if (DataType.isFixedSizeList(type) && DataType.isFloat(type.valueType)) { const extendedData = [...data, new Array(type.listSize).fill(0.0)]; const array = badVectorFromArray(extendedData, type); return array.slice(0, data.length); } else if (type === undefined) { return badVectorFromArray(data); } else { return badVectorFromArray(data, type); } } /** Options to control the makeArrowTable call. */ export class MakeArrowTableOptions { /* * Schema of the data. * * If this is not provided then the data type will be inferred from the * JS type. Integer numbers will become int64, floating point numbers * will become float64 and arrays will become variable sized lists with * the data type inferred from the first element in the array. * * The schema must be specified if there are no records (e.g. to make * an empty table) */ schema?: SchemaLike; /* * Mapping from vector column name to expected type * * Lance expects vector columns to be fixed size list arrays (i.e. tensors) * However, `makeArrowTable` will not infer this by default (it creates * variable size list arrays). This field can be used to indicate that a column * should be treated as a vector column and converted to a fixed size list. * * The keys should be the names of the vector columns. The value specifies the * expected data type of the vector columns. * * If `schema` is provided then this field is ignored. * * By default, the column named "vector" will be assumed to be a float32 * vector column. */ vectorColumns: Record = { vector: new VectorColumnOptions(), }; embeddings?: EmbeddingFunction; embeddingFunction?: EmbeddingFunctionConfig; /** * If true then string columns will be encoded with dictionary encoding * * Set this to true if your string columns tend to repeat the same values * often. For more precise control use the `schema` property to specify the * data type for individual columns. * * If `schema` is provided then this property is ignored. */ dictionaryEncodeStrings: boolean = false; constructor(values?: Partial) { Object.assign(this, values); } } /** * An enhanced version of the {@link makeTable} function from Apache Arrow * that supports nested fields and embeddings columns. * * (typically you do not need to call this function. It will be called automatically * when creating a table or adding data to it) * * This function converts an array of Record (row-major JS objects) * to an Arrow Table (a columnar structure) * * If a schema is provided then it will be used to determine the resulting array * types. Fields will also be reordered to fit the order defined by the schema. * * If a schema is not provided then the types will be inferred and the field order * will be controlled by the order of properties in the first record. If a type * is inferred it will always be nullable. * * If not all fields are found in the data, then a subset of the schema will be * returned. * * If the input is empty then a schema must be provided to create an empty table. * * When a schema is not specified then data types will be inferred. The inference * rules are as follows: * * - boolean => Bool * - number => Float64 * - bigint => Int64 * - String => Utf8 * - Buffer => Binary * - Record => Struct * - Array => List * @example * ```ts * import { fromTableToBuffer, makeArrowTable } from "../arrow"; * import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow"; * * const schema = new Schema([ * new Field("a", new Int32()), * new Field("b", new Float32()), * new Field("c", new FixedSizeList(3, new Field("item", new Float16()))), * ]); * const table = makeArrowTable([ * { a: 1, b: 2, c: [1, 2, 3] }, * { a: 4, b: 5, c: [4, 5, 6] }, * { a: 7, b: 8, c: [7, 8, 9] }, * ], { schema }); * ``` * * By default it assumes that the column named `vector` is a vector column * and it will be converted into a fixed size list array of type float32. * The `vectorColumns` option can be used to support other vector column * names and data types. * * ```ts * const schema = new Schema([ * new Field("a", new Float64()), * new Field("b", new Float64()), * new Field( * "vector", * new FixedSizeList(3, new Field("item", new Float32())) * ), * ]); * const table = makeArrowTable([ * { a: 1, b: 2, vector: [1, 2, 3] }, * { a: 4, b: 5, vector: [4, 5, 6] }, * { a: 7, b: 8, vector: [7, 8, 9] }, * ]); * assert.deepEqual(table.schema, schema); * ``` * * You can specify the vector column types and names using the options as well * * ```ts * const schema = new Schema([ * new Field('a', new Float64()), * new Field('b', new Float64()), * new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))), * new Field('vec2', new FixedSizeList(3, new Field('item', new Float16()))) * ]); * const table = makeArrowTable([ * { a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] }, * { a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] }, * { a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] } * ], { * vectorColumns: { * vec1: { type: new Float16() }, * vec2: { type: new Float16() } * } * } * assert.deepEqual(table.schema, schema) * ``` */ export function makeArrowTable( data: Array>, options?: Partial, metadata?: Map, ): ArrowTable { const opt = new MakeArrowTableOptions(options !== undefined ? options : {}); let schema: Schema | undefined = undefined; if (opt.schema !== undefined && opt.schema !== null) { schema = sanitizeSchema(opt.schema); schema = validateSchemaEmbeddings( schema as Schema, data, options?.embeddingFunction, ); } let schemaMetadata = schema?.metadata || new Map(); if (metadata !== undefined) { schemaMetadata = new Map([...schemaMetadata, ...metadata]); } if ( data.length === 0 && (options?.schema === undefined || options?.schema === null) ) { throw new Error("At least one record or a schema needs to be provided"); } else if (data.length === 0) { if (schema === undefined) { throw new Error("A schema must be provided if data is empty"); } else { schema = new Schema(schema.fields, schemaMetadata); return new ArrowTable(schema); } } let inferredSchema = inferSchema(data, schema, opt); inferredSchema = new Schema(inferredSchema.fields, schemaMetadata); const finalColumns: Record = {}; for (const field of inferredSchema.fields) { finalColumns[field.name] = transposeData(data, field); } return new ArrowTable(inferredSchema, finalColumns); } function inferSchema( data: Array>, schema: Schema | undefined, opts: MakeArrowTableOptions, ): Schema { // We will collect all fields we see in the data. const pathTree = new PathTree(); for (const [rowI, row] of data.entries()) { for (const [path, value] of rowPathsAndValues(row)) { if (!pathTree.has(path)) { // First time seeing this field. if (schema !== undefined) { const field = getFieldForPath(schema, path); if (field === undefined) { throw new Error( `Found field not in schema: ${path.join(".")} at row ${rowI}`, ); } else { pathTree.set(path, field.type); } } else { const inferredType = inferType(value, path, opts); if (inferredType === undefined) { throw new Error(`Failed to infer data type for field ${path.join( ".", )} at row ${rowI}. \ Consider providing an explicit schema.`); } pathTree.set(path, inferredType); } } else if (schema === undefined) { const currentType = pathTree.get(path); const newType = inferType(value, path, opts); if (currentType !== newType) { new Error(`Failed to infer schema for data. Previously inferred type \ ${currentType} but found ${newType} at row ${rowI}. Consider \ providing an explicit schema.`); } } } } if (schema === undefined) { function fieldsFromPathTree(pathTree: PathTree): Field[] { const fields = []; for (const [name, value] of pathTree.map.entries()) { if (value instanceof PathTree) { const children = fieldsFromPathTree(value); fields.push(new Field(name, new Struct(children), true)); } else { fields.push(new Field(name, value, true)); } } return fields; } const fields = fieldsFromPathTree(pathTree); return new Schema(fields); } else { function takeMatchingFields( fields: Field[], pathTree: PathTree, ): Field[] { const outFields = []; for (const field of fields) { if (pathTree.map.has(field.name)) { const value = pathTree.get([field.name]); if (value instanceof PathTree) { const struct = field.type as Struct; const children = takeMatchingFields(struct.children, value); outFields.push( new Field(field.name, new Struct(children), field.nullable), ); } else { outFields.push( new Field(field.name, value as DataType, field.nullable), ); } } } return outFields; } const fields = takeMatchingFields(schema.fields, pathTree); return new Schema(fields); } } function* rowPathsAndValues( row: Record, basePath: string[] = [], ): Generator<[string[], unknown]> { for (const [key, value] of Object.entries(row)) { if (isObject(value)) { yield* rowPathsAndValues(value, [...basePath, key]); } else { yield [[...basePath, key], value]; } } } function isObject(value: unknown): value is Record { return ( typeof value === "object" && value !== null && !Array.isArray(value) && !(value instanceof RegExp) && !(value instanceof Date) && !(value instanceof Set) && !(value instanceof Map) && !(value instanceof Buffer) ); } function getFieldForPath(schema: Schema, path: string[]): Field | undefined { let current: Field | Schema = schema; for (const key of path) { if (current instanceof Schema) { const field: Field | undefined = current.fields.find( (f) => f.name === key, ); if (field === undefined) { return undefined; } current = field; } else if (current instanceof Field && DataType.isStruct(current.type)) { const struct: Struct = current.type; const field = struct.children.find((f) => f.name === key); if (field === undefined) { return undefined; } current = field; } else { return undefined; } } if (current instanceof Field) { return current; } else { return undefined; } } /** * Try to infer which Arrow type to use for a given value. * * May return undefined if the type cannot be inferred. */ function inferType( value: unknown, path: string[], opts: MakeArrowTableOptions, ): DataType | undefined { if (typeof value === "bigint") { return new Int64(); } else if (typeof value === "number") { // Even if it's an integer, it's safer to assume Float64. Users can // always provide an explicit schema or use BigInt if they mean integer. return new Float64(); } else if (typeof value === "string") { if (opts.dictionaryEncodeStrings) { return new Dictionary(new Utf8(), new Int32()); } else { return new Utf8(); } } else if (typeof value === "boolean") { return new Bool(); } else if (value instanceof Buffer) { return new Binary(); } else if (Array.isArray(value)) { if (value.length === 0) { return undefined; // Without any values we can't infer the type } if (path.length === 1 && Object.hasOwn(opts.vectorColumns, path[0])) { const floatType = sanitizeType(opts.vectorColumns[path[0]].type); return new FixedSizeList( value.length, new Field("item", floatType, true), ); } const valueType = inferType(value[0], path, opts); if (valueType === undefined) { return undefined; } // Try to automatically detect embedding columns. if (valueType instanceof Float && path[path.length - 1] === "vector") { // We default to Float32 for vectors. const child = new Field("item", new Float32(), true); return new FixedSizeList(value.length, child); } else { const child = new Field("item", valueType, true); return new List(child); } } else { // TODO: timestamp return undefined; } } class PathTree { map: Map>; constructor(entries?: [string[], V][]) { this.map = new Map(); if (entries !== undefined) { for (const [path, value] of entries) { this.set(path, value); } } } has(path: string[]): boolean { let ref: PathTree = this; for (const part of path) { if (!(ref instanceof PathTree) || !ref.map.has(part)) { return false; } ref = ref.map.get(part) as PathTree; } return true; } get(path: string[]): V | undefined { let ref: PathTree = this; for (const part of path) { if (!(ref instanceof PathTree) || !ref.map.has(part)) { return undefined; } ref = ref.map.get(part) as PathTree; } return ref as V; } set(path: string[], value: V): void { let ref: PathTree = this; for (const part of path.slice(0, path.length - 1)) { if (!ref.map.has(part)) { ref.map.set(part, new PathTree()); } ref = ref.map.get(part) as PathTree; } ref.map.set(path[path.length - 1], value); } } function transposeData( data: Record[], field: Field, path: string[] = [], ): Vector { if (field.type instanceof Struct) { const childFields = field.type.children; const fullPath = [...path, field.name]; const childVectors = childFields.map((child) => { return transposeData(data, child, fullPath); }); const structData = makeData({ type: field.type, children: childVectors as unknown as ArrowData[], }); return arrowMakeVector(structData); } else { const valuesPath = [...path, field.name]; const values = data.map((datum) => { let current: unknown = datum; for (const key of valuesPath) { if (current == null) { return null; } if ( isObject(current) && (Object.hasOwn(current, key) || key in current) ) { current = current[key]; } else { return null; } } return current; }); return makeVector(values, field.type); } } /** * Create an empty Arrow table with the provided schema */ export function makeEmptyTable( schema: SchemaLike, metadata?: Map, ): ArrowTable { return makeArrowTable([], { schema }, metadata); } /** * Helper function to convert Array> to a variable sized list array */ // @ts-expect-error (Vector is not assignable to Vector) function makeListVector(lists: unknown[][]): Vector { if (lists.length === 0 || lists[0].length === 0) { throw Error("Cannot infer list vector from empty array or empty list"); } const sampleList = lists[0]; // biome-ignore lint/suspicious/noExplicitAny: skip let inferredType: any; try { const sampleVector = makeVector(sampleList); inferredType = sampleVector.type; } catch (error: unknown) { // eslint-disable-next-line @typescript-eslint/restrict-template-expressions throw Error(`Cannot infer list vector. Cannot infer inner type: ${error}`); } const listBuilder = makeBuilder({ type: new List(new Field("item", inferredType, true)), }); for (const list of lists) { listBuilder.append(list); } return listBuilder.finish().toVector(); } /** Helper function to convert an Array of JS values to an Arrow Vector */ function makeVector( values: unknown[], type?: DataType, stringAsDictionary?: boolean, // biome-ignore lint/suspicious/noExplicitAny: skip ): Vector { if (type !== undefined) { // No need for inference, let Arrow create it if (type instanceof Int) { if (DataType.isInt(type) && type.bitWidth === 64) { // wrap in BigInt to avoid bug: https://github.com/apache/arrow/issues/40051 values = values.map((v) => { if (v === null) { return v; } else if (typeof v === "bigint") { return v; } else if (typeof v === "number") { return BigInt(v); } else { return v; } }); } else { // Similarly, bigint isn't supported for 16 or 32-bit ints. values = values.map((v) => { if (typeof v == "bigint") { return Number(v); } else { return v; } }); } } return vectorFromArray(values, type); } if (values.length === 0) { throw Error( "makeVector requires at least one value or the type must be specfied", ); } const sampleValue = values.find((val) => val !== null && val !== undefined); if (sampleValue === undefined) { throw Error( "makeVector cannot infer the type if all values are null or undefined", ); } if (Array.isArray(sampleValue)) { // Default Arrow inference doesn't handle list types return makeListVector(values as unknown[][]); } else if (Buffer.isBuffer(sampleValue)) { // Default Arrow inference doesn't handle Buffer return vectorFromArray(values, new Binary()); } else if ( !(stringAsDictionary ?? false) && (typeof sampleValue === "string" || sampleValue instanceof String) ) { // If the type is string then don't use Arrow's default inference unless dictionaries are requested // because it will always use dictionary encoding for strings return vectorFromArray(values, new Utf8()); } else { // Convert a JS array of values to an arrow vector return vectorFromArray(values); } } /** Helper function to apply embeddings from metadata to an input table */ async function applyEmbeddingsFromMetadata( table: ArrowTable, schema: Schema, ): Promise { const registry = getRegistry(); const functions = await registry.parseFunctions(schema.metadata); const columns = Object.fromEntries( table.schema.fields.map((field) => [ field.name, table.getChild(field.name)!, ]), ); for (const functionEntry of functions.values()) { const sourceColumn = columns[functionEntry.sourceColumn]; const destColumn = functionEntry.vectorColumn ?? "vector"; if (sourceColumn === undefined) { throw new Error( `Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`, ); } // Check if destination column exists and handle accordingly if (columns[destColumn] !== undefined) { const existingColumn = columns[destColumn]; // If the column exists but is all null, we can fill it with embeddings if (existingColumn.nullCount !== existingColumn.length) { // Column has non-null values, skip embedding application continue; } } if (table.batches.length > 1) { throw new Error( "Internal error: `makeArrowTable` unexpectedly created a table with more than one batch", ); } const values = sourceColumn.toArray(); const vectors = await functionEntry.function.computeSourceEmbeddings(values); if (vectors.length !== values.length) { throw new Error( "Embedding function did not return an embedding for each input element", ); } let destType: DataType; const dtype = schema.fields.find((f) => f.name === destColumn)!.type; if (isFixedSizeList(dtype)) { destType = sanitizeType(dtype); } else { throw new Error( "Expected FixedSizeList as datatype for vector field, instead got: " + dtype, ); } const vector = makeVector(vectors, destType); columns[destColumn] = vector; } const newTable = new ArrowTable(columns); return alignTable(newTable, schema); } /** Helper function to apply embeddings to an input table */ async function applyEmbeddings( table: ArrowTable, embeddings?: EmbeddingFunctionConfig, schema?: SchemaLike, ): Promise { if (schema !== undefined && schema !== null) { schema = sanitizeSchema(schema); } if (schema?.metadata.has("embedding_functions")) { return applyEmbeddingsFromMetadata(table, schema! as Schema); } else if (embeddings == null || embeddings === undefined) { return table; } let schemaMetadata = schema?.metadata || new Map(); if (!(embeddings == null || embeddings === undefined)) { const registry = getRegistry(); const embeddingMetadata = registry.getTableMetadata([embeddings]); schemaMetadata = new Map([...schemaMetadata, ...embeddingMetadata]); } // Convert from ArrowTable to Record const colEntries = [...Array(table.numCols).keys()].map((_, idx) => { const name = table.schema.fields[idx].name; // eslint-disable-next-line @typescript-eslint/no-non-null-assertion const vec = table.getChildAt(idx)!; return [name, vec]; }); const newColumns = Object.fromEntries(colEntries); const sourceColumn = newColumns[embeddings.sourceColumn]; const destColumn = embeddings.vectorColumn ?? "vector"; const innerDestType = embeddings.function.embeddingDataType() ?? new Float32(); if (sourceColumn === undefined) { throw new Error( `Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`, ); } if (table.numRows === 0) { if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) { // We have an empty table and it already has the embedding column so no work needs to be done // Note: we don't return an error like we did below because this is a common occurrence. For example, // if we call convertToTable with 0 records and a schema that includes the embedding return table; } const dimensions = embeddings.function.ndims(); if (dimensions !== undefined) { const destType = newVectorType(dimensions, innerDestType); newColumns[destColumn] = makeVector([], destType); } else if (schema != null) { const destField = schema.fields.find((f) => f.name === destColumn); if (destField != null) { newColumns[destColumn] = makeVector([], destField.type); } else { throw new Error( `Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`, ); } } else { throw new Error( "Attempt to apply embeddings to an empty table when the embeddings function does not specify `embeddingDimension`", ); } } else { // Check if destination column exists and handle accordingly if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) { const existingColumn = newColumns[destColumn]; // If the column exists but is all null, we can fill it with embeddings if (existingColumn.nullCount !== existingColumn.length) { // Column has non-null values, skip embedding application and return table as-is let newTable = new ArrowTable(newColumns); if (schema != null) { newTable = alignTable(newTable, schema as Schema); } return new ArrowTable( new Schema(newTable.schema.fields, schemaMetadata), newTable.batches, ); } } if (table.batches.length > 1) { throw new Error( "Internal error: `makeArrowTable` unexpectedly created a table with more than one batch", ); } const values = sourceColumn.toArray(); const vectors = await embeddings.function.computeSourceEmbeddings( values as T[], ); if (vectors.length !== values.length) { throw new Error( "Embedding function did not return an embedding for each input element", ); } const destType = newVectorType(vectors[0].length, innerDestType); newColumns[destColumn] = makeVector(vectors, destType); } let newTable = new ArrowTable(newColumns); if (schema != null) { if (schema.fields.find((f) => f.name === destColumn) === undefined) { throw new Error( `When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`, ); } newTable = alignTable(newTable, schema as Schema); } newTable = new ArrowTable( new Schema(newTable.schema.fields, schemaMetadata), newTable.batches, ); return newTable; } /** * Convert an Array of records into an Arrow Table, optionally applying an * embeddings function to it. * * This function calls `makeArrowTable` first to create the Arrow Table. * Any provided `makeTableOptions` (e.g. a schema) will be passed on to * that call. * * The embedding function will be passed a column of values (based on the * `sourceColumn` of the embedding function) and expects to receive back * number[][] which will be converted into a fixed size list column. By * default this will be a fixed size list of Float32 but that can be * customized by the `embeddingDataType` property of the embedding function. * * If a schema is provided in `makeTableOptions` then it should include the * embedding columns. If no schema is provded then embedding columns will * be placed at the end of the table, after all of the input columns. */ export async function convertToTable( data: Array>, embeddings?: EmbeddingFunctionConfig, makeTableOptions?: Partial, ): Promise { const table = makeArrowTable(data, makeTableOptions); return await applyEmbeddings(table, embeddings, makeTableOptions?.schema); } /** Creates the Arrow Type for a Vector column with dimension `dim` */ export function newVectorType( dim: number, innerType: unknown, ): FixedSizeList { // in Lance we always default to have the elements nullable, so we need to set it to true // otherwise we often get schema mismatches because the stored data always has schema with nullable elements const children = new Field("item", sanitizeType(innerType), true); return new FixedSizeList(dim, children); } /** * Serialize an Array of records into a buffer using the Arrow IPC File serialization * * This function will call `convertToTable` and pass on `embeddings` and `schema` * * `schema` is required if data is empty */ export async function fromRecordsToBuffer( data: Array>, embeddings?: EmbeddingFunctionConfig, schema?: Schema, ): Promise { if (schema !== undefined && schema !== null) { schema = sanitizeSchema(schema); } const table = await convertToTable(data, embeddings, { schema }); const writer = RecordBatchFileWriter.writeAll(table); return Buffer.from(await writer.toUint8Array()); } /** * Serialize an Array of records into a buffer using the Arrow IPC Stream serialization * * This function will call `convertToTable` and pass on `embeddings` and `schema` * * `schema` is required if data is empty */ export async function fromRecordsToStreamBuffer( data: Array>, embeddings?: EmbeddingFunctionConfig, schema?: Schema, ): Promise { if (schema !== undefined && schema !== null) { schema = sanitizeSchema(schema); } const table = await convertToTable(data, embeddings, { schema }); const writer = RecordBatchStreamWriter.writeAll(table); return Buffer.from(await writer.toUint8Array()); } /** * Serialize an Arrow Table into a buffer using the Arrow IPC File serialization * * This function will apply `embeddings` to the table in a manner similar to * `convertToTable`. * * `schema` is required if the table is empty */ export async function fromTableToBuffer( table: ArrowTable, embeddings?: EmbeddingFunctionConfig, schema?: SchemaLike, ): Promise { if (schema !== undefined && schema !== null) { schema = sanitizeSchema(schema); } const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema); const writer = RecordBatchFileWriter.writeAll(tableWithEmbeddings); return Buffer.from(await writer.toUint8Array()); } /** * Serialize an Arrow Table into a buffer using the Arrow IPC File serialization * * This function will apply `embeddings` to the table in a manner similar to * `convertToTable`. * * `schema` is required if the table is empty */ export async function fromDataToBuffer( data: Data, embeddings?: EmbeddingFunctionConfig, schema?: Schema, ): Promise { if (schema !== undefined && schema !== null) { schema = sanitizeSchema(schema); } if (isArrowTable(data)) { return fromTableToBuffer(sanitizeTable(data), embeddings, schema); } else { const table = await convertToTable(data, embeddings, { schema }); return fromTableToBuffer(table); } } /** * Read a single record batch from a buffer. * * Returns null if the buffer does not contain a record batch */ export async function fromBufferToRecordBatch( data: Buffer, ): Promise { const iter = await RecordBatchFileReader.readAll(Buffer.from(data)).next() .value; const recordBatch = iter?.next().value; return recordBatch || null; } /** * Create a buffer containing a single record batch */ export async function fromRecordBatchToBuffer( batch: RecordBatch, ): Promise { const writer = new RecordBatchFileWriter().writeAll([batch]); return Buffer.from(await writer.toUint8Array()); } /** * Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization * * This function will apply `embeddings` to the table in a manner similar to * `convertToTable`. * * `schema` is required if the table is empty */ export async function fromTableToStreamBuffer( table: ArrowTable, embeddings?: EmbeddingFunctionConfig, schema?: SchemaLike, ): Promise { const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema); const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings); return Buffer.from(await writer.toUint8Array()); } /** * Reorder the columns in `batch` so that they agree with the field order in `schema` */ function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch { const alignedChildren = []; for (const field of schema.fields) { const indexInBatch = batch.schema.fields?.findIndex( (f) => f.name === field.name, ); if (indexInBatch < 0) { throw new Error( `The column ${field.name} was not found in the Arrow Table`, ); } alignedChildren.push(batch.data.children[indexInBatch]); } const newData = makeData({ type: new Struct(schema.fields), length: batch.numRows, nullCount: batch.nullCount, children: alignedChildren, }); return new RecordBatch(schema, newData); } /** * Reorder the columns in `table` so that they agree with the field order in `schema` */ function alignTable(table: ArrowTable, schema: Schema): ArrowTable { const alignedBatches = table.batches.map((batch) => alignBatch(batch, schema), ); return new ArrowTable(schema, alignedBatches); } /** * Create an empty table with the given schema */ export function createEmptyTable(schema: Schema): ArrowTable { return new ArrowTable(sanitizeSchema(schema)); } function validateSchemaEmbeddings( schema: Schema, data: Array>, embeddings: EmbeddingFunctionConfig | undefined, ): Schema { const fields = []; const missingEmbeddingFields = []; // First we check if the field is a `FixedSizeList` // Then we check if the data contains the field // if it does not, we add it to the list of missing embedding fields // Finally, we check if those missing embedding fields are `this._embeddings` // if they are not, we throw an error for (let field of schema.fields) { if (isFixedSizeList(field.type)) { field = sanitizeField(field); if (data.length !== 0 && data?.[0]?.[field.name] === undefined) { if (schema.metadata.has("embedding_functions")) { const embeddings = JSON.parse( schema.metadata.get("embedding_functions")!, ); if ( // biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f` embeddings.find((f: any) => f["vectorColumn"] === field.name) === undefined ) { missingEmbeddingFields.push(field); } } else { missingEmbeddingFields.push(field); } } else { fields.push(field); } } else { fields.push(field); } } if (missingEmbeddingFields.length > 0 && embeddings === undefined) { throw new Error( `Table has embeddings: "${missingEmbeddingFields .map((f) => f.name) .join(",")}", but no embedding function was provided`, ); } return new Schema(fields, schema.metadata); } interface JsonDataType { type: string; fields?: JsonField[]; length?: number; } interface JsonField { name: string; type: JsonDataType; nullable: boolean; metadata: Map; } // Matches format of https://github.com/lancedb/lance/blob/main/rust/lance/src/arrow/json.rs export function dataTypeToJson(dataType: DataType): JsonDataType { switch (dataType.typeId) { // For primitives, matches https://github.com/lancedb/lance/blob/e12bb9eff2a52f753668d4b62c52e4d72b10d294/rust/lance-core/src/datatypes.rs#L185 case Type.Null: return { type: "null" }; case Type.Bool: return { type: "bool" }; case Type.Int8: return { type: "int8" }; case Type.Int16: return { type: "int16" }; case Type.Int32: return { type: "int32" }; case Type.Int64: return { type: "int64" }; case Type.Uint8: return { type: "uint8" }; case Type.Uint16: return { type: "uint16" }; case Type.Uint32: return { type: "uint32" }; case Type.Uint64: return { type: "uint64" }; case Type.Int: { const bitWidth = (dataType as Int).bitWidth; const signed = (dataType as Int).isSigned; const prefix = signed ? "" : "u"; return { type: `${prefix}int${bitWidth}` }; } case Type.Float: { switch ((dataType as Float).precision) { case Precision.HALF: return { type: "halffloat" }; case Precision.SINGLE: return { type: "float" }; case Precision.DOUBLE: return { type: "double" }; } throw Error("Unsupported float precision"); } case Type.Float16: return { type: "halffloat" }; case Type.Float32: return { type: "float" }; case Type.Float64: return { type: "double" }; case Type.Utf8: return { type: "string" }; case Type.Binary: return { type: "binary" }; case Type.LargeUtf8: return { type: "large_string" }; case Type.LargeBinary: return { type: "large_binary" }; case Type.List: return { type: "list", fields: [fieldToJson((dataType as List).children[0])], }; case Type.FixedSizeList: { const fixedSizeList = dataType as FixedSizeList; return { type: "fixed_size_list", fields: [fieldToJson(fixedSizeList.children[0])], length: fixedSizeList.listSize, }; } case Type.Struct: return { type: "struct", fields: (dataType as Struct).children.map(fieldToJson), }; case Type.Date: { const unit = (dataType as Date_).unit; return { type: unit === DateUnit.DAY ? "date32:day" : "date64:ms", }; } case Type.Timestamp: { const timestamp = dataType as Timestamp; const timezone = timestamp.timezone || "-"; return { type: `timestamp:${timestamp.unit}:${timezone}`, }; } case Type.Decimal: { const decimal = dataType as Decimal; return { type: `decimal:${decimal.bitWidth}:${decimal.precision}:${decimal.scale}`, }; } case Type.Duration: { const duration = dataType as Duration; return { type: `duration:${duration.unit}` }; } case Type.FixedSizeBinary: { const byteWidth = (dataType as FixedSizeBinary).byteWidth; return { type: `fixed_size_binary:${byteWidth}` }; } case Type.Dictionary: { const dict = dataType as Dictionary; const indexType = dataTypeToJson(dict.indices); const valueType = dataTypeToJson(dict.valueType); return { type: `dict:${valueType.type}:${indexType.type}:false`, }; } } throw new Error("Unsupported data type"); } function fieldToJson(field: Field): JsonField { return { name: field.name, type: dataTypeToJson(field.type), nullable: field.nullable, metadata: field.metadata, }; }