mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-04 10:52:56 +00:00
metadata{filename=xyz} filename would be there structurally, but ALWAYS
null.
I didn't include this as a file but it may be useful for understanding
the problem for people searching on this issue so I'm including it here
as documentation. Before this patch any field that is more than 1 deep
is accepted but returns null values for subfields when queried.
```js
const lancedb = require('@lancedb/lancedb');
// Debug logger
function debug(message, data) {
console.log(`[TEST] ${message}`, data !== undefined ? data : '');
}
// Log when our unwrapArrowObject is called
const kParent = Symbol.for("parent");
const kRowIndex = Symbol.for("rowIndex");
// Override console.log for our test
const originalConsoleLog = console.log;
console.log = function() {
// Filter out noisy logs
if (arguments[0] && typeof arguments[0] === 'string' && arguments[0].includes('[INFO] [LanceDB]')) {
originalConsoleLog.apply(console, arguments);
}
originalConsoleLog.apply(console, arguments);
};
async function main() {
debug('Starting test...');
// Connect to the database
debug('Connecting to database...');
const db = await lancedb.connect('./.lancedb');
// Try to open an existing table, or create a new one if it doesn't exist
let table;
try {
table = await db.openTable('test_nested_fields');
debug('Opened existing table');
} catch (e) {
debug('Creating new table...');
// Create test data with nested metadata structure
const data = [
{
id: 'test1',
vector: [1, 2, 3],
metadata: {
filePath: "/path/to/file1.ts",
startLine: 10,
endLine: 20,
text: "function test() { return true; }"
}
},
{
id: 'test2',
vector: [4, 5, 6],
metadata: {
filePath: "/path/to/file2.ts",
startLine: 30,
endLine: 40,
text: "function test2() { return false; }"
}
}
];
debug('Data to be inserted:', JSON.stringify(data, null, 2));
// Create the table
table = await db.createTable('test_nested_fields', data);
debug('Table created successfully');
}
// Query the table and get results
debug('Querying table...');
const results = await table.search([1, 2, 3]).limit(10).toArray();
// Log the results
debug('Number of results:', results.length);
if (results.length > 0) {
const firstResult = results[0];
debug('First result properties:', Object.keys(firstResult));
// Check if metadata is accessible and what properties it has
if (firstResult.metadata) {
debug('Metadata properties:', Object.keys(firstResult.metadata));
debug('Metadata filePath:', firstResult.metadata.filePath);
debug('Metadata startLine:', firstResult.metadata.startLine);
// Destructure to see if that helps
const { filePath, startLine, endLine, text } = firstResult.metadata;
debug('Destructured values:', { filePath, startLine, endLine, text });
// Check if it's a proxy object
debug('Result is proxy?', Object.getPrototypeOf(firstResult) === Object.prototype ? false : true);
debug('Metadata is proxy?', Object.getPrototypeOf(firstResult.metadata) === Object.prototype ? false : true);
} else {
debug('Metadata is not accessible!');
}
}
// Close the database
await db.close();
}
main().catch(e => {
console.error('Error:', e);
});
```
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit
## Summary by CodeRabbit
- **Bug Fixes**
- Improved handling of nested struct fields to ensure accurate
preservation of values during serialization and deserialization.
- Enhanced robustness when accessing nested object properties, reducing
errors with missing or null values.
- **Tests**
- Added tests to verify correct handling of nested struct fields through
serialization and deserialization.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---------
Co-authored-by: Will Jones <willjones127@gmail.com>
1337 lines
42 KiB
TypeScript
1337 lines
42 KiB
TypeScript
// SPDX-License-Identifier: Apache-2.0
|
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
|
|
import {
|
|
Data as ArrowData,
|
|
Table as ArrowTable,
|
|
Binary,
|
|
Bool,
|
|
BufferType,
|
|
DataType,
|
|
DateUnit,
|
|
Date_,
|
|
Decimal,
|
|
Dictionary,
|
|
Duration,
|
|
Field,
|
|
FixedSizeBinary,
|
|
FixedSizeList,
|
|
Float,
|
|
Float32,
|
|
Float64,
|
|
Int,
|
|
Int32,
|
|
Int64,
|
|
LargeBinary,
|
|
List,
|
|
Null,
|
|
Precision,
|
|
RecordBatch,
|
|
RecordBatchFileReader,
|
|
RecordBatchFileWriter,
|
|
RecordBatchStreamWriter,
|
|
Schema,
|
|
Struct,
|
|
Timestamp,
|
|
Type,
|
|
Utf8,
|
|
Vector,
|
|
makeVector as arrowMakeVector,
|
|
vectorFromArray as badVectorFromArray,
|
|
makeBuilder,
|
|
makeData,
|
|
makeTable,
|
|
} from "apache-arrow";
|
|
import { Buffers } from "apache-arrow/data";
|
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
|
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
|
import {
|
|
sanitizeField,
|
|
sanitizeSchema,
|
|
sanitizeTable,
|
|
sanitizeType,
|
|
} from "./sanitize";
|
|
export * from "apache-arrow";
|
|
export type SchemaLike =
|
|
| Schema
|
|
| {
|
|
fields: FieldLike[];
|
|
metadata: Map<string, string>;
|
|
get names(): unknown[];
|
|
};
|
|
export type FieldLike =
|
|
| Field
|
|
| {
|
|
type: string;
|
|
name: string;
|
|
nullable?: boolean;
|
|
metadata?: Map<string, string>;
|
|
};
|
|
|
|
export type DataLike =
|
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
| import("apache-arrow").Data<Struct<any>>
|
|
| {
|
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
type: any;
|
|
length: number;
|
|
offset: number;
|
|
stride: number;
|
|
nullable: boolean;
|
|
children: DataLike[];
|
|
get nullCount(): number;
|
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
values: Buffers<any>[BufferType.DATA];
|
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
typeIds: Buffers<any>[BufferType.TYPE];
|
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
nullBitmap: Buffers<any>[BufferType.VALIDITY];
|
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
valueOffsets: Buffers<any>[BufferType.OFFSET];
|
|
};
|
|
|
|
export type RecordBatchLike =
|
|
| RecordBatch
|
|
| {
|
|
schema: SchemaLike;
|
|
data: DataLike;
|
|
};
|
|
|
|
export type TableLike =
|
|
| ArrowTable
|
|
| { schema: SchemaLike; batches: RecordBatchLike[] };
|
|
|
|
export type IntoVector =
|
|
| Float32Array
|
|
| Float64Array
|
|
| number[]
|
|
| Promise<Float32Array | Float64Array | number[]>;
|
|
|
|
export function isArrowTable(value: object): value is TableLike {
|
|
if (value instanceof ArrowTable) return true;
|
|
return "schema" in value && "batches" in value;
|
|
}
|
|
|
|
export function isNull(value: unknown): value is Null {
|
|
return value instanceof Null || DataType.isNull(value);
|
|
}
|
|
export function isInt(value: unknown): value is Int {
|
|
return value instanceof Int || DataType.isInt(value);
|
|
}
|
|
export function isFloat(value: unknown): value is Float {
|
|
return value instanceof Float || DataType.isFloat(value);
|
|
}
|
|
export function isBinary(value: unknown): value is Binary {
|
|
return value instanceof Binary || DataType.isBinary(value);
|
|
}
|
|
export function isLargeBinary(value: unknown): value is LargeBinary {
|
|
return value instanceof LargeBinary || DataType.isLargeBinary(value);
|
|
}
|
|
export function isUtf8(value: unknown): value is Utf8 {
|
|
return value instanceof Utf8 || DataType.isUtf8(value);
|
|
}
|
|
export function isLargeUtf8(value: unknown): value is Utf8 {
|
|
return value instanceof Utf8 || DataType.isLargeUtf8(value);
|
|
}
|
|
export function isBool(value: unknown): value is Utf8 {
|
|
return value instanceof Utf8 || DataType.isBool(value);
|
|
}
|
|
export function isDecimal(value: unknown): value is Utf8 {
|
|
return value instanceof Utf8 || DataType.isDecimal(value);
|
|
}
|
|
export function isDate(value: unknown): value is Utf8 {
|
|
return value instanceof Utf8 || DataType.isDate(value);
|
|
}
|
|
export function isTime(value: unknown): value is Utf8 {
|
|
return value instanceof Utf8 || DataType.isTime(value);
|
|
}
|
|
export function isTimestamp(value: unknown): value is Utf8 {
|
|
return value instanceof Utf8 || DataType.isTimestamp(value);
|
|
}
|
|
export function isInterval(value: unknown): value is Utf8 {
|
|
return value instanceof Utf8 || DataType.isInterval(value);
|
|
}
|
|
export function isDuration(value: unknown): value is Utf8 {
|
|
return value instanceof Utf8 || DataType.isDuration(value);
|
|
}
|
|
export function isList(value: unknown): value is List {
|
|
return value instanceof List || DataType.isList(value);
|
|
}
|
|
export function isStruct(value: unknown): value is Struct {
|
|
return value instanceof Struct || DataType.isStruct(value);
|
|
}
|
|
export function isUnion(value: unknown): value is Struct {
|
|
return value instanceof Struct || DataType.isUnion(value);
|
|
}
|
|
export function isFixedSizeBinary(value: unknown): value is FixedSizeBinary {
|
|
return value instanceof FixedSizeBinary || DataType.isFixedSizeBinary(value);
|
|
}
|
|
|
|
export function isFixedSizeList(value: unknown): value is FixedSizeList {
|
|
return value instanceof FixedSizeList || DataType.isFixedSizeList(value);
|
|
}
|
|
|
|
/** Data type accepted by NodeJS SDK */
|
|
export type Data = Record<string, unknown>[] | TableLike;
|
|
|
|
/*
|
|
* Options to control how a column should be converted to a vector array
|
|
*/
|
|
export class VectorColumnOptions {
|
|
/** Vector column type. */
|
|
type: Float = new Float32();
|
|
|
|
constructor(values?: Partial<VectorColumnOptions>) {
|
|
Object.assign(this, values);
|
|
}
|
|
}
|
|
|
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
function vectorFromArray(data: any, type?: DataType) {
|
|
// Workaround for: https://github.com/apache/arrow/issues/45862
|
|
// If FSL type with float
|
|
if (DataType.isFixedSizeList(type) && DataType.isFloat(type.valueType)) {
|
|
const extendedData = [...data, new Array(type.listSize).fill(0.0)];
|
|
const array = badVectorFromArray(extendedData, type);
|
|
return array.slice(0, data.length);
|
|
} else if (type === undefined) {
|
|
return badVectorFromArray(data);
|
|
} else {
|
|
return badVectorFromArray(data, type);
|
|
}
|
|
}
|
|
|
|
/** Options to control the makeArrowTable call. */
|
|
export class MakeArrowTableOptions {
|
|
/*
|
|
* Schema of the data.
|
|
*
|
|
* If this is not provided then the data type will be inferred from the
|
|
* JS type. Integer numbers will become int64, floating point numbers
|
|
* will become float64 and arrays will become variable sized lists with
|
|
* the data type inferred from the first element in the array.
|
|
*
|
|
* The schema must be specified if there are no records (e.g. to make
|
|
* an empty table)
|
|
*/
|
|
schema?: SchemaLike;
|
|
|
|
/*
|
|
* Mapping from vector column name to expected type
|
|
*
|
|
* Lance expects vector columns to be fixed size list arrays (i.e. tensors)
|
|
* However, `makeArrowTable` will not infer this by default (it creates
|
|
* variable size list arrays). This field can be used to indicate that a column
|
|
* should be treated as a vector column and converted to a fixed size list.
|
|
*
|
|
* The keys should be the names of the vector columns. The value specifies the
|
|
* expected data type of the vector columns.
|
|
*
|
|
* If `schema` is provided then this field is ignored.
|
|
*
|
|
* By default, the column named "vector" will be assumed to be a float32
|
|
* vector column.
|
|
*/
|
|
vectorColumns: Record<string, VectorColumnOptions> = {
|
|
vector: new VectorColumnOptions(),
|
|
};
|
|
embeddings?: EmbeddingFunction<unknown>;
|
|
embeddingFunction?: EmbeddingFunctionConfig;
|
|
|
|
/**
|
|
* If true then string columns will be encoded with dictionary encoding
|
|
*
|
|
* Set this to true if your string columns tend to repeat the same values
|
|
* often. For more precise control use the `schema` property to specify the
|
|
* data type for individual columns.
|
|
*
|
|
* If `schema` is provided then this property is ignored.
|
|
*/
|
|
dictionaryEncodeStrings: boolean = false;
|
|
|
|
constructor(values?: Partial<MakeArrowTableOptions>) {
|
|
Object.assign(this, values);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* An enhanced version of the {@link makeTable} function from Apache Arrow
|
|
* that supports nested fields and embeddings columns.
|
|
*
|
|
* (typically you do not need to call this function. It will be called automatically
|
|
* when creating a table or adding data to it)
|
|
*
|
|
* This function converts an array of Record<String, any> (row-major JS objects)
|
|
* to an Arrow Table (a columnar structure)
|
|
*
|
|
* If a schema is provided then it will be used to determine the resulting array
|
|
* types. Fields will also be reordered to fit the order defined by the schema.
|
|
*
|
|
* If a schema is not provided then the types will be inferred and the field order
|
|
* will be controlled by the order of properties in the first record. If a type
|
|
* is inferred it will always be nullable.
|
|
*
|
|
* If not all fields are found in the data, then a subset of the schema will be
|
|
* returned.
|
|
*
|
|
* If the input is empty then a schema must be provided to create an empty table.
|
|
*
|
|
* When a schema is not specified then data types will be inferred. The inference
|
|
* rules are as follows:
|
|
*
|
|
* - boolean => Bool
|
|
* - number => Float64
|
|
* - bigint => Int64
|
|
* - String => Utf8
|
|
* - Buffer => Binary
|
|
* - Record<String, any> => Struct
|
|
* - Array<any> => List
|
|
* @example
|
|
* ```ts
|
|
* import { fromTableToBuffer, makeArrowTable } from "../arrow";
|
|
* import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
|
|
*
|
|
* const schema = new Schema([
|
|
* new Field("a", new Int32()),
|
|
* new Field("b", new Float32()),
|
|
* new Field("c", new FixedSizeList(3, new Field("item", new Float16()))),
|
|
* ]);
|
|
* const table = makeArrowTable([
|
|
* { a: 1, b: 2, c: [1, 2, 3] },
|
|
* { a: 4, b: 5, c: [4, 5, 6] },
|
|
* { a: 7, b: 8, c: [7, 8, 9] },
|
|
* ], { schema });
|
|
* ```
|
|
*
|
|
* By default it assumes that the column named `vector` is a vector column
|
|
* and it will be converted into a fixed size list array of type float32.
|
|
* The `vectorColumns` option can be used to support other vector column
|
|
* names and data types.
|
|
*
|
|
* ```ts
|
|
* const schema = new Schema([
|
|
* new Field("a", new Float64()),
|
|
* new Field("b", new Float64()),
|
|
* new Field(
|
|
* "vector",
|
|
* new FixedSizeList(3, new Field("item", new Float32()))
|
|
* ),
|
|
* ]);
|
|
* const table = makeArrowTable([
|
|
* { a: 1, b: 2, vector: [1, 2, 3] },
|
|
* { a: 4, b: 5, vector: [4, 5, 6] },
|
|
* { a: 7, b: 8, vector: [7, 8, 9] },
|
|
* ]);
|
|
* assert.deepEqual(table.schema, schema);
|
|
* ```
|
|
*
|
|
* You can specify the vector column types and names using the options as well
|
|
*
|
|
* ```ts
|
|
* const schema = new Schema([
|
|
* new Field('a', new Float64()),
|
|
* new Field('b', new Float64()),
|
|
* new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
|
|
* new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
|
|
* ]);
|
|
* const table = makeArrowTable([
|
|
* { a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
|
|
* { a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
|
|
* { a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
|
|
* ], {
|
|
* vectorColumns: {
|
|
* vec1: { type: new Float16() },
|
|
* vec2: { type: new Float16() }
|
|
* }
|
|
* }
|
|
* assert.deepEqual(table.schema, schema)
|
|
* ```
|
|
*/
|
|
export function makeArrowTable(
|
|
data: Array<Record<string, unknown>>,
|
|
options?: Partial<MakeArrowTableOptions>,
|
|
metadata?: Map<string, string>,
|
|
): ArrowTable {
|
|
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
let schema: Schema | undefined = undefined;
|
|
if (opt.schema !== undefined && opt.schema !== null) {
|
|
schema = sanitizeSchema(opt.schema);
|
|
schema = validateSchemaEmbeddings(
|
|
schema as Schema,
|
|
data,
|
|
options?.embeddingFunction,
|
|
);
|
|
}
|
|
|
|
let schemaMetadata = schema?.metadata || new Map<string, string>();
|
|
if (metadata !== undefined) {
|
|
schemaMetadata = new Map([...schemaMetadata, ...metadata]);
|
|
}
|
|
|
|
if (
|
|
data.length === 0 &&
|
|
(options?.schema === undefined || options?.schema === null)
|
|
) {
|
|
throw new Error("At least one record or a schema needs to be provided");
|
|
} else if (data.length === 0) {
|
|
if (schema === undefined) {
|
|
throw new Error("A schema must be provided if data is empty");
|
|
} else {
|
|
schema = new Schema(schema.fields, schemaMetadata);
|
|
return new ArrowTable(schema);
|
|
}
|
|
}
|
|
|
|
let inferredSchema = inferSchema(data, schema, opt);
|
|
inferredSchema = new Schema(inferredSchema.fields, schemaMetadata);
|
|
|
|
const finalColumns: Record<string, Vector> = {};
|
|
for (const field of inferredSchema.fields) {
|
|
finalColumns[field.name] = transposeData(data, field);
|
|
}
|
|
|
|
return new ArrowTable(inferredSchema, finalColumns);
|
|
}
|
|
|
|
function inferSchema(
|
|
data: Array<Record<string, unknown>>,
|
|
schema: Schema | undefined,
|
|
opts: MakeArrowTableOptions,
|
|
): Schema {
|
|
// We will collect all fields we see in the data.
|
|
const pathTree = new PathTree<DataType>();
|
|
|
|
for (const [rowI, row] of data.entries()) {
|
|
for (const [path, value] of rowPathsAndValues(row)) {
|
|
if (!pathTree.has(path)) {
|
|
// First time seeing this field.
|
|
if (schema !== undefined) {
|
|
const field = getFieldForPath(schema, path);
|
|
if (field === undefined) {
|
|
throw new Error(
|
|
`Found field not in schema: ${path.join(".")} at row ${rowI}`,
|
|
);
|
|
} else {
|
|
pathTree.set(path, field.type);
|
|
}
|
|
} else {
|
|
const inferredType = inferType(value, path, opts);
|
|
if (inferredType === undefined) {
|
|
throw new Error(`Failed to infer data type for field ${path.join(".")} at row ${rowI}. \
|
|
Consider providing an explicit schema.`);
|
|
}
|
|
pathTree.set(path, inferredType);
|
|
}
|
|
} else if (schema === undefined) {
|
|
const currentType = pathTree.get(path);
|
|
const newType = inferType(value, path, opts);
|
|
if (currentType !== newType) {
|
|
new Error(`Failed to infer schema for data. Previously inferred type \
|
|
${currentType} but found ${newType} at row ${rowI}. Consider \
|
|
providing an explicit schema.`);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (schema === undefined) {
|
|
function fieldsFromPathTree(pathTree: PathTree<DataType>): Field[] {
|
|
const fields = [];
|
|
for (const [name, value] of pathTree.map.entries()) {
|
|
if (value instanceof PathTree) {
|
|
const children = fieldsFromPathTree(value);
|
|
fields.push(new Field(name, new Struct(children), true));
|
|
} else {
|
|
fields.push(new Field(name, value, true));
|
|
}
|
|
}
|
|
return fields;
|
|
}
|
|
const fields = fieldsFromPathTree(pathTree);
|
|
return new Schema(fields);
|
|
} else {
|
|
function takeMatchingFields(
|
|
fields: Field[],
|
|
pathTree: PathTree<DataType>,
|
|
): Field[] {
|
|
const outFields = [];
|
|
for (const field of fields) {
|
|
if (pathTree.map.has(field.name)) {
|
|
const value = pathTree.get([field.name]);
|
|
if (value instanceof PathTree) {
|
|
const struct = field.type as Struct;
|
|
const children = takeMatchingFields(struct.children, value);
|
|
outFields.push(
|
|
new Field(field.name, new Struct(children), field.nullable),
|
|
);
|
|
} else {
|
|
outFields.push(
|
|
new Field(field.name, value as DataType, field.nullable),
|
|
);
|
|
}
|
|
}
|
|
}
|
|
return outFields;
|
|
}
|
|
const fields = takeMatchingFields(schema.fields, pathTree);
|
|
return new Schema(fields);
|
|
}
|
|
}
|
|
|
|
function* rowPathsAndValues(
|
|
row: Record<string, unknown>,
|
|
basePath: string[] = [],
|
|
): Generator<[string[], unknown]> {
|
|
for (const [key, value] of Object.entries(row)) {
|
|
if (isObject(value)) {
|
|
yield* rowPathsAndValues(value, [...basePath, key]);
|
|
} else {
|
|
yield [[...basePath, key], value];
|
|
}
|
|
}
|
|
}
|
|
|
|
function isObject(value: unknown): value is Record<string, unknown> {
|
|
return (
|
|
typeof value === "object" &&
|
|
value !== null &&
|
|
!Array.isArray(value) &&
|
|
!(value instanceof RegExp) &&
|
|
!(value instanceof Date) &&
|
|
!(value instanceof Set) &&
|
|
!(value instanceof Map) &&
|
|
!(value instanceof Buffer)
|
|
);
|
|
}
|
|
|
|
function getFieldForPath(schema: Schema, path: string[]): Field | undefined {
|
|
let current: Field | Schema = schema;
|
|
for (const key of path) {
|
|
if (current instanceof Schema) {
|
|
const field: Field | undefined = current.fields.find(
|
|
(f) => f.name === key,
|
|
);
|
|
if (field === undefined) {
|
|
return undefined;
|
|
}
|
|
current = field;
|
|
} else if (current instanceof Field && DataType.isStruct(current.type)) {
|
|
const struct: Struct = current.type;
|
|
const field = struct.children.find((f) => f.name === key);
|
|
if (field === undefined) {
|
|
return undefined;
|
|
}
|
|
current = field;
|
|
} else {
|
|
return undefined;
|
|
}
|
|
}
|
|
if (current instanceof Field) {
|
|
return current;
|
|
} else {
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Try to infer which Arrow type to use for a given value.
|
|
*
|
|
* May return undefined if the type cannot be inferred.
|
|
*/
|
|
function inferType(
|
|
value: unknown,
|
|
path: string[],
|
|
opts: MakeArrowTableOptions,
|
|
): DataType | undefined {
|
|
if (typeof value === "bigint") {
|
|
return new Int64();
|
|
} else if (typeof value === "number") {
|
|
// Even if it's an integer, it's safer to assume Float64. Users can
|
|
// always provide an explicit schema or use BigInt if they mean integer.
|
|
return new Float64();
|
|
} else if (typeof value === "string") {
|
|
if (opts.dictionaryEncodeStrings) {
|
|
return new Dictionary(new Utf8(), new Int32());
|
|
} else {
|
|
return new Utf8();
|
|
}
|
|
} else if (typeof value === "boolean") {
|
|
return new Bool();
|
|
} else if (value instanceof Buffer) {
|
|
return new Binary();
|
|
} else if (Array.isArray(value)) {
|
|
if (value.length === 0) {
|
|
return undefined; // Without any values we can't infer the type
|
|
}
|
|
if (path.length === 1 && Object.hasOwn(opts.vectorColumns, path[0])) {
|
|
const floatType = sanitizeType(opts.vectorColumns[path[0]].type);
|
|
return new FixedSizeList(
|
|
value.length,
|
|
new Field("item", floatType, true),
|
|
);
|
|
}
|
|
const valueType = inferType(value[0], path, opts);
|
|
if (valueType === undefined) {
|
|
return undefined;
|
|
}
|
|
// Try to automatically detect embedding columns.
|
|
if (valueType instanceof Float && path[path.length - 1] === "vector") {
|
|
// We default to Float32 for vectors.
|
|
const child = new Field("item", new Float32(), true);
|
|
return new FixedSizeList(value.length, child);
|
|
} else {
|
|
const child = new Field("item", valueType, true);
|
|
return new List(child);
|
|
}
|
|
} else {
|
|
// TODO: timestamp
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
class PathTree<V> {
|
|
map: Map<string, V | PathTree<V>>;
|
|
|
|
constructor(entries?: [string[], V][]) {
|
|
this.map = new Map();
|
|
if (entries !== undefined) {
|
|
for (const [path, value] of entries) {
|
|
this.set(path, value);
|
|
}
|
|
}
|
|
}
|
|
has(path: string[]): boolean {
|
|
let ref: PathTree<V> = this;
|
|
for (const part of path) {
|
|
if (!(ref instanceof PathTree) || !ref.map.has(part)) {
|
|
return false;
|
|
}
|
|
ref = ref.map.get(part) as PathTree<V>;
|
|
}
|
|
return true;
|
|
}
|
|
get(path: string[]): V | undefined {
|
|
let ref: PathTree<V> = this;
|
|
for (const part of path) {
|
|
if (!(ref instanceof PathTree) || !ref.map.has(part)) {
|
|
return undefined;
|
|
}
|
|
ref = ref.map.get(part) as PathTree<V>;
|
|
}
|
|
return ref as V;
|
|
}
|
|
set(path: string[], value: V): void {
|
|
let ref: PathTree<V> = this;
|
|
for (const part of path.slice(0, path.length - 1)) {
|
|
if (!ref.map.has(part)) {
|
|
ref.map.set(part, new PathTree<V>());
|
|
}
|
|
ref = ref.map.get(part) as PathTree<V>;
|
|
}
|
|
ref.map.set(path[path.length - 1], value);
|
|
}
|
|
}
|
|
|
|
function transposeData(
|
|
data: Record<string, unknown>[],
|
|
field: Field,
|
|
path: string[] = [],
|
|
): Vector {
|
|
if (field.type instanceof Struct) {
|
|
const childFields = field.type.children;
|
|
const fullPath = [...path, field.name];
|
|
const childVectors = childFields.map((child) => {
|
|
return transposeData(data, child, fullPath);
|
|
});
|
|
const structData = makeData({
|
|
type: field.type,
|
|
children: childVectors as unknown as ArrowData<DataType>[],
|
|
});
|
|
return arrowMakeVector(structData);
|
|
} else {
|
|
const valuesPath = [...path, field.name];
|
|
const values = data.map((datum) => {
|
|
let current: unknown = datum;
|
|
for (const key of valuesPath) {
|
|
if (current == null) {
|
|
return null;
|
|
}
|
|
|
|
if (
|
|
isObject(current) &&
|
|
(Object.hasOwn(current, key) || key in current)
|
|
) {
|
|
current = current[key];
|
|
} else {
|
|
return null;
|
|
}
|
|
}
|
|
return current;
|
|
});
|
|
return makeVector(values, field.type);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Create an empty Arrow table with the provided schema
|
|
*/
|
|
export function makeEmptyTable(
|
|
schema: SchemaLike,
|
|
metadata?: Map<string, string>,
|
|
): ArrowTable {
|
|
return makeArrowTable([], { schema }, metadata);
|
|
}
|
|
|
|
/**
|
|
* Helper function to convert Array<Array<any>> to a variable sized list array
|
|
*/
|
|
// @ts-expect-error (Vector<unknown> is not assignable to Vector<any>)
|
|
function makeListVector(lists: unknown[][]): Vector<unknown> {
|
|
if (lists.length === 0 || lists[0].length === 0) {
|
|
throw Error("Cannot infer list vector from empty array or empty list");
|
|
}
|
|
const sampleList = lists[0];
|
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
let inferredType: any;
|
|
try {
|
|
const sampleVector = makeVector(sampleList);
|
|
inferredType = sampleVector.type;
|
|
} catch (error: unknown) {
|
|
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
throw Error(`Cannot infer list vector. Cannot infer inner type: ${error}`);
|
|
}
|
|
|
|
const listBuilder = makeBuilder({
|
|
type: new List(new Field("item", inferredType, true)),
|
|
});
|
|
for (const list of lists) {
|
|
listBuilder.append(list);
|
|
}
|
|
return listBuilder.finish().toVector();
|
|
}
|
|
|
|
/** Helper function to convert an Array of JS values to an Arrow Vector */
|
|
function makeVector(
|
|
values: unknown[],
|
|
type?: DataType,
|
|
stringAsDictionary?: boolean,
|
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
): Vector<any> {
|
|
if (type !== undefined) {
|
|
// No need for inference, let Arrow create it
|
|
if (type instanceof Int) {
|
|
if (DataType.isInt(type) && type.bitWidth === 64) {
|
|
// wrap in BigInt to avoid bug: https://github.com/apache/arrow/issues/40051
|
|
values = values.map((v) => {
|
|
if (v === null) {
|
|
return v;
|
|
} else if (typeof v === "bigint") {
|
|
return v;
|
|
} else if (typeof v === "number") {
|
|
return BigInt(v);
|
|
} else {
|
|
return v;
|
|
}
|
|
});
|
|
} else {
|
|
// Similarly, bigint isn't supported for 16 or 32-bit ints.
|
|
values = values.map((v) => {
|
|
if (typeof v == "bigint") {
|
|
return Number(v);
|
|
} else {
|
|
return v;
|
|
}
|
|
});
|
|
}
|
|
}
|
|
return vectorFromArray(values, type);
|
|
}
|
|
if (values.length === 0) {
|
|
throw Error(
|
|
"makeVector requires at least one value or the type must be specfied",
|
|
);
|
|
}
|
|
const sampleValue = values.find((val) => val !== null && val !== undefined);
|
|
if (sampleValue === undefined) {
|
|
throw Error(
|
|
"makeVector cannot infer the type if all values are null or undefined",
|
|
);
|
|
}
|
|
if (Array.isArray(sampleValue)) {
|
|
// Default Arrow inference doesn't handle list types
|
|
return makeListVector(values as unknown[][]);
|
|
} else if (Buffer.isBuffer(sampleValue)) {
|
|
// Default Arrow inference doesn't handle Buffer
|
|
return vectorFromArray(values, new Binary());
|
|
} else if (
|
|
!(stringAsDictionary ?? false) &&
|
|
(typeof sampleValue === "string" || sampleValue instanceof String)
|
|
) {
|
|
// If the type is string then don't use Arrow's default inference unless dictionaries are requested
|
|
// because it will always use dictionary encoding for strings
|
|
return vectorFromArray(values, new Utf8());
|
|
} else {
|
|
// Convert a JS array of values to an arrow vector
|
|
return vectorFromArray(values);
|
|
}
|
|
}
|
|
|
|
/** Helper function to apply embeddings from metadata to an input table */
|
|
async function applyEmbeddingsFromMetadata(
|
|
table: ArrowTable,
|
|
schema: Schema,
|
|
): Promise<ArrowTable> {
|
|
const registry = getRegistry();
|
|
const functions = await registry.parseFunctions(schema.metadata);
|
|
|
|
const columns = Object.fromEntries(
|
|
table.schema.fields.map((field) => [
|
|
field.name,
|
|
table.getChild(field.name)!,
|
|
]),
|
|
);
|
|
|
|
for (const functionEntry of functions.values()) {
|
|
const sourceColumn = columns[functionEntry.sourceColumn];
|
|
const destColumn = functionEntry.vectorColumn ?? "vector";
|
|
if (sourceColumn === undefined) {
|
|
throw new Error(
|
|
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
|
|
);
|
|
}
|
|
if (columns[destColumn] !== undefined) {
|
|
throw new Error(
|
|
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
|
);
|
|
}
|
|
if (table.batches.length > 1) {
|
|
throw new Error(
|
|
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
|
);
|
|
}
|
|
const values = sourceColumn.toArray();
|
|
|
|
const vectors =
|
|
await functionEntry.function.computeSourceEmbeddings(values);
|
|
if (vectors.length !== values.length) {
|
|
throw new Error(
|
|
"Embedding function did not return an embedding for each input element",
|
|
);
|
|
}
|
|
let destType: DataType;
|
|
const dtype = schema.fields.find((f) => f.name === destColumn)!.type;
|
|
if (isFixedSizeList(dtype)) {
|
|
destType = sanitizeType(dtype);
|
|
} else {
|
|
throw new Error(
|
|
"Expected FixedSizeList as datatype for vector field, instead got: " +
|
|
dtype,
|
|
);
|
|
}
|
|
const vector = makeVector(vectors, destType);
|
|
columns[destColumn] = vector;
|
|
}
|
|
const newTable = new ArrowTable(columns);
|
|
return alignTable(newTable, schema);
|
|
}
|
|
|
|
/** Helper function to apply embeddings to an input table */
|
|
async function applyEmbeddings<T>(
|
|
table: ArrowTable,
|
|
embeddings?: EmbeddingFunctionConfig,
|
|
schema?: SchemaLike,
|
|
): Promise<ArrowTable> {
|
|
if (schema !== undefined && schema !== null) {
|
|
schema = sanitizeSchema(schema);
|
|
}
|
|
if (schema?.metadata.has("embedding_functions")) {
|
|
return applyEmbeddingsFromMetadata(table, schema! as Schema);
|
|
} else if (embeddings == null || embeddings === undefined) {
|
|
return table;
|
|
}
|
|
|
|
let schemaMetadata = schema?.metadata || new Map<string, string>();
|
|
|
|
if (!(embeddings == null || embeddings === undefined)) {
|
|
const registry = getRegistry();
|
|
const embeddingMetadata = registry.getTableMetadata([embeddings]);
|
|
schemaMetadata = new Map([...schemaMetadata, ...embeddingMetadata]);
|
|
}
|
|
|
|
// Convert from ArrowTable to Record<String, Vector>
|
|
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
|
|
const name = table.schema.fields[idx].name;
|
|
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
const vec = table.getChildAt(idx)!;
|
|
return [name, vec];
|
|
});
|
|
const newColumns = Object.fromEntries(colEntries);
|
|
|
|
const sourceColumn = newColumns[embeddings.sourceColumn];
|
|
const destColumn = embeddings.vectorColumn ?? "vector";
|
|
const innerDestType =
|
|
embeddings.function.embeddingDataType() ?? new Float32();
|
|
if (sourceColumn === undefined) {
|
|
throw new Error(
|
|
`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`,
|
|
);
|
|
}
|
|
|
|
if (table.numRows === 0) {
|
|
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
|
// We have an empty table and it already has the embedding column so no work needs to be done
|
|
// Note: we don't return an error like we did below because this is a common occurrence. For example,
|
|
// if we call convertToTable with 0 records and a schema that includes the embedding
|
|
return table;
|
|
}
|
|
const dimensions = embeddings.function.ndims();
|
|
if (dimensions !== undefined) {
|
|
const destType = newVectorType(dimensions, innerDestType);
|
|
newColumns[destColumn] = makeVector([], destType);
|
|
} else if (schema != null) {
|
|
const destField = schema.fields.find((f) => f.name === destColumn);
|
|
if (destField != null) {
|
|
newColumns[destColumn] = makeVector([], destField.type);
|
|
} else {
|
|
throw new Error(
|
|
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`,
|
|
);
|
|
}
|
|
} else {
|
|
throw new Error(
|
|
"Attempt to apply embeddings to an empty table when the embeddings function does not specify `embeddingDimension`",
|
|
);
|
|
}
|
|
} else {
|
|
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
|
throw new Error(
|
|
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
|
);
|
|
}
|
|
if (table.batches.length > 1) {
|
|
throw new Error(
|
|
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
|
);
|
|
}
|
|
const values = sourceColumn.toArray();
|
|
const vectors = await embeddings.function.computeSourceEmbeddings(
|
|
values as T[],
|
|
);
|
|
if (vectors.length !== values.length) {
|
|
throw new Error(
|
|
"Embedding function did not return an embedding for each input element",
|
|
);
|
|
}
|
|
const destType = newVectorType(vectors[0].length, innerDestType);
|
|
newColumns[destColumn] = makeVector(vectors, destType);
|
|
}
|
|
|
|
let newTable = new ArrowTable(newColumns);
|
|
if (schema != null) {
|
|
if (schema.fields.find((f) => f.name === destColumn) === undefined) {
|
|
throw new Error(
|
|
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
|
|
);
|
|
}
|
|
newTable = alignTable(newTable, schema as Schema);
|
|
}
|
|
|
|
newTable = new ArrowTable(
|
|
new Schema(newTable.schema.fields, schemaMetadata),
|
|
newTable.batches,
|
|
);
|
|
|
|
return newTable;
|
|
}
|
|
|
|
/**
|
|
* Convert an Array of records into an Arrow Table, optionally applying an
|
|
* embeddings function to it.
|
|
*
|
|
* This function calls `makeArrowTable` first to create the Arrow Table.
|
|
* Any provided `makeTableOptions` (e.g. a schema) will be passed on to
|
|
* that call.
|
|
*
|
|
* The embedding function will be passed a column of values (based on the
|
|
* `sourceColumn` of the embedding function) and expects to receive back
|
|
* number[][] which will be converted into a fixed size list column. By
|
|
* default this will be a fixed size list of Float32 but that can be
|
|
* customized by the `embeddingDataType` property of the embedding function.
|
|
*
|
|
* If a schema is provided in `makeTableOptions` then it should include the
|
|
* embedding columns. If no schema is provded then embedding columns will
|
|
* be placed at the end of the table, after all of the input columns.
|
|
*/
|
|
export async function convertToTable(
|
|
data: Array<Record<string, unknown>>,
|
|
embeddings?: EmbeddingFunctionConfig,
|
|
makeTableOptions?: Partial<MakeArrowTableOptions>,
|
|
): Promise<ArrowTable> {
|
|
const table = makeArrowTable(data, makeTableOptions);
|
|
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
|
}
|
|
|
|
/** Creates the Arrow Type for a Vector column with dimension `dim` */
|
|
export function newVectorType<T extends Float>(
|
|
dim: number,
|
|
innerType: unknown,
|
|
): FixedSizeList<T> {
|
|
// in Lance we always default to have the elements nullable, so we need to set it to true
|
|
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
|
|
const children = new Field("item", <T>sanitizeType(innerType), true);
|
|
return new FixedSizeList(dim, children);
|
|
}
|
|
|
|
/**
|
|
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
|
|
*
|
|
* This function will call `convertToTable` and pass on `embeddings` and `schema`
|
|
*
|
|
* `schema` is required if data is empty
|
|
*/
|
|
export async function fromRecordsToBuffer(
|
|
data: Array<Record<string, unknown>>,
|
|
embeddings?: EmbeddingFunctionConfig,
|
|
schema?: Schema,
|
|
): Promise<Buffer> {
|
|
if (schema !== undefined && schema !== null) {
|
|
schema = sanitizeSchema(schema);
|
|
}
|
|
const table = await convertToTable(data, embeddings, { schema });
|
|
const writer = RecordBatchFileWriter.writeAll(table);
|
|
return Buffer.from(await writer.toUint8Array());
|
|
}
|
|
|
|
/**
|
|
* Serialize an Array of records into a buffer using the Arrow IPC Stream serialization
|
|
*
|
|
* This function will call `convertToTable` and pass on `embeddings` and `schema`
|
|
*
|
|
* `schema` is required if data is empty
|
|
*/
|
|
export async function fromRecordsToStreamBuffer(
|
|
data: Array<Record<string, unknown>>,
|
|
embeddings?: EmbeddingFunctionConfig,
|
|
schema?: Schema,
|
|
): Promise<Buffer> {
|
|
if (schema !== undefined && schema !== null) {
|
|
schema = sanitizeSchema(schema);
|
|
}
|
|
const table = await convertToTable(data, embeddings, { schema });
|
|
const writer = RecordBatchStreamWriter.writeAll(table);
|
|
return Buffer.from(await writer.toUint8Array());
|
|
}
|
|
|
|
/**
|
|
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
|
|
*
|
|
* This function will apply `embeddings` to the table in a manner similar to
|
|
* `convertToTable`.
|
|
*
|
|
* `schema` is required if the table is empty
|
|
*/
|
|
export async function fromTableToBuffer(
|
|
table: ArrowTable,
|
|
embeddings?: EmbeddingFunctionConfig,
|
|
schema?: SchemaLike,
|
|
): Promise<Buffer> {
|
|
if (schema !== undefined && schema !== null) {
|
|
schema = sanitizeSchema(schema);
|
|
}
|
|
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
const writer = RecordBatchFileWriter.writeAll(tableWithEmbeddings);
|
|
return Buffer.from(await writer.toUint8Array());
|
|
}
|
|
|
|
/**
|
|
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
|
|
*
|
|
* This function will apply `embeddings` to the table in a manner similar to
|
|
* `convertToTable`.
|
|
*
|
|
* `schema` is required if the table is empty
|
|
*/
|
|
export async function fromDataToBuffer(
|
|
data: Data,
|
|
embeddings?: EmbeddingFunctionConfig,
|
|
schema?: Schema,
|
|
): Promise<Buffer> {
|
|
if (schema !== undefined && schema !== null) {
|
|
schema = sanitizeSchema(schema);
|
|
}
|
|
if (isArrowTable(data)) {
|
|
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
|
|
} else {
|
|
const table = await convertToTable(data, embeddings, { schema });
|
|
return fromTableToBuffer(table);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Read a single record batch from a buffer.
|
|
*
|
|
* Returns null if the buffer does not contain a record batch
|
|
*/
|
|
export async function fromBufferToRecordBatch(
|
|
data: Buffer,
|
|
): Promise<RecordBatch | null> {
|
|
const iter = await RecordBatchFileReader.readAll(Buffer.from(data)).next()
|
|
.value;
|
|
const recordBatch = iter?.next().value;
|
|
return recordBatch || null;
|
|
}
|
|
|
|
/**
|
|
* Create a buffer containing a single record batch
|
|
*/
|
|
export async function fromRecordBatchToBuffer(
|
|
batch: RecordBatch,
|
|
): Promise<Buffer> {
|
|
const writer = new RecordBatchFileWriter().writeAll([batch]);
|
|
return Buffer.from(await writer.toUint8Array());
|
|
}
|
|
|
|
/**
|
|
* Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
|
|
*
|
|
* This function will apply `embeddings` to the table in a manner similar to
|
|
* `convertToTable`.
|
|
*
|
|
* `schema` is required if the table is empty
|
|
*/
|
|
export async function fromTableToStreamBuffer(
|
|
table: ArrowTable,
|
|
embeddings?: EmbeddingFunctionConfig,
|
|
schema?: SchemaLike,
|
|
): Promise<Buffer> {
|
|
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
|
|
return Buffer.from(await writer.toUint8Array());
|
|
}
|
|
|
|
/**
|
|
* Reorder the columns in `batch` so that they agree with the field order in `schema`
|
|
*/
|
|
function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
|
|
const alignedChildren = [];
|
|
for (const field of schema.fields) {
|
|
const indexInBatch = batch.schema.fields?.findIndex(
|
|
(f) => f.name === field.name,
|
|
);
|
|
if (indexInBatch < 0) {
|
|
throw new Error(
|
|
`The column ${field.name} was not found in the Arrow Table`,
|
|
);
|
|
}
|
|
alignedChildren.push(batch.data.children[indexInBatch]);
|
|
}
|
|
const newData = makeData({
|
|
type: new Struct(schema.fields),
|
|
length: batch.numRows,
|
|
nullCount: batch.nullCount,
|
|
children: alignedChildren,
|
|
});
|
|
return new RecordBatch(schema, newData);
|
|
}
|
|
|
|
/**
|
|
* Reorder the columns in `table` so that they agree with the field order in `schema`
|
|
*/
|
|
function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
|
|
const alignedBatches = table.batches.map((batch) =>
|
|
alignBatch(batch, schema),
|
|
);
|
|
return new ArrowTable(schema, alignedBatches);
|
|
}
|
|
|
|
/**
|
|
* Create an empty table with the given schema
|
|
*/
|
|
export function createEmptyTable(schema: Schema): ArrowTable {
|
|
return new ArrowTable(sanitizeSchema(schema));
|
|
}
|
|
|
|
function validateSchemaEmbeddings(
|
|
schema: Schema,
|
|
data: Array<Record<string, unknown>>,
|
|
embeddings: EmbeddingFunctionConfig | undefined,
|
|
): Schema {
|
|
const fields = [];
|
|
const missingEmbeddingFields = [];
|
|
|
|
// First we check if the field is a `FixedSizeList`
|
|
// Then we check if the data contains the field
|
|
// if it does not, we add it to the list of missing embedding fields
|
|
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
// if they are not, we throw an error
|
|
for (let field of schema.fields) {
|
|
if (isFixedSizeList(field.type)) {
|
|
field = sanitizeField(field);
|
|
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
if (schema.metadata.has("embedding_functions")) {
|
|
const embeddings = JSON.parse(
|
|
schema.metadata.get("embedding_functions")!,
|
|
);
|
|
if (
|
|
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
|
embeddings.find((f: any) => f["vectorColumn"] === field.name) ===
|
|
undefined
|
|
) {
|
|
missingEmbeddingFields.push(field);
|
|
}
|
|
} else {
|
|
missingEmbeddingFields.push(field);
|
|
}
|
|
} else {
|
|
fields.push(field);
|
|
}
|
|
} else {
|
|
fields.push(field);
|
|
}
|
|
}
|
|
|
|
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
|
throw new Error(
|
|
`Table has embeddings: "${missingEmbeddingFields
|
|
.map((f) => f.name)
|
|
.join(",")}", but no embedding function was provided`,
|
|
);
|
|
}
|
|
|
|
return new Schema(fields, schema.metadata);
|
|
}
|
|
|
|
interface JsonDataType {
|
|
type: string;
|
|
fields?: JsonField[];
|
|
length?: number;
|
|
}
|
|
|
|
interface JsonField {
|
|
name: string;
|
|
type: JsonDataType;
|
|
nullable: boolean;
|
|
metadata: Map<string, string>;
|
|
}
|
|
|
|
// Matches format of https://github.com/lancedb/lance/blob/main/rust/lance/src/arrow/json.rs
|
|
export function dataTypeToJson(dataType: DataType): JsonDataType {
|
|
switch (dataType.typeId) {
|
|
// For primitives, matches https://github.com/lancedb/lance/blob/e12bb9eff2a52f753668d4b62c52e4d72b10d294/rust/lance-core/src/datatypes.rs#L185
|
|
case Type.Null:
|
|
return { type: "null" };
|
|
case Type.Bool:
|
|
return { type: "bool" };
|
|
case Type.Int8:
|
|
return { type: "int8" };
|
|
case Type.Int16:
|
|
return { type: "int16" };
|
|
case Type.Int32:
|
|
return { type: "int32" };
|
|
case Type.Int64:
|
|
return { type: "int64" };
|
|
case Type.Uint8:
|
|
return { type: "uint8" };
|
|
case Type.Uint16:
|
|
return { type: "uint16" };
|
|
case Type.Uint32:
|
|
return { type: "uint32" };
|
|
case Type.Uint64:
|
|
return { type: "uint64" };
|
|
case Type.Int: {
|
|
const bitWidth = (dataType as Int).bitWidth;
|
|
const signed = (dataType as Int).isSigned;
|
|
const prefix = signed ? "" : "u";
|
|
return { type: `${prefix}int${bitWidth}` };
|
|
}
|
|
case Type.Float: {
|
|
switch ((dataType as Float).precision) {
|
|
case Precision.HALF:
|
|
return { type: "halffloat" };
|
|
case Precision.SINGLE:
|
|
return { type: "float" };
|
|
case Precision.DOUBLE:
|
|
return { type: "double" };
|
|
}
|
|
throw Error("Unsupported float precision");
|
|
}
|
|
case Type.Float16:
|
|
return { type: "halffloat" };
|
|
case Type.Float32:
|
|
return { type: "float" };
|
|
case Type.Float64:
|
|
return { type: "double" };
|
|
case Type.Utf8:
|
|
return { type: "string" };
|
|
case Type.Binary:
|
|
return { type: "binary" };
|
|
case Type.LargeUtf8:
|
|
return { type: "large_string" };
|
|
case Type.LargeBinary:
|
|
return { type: "large_binary" };
|
|
case Type.List:
|
|
return {
|
|
type: "list",
|
|
fields: [fieldToJson((dataType as List).children[0])],
|
|
};
|
|
case Type.FixedSizeList: {
|
|
const fixedSizeList = dataType as FixedSizeList;
|
|
return {
|
|
type: "fixed_size_list",
|
|
fields: [fieldToJson(fixedSizeList.children[0])],
|
|
length: fixedSizeList.listSize,
|
|
};
|
|
}
|
|
case Type.Struct:
|
|
return {
|
|
type: "struct",
|
|
fields: (dataType as Struct).children.map(fieldToJson),
|
|
};
|
|
case Type.Date: {
|
|
const unit = (dataType as Date_).unit;
|
|
return {
|
|
type: unit === DateUnit.DAY ? "date32:day" : "date64:ms",
|
|
};
|
|
}
|
|
case Type.Timestamp: {
|
|
const timestamp = dataType as Timestamp;
|
|
const timezone = timestamp.timezone || "-";
|
|
return {
|
|
type: `timestamp:${timestamp.unit}:${timezone}`,
|
|
};
|
|
}
|
|
case Type.Decimal: {
|
|
const decimal = dataType as Decimal;
|
|
return {
|
|
type: `decimal:${decimal.bitWidth}:${decimal.precision}:${decimal.scale}`,
|
|
};
|
|
}
|
|
case Type.Duration: {
|
|
const duration = dataType as Duration;
|
|
return { type: `duration:${duration.unit}` };
|
|
}
|
|
case Type.FixedSizeBinary: {
|
|
const byteWidth = (dataType as FixedSizeBinary).byteWidth;
|
|
return { type: `fixed_size_binary:${byteWidth}` };
|
|
}
|
|
case Type.Dictionary: {
|
|
const dict = dataType as Dictionary;
|
|
const indexType = dataTypeToJson(dict.indices);
|
|
const valueType = dataTypeToJson(dict.valueType);
|
|
return {
|
|
type: `dict:${valueType.type}:${indexType.type}:false`,
|
|
};
|
|
}
|
|
}
|
|
throw new Error("Unsupported data type");
|
|
}
|
|
|
|
function fieldToJson(field: Field): JsonField {
|
|
return {
|
|
name: field.name,
|
|
type: dataTypeToJson(field.type),
|
|
nullable: field.nullable,
|
|
metadata: field.metadata,
|
|
};
|
|
}
|