feat: add support for add to async python API (#1037)

In order to add support for `add` we needed to migrate the rust `Table`
trait to a `Table` struct and `TableInternal` trait (similar to the way
the connection is designed).

While doing this we also cleaned up some inconsistencies between the
SDKs:

* Python and Node are garbage collected languages and it can be
difficult to trigger something to be freed. The convention for these
languages is to have some kind of close method. I added a close method
to both the table and connection which will drop the underlying rust
object.
* We made significant improvements to table creation in
cc5f2136a6
for the `node` SDK. I copied these changes to the `nodejs` SDK.
* The nodejs tables were using fs to create tmp directories and these
were not getting cleaned up. This is mostly harmless but annoying and so
I changed it up a bit to ensure we cleanup tmp directories.
* ~~countRows in the node SDK was returning `bigint`. I changed it to
return `number`~~ (this actually happened in a previous PR)
* Tables and connections now implement `std::fmt::Display` which is
hooked into python's `__repr__`. Node has no concept of a regular "to
string" function and so I added a `display` method.
* Python method signatures are changing so that optional parameters are
always `Optional[foo] = None` instead of something like `foo = False`.
This is because we want those defaults to be in rust whenever possible
(though we still need to mention the default in documentation).
* I changed the python `AsyncConnection/AsyncTable` classes from
abstract classes with a single implementation to just classes because we
no longer have the remote implementation in python.

Note: this does NOT add the `add` function to the remote table. This PR
was already large enough, and the remote implementation is unique
enough, that I am going to do all the remote stuff at a later date (we
should have the structure in place and correct so there shouldn't be any
refactor concerns)

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
Weston Pace
2024-03-04 09:27:41 -08:00
parent 3bbcaba65b
commit 8033a44d68
42 changed files with 2822 additions and 1122 deletions

View File

@@ -1,4 +1,4 @@
// Copyright 2024 Lance Developers.
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -13,44 +13,91 @@
// limitations under the License.
import {
Int64,
Field,
makeBuilder,
RecordBatchFileWriter,
Utf8,
type Vector,
FixedSizeList,
Float,
Float32,
Schema,
Table as ArrowTable,
Table,
Vector,
vectorFromArray,
tableToIPC,
type Schema,
Table as ArrowTable,
RecordBatchStreamWriter,
List,
RecordBatch,
makeData,
Struct,
type Float,
DataType,
} from "apache-arrow";
Binary,
Float32
} from 'apache-arrow'
import { type EmbeddingFunction } from './embedding/embedding_function'
import { Table } from './native';
/** Data type accepted by NodeJS SDK */
export type Data = Record<string, unknown>[] | ArrowTable;
/*
* Options to control how a column should be converted to a vector array
*/
export class VectorColumnOptions {
/** Vector column type. */
type: Float = new Float32();
type: Float = new Float32()
constructor(values?: Partial<VectorColumnOptions>) {
Object.assign(this, values);
constructor (values?: Partial<VectorColumnOptions>) {
Object.assign(this, values)
}
}
/** Options to control the makeArrowTable call. */
export class MakeArrowTableOptions {
/** Provided schema. */
schema?: Schema;
/*
* Schema of the data.
*
* If this is not provided then the data type will be inferred from the
* JS type. Integer numbers will become int64, floating point numbers
* will become float64 and arrays will become variable sized lists with
* the data type inferred from the first element in the array.
*
* The schema must be specified if there are no records (e.g. to make
* an empty table)
*/
schema?: Schema
/** Vector columns */
/*
* Mapping from vector column name to expected type
*
* Lance expects vector columns to be fixed size list arrays (i.e. tensors)
* However, `makeArrowTable` will not infer this by default (it creates
* variable size list arrays). This field can be used to indicate that a column
* should be treated as a vector column and converted to a fixed size list.
*
* The keys should be the names of the vector columns. The value specifies the
* expected data type of the vector columns.
*
* If `schema` is provided then this field is ignored.
*
* By default, the column named "vector" will be assumed to be a float32
* vector column.
*/
vectorColumns: Record<string, VectorColumnOptions> = {
vector: new VectorColumnOptions(),
};
vector: new VectorColumnOptions()
}
constructor(values?: Partial<MakeArrowTableOptions>) {
Object.assign(this, values);
/**
* If true then string columns will be encoded with dictionary encoding
*
* Set this to true if your string columns tend to repeat the same values
* often. For more precise control use the `schema` property to specify the
* data type for individual columns.
*
* If `schema` is provided then this property is ignored.
*/
dictionaryEncodeStrings: boolean = false
constructor (values?: Partial<MakeArrowTableOptions>) {
Object.assign(this, values)
}
}
@@ -58,8 +105,30 @@ export class MakeArrowTableOptions {
* An enhanced version of the {@link makeTable} function from Apache Arrow
* that supports nested fields and embeddings columns.
*
* This function converts an array of Record<String, any> (row-major JS objects)
* to an Arrow Table (a columnar structure)
*
* Note that it currently does not support nulls.
*
* If a schema is provided then it will be used to determine the resulting array
* types. Fields will also be reordered to fit the order defined by the schema.
*
* If a schema is not provided then the types will be inferred and the field order
* will be controlled by the order of properties in the first record. If a type
* is inferred it will always be nullable.
*
* If the input is empty then a schema must be provided to create an empty table.
*
* When a schema is not specified then data types will be inferred. The inference
* rules are as follows:
*
* - boolean => Bool
* - number => Float64
* - String => Utf8
* - Buffer => Binary
* - Record<String, any> => Struct
* - Array<any> => List
*
* @param data input data
* @param options options to control the makeArrowTable call.
*
@@ -82,25 +151,27 @@ export class MakeArrowTableOptions {
* ], { schema });
* ```
*
* It guesses the vector columns if the schema is not provided. For example,
* by default it assumes that the column named `vector` is a vector column.
* By default it assumes that the column named `vector` is a vector column
* and it will be converted into a fixed size list array of type float32.
* The `vectorColumns` option can be used to support other vector column
* names and data types.
*
* ```ts
*
* const schema = new Schema([
new Field("a", new Float64()),
new Field("b", new Float64()),
new Field(
"vector",
new FixedSizeList(3, new Field("item", new Float32()))
),
]);
const table = makeArrowTable([
{ a: 1, b: 2, vector: [1, 2, 3] },
{ a: 4, b: 5, vector: [4, 5, 6] },
{ a: 7, b: 8, vector: [7, 8, 9] },
]);
assert.deepEqual(table.schema, schema);
new Field("a", new Float64()),
new Field("b", new Float64()),
new Field(
"vector",
new FixedSizeList(3, new Field("item", new Float32()))
),
]);
const table = makeArrowTable([
{ a: 1, b: 2, vector: [1, 2, 3] },
{ a: 4, b: 5, vector: [4, 5, 6] },
{ a: 7, b: 8, vector: [7, 8, 9] },
]);
assert.deepEqual(table.schema, schema);
* ```
*
* You can specify the vector column types and names using the options as well
@@ -108,81 +179,372 @@ export class MakeArrowTableOptions {
* ```typescript
*
* const schema = new Schema([
new Field('a', new Float64()),
new Field('b', new Float64()),
new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
]);
new Field('a', new Float64()),
new Field('b', new Float64()),
new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
]);
* const table = makeArrowTable([
{ a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
{ a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
{ a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
], {
vectorColumns: {
vec1: { type: new Float16() },
vec2: { type: new Float16() }
}
}
{ a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
{ a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
{ a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
], {
vectorColumns: {
vec1: { type: new Float16() },
vec2: { type: new Float16() }
}
}
* assert.deepEqual(table.schema, schema)
* ```
*/
export function makeArrowTable(
data: Record<string, any>[],
export function makeArrowTable (
data: Array<Record<string, any>>,
options?: Partial<MakeArrowTableOptions>
): Table {
if (data.length === 0) {
throw new Error("At least one record needs to be provided");
): ArrowTable {
if (data.length === 0 && (options?.schema === undefined || options?.schema === null)) {
throw new Error('At least one record or a schema needs to be provided')
}
const opt = new MakeArrowTableOptions(options ?? {});
const columns: Record<string, Vector> = {};
// TODO: sample dataset to find missing columns
const columnNames = Object.keys(data[0]);
for (const colName of columnNames) {
// eslint-disable-next-line @typescript-eslint/no-unsafe-return
let values = data.map((datum) => datum[colName]);
let vector: Vector;
const opt = new MakeArrowTableOptions(options !== undefined ? options : {})
const columns: Record<string, Vector> = {}
// TODO: sample dataset to find missing columns
// Prefer the field ordering of the schema, if present
const columnNames = ((options?.schema) != null) ? (options?.schema?.names as string[]) : Object.keys(data[0])
for (const colName of columnNames) {
if (data.length !== 0 && !Object.prototype.hasOwnProperty.call(data[0], colName)) {
// The field is present in the schema, but not in the data, skip it
continue
}
// Extract a single column from the records (transpose from row-major to col-major)
let values = data.map((datum) => datum[colName])
// By default (type === undefined) arrow will infer the type from the JS type
let type
if (opt.schema !== undefined) {
// Explicit schema is provided, highest priority
const fieldType: DataType | undefined = opt.schema.fields.filter((f) => f.name === colName)[0]?.type as DataType;
if (fieldType instanceof Int64) {
// If there is a schema provided, then use that for the type instead
type = opt.schema?.fields.filter((f) => f.name === colName)[0]?.type
if (DataType.isInt(type) && type.bitWidth === 64) {
// wrap in BigInt to avoid bug: https://github.com/apache/arrow/issues/40051
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
values = values.map((v) => BigInt(v));
values = values.map((v) => {
if (v === null) {
return v
}
return BigInt(v)
})
}
vector = vectorFromArray(values, fieldType);
} else {
const vectorColumnOptions = opt.vectorColumns[colName];
// Otherwise, check to see if this column is one of the vector columns
// defined by opt.vectorColumns and, if so, use the fixed size list type
const vectorColumnOptions = opt.vectorColumns[colName]
if (vectorColumnOptions !== undefined) {
const fslType = new FixedSizeList(
(values[0] as any[]).length,
new Field("item", vectorColumnOptions.type, false)
);
vector = vectorFromArray(values, fslType);
} else {
// Normal case
vector = vectorFromArray(values);
type = newVectorType(values[0].length, vectorColumnOptions.type)
}
}
columns[colName] = vector;
try {
// Convert an Array of JS values to an arrow vector
columns[colName] = makeVector(values, type, opt.dictionaryEncodeStrings)
} catch (error: unknown) {
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
throw Error(`Could not convert column "${colName}" to Arrow: ${error}`)
}
}
return new Table(columns);
if (opt.schema != null) {
// `new ArrowTable(columns)` infers a schema which may sometimes have
// incorrect nullability (it assumes nullable=true always)
//
// `new ArrowTable(schema, columns)` will also fail because it will create a
// batch with an inferred schema and then complain that the batch schema
// does not match the provided schema.
//
// To work around this we first create a table with the wrong schema and
// then patch the schema of the batches so we can use
// `new ArrowTable(schema, batches)` which does not do any schema inference
const firstTable = new ArrowTable(columns)
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const batchesFixed = firstTable.batches.map(batch => new RecordBatch(opt.schema!, batch.data))
return new ArrowTable(opt.schema, batchesFixed)
} else {
return new ArrowTable(columns)
}
}
/**
* Convert an Arrow Table to a Buffer.
*
* @param data Arrow Table
* @param schema Arrow Schema, optional
* @returns Buffer node
* Create an empty Arrow table with the provided schema
*/
export function toBuffer(data: Data, schema?: Schema): Buffer {
let tbl: Table;
if (data instanceof Table) {
tbl = data;
} else {
tbl = makeArrowTable(data, { schema });
}
return Buffer.from(tableToIPC(tbl));
export function makeEmptyTable (schema: Schema): ArrowTable {
return makeArrowTable([], { schema })
}
// Helper function to convert Array<Array<any>> to a variable sized list array
function makeListVector (lists: any[][]): Vector<any> {
if (lists.length === 0 || lists[0].length === 0) {
throw Error('Cannot infer list vector from empty array or empty list')
}
const sampleList = lists[0]
let inferredType
try {
const sampleVector = makeVector(sampleList)
inferredType = sampleVector.type
} catch (error: unknown) {
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
throw Error(`Cannot infer list vector. Cannot infer inner type: ${error}`)
}
const listBuilder = makeBuilder({
type: new List(new Field('item', inferredType, true))
})
for (const list of lists) {
listBuilder.append(list)
}
return listBuilder.finish().toVector()
}
// Helper function to convert an Array of JS values to an Arrow Vector
function makeVector (values: any[], type?: DataType, stringAsDictionary?: boolean): Vector<any> {
if (type !== undefined) {
// No need for inference, let Arrow create it
return vectorFromArray(values, type)
}
if (values.length === 0) {
throw Error('makeVector requires at least one value or the type must be specfied')
}
const sampleValue = values.find(val => val !== null && val !== undefined)
if (sampleValue === undefined) {
throw Error('makeVector cannot infer the type if all values are null or undefined')
}
if (Array.isArray(sampleValue)) {
// Default Arrow inference doesn't handle list types
return makeListVector(values)
} else if (Buffer.isBuffer(sampleValue)) {
// Default Arrow inference doesn't handle Buffer
return vectorFromArray(values, new Binary())
} else if (!(stringAsDictionary ?? false) && (typeof sampleValue === 'string' || sampleValue instanceof String)) {
// If the type is string then don't use Arrow's default inference unless dictionaries are requested
// because it will always use dictionary encoding for strings
return vectorFromArray(values, new Utf8())
} else {
// Convert a JS array of values to an arrow vector
return vectorFromArray(values)
}
}
async function applyEmbeddings<T> (table: ArrowTable, embeddings?: EmbeddingFunction<T>, schema?: Schema): Promise<ArrowTable> {
if (embeddings == null) {
return table
}
// Convert from ArrowTable to Record<String, Vector>
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
const name = table.schema.fields[idx].name
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const vec = table.getChildAt(idx)!
return [name, vec]
})
const newColumns = Object.fromEntries(colEntries)
const sourceColumn = newColumns[embeddings.sourceColumn]
const destColumn = embeddings.destColumn ?? 'vector'
const innerDestType = embeddings.embeddingDataType ?? new Float32()
if (sourceColumn === undefined) {
throw new Error(`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`)
}
if (table.numRows === 0) {
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
// We have an empty table and it already has the embedding column so no work needs to be done
// Note: we don't return an error like we did below because this is a common occurrence. For example,
// if we call convertToTable with 0 records and a schema that includes the embedding
return table
}
if (embeddings.embeddingDimension !== undefined) {
const destType = newVectorType(embeddings.embeddingDimension, innerDestType)
newColumns[destColumn] = makeVector([], destType)
} else if (schema != null) {
const destField = schema.fields.find(f => f.name === destColumn)
if (destField != null) {
newColumns[destColumn] = makeVector([], destField.type)
} else {
throw new Error(`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`)
}
} else {
throw new Error('Attempt to apply embeddings to an empty table when the embeddings function does not specify `embeddingDimension`')
}
} else {
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
throw new Error(`Attempt to apply embeddings to table failed because column ${destColumn} already existed`)
}
if (table.batches.length > 1) {
throw new Error('Internal error: `makeArrowTable` unexpectedly created a table with more than one batch')
}
const values = sourceColumn.toArray()
const vectors = await embeddings.embed(values as T[])
if (vectors.length !== values.length) {
throw new Error('Embedding function did not return an embedding for each input element')
}
const destType = newVectorType(vectors[0].length, innerDestType)
newColumns[destColumn] = makeVector(vectors, destType)
}
const newTable = new ArrowTable(newColumns)
if (schema != null) {
if (schema.fields.find(f => f.name === destColumn) === undefined) {
throw new Error(`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`)
}
return alignTable(newTable, schema)
}
return newTable
}
/*
* Convert an Array of records into an Arrow Table, optionally applying an
* embeddings function to it.
*
* This function calls `makeArrowTable` first to create the Arrow Table.
* Any provided `makeTableOptions` (e.g. a schema) will be passed on to
* that call.
*
* The embedding function will be passed a column of values (based on the
* `sourceColumn` of the embedding function) and expects to receive back
* number[][] which will be converted into a fixed size list column. By
* default this will be a fixed size list of Float32 but that can be
* customized by the `embeddingDataType` property of the embedding function.
*
* If a schema is provided in `makeTableOptions` then it should include the
* embedding columns. If no schema is provded then embedding columns will
* be placed at the end of the table, after all of the input columns.
*/
export async function convertToTable<T> (
data: Array<Record<string, unknown>>,
embeddings?: EmbeddingFunction<T>,
makeTableOptions?: Partial<MakeArrowTableOptions>
): Promise<ArrowTable> {
const table = makeArrowTable(data, makeTableOptions)
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema)
}
// Creates the Arrow Type for a Vector column with dimension `dim`
function newVectorType <T extends Float> (dim: number, innerType: T): FixedSizeList<T> {
// in Lance we always default to have the elements nullable, so we need to set it to true
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
const children = new Field<T>('item', innerType, true)
return new FixedSizeList(dim, children)
}
/**
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
*
* This function will call `convertToTable` and pass on `embeddings` and `schema`
*
* `schema` is required if data is empty
*/
export async function fromRecordsToBuffer<T> (
data: Array<Record<string, unknown>>,
embeddings?: EmbeddingFunction<T>,
schema?: Schema
): Promise<Buffer> {
const table = await convertToTable(data, embeddings, { schema })
const writer = RecordBatchFileWriter.writeAll(table)
return Buffer.from(await writer.toUint8Array())
}
/**
* Serialize an Array of records into a buffer using the Arrow IPC Stream serialization
*
* This function will call `convertToTable` and pass on `embeddings` and `schema`
*
* `schema` is required if data is empty
*/
export async function fromRecordsToStreamBuffer<T> (
data: Array<Record<string, unknown>>,
embeddings?: EmbeddingFunction<T>,
schema?: Schema
): Promise<Buffer> {
const table = await convertToTable(data, embeddings, { schema })
const writer = RecordBatchStreamWriter.writeAll(table)
return Buffer.from(await writer.toUint8Array())
}
/**
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
*
* This function will apply `embeddings` to the table in a manner similar to
* `convertToTable`.
*
* `schema` is required if the table is empty
*/
export async function fromTableToBuffer<T> (
table: ArrowTable,
embeddings?: EmbeddingFunction<T>,
schema?: Schema
): Promise<Buffer> {
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema)
const writer = RecordBatchFileWriter.writeAll(tableWithEmbeddings)
return Buffer.from(await writer.toUint8Array())
}
export async function fromDataToBuffer<T> (
data: Data,
embeddings?: EmbeddingFunction<T>,
schema?: Schema
): Promise<Buffer> {
if (data instanceof ArrowTable) {
return fromTableToBuffer(data, embeddings, schema)
} else {
const table = await convertToTable(data);
return fromTableToBuffer(table, embeddings, schema);
}
}
/**
* Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
*
* This function will apply `embeddings` to the table in a manner similar to
* `convertToTable`.
*
* `schema` is required if the table is empty
*/
export async function fromTableToStreamBuffer<T> (
table: ArrowTable,
embeddings?: EmbeddingFunction<T>,
schema?: Schema
): Promise<Buffer> {
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema)
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings)
return Buffer.from(await writer.toUint8Array())
}
function alignBatch (batch: RecordBatch, schema: Schema): RecordBatch {
const alignedChildren = []
for (const field of schema.fields) {
const indexInBatch = batch.schema.fields?.findIndex(
(f) => f.name === field.name
)
if (indexInBatch < 0) {
throw new Error(
`The column ${field.name} was not found in the Arrow Table`
)
}
alignedChildren.push(batch.data.children[indexInBatch])
}
const newData = makeData({
type: new Struct(schema.fields),
length: batch.numRows,
nullCount: batch.nullCount,
children: alignedChildren
})
return new RecordBatch(schema, newData)
}
function alignTable (table: ArrowTable, schema: Schema): ArrowTable {
const alignedBatches = table.batches.map((batch) =>
alignBatch(batch, schema)
)
return new ArrowTable(schema, alignedBatches)
}
// Creates an empty Arrow Table
export function createEmptyTable (schema: Schema): ArrowTable {
return new ArrowTable(schema)
}

View File

@@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
import { toBuffer } from "./arrow";
import { Connection as _NativeConnection } from "./native";
import { fromTableToBuffer, makeArrowTable, makeEmptyTable } from "./arrow";
import { Connection as LanceDbConnection } from "./native";
import { Table } from "./table";
import { Table as ArrowTable } from "apache-arrow";
import { Table as ArrowTable, Schema } from "apache-arrow";
export interface CreateTableOptions {
/**
@@ -39,14 +39,47 @@ export interface CreateTableOptions {
* A LanceDB Connection that allows you to open tables and create new ones.
*
* Connection could be local against filesystem or remote against a server.
*
* A Connection is intended to be a long lived object and may hold open
* resources such as HTTP connection pools. This is generally fine and
* a single connection should be shared if it is going to be used many
* times. However, if you are finished with a connection, you may call
* close to eagerly free these resources. Any call to a Connection
* method after it has been closed will result in an error.
*
* Closing a connection is optional. Connections will automatically
* be closed when they are garbage collected.
*
* Any created tables are independent and will continue to work even if
* the underlying connection has been closed.
*/
export class Connection {
readonly inner: _NativeConnection;
readonly inner: LanceDbConnection;
constructor(inner: _NativeConnection) {
constructor(inner: LanceDbConnection) {
this.inner = inner;
}
/** Return true if the connection has not been closed */
isOpen(): boolean {
return this.inner.isOpen();
}
/** Close the connection, releasing any underlying resources.
*
* It is safe to call this method multiple times.
*
* Any attempt to use the connection after it is closed will result in an error.
*/
close(): void {
this.inner.close();
}
/** Return a brief description of the connection */
display(): string {
return this.inner.display();
}
/** List all the table names in this database. */
async tableNames(): Promise<string[]> {
return this.inner.tableNames();
@@ -81,11 +114,41 @@ export class Connection {
mode = "exist_ok";
}
const buf = toBuffer(data);
let table: ArrowTable;
if (data instanceof ArrowTable) {
table = data;
} else {
table = makeArrowTable(data);
}
const buf = await fromTableToBuffer(table);
const innerTable = await this.inner.createTable(name, buf, mode);
return new Table(innerTable);
}
/**
* Creates a new empty Table
*
* @param {string} name - The name of the table.
* @param schema - The schema of the table
*/
async createEmptyTable(
name: string,
schema: Schema,
options?: Partial<CreateTableOptions>
): Promise<Table> {
let mode: string = options?.mode ?? "create";
const existOk = options?.existOk ?? false;
if (mode === "create" && existOk) {
mode = "exist_ok";
}
const table = makeEmptyTable(schema);
const buf = await fromTableToBuffer(table);
const innerTable = await this.inner.createEmptyTable(name, buf, mode);
return new Table(innerTable);
}
/**
* Drop an existing table.
* @param name The name of the table to drop.

View File

@@ -0,0 +1,68 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { type Float } from 'apache-arrow'
/**
* An embedding function that automatically creates vector representation for a given column.
*/
export interface EmbeddingFunction<T> {
/**
* The name of the column that will be used as input for the Embedding Function.
*/
sourceColumn: string
/**
* The data type of the embedding
*
* The embedding function should return `number`. This will be converted into
* an Arrow float array. By default this will be Float32 but this property can
* be used to control the conversion.
*/
embeddingDataType?: Float
/**
* The dimension of the embedding
*
* This is optional, normally this can be determined by looking at the results of
* `embed`. If this is not specified, and there is an attempt to apply the embedding
* to an empty table, then that process will fail.
*/
embeddingDimension?: number
/**
* The name of the column that will contain the embedding
*
* By default this is "vector"
*/
destColumn?: string
/**
* Should the source column be excluded from the resulting table
*
* By default the source column is included. Set this to true and
* only the embedding will be stored.
*/
excludeSource?: boolean
/**
* Creates a vector representation for the given values.
*/
embed: (data: T[]) => Promise<number[][]>
}
export function isEmbeddingFunction<T> (value: any): value is EmbeddingFunction<T> {
return typeof value.sourceColumn === 'string' &&
typeof value.embed === 'function'
}

View File

@@ -0,0 +1,57 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { type EmbeddingFunction } from './embedding_function'
import type OpenAI from 'openai'
export class OpenAIEmbeddingFunction implements EmbeddingFunction<string> {
private readonly _openai: OpenAI
private readonly _modelName: string
constructor (sourceColumn: string, openAIKey: string, modelName: string = 'text-embedding-ada-002') {
/**
* @type {import("openai").default}
*/
let Openai
try {
// eslint-disable-next-line @typescript-eslint/no-var-requires
Openai = require('openai')
} catch {
throw new Error('please install openai@^4.24.1 using npm install openai')
}
this.sourceColumn = sourceColumn
const configuration = {
apiKey: openAIKey
}
this._openai = new Openai(configuration)
this._modelName = modelName
}
async embed (data: string[]): Promise<number[][]> {
const response = await this._openai.embeddings.create({
model: this._modelName,
input: data
})
const embeddings: number[][] = []
for (let i = 0; i < response.data.length; i++) {
embeddings.push(response.data[i].embedding)
}
return embeddings
}
sourceColumn: string
}

View File

@@ -13,7 +13,7 @@
// limitations under the License.
import { Connection } from "./connection";
import { Connection as NativeConnection, ConnectionOptions } from "./native.js";
import { Connection as LanceDbConnection, ConnectionOptions } from "./native.js";
export {
ConnectionOptions,
@@ -23,7 +23,6 @@ export {
} from "./native.js";
export { Connection } from "./connection";
export { Table } from "./table";
export { Data } from "./arrow";
export { IvfPQOptions, IndexBuilder } from "./indexer";
/**
@@ -39,26 +38,9 @@ export { IvfPQOptions, IndexBuilder } from "./indexer";
*
* @see {@link ConnectionOptions} for more details on the URI format.
*/
export async function connect(uri: string): Promise<Connection>;
export async function connect(
opts: Partial<ConnectionOptions>
): Promise<Connection>;
export async function connect(
args: string | Partial<ConnectionOptions>
): Promise<Connection> {
let opts: ConnectionOptions;
if (typeof args === "string") {
opts = { uri: args };
} else {
opts = Object.assign(
{
uri: "",
apiKey: undefined,
hostOverride: undefined,
},
args
);
}
const nativeConn = await NativeConnection.new(opts);
export async function connect(uri: string, opts?: Partial<ConnectionOptions>): Promise<Connection>
{
opts = opts ?? {};
const nativeConn = await LanceDbConnection.new(uri, opts);
return new Connection(nativeConn);
}

View File

@@ -45,7 +45,6 @@ export interface AddColumnsSql {
valueSql: string
}
export interface ConnectionOptions {
uri: string
apiKey?: string
hostOverride?: string
/**
@@ -71,10 +70,13 @@ export const enum WriteMode {
export interface WriteOptions {
mode?: WriteMode
}
export function connect(options: ConnectionOptions): Promise<Connection>
export function connect(uri: string, options: ConnectionOptions): Promise<Connection>
export class Connection {
/** Create a new Connection instance from the given URI. */
static new(options: ConnectionOptions): Promise<Connection>
static new(uri: string, options: ConnectionOptions): Promise<Connection>
display(): string
isOpen(): boolean
close(): void
/** List all tables in the dataset. */
tableNames(): Promise<Array<string>>
/**
@@ -86,6 +88,7 @@ export class Connection {
*
*/
createTable(name: string, buf: Buffer, mode: string): Promise<Table>
createEmptyTable(name: string, schemaBuf: Buffer, mode: string): Promise<Table>
openTable(name: string): Promise<Table>
/** Drop table with the name. Or raise an error if the table does not exist. */
dropTable(name: string): Promise<void>
@@ -114,9 +117,12 @@ export class Query {
executeStream(): Promise<RecordBatchIterator>
}
export class Table {
display(): string
isOpen(): boolean
close(): void
/** Return Schema as empty Arrow IPC file. */
schema(): Promise<Buffer>
add(buf: Buffer): Promise<void>
add(buf: Buffer, mode: string): Promise<void>
countRows(filter?: string | undefined | null): Promise<number>
delete(predicate: string): Promise<void>
createIndex(): IndexBuilder

View File

@@ -14,14 +14,32 @@
import { Schema, tableFromIPC } from "apache-arrow";
import { AddColumnsSql, ColumnAlteration, Table as _NativeTable } from "./native";
import { toBuffer, Data } from "./arrow";
import { Query } from "./query";
import { IndexBuilder } from "./indexer";
import { Data, fromDataToBuffer } from "./arrow";
/**
* A LanceDB Table is the collection of Records.
* Options for adding data to a table.
*/
export interface AddDataOptions {
/** If "append" (the default) then the new data will be added to the table
*
* If "overwrite" then the new data will replace the existing data in the table.
*/
mode: "append" | "overwrite";
}
/**
* A Table is a collection of Records in a LanceDB Database.
*
* Each Record has one or more vector fields.
* A Table object is expected to be long lived and reused for multiple operations.
* Table objects will cache a certain amount of index data in memory. This cache
* will be freed when the Table is garbage collected. To eagerly free the cache you
* can call the `close` method. Once the Table is closed, it cannot be used for any
* further operations.
*
* Closing a table is optional. It not closed, it will be closed when it is garbage
* collected.
*/
export class Table {
private readonly inner: _NativeTable;
@@ -31,6 +49,27 @@ export class Table {
this.inner = inner;
}
/** Return true if the table has not been closed */
isOpen(): boolean {
return this.inner.isOpen();
}
/** Close the table, releasing any underlying resources.
*
* It is safe to call this method multiple times.
*
* Any attempt to use the table after it is closed will result in an error.
*/
close(): void {
this.inner.close();
}
/** Return a brief description of the table */
display(): string {
return this.inner.display();
}
/** Get the schema of the table. */
async schema(): Promise<Schema> {
const schemaBuf = await this.inner.schema();
@@ -44,9 +83,11 @@ export class Table {
* @param {Data} data Records to be inserted into the Table
* @return The number of rows added to the table
*/
async add(data: Data): Promise<void> {
const buffer = toBuffer(data);
await this.inner.add(buffer);
async add(data: Data, options?: Partial<AddDataOptions>): Promise<void> {
let mode = options?.mode ?? "append";
const buffer = await fromDataToBuffer(data);
await this.inner.add(buffer, mode);
}
/** Count the total number of rows in the dataset. */