feat: make it possible to opt in to using the v2 format (#1352)

This also exposed the max_batch_length configuration option in
python/node (it was needed to verify if we are actually in v2 mode or
not)
This commit is contained in:
Weston Pace
2024-06-04 21:52:14 -07:00
committed by GitHub
parent d39e7d23f4
commit d5586c9c32
17 changed files with 310 additions and 33 deletions

View File

@@ -71,6 +71,12 @@ export interface CreateTableOptions {
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
*/
storageOptions?: Record<string, string>;
/**
* If true then data files will be written with the legacy format
*
* The default is true while the new format is in beta
*/
useLegacyFormat?: boolean;
schema?: Schema;
embeddingFunction?: EmbeddingFunctionConfig;
}
@@ -221,6 +227,7 @@ export class Connection {
buf,
mode,
cleanseStorageOptions(options?.storageOptions),
options?.useLegacyFormat,
);
return new Table(innerTable);
@@ -256,6 +263,7 @@ export class Connection {
buf,
mode,
cleanseStorageOptions(options?.storageOptions),
options?.useLegacyFormat,
);
return new Table(innerTable);
}

View File

@@ -55,6 +55,39 @@ export class RecordBatchIterator implements AsyncIterator<RecordBatch> {
}
/* eslint-enable */
class RecordBatchIterable<
NativeQueryType extends NativeQuery | NativeVectorQuery,
> implements AsyncIterable<RecordBatch>
{
private inner: NativeQueryType;
private options?: QueryExecutionOptions;
constructor(inner: NativeQueryType, options?: QueryExecutionOptions) {
this.inner = inner;
this.options = options;
}
// biome-ignore lint/suspicious/noExplicitAny: skip
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>, any, undefined> {
return new RecordBatchIterator(
this.inner.execute(this.options?.maxBatchLength),
);
}
}
/**
* Options that control the behavior of a particular query execution
*/
export interface QueryExecutionOptions {
/**
* The maximum number of rows to return in a single batch
*
* Batches may have fewer rows if the underlying data is stored
* in smaller chunks.
*/
maxBatchLength?: number;
}
/** Common methods supported by all query types */
export class QueryBase<
NativeQueryType extends NativeQuery | NativeVectorQuery,
@@ -141,8 +174,10 @@ export class QueryBase<
return this as unknown as QueryType;
}
protected nativeExecute(): Promise<NativeBatchIterator> {
return this.inner.execute();
protected nativeExecute(
options?: Partial<QueryExecutionOptions>,
): Promise<NativeBatchIterator> {
return this.inner.execute(options?.maxBatchLength);
}
/**
@@ -156,8 +191,10 @@ export class QueryBase<
* single query)
*
*/
protected execute(): RecordBatchIterator {
return new RecordBatchIterator(this.nativeExecute());
protected execute(
options?: Partial<QueryExecutionOptions>,
): RecordBatchIterator {
return new RecordBatchIterator(this.nativeExecute(options));
}
// biome-ignore lint/suspicious/noExplicitAny: skip
@@ -167,9 +204,9 @@ export class QueryBase<
}
/** Collect the results as an Arrow @see {@link ArrowTable}. */
async toArrow(): Promise<ArrowTable> {
async toArrow(options?: Partial<QueryExecutionOptions>): Promise<ArrowTable> {
const batches = [];
for await (const batch of this) {
for await (const batch of new RecordBatchIterable(this.inner, options)) {
batches.push(batch);
}
return new ArrowTable(batches);
@@ -177,9 +214,8 @@ export class QueryBase<
/** Collect the results as an array of objects. */
// biome-ignore lint/suspicious/noExplicitAny: arrow.toArrow() returns any[]
async toArray(): Promise<any[]> {
const tbl = await this.toArrow();
async toArray(options?: Partial<QueryExecutionOptions>): Promise<any[]> {
const tbl = await this.toArrow(options);
return tbl.toArray();
}
}