mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 07:09:57 +00:00
This will start publishing `@lancedb/lancedb` with the new nodejs package on our releases.
376 lines
14 KiB
TypeScript
376 lines
14 KiB
TypeScript
// Copyright 2024 Lance Developers.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
import { RecordBatch, tableFromIPC, Table as ArrowTable } from "apache-arrow";
|
|
import {
|
|
RecordBatchIterator as NativeBatchIterator,
|
|
Query as NativeQuery,
|
|
Table as NativeTable,
|
|
VectorQuery as NativeVectorQuery,
|
|
} from "./native";
|
|
import { type IvfPqOptions } from "./indices";
|
|
export class RecordBatchIterator implements AsyncIterator<RecordBatch> {
|
|
private promisedInner?: Promise<NativeBatchIterator>;
|
|
private inner?: NativeBatchIterator;
|
|
|
|
constructor(promise?: Promise<NativeBatchIterator>) {
|
|
// TODO: check promise reliably so we dont need to pass two arguments.
|
|
this.promisedInner = promise;
|
|
}
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
async next(): Promise<IteratorResult<RecordBatch<any>>> {
|
|
if (this.inner === undefined) {
|
|
this.inner = await this.promisedInner;
|
|
}
|
|
if (this.inner === undefined) {
|
|
throw new Error("Invalid iterator state state");
|
|
}
|
|
const n = await this.inner.next();
|
|
if (n == null) {
|
|
return Promise.resolve({ done: true, value: null });
|
|
}
|
|
const tbl = tableFromIPC(n);
|
|
if (tbl.batches.length != 1) {
|
|
throw new Error("Expected only one batch");
|
|
}
|
|
return Promise.resolve({ done: false, value: tbl.batches[0] });
|
|
}
|
|
}
|
|
/* eslint-enable */
|
|
|
|
/** Common methods supported by all query types */
|
|
export class QueryBase<
|
|
NativeQueryType extends NativeQuery | NativeVectorQuery,
|
|
QueryType,
|
|
> implements AsyncIterable<RecordBatch>
|
|
{
|
|
protected constructor(protected inner: NativeQueryType) {}
|
|
|
|
/**
|
|
* A filter statement to be applied to this query.
|
|
*
|
|
* The filter should be supplied as an SQL query string. For example:
|
|
* @example
|
|
* x > 10
|
|
* y > 0 AND y < 100
|
|
* x > 5 OR y = 'test'
|
|
*
|
|
* Filtering performance can often be improved by creating a scalar index
|
|
* on the filter column(s).
|
|
*/
|
|
where(predicate: string): QueryType {
|
|
this.inner.onlyIf(predicate);
|
|
return this as unknown as QueryType;
|
|
}
|
|
|
|
/**
|
|
* Return only the specified columns.
|
|
*
|
|
* By default a query will return all columns from the table. However, this can have
|
|
* a very significant impact on latency. LanceDb stores data in a columnar fashion. This
|
|
* means we can finely tune our I/O to select exactly the columns we need.
|
|
*
|
|
* As a best practice you should always limit queries to the columns that you need. If you
|
|
* pass in an array of column names then only those columns will be returned.
|
|
*
|
|
* You can also use this method to create new "dynamic" columns based on your existing columns.
|
|
* For example, you may not care about "a" or "b" but instead simply want "a + b". This is often
|
|
* seen in the SELECT clause of an SQL query (e.g. `SELECT a+b FROM my_table`).
|
|
*
|
|
* To create dynamic columns you can pass in a Map<string, string>. A column will be returned
|
|
* for each entry in the map. The key provides the name of the column. The value is
|
|
* an SQL string used to specify how the column is calculated.
|
|
*
|
|
* For example, an SQL query might state `SELECT a + b AS combined, c`. The equivalent
|
|
* input to this method would be:
|
|
* @example
|
|
* new Map([["combined", "a + b"], ["c", "c"]])
|
|
*
|
|
* Columns will always be returned in the order given, even if that order is different than
|
|
* the order used when adding the data.
|
|
*
|
|
* Note that you can pass in a `Record<string, string>` (e.g. an object literal). This method
|
|
* uses `Object.entries` which should preserve the insertion order of the object. However,
|
|
* object insertion order is easy to get wrong and `Map` is more foolproof.
|
|
*/
|
|
select(
|
|
columns: string[] | Map<string, string> | Record<string, string>,
|
|
): QueryType {
|
|
let columnTuples: [string, string][];
|
|
if (Array.isArray(columns)) {
|
|
columnTuples = columns.map((c) => [c, c]);
|
|
} else if (columns instanceof Map) {
|
|
columnTuples = Array.from(columns.entries());
|
|
} else {
|
|
columnTuples = Object.entries(columns);
|
|
}
|
|
this.inner.select(columnTuples);
|
|
return this as unknown as QueryType;
|
|
}
|
|
|
|
/**
|
|
* Set the maximum number of results to return.
|
|
*
|
|
* By default, a plain search has no limit. If this method is not
|
|
* called then every valid row from the table will be returned.
|
|
*/
|
|
limit(limit: number): QueryType {
|
|
this.inner.limit(limit);
|
|
return this as unknown as QueryType;
|
|
}
|
|
|
|
protected nativeExecute(): Promise<NativeBatchIterator> {
|
|
return this.inner.execute();
|
|
}
|
|
|
|
/**
|
|
* Execute the query and return the results as an @see {@link AsyncIterator}
|
|
* of @see {@link RecordBatch}.
|
|
*
|
|
* By default, LanceDb will use many threads to calculate results and, when
|
|
* the result set is large, multiple batches will be processed at one time.
|
|
* This readahead is limited however and backpressure will be applied if this
|
|
* stream is consumed slowly (this constrains the maximum memory used by a
|
|
* single query)
|
|
*
|
|
*/
|
|
protected execute(): RecordBatchIterator {
|
|
return new RecordBatchIterator(this.nativeExecute());
|
|
}
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>> {
|
|
const promise = this.nativeExecute();
|
|
return new RecordBatchIterator(promise);
|
|
}
|
|
|
|
/** Collect the results as an Arrow @see {@link ArrowTable}. */
|
|
async toArrow(): Promise<ArrowTable> {
|
|
const batches = [];
|
|
for await (const batch of this) {
|
|
batches.push(batch);
|
|
}
|
|
return new ArrowTable(batches);
|
|
}
|
|
|
|
/** Collect the results as an array of objects. */
|
|
async toArray(): Promise<unknown[]> {
|
|
const tbl = await this.toArrow();
|
|
// eslint-disable-next-line @typescript-eslint/no-unsafe-return
|
|
return tbl.toArray();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* An interface for a query that can be executed
|
|
*
|
|
* Supported by all query types
|
|
*/
|
|
export interface ExecutableQuery {}
|
|
|
|
/**
|
|
* A builder used to construct a vector search
|
|
*
|
|
* This builder can be reused to execute the query many times.
|
|
*/
|
|
export class VectorQuery extends QueryBase<NativeVectorQuery, VectorQuery> {
|
|
constructor(inner: NativeVectorQuery) {
|
|
super(inner);
|
|
}
|
|
|
|
/**
|
|
* Set the number of partitions to search (probe)
|
|
*
|
|
* This argument is only used when the vector column has an IVF PQ index.
|
|
* If there is no index then this value is ignored.
|
|
*
|
|
* The IVF stage of IVF PQ divides the input into partitions (clusters) of
|
|
* related values.
|
|
*
|
|
* The partition whose centroids are closest to the query vector will be
|
|
* exhaustiely searched to find matches. This parameter controls how many
|
|
* partitions should be searched.
|
|
*
|
|
* Increasing this value will increase the recall of your query but will
|
|
* also increase the latency of your query. The default value is 20. This
|
|
* default is good for many cases but the best value to use will depend on
|
|
* your data and the recall that you need to achieve.
|
|
*
|
|
* For best results we recommend tuning this parameter with a benchmark against
|
|
* your actual data to find the smallest possible value that will still give
|
|
* you the desired recall.
|
|
*/
|
|
nprobes(nprobes: number): VectorQuery {
|
|
this.inner.nprobes(nprobes);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Set the vector column to query
|
|
*
|
|
* This controls which column is compared to the query vector supplied in
|
|
* the call to @see {@link Query#nearestTo}
|
|
*
|
|
* This parameter must be specified if the table has more than one column
|
|
* whose data type is a fixed-size-list of floats.
|
|
*/
|
|
column(column: string): VectorQuery {
|
|
this.inner.column(column);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Set the distance metric to use
|
|
*
|
|
* When performing a vector search we try and find the "nearest" vectors according
|
|
* to some kind of distance metric. This parameter controls which distance metric to
|
|
* use. See @see {@link IvfPqOptions.distanceType} for more details on the different
|
|
* distance metrics available.
|
|
*
|
|
* Note: if there is a vector index then the distance type used MUST match the distance
|
|
* type used to train the vector index. If this is not done then the results will be
|
|
* invalid.
|
|
*
|
|
* By default "l2" is used.
|
|
*/
|
|
distanceType(distanceType: string): VectorQuery {
|
|
this.inner.distanceType(distanceType);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* A multiplier to control how many additional rows are taken during the refine step
|
|
*
|
|
* This argument is only used when the vector column has an IVF PQ index.
|
|
* If there is no index then this value is ignored.
|
|
*
|
|
* An IVF PQ index stores compressed (quantized) values. They query vector is compared
|
|
* against these values and, since they are compressed, the comparison is inaccurate.
|
|
*
|
|
* This parameter can be used to refine the results. It can improve both improve recall
|
|
* and correct the ordering of the nearest results.
|
|
*
|
|
* To refine results LanceDb will first perform an ANN search to find the nearest
|
|
* `limit` * `refine_factor` results. In other words, if `refine_factor` is 3 and
|
|
* `limit` is the default (10) then the first 30 results will be selected. LanceDb
|
|
* then fetches the full, uncompressed, values for these 30 results. The results are
|
|
* then reordered by the true distance and only the nearest 10 are kept.
|
|
*
|
|
* Note: there is a difference between calling this method with a value of 1 and never
|
|
* calling this method at all. Calling this method with any value will have an impact
|
|
* on your search latency. When you call this method with a `refine_factor` of 1 then
|
|
* LanceDb still needs to fetch the full, uncompressed, values so that it can potentially
|
|
* reorder the results.
|
|
*
|
|
* Note: if this method is NOT called then the distances returned in the _distance column
|
|
* will be approximate distances based on the comparison of the quantized query vector
|
|
* and the quantized result vectors. This can be considerably different than the true
|
|
* distance between the query vector and the actual uncompressed vector.
|
|
*/
|
|
refineFactor(refineFactor: number): VectorQuery {
|
|
this.inner.refineFactor(refineFactor);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* If this is called then filtering will happen after the vector search instead of
|
|
* before.
|
|
*
|
|
* By default filtering will be performed before the vector search. This is how
|
|
* filtering is typically understood to work. This prefilter step does add some
|
|
* additional latency. Creating a scalar index on the filter column(s) can
|
|
* often improve this latency. However, sometimes a filter is too complex or scalar
|
|
* indices cannot be applied to the column. In these cases postfiltering can be
|
|
* used instead of prefiltering to improve latency.
|
|
*
|
|
* Post filtering applies the filter to the results of the vector search. This means
|
|
* we only run the filter on a much smaller set of data. However, it can cause the
|
|
* query to return fewer than `limit` results (or even no results) if none of the nearest
|
|
* results match the filter.
|
|
*
|
|
* Post filtering happens during the "refine stage" (described in more detail in
|
|
* @see {@link VectorQuery#refineFactor}). This means that setting a higher refine
|
|
* factor can often help restore some of the results lost by post filtering.
|
|
*/
|
|
postfilter(): VectorQuery {
|
|
this.inner.postfilter();
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* If this is called then any vector index is skipped
|
|
*
|
|
* An exhaustive (flat) search will be performed. The query vector will
|
|
* be compared to every vector in the table. At high scales this can be
|
|
* expensive. However, this is often still useful. For example, skipping
|
|
* the vector index can give you ground truth results which you can use to
|
|
* calculate your recall to select an appropriate value for nprobes.
|
|
*/
|
|
bypassVectorIndex(): VectorQuery {
|
|
this.inner.bypassVectorIndex();
|
|
return this;
|
|
}
|
|
}
|
|
|
|
/** A builder for LanceDB queries. */
|
|
export class Query extends QueryBase<NativeQuery, Query> {
|
|
constructor(tbl: NativeTable) {
|
|
super(tbl.query());
|
|
}
|
|
|
|
/**
|
|
* Find the nearest vectors to the given query vector.
|
|
*
|
|
* This converts the query from a plain query to a vector query.
|
|
*
|
|
* This method will attempt to convert the input to the query vector
|
|
* expected by the embedding model. If the input cannot be converted
|
|
* then an error will be thrown.
|
|
*
|
|
* By default, there is no embedding model, and the input should be
|
|
* an array-like object of numbers (something that can be used as input
|
|
* to Float32Array.from)
|
|
*
|
|
* If there is only one vector column (a column whose data type is a
|
|
* fixed size list of floats) then the column does not need to be specified.
|
|
* If there is more than one vector column you must use
|
|
* @see {@link VectorQuery#column} to specify which column you would like
|
|
* to compare with.
|
|
*
|
|
* If no index has been created on the vector column then a vector query
|
|
* will perform a distance comparison between the query vector and every
|
|
* vector in the database and then sort the results. This is sometimes
|
|
* called a "flat search"
|
|
*
|
|
* For small databases, with a few hundred thousand vectors or less, this can
|
|
* be reasonably fast. In larger databases you should create a vector index
|
|
* on the column. If there is a vector index then an "approximate" nearest
|
|
* neighbor search (frequently called an ANN search) will be performed. This
|
|
* search is much faster, but the results will be approximate.
|
|
*
|
|
* The query can be further parameterized using the returned builder. There
|
|
* are various ANN search parameters that will let you fine tune your recall
|
|
* accuracy vs search latency.
|
|
*
|
|
* Vector searches always have a `limit`. If `limit` has not been called then
|
|
* a default `limit` of 10 will be used. @see {@link Query#limit}
|
|
*/
|
|
nearestTo(vector: unknown): VectorQuery {
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
const vectorQuery = this.inner.nearestTo(Float32Array.from(vector as any));
|
|
return new VectorQuery(vectorQuery);
|
|
}
|
|
}
|