feat: refactor the query API and add query support to the python async API (#1113)

In addition, there are also a number of changes in nodejs to the
docstrings of existing methods because this PR adds a jsdoc linter.
This commit is contained in:
Weston Pace
2024-03-18 12:36:49 -07:00
parent 2db257ca29
commit 4180b44472
38 changed files with 2609 additions and 754 deletions

View File

@@ -129,11 +129,25 @@ describe("When creating an index", () => {
});
// Search without specifying the column
const rst = await tbl.query().nearestTo(queryVec).limit(2).toArrow();
let rst = await tbl
.query()
.limit(2)
.nearestTo(queryVec)
.distanceType("DoT")
.toArrow();
expect(rst.numRows).toBe(2);
// Search using `vectorSearch`
rst = await tbl.vectorSearch(queryVec).limit(2).toArrow();
expect(rst.numRows).toBe(2);
// Search with specifying the column
const rst2 = await tbl.search(queryVec, "vec").limit(2).toArrow();
const rst2 = await tbl
.query()
.limit(2)
.nearestTo(queryVec)
.column("vec")
.toArrow();
expect(rst2.numRows).toBe(2);
expect(rst.toString()).toEqual(rst2.toString());
});
@@ -163,7 +177,7 @@ describe("When creating an index", () => {
const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
expect(fs.readdirSync(indexDir)).toHaveLength(1);
for await (const r of tbl.query().filter("id > 1").select(["id"])) {
for await (const r of tbl.query().where("id > 1").select(["id"])) {
expect(r.numRows).toBe(298);
}
});
@@ -205,33 +219,39 @@ describe("When creating an index", () => {
const rst = await tbl
.query()
.limit(2)
.nearestTo(
Array(32)
.fill(1)
.map(() => Math.random()),
)
.limit(2)
.toArrow();
expect(rst.numRows).toBe(2);
// Search with specifying the column
await expect(
tbl
.search(
.query()
.limit(2)
.nearestTo(
Array(64)
.fill(1)
.map(() => Math.random()),
"vec",
)
.limit(2)
.column("vec")
.toArrow(),
).rejects.toThrow(/.*does not match the dimension.*/);
const query64 = Array(64)
.fill(1)
.map(() => Math.random());
const rst64Query = await tbl.query().nearestTo(query64).limit(2).toArrow();
const rst64Search = await tbl.search(query64, "vec2").limit(2).toArrow();
const rst64Query = await tbl.query().limit(2).nearestTo(query64).toArrow();
const rst64Search = await tbl
.query()
.limit(2)
.nearestTo(query64)
.column("vec2")
.toArrow();
expect(rst64Query.toString()).toEqual(rst64Search.toString());
expect(rst64Query.numRows).toBe(2);
});

View File

@@ -4,14 +4,25 @@
const eslint = require("@eslint/js");
const tseslint = require("typescript-eslint");
const eslintConfigPrettier = require("eslint-config-prettier");
const jsdoc = require("eslint-plugin-jsdoc");
module.exports = tseslint.config(
eslint.configs.recommended,
jsdoc.configs["flat/recommended"],
eslintConfigPrettier,
...tseslint.configs.recommended,
{
rules: {
"@typescript-eslint/naming-convention": "error",
"jsdoc/require-returns": "off",
"jsdoc/require-param": "off",
"jsdoc/require-jsdoc": [
"error",
{
publicOnly: true,
},
],
},
plugins: jsdoc,
},
);

View File

@@ -31,6 +31,7 @@ import {
DataType,
Binary,
Float32,
type makeTable,
} from "apache-arrow";
import { type EmbeddingFunction } from "./embedding/embedding_function";
import { sanitizeSchema } from "./sanitize";
@@ -128,14 +129,7 @@ export class MakeArrowTableOptions {
* - Buffer => Binary
* - Record<String, any> => Struct
* - Array<any> => List
*
* @param data input data
* @param options options to control the makeArrowTable call.
*
* @example
*
* ```ts
*
* import { fromTableToBuffer, makeArrowTable } from "../arrow";
* import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
*
@@ -307,7 +301,9 @@ export function makeEmptyTable(schema: Schema): ArrowTable {
return makeArrowTable([], { schema });
}
// Helper function to convert Array<Array<any>> to a variable sized list array
/**
* Helper function to convert Array<Array<any>> to a variable sized list array
*/
// @ts-expect-error (Vector<unknown> is not assignable to Vector<any>)
function makeListVector(lists: unknown[][]): Vector<unknown> {
if (lists.length === 0 || lists[0].length === 0) {
@@ -333,7 +329,7 @@ function makeListVector(lists: unknown[][]): Vector<unknown> {
return listBuilder.finish().toVector();
}
// Helper function to convert an Array of JS values to an Arrow Vector
/** Helper function to convert an Array of JS values to an Arrow Vector */
function makeVector(
values: unknown[],
type?: DataType,
@@ -374,6 +370,7 @@ function makeVector(
}
}
/** Helper function to apply embeddings to an input table */
async function applyEmbeddings<T>(
table: ArrowTable,
embeddings?: EmbeddingFunction<T>,
@@ -466,7 +463,7 @@ async function applyEmbeddings<T>(
return newTable;
}
/*
/**
* Convert an Array of records into an Arrow Table, optionally applying an
* embeddings function to it.
*
@@ -493,7 +490,7 @@ export async function convertToTable<T>(
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
}
// Creates the Arrow Type for a Vector column with dimension `dim`
/** Creates the Arrow Type for a Vector column with dimension `dim` */
function newVectorType<T extends Float>(
dim: number,
innerType: T,
@@ -565,6 +562,14 @@ export async function fromTableToBuffer<T>(
return Buffer.from(await writer.toUint8Array());
}
/**
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
*
* This function will apply `embeddings` to the table in a manner similar to
* `convertToTable`.
*
* `schema` is required if the table is empty
*/
export async function fromDataToBuffer<T>(
data: Data,
embeddings?: EmbeddingFunction<T>,
@@ -599,6 +604,9 @@ export async function fromTableToStreamBuffer<T>(
return Buffer.from(await writer.toUint8Array());
}
/**
* Reorder the columns in `batch` so that they agree with the field order in `schema`
*/
function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
const alignedChildren = [];
for (const field of schema.fields) {
@@ -621,6 +629,9 @@ function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
return new RecordBatch(schema, newData);
}
/**
* Reorder the columns in `table` so that they agree with the field order in `schema`
*/
function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
const alignedBatches = table.batches.map((batch) =>
alignBatch(batch, schema),
@@ -628,7 +639,9 @@ function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
return new ArrowTable(schema, alignedBatches);
}
// Creates an empty Arrow Table
/**
* Create an empty table with the given schema
*/
export function createEmptyTable(schema: Schema): ArrowTable {
return new ArrowTable(sanitizeSchema(schema));
}

View File

@@ -78,7 +78,8 @@ export class Connection {
return this.inner.isOpen();
}
/** Close the connection, releasing any underlying resources.
/**
* Close the connection, releasing any underlying resources.
*
* It is safe to call this method multiple times.
*
@@ -93,11 +94,12 @@ export class Connection {
return this.inner.display();
}
/** List all the table names in this database.
/**
* List all the table names in this database.
*
* Tables will be returned in lexicographical order.
*
* @param options Optional parameters to control the listing.
* @param {Partial<TableNamesOptions>} options - options to control the
* paging / start point
*/
async tableNames(options?: Partial<TableNamesOptions>): Promise<string[]> {
return this.inner.tableNames(options?.startAfter, options?.limit);
@@ -105,9 +107,7 @@ export class Connection {
/**
* Open a table in the database.
*
* @param name The name of the table.
* @param embeddings An embedding function to use on this table
* @param {string} name - The name of the table
*/
async openTable(name: string): Promise<Table> {
const innerTable = await this.inner.openTable(name);
@@ -116,9 +116,9 @@ export class Connection {
/**
* Creates a new Table and initialize it with new data.
*
* @param {string} name - The name of the table.
* @param data - Non-empty Array of Records to be inserted into the table
* @param {Record<string, unknown>[] | ArrowTable} data - Non-empty Array of Records
* to be inserted into the table
*/
async createTable(
name: string,
@@ -145,9 +145,8 @@ export class Connection {
/**
* Creates a new empty Table
*
* @param {string} name - The name of the table.
* @param schema - The schema of the table
* @param {Schema} schema - The schema of the table
*/
async createEmptyTable(
name: string,
@@ -169,7 +168,7 @@ export class Connection {
/**
* Drop an existing table.
* @param name The name of the table to drop.
* @param {string} name The name of the table to drop.
*/
async dropTable(name: string): Promise<void> {
return this.inner.dropTable(name);

View File

@@ -62,6 +62,7 @@ export interface EmbeddingFunction<T> {
embed: (data: T[]) => Promise<number[][]>;
}
/** Test if the input seems to be an embedding function */
export function isEmbeddingFunction<T>(
value: unknown,
): value is EmbeddingFunction<T> {

View File

@@ -30,9 +30,8 @@ export { Table, AddDataOptions } from "./table";
* - `/path/to/database` - local database
* - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
* - `db://host:port` - remote database (LanceDB cloud)
*
* @param uri The uri of the database. If the database uri starts with `db://` then it connects to a remote database.
*
* @param {string} uri - The uri of the database. If the database uri starts
* with `db://` then it connects to a remote database.
* @see {@link ConnectionOptions} for more details on the URI format.
*/
export async function connect(

View File

@@ -18,7 +18,8 @@ import { Index as LanceDbIndex } from "./native";
* Options to create an `IVF_PQ` index
*/
export interface IvfPqOptions {
/** The number of IVF partitions to create.
/**
* The number of IVF partitions to create.
*
* This value should generally scale with the number of rows in the dataset.
* By default the number of partitions is the square root of the number of
@@ -30,7 +31,8 @@ export interface IvfPqOptions {
*/
numPartitions?: number;
/** Number of sub-vectors of PQ.
/**
* Number of sub-vectors of PQ.
*
* This value controls how much the vector is compressed during the quantization step.
* The more sub vectors there are the less the vector is compressed. The default is
@@ -45,9 +47,10 @@ export interface IvfPqOptions {
*/
numSubVectors?: number;
/** [DistanceType] to use to build the index.
/**
* Distance type to use to build the index.
*
* Default value is [DistanceType::L2].
* Default value is "l2".
*
* This is used when training the index to calculate the IVF partitions
* (vectors are grouped in partitions with similar vectors according to this
@@ -79,7 +82,8 @@ export interface IvfPqOptions {
*/
distanceType?: "l2" | "cosine" | "dot";
/** Max iteration to train IVF kmeans.
/**
* Max iteration to train IVF kmeans.
*
* When training an IVF PQ index we use kmeans to calculate the partitions. This parameter
* controls how many iterations of kmeans to run.
@@ -91,7 +95,8 @@ export interface IvfPqOptions {
*/
maxIterations?: number;
/** The number of vectors, per partition, to sample when training IVF kmeans.
/**
* The number of vectors, per partition, to sample when training IVF kmeans.
*
* When an IVF PQ index is trained, we need to calculate partitions. These are groups
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
@@ -148,7 +153,8 @@ export class Index {
);
}
/** Create a btree index
/**
* Create a btree index
*
* A btree index is an index on a scalar columns. The index stores a copy of the column
* in sorted order. A header entry is created for each block of rows (currently the
@@ -172,7 +178,8 @@ export class Index {
}
export interface IndexOptions {
/** Advanced index configuration
/**
* Advanced index configuration
*
* This option allows you to specify a specfic index to create and also
* allows you to pass in configuration for training the index.
@@ -183,7 +190,8 @@ export interface IndexOptions {
* will be used to determine the most useful kind of index to create.
*/
config?: Index;
/** Whether to replace the existing index
/**
* Whether to replace the existing index
*
* If this is false, and another index already exists on the same columns
* and the same name, then an error will be returned. This is true even if

View File

@@ -105,15 +105,23 @@ export class RecordBatchIterator {
next(): Promise<Buffer | null>
}
export class Query {
column(column: string): void
filter(filter: string): void
select(columns: Array<string>): void
onlyIf(predicate: string): void
select(columns: Array<[string, string]>): void
limit(limit: number): void
prefilter(prefilter: boolean): void
nearestTo(vector: Float32Array): void
nearestTo(vector: Float32Array): VectorQuery
execute(): Promise<RecordBatchIterator>
}
export class VectorQuery {
column(column: string): void
distanceType(distanceType: string): void
postfilter(): void
refineFactor(refineFactor: number): void
nprobes(nprobe: number): void
executeStream(): Promise<RecordBatchIterator>
bypassVectorIndex(): void
onlyIf(predicate: string): void
select(columns: Array<[string, string]>): void
limit(limit: number): void
execute(): Promise<RecordBatchIterator>
}
export class Table {
display(): string
@@ -127,6 +135,7 @@ export class Table {
createIndex(index: Index | undefined | null, column: string, replace?: boolean | undefined | null): Promise<void>
update(onlyIf: string | undefined | null, columns: Array<[string, string]>): Promise<void>
query(): Query
vectorSearch(vector: Float32Array): VectorQuery
addColumns(transforms: Array<AddColumnsSql>): Promise<void>
alterColumns(alterations: Array<ColumnAlteration>): Promise<void>
dropColumns(columns: Array<string>): Promise<void>

View File

@@ -5,302 +5,325 @@
/* auto-generated by NAPI-RS */
const { existsSync, readFileSync } = require('fs')
const { join } = require('path')
const { join } = require("path");
const { platform, arch } = process
const { platform, arch } = process;
let nativeBinding = null
let localFileExisted = false
let loadError = null
let nativeBinding = null;
let localFileExisted = false;
let loadError = null;
function isMusl() {
// For Node 10
if (!process.report || typeof process.report.getReport !== 'function') {
if (!process.report || typeof process.report.getReport !== "function") {
try {
const lddPath = require('child_process').execSync('which ldd').toString().trim()
return readFileSync(lddPath, 'utf8').includes('musl')
const lddPath = require("child_process")
.execSync("which ldd")
.toString()
.trim();
return readFileSync(lddPath, "utf8").includes("musl");
} catch (e) {
return true
return true;
}
} else {
const { glibcVersionRuntime } = process.report.getReport().header
return !glibcVersionRuntime
const { glibcVersionRuntime } = process.report.getReport().header;
return !glibcVersionRuntime;
}
}
switch (platform) {
case 'android':
case "android":
switch (arch) {
case 'arm64':
localFileExisted = existsSync(join(__dirname, 'lancedb-nodejs.android-arm64.node'))
case "arm64":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.android-arm64.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.android-arm64.node')
nativeBinding = require("./lancedb-nodejs.android-arm64.node");
} else {
nativeBinding = require('lancedb-android-arm64')
nativeBinding = require("lancedb-android-arm64");
}
} catch (e) {
loadError = e
loadError = e;
}
break
case 'arm':
localFileExisted = existsSync(join(__dirname, 'lancedb-nodejs.android-arm-eabi.node'))
break;
case "arm":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.android-arm-eabi.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.android-arm-eabi.node')
nativeBinding = require("./lancedb-nodejs.android-arm-eabi.node");
} else {
nativeBinding = require('lancedb-android-arm-eabi')
nativeBinding = require("lancedb-android-arm-eabi");
}
} catch (e) {
loadError = e
loadError = e;
}
break
break;
default:
throw new Error(`Unsupported architecture on Android ${arch}`)
throw new Error(`Unsupported architecture on Android ${arch}`);
}
break
case 'win32':
break;
case "win32":
switch (arch) {
case 'x64':
case "x64":
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.win32-x64-msvc.node')
)
join(__dirname, "lancedb-nodejs.win32-x64-msvc.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.win32-x64-msvc.node')
nativeBinding = require("./lancedb-nodejs.win32-x64-msvc.node");
} else {
nativeBinding = require('lancedb-win32-x64-msvc')
nativeBinding = require("lancedb-win32-x64-msvc");
}
} catch (e) {
loadError = e
loadError = e;
}
break
case 'ia32':
break;
case "ia32":
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.win32-ia32-msvc.node')
)
join(__dirname, "lancedb-nodejs.win32-ia32-msvc.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.win32-ia32-msvc.node')
nativeBinding = require("./lancedb-nodejs.win32-ia32-msvc.node");
} else {
nativeBinding = require('lancedb-win32-ia32-msvc')
nativeBinding = require("lancedb-win32-ia32-msvc");
}
} catch (e) {
loadError = e
loadError = e;
}
break
case 'arm64':
break;
case "arm64":
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.win32-arm64-msvc.node')
)
join(__dirname, "lancedb-nodejs.win32-arm64-msvc.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.win32-arm64-msvc.node')
nativeBinding = require("./lancedb-nodejs.win32-arm64-msvc.node");
} else {
nativeBinding = require('lancedb-win32-arm64-msvc')
nativeBinding = require("lancedb-win32-arm64-msvc");
}
} catch (e) {
loadError = e
loadError = e;
}
break
break;
default:
throw new Error(`Unsupported architecture on Windows: ${arch}`)
throw new Error(`Unsupported architecture on Windows: ${arch}`);
}
break
case 'darwin':
localFileExisted = existsSync(join(__dirname, 'lancedb-nodejs.darwin-universal.node'))
break;
case "darwin":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.darwin-universal.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.darwin-universal.node')
nativeBinding = require("./lancedb-nodejs.darwin-universal.node");
} else {
nativeBinding = require('lancedb-darwin-universal')
nativeBinding = require("lancedb-darwin-universal");
}
break
break;
} catch {}
switch (arch) {
case 'x64':
localFileExisted = existsSync(join(__dirname, 'lancedb-nodejs.darwin-x64.node'))
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.darwin-x64.node')
} else {
nativeBinding = require('lancedb-darwin-x64')
}
} catch (e) {
loadError = e
}
break
case 'arm64':
case "x64":
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.darwin-arm64.node')
)
join(__dirname, "lancedb-nodejs.darwin-x64.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.darwin-arm64.node')
nativeBinding = require("./lancedb-nodejs.darwin-x64.node");
} else {
nativeBinding = require('lancedb-darwin-arm64')
nativeBinding = require("lancedb-darwin-x64");
}
} catch (e) {
loadError = e
loadError = e;
}
break
break;
case "arm64":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.darwin-arm64.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.darwin-arm64.node");
} else {
nativeBinding = require("lancedb-darwin-arm64");
}
} catch (e) {
loadError = e;
}
break;
default:
throw new Error(`Unsupported architecture on macOS: ${arch}`)
throw new Error(`Unsupported architecture on macOS: ${arch}`);
}
break
case 'freebsd':
if (arch !== 'x64') {
throw new Error(`Unsupported architecture on FreeBSD: ${arch}`)
break;
case "freebsd":
if (arch !== "x64") {
throw new Error(`Unsupported architecture on FreeBSD: ${arch}`);
}
localFileExisted = existsSync(join(__dirname, 'lancedb-nodejs.freebsd-x64.node'))
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.freebsd-x64.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.freebsd-x64.node')
nativeBinding = require("./lancedb-nodejs.freebsd-x64.node");
} else {
nativeBinding = require('lancedb-freebsd-x64')
nativeBinding = require("lancedb-freebsd-x64");
}
} catch (e) {
loadError = e
loadError = e;
}
break
case 'linux':
break;
case "linux":
switch (arch) {
case 'x64':
case "x64":
if (isMusl()) {
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.linux-x64-musl.node')
)
join(__dirname, "lancedb-nodejs.linux-x64-musl.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.linux-x64-musl.node')
nativeBinding = require("./lancedb-nodejs.linux-x64-musl.node");
} else {
nativeBinding = require('lancedb-linux-x64-musl')
nativeBinding = require("lancedb-linux-x64-musl");
}
} catch (e) {
loadError = e
loadError = e;
}
} else {
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.linux-x64-gnu.node')
)
join(__dirname, "lancedb-nodejs.linux-x64-gnu.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.linux-x64-gnu.node')
nativeBinding = require("./lancedb-nodejs.linux-x64-gnu.node");
} else {
nativeBinding = require('lancedb-linux-x64-gnu')
nativeBinding = require("lancedb-linux-x64-gnu");
}
} catch (e) {
loadError = e
loadError = e;
}
}
break
case 'arm64':
break;
case "arm64":
if (isMusl()) {
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.linux-arm64-musl.node')
)
join(__dirname, "lancedb-nodejs.linux-arm64-musl.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.linux-arm64-musl.node')
nativeBinding = require("./lancedb-nodejs.linux-arm64-musl.node");
} else {
nativeBinding = require('lancedb-linux-arm64-musl')
nativeBinding = require("lancedb-linux-arm64-musl");
}
} catch (e) {
loadError = e
loadError = e;
}
} else {
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.linux-arm64-gnu.node')
)
join(__dirname, "lancedb-nodejs.linux-arm64-gnu.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.linux-arm64-gnu.node')
nativeBinding = require("./lancedb-nodejs.linux-arm64-gnu.node");
} else {
nativeBinding = require('lancedb-linux-arm64-gnu')
nativeBinding = require("lancedb-linux-arm64-gnu");
}
} catch (e) {
loadError = e
loadError = e;
}
}
break
case 'arm':
break;
case "arm":
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.linux-arm-gnueabihf.node')
)
join(__dirname, "lancedb-nodejs.linux-arm-gnueabihf.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.linux-arm-gnueabihf.node')
nativeBinding = require("./lancedb-nodejs.linux-arm-gnueabihf.node");
} else {
nativeBinding = require('lancedb-linux-arm-gnueabihf')
nativeBinding = require("lancedb-linux-arm-gnueabihf");
}
} catch (e) {
loadError = e
loadError = e;
}
break
case 'riscv64':
break;
case "riscv64":
if (isMusl()) {
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.linux-riscv64-musl.node')
)
join(__dirname, "lancedb-nodejs.linux-riscv64-musl.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.linux-riscv64-musl.node')
nativeBinding = require("./lancedb-nodejs.linux-riscv64-musl.node");
} else {
nativeBinding = require('lancedb-linux-riscv64-musl')
nativeBinding = require("lancedb-linux-riscv64-musl");
}
} catch (e) {
loadError = e
loadError = e;
}
} else {
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.linux-riscv64-gnu.node')
)
join(__dirname, "lancedb-nodejs.linux-riscv64-gnu.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.linux-riscv64-gnu.node')
nativeBinding = require("./lancedb-nodejs.linux-riscv64-gnu.node");
} else {
nativeBinding = require('lancedb-linux-riscv64-gnu')
nativeBinding = require("lancedb-linux-riscv64-gnu");
}
} catch (e) {
loadError = e
loadError = e;
}
}
break
case 's390x':
break;
case "s390x":
localFileExisted = existsSync(
join(__dirname, 'lancedb-nodejs.linux-s390x-gnu.node')
)
join(__dirname, "lancedb-nodejs.linux-s390x-gnu.node"),
);
try {
if (localFileExisted) {
nativeBinding = require('./lancedb-nodejs.linux-s390x-gnu.node')
nativeBinding = require("./lancedb-nodejs.linux-s390x-gnu.node");
} else {
nativeBinding = require('lancedb-linux-s390x-gnu')
nativeBinding = require("lancedb-linux-s390x-gnu");
}
} catch (e) {
loadError = e
loadError = e;
}
break
break;
default:
throw new Error(`Unsupported architecture on Linux: ${arch}`)
throw new Error(`Unsupported architecture on Linux: ${arch}`);
}
break
break;
default:
throw new Error(`Unsupported OS: ${platform}, architecture: ${arch}`)
throw new Error(`Unsupported OS: ${platform}, architecture: ${arch}`);
}
if (!nativeBinding) {
if (loadError) {
throw loadError
throw loadError;
}
throw new Error(`Failed to load native binding`)
throw new Error(`Failed to load native binding`);
}
const { Connection, Index, RecordBatchIterator, Query, Table, WriteMode, connect } = nativeBinding
const {
Connection,
Index,
RecordBatchIterator,
Query,
VectorQuery,
Table,
WriteMode,
connect,
} = nativeBinding;
module.exports.Connection = Connection
module.exports.Index = Index
module.exports.RecordBatchIterator = RecordBatchIterator
module.exports.Query = Query
module.exports.Table = Table
module.exports.WriteMode = WriteMode
module.exports.connect = connect
module.exports.Connection = Connection;
module.exports.Index = Index;
module.exports.RecordBatchIterator = RecordBatchIterator;
module.exports.Query = Query;
module.exports.VectorQuery = VectorQuery;
module.exports.Table = Table;
module.exports.WriteMode = WriteMode;
module.exports.connect = connect;

View File

@@ -17,18 +17,15 @@ import {
RecordBatchIterator as NativeBatchIterator,
Query as NativeQuery,
Table as NativeTable,
VectorQuery as NativeVectorQuery,
} from "./native";
import { type IvfPqOptions } from "./indices";
class RecordBatchIterator implements AsyncIterator<RecordBatch> {
private promisedInner?: Promise<NativeBatchIterator>;
private inner?: NativeBatchIterator;
constructor(
inner?: NativeBatchIterator,
promise?: Promise<NativeBatchIterator>,
) {
constructor(promise?: Promise<NativeBatchIterator>) {
// TODO: check promise reliably so we dont need to pass two arguments.
this.inner = inner;
this.promisedInner = promise;
}
@@ -53,82 +50,113 @@ class RecordBatchIterator implements AsyncIterator<RecordBatch> {
}
/* eslint-enable */
/** Query executor */
export class Query implements AsyncIterable<RecordBatch> {
private readonly inner: NativeQuery;
/** Common methods supported by all query types */
export class QueryBase<
NativeQueryType extends NativeQuery | NativeVectorQuery,
QueryType,
> implements AsyncIterable<RecordBatch>
{
protected constructor(protected inner: NativeQueryType) {}
constructor(tbl: NativeTable) {
this.inner = tbl.query();
/**
* A filter statement to be applied to this query.
*
* The filter should be supplied as an SQL query string. For example:
* @example
* x > 10
* y > 0 AND y < 100
* x > 5 OR y = 'test'
*
* Filtering performance can often be improved by creating a scalar index
* on the filter column(s).
*/
where(predicate: string): QueryType {
this.inner.onlyIf(predicate);
return this as unknown as QueryType;
}
/** Set the column to run query. */
column(column: string): Query {
this.inner.column(column);
return this;
/**
* Return only the specified columns.
*
* By default a query will return all columns from the table. However, this can have
* a very significant impact on latency. LanceDb stores data in a columnar fashion. This
* means we can finely tune our I/O to select exactly the columns we need.
*
* As a best practice you should always limit queries to the columns that you need. If you
* pass in an array of column names then only those columns will be returned.
*
* You can also use this method to create new "dynamic" columns based on your existing columns.
* For example, you may not care about "a" or "b" but instead simply want "a + b". This is often
* seen in the SELECT clause of an SQL query (e.g. `SELECT a+b FROM my_table`).
*
* To create dynamic columns you can pass in a Map<string, string>. A column will be returned
* for each entry in the map. The key provides the name of the column. The value is
* an SQL string used to specify how the column is calculated.
*
* For example, an SQL query might state `SELECT a + b AS combined, c`. The equivalent
* input to this method would be:
* @example
* new Map([["combined", "a + b"], ["c", "c"]])
*
* Columns will always be returned in the order given, even if that order is different than
* the order used when adding the data.
*
* Note that you can pass in a `Record<string, string>` (e.g. an object literal). This method
* uses `Object.entries` which should preserve the insertion order of the object. However,
* object insertion order is easy to get wrong and `Map` is more foolproof.
*/
select(
columns: string[] | Map<string, string> | Record<string, string>,
): QueryType {
let columnTuples: [string, string][];
if (Array.isArray(columns)) {
columnTuples = columns.map((c) => [c, c]);
} else if (columns instanceof Map) {
columnTuples = Array.from(columns.entries());
} else {
columnTuples = Object.entries(columns);
}
this.inner.select(columnTuples);
return this as unknown as QueryType;
}
/** Set the filter predicate, only returns the results that satisfy the filter.
/**
* Set the maximum number of results to return.
*
* By default, a plain search has no limit. If this method is not
* called then every valid row from the table will be returned.
*/
limit(limit: number): QueryType {
this.inner.limit(limit);
return this as unknown as QueryType;
}
protected nativeExecute(): Promise<NativeBatchIterator> {
return this.inner.execute();
}
/**
* Execute the query and return the results as an @see {@link AsyncIterator}
* of @see {@link RecordBatch}.
*
* By default, LanceDb will use many threads to calculate results and, when
* the result set is large, multiple batches will be processed at one time.
* This readahead is limited however and backpressure will be applied if this
* stream is consumed slowly (this constrains the maximum memory used by a
* single query)
*
*/
filter(predicate: string): Query {
this.inner.filter(predicate);
return this;
protected execute(): RecordBatchIterator {
return new RecordBatchIterator(this.nativeExecute());
}
/**
* Select the columns to return. If not set, all columns are returned.
*/
select(columns: string[]): Query {
this.inner.select(columns);
return this;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>> {
const promise = this.nativeExecute();
return new RecordBatchIterator(promise);
}
/**
* Set the limit of rows to return.
*/
limit(limit: number): Query {
this.inner.limit(limit);
return this;
}
prefilter(prefilter: boolean): Query {
this.inner.prefilter(prefilter);
return this;
}
/**
* Set the query vector.
*/
nearestTo(vector: number[]): Query {
this.inner.nearestTo(Float32Array.from(vector));
return this;
}
/**
* Set the number of IVF partitions to use for the query.
*/
nprobes(nprobes: number): Query {
this.inner.nprobes(nprobes);
return this;
}
/**
* Set the refine factor for the query.
*/
refineFactor(refineFactor: number): Query {
this.inner.refineFactor(refineFactor);
return this;
}
/**
* Execute the query and return the results as an AsyncIterator.
*/
async executeStream(): Promise<RecordBatchIterator> {
const inner = await this.inner.executeStream();
return new RecordBatchIterator(inner);
}
/** Collect the results as an Arrow Table. */
/** Collect the results as an Arrow @see {@link ArrowTable}. */
async toArrow(): Promise<ArrowTable> {
const batches = [];
for await (const batch of this) {
@@ -137,18 +165,211 @@ export class Query implements AsyncIterable<RecordBatch> {
return new ArrowTable(batches);
}
/** Returns a JSON Array of All results.
*
*/
/** Collect the results as an array of objects. */
async toArray(): Promise<unknown[]> {
const tbl = await this.toArrow();
// eslint-disable-next-line @typescript-eslint/no-unsafe-return
return tbl.toArray();
}
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
[Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>> {
const promise = this.inner.executeStream();
return new RecordBatchIterator(undefined, promise);
/**
* An interface for a query that can be executed
*
* Supported by all query types
*/
export interface ExecutableQuery {}
/**
* A builder used to construct a vector search
*
* This builder can be reused to execute the query many times.
*/
export class VectorQuery extends QueryBase<NativeVectorQuery, VectorQuery> {
constructor(inner: NativeVectorQuery) {
super(inner);
}
/**
* Set the number of partitions to search (probe)
*
* This argument is only used when the vector column has an IVF PQ index.
* If there is no index then this value is ignored.
*
* The IVF stage of IVF PQ divides the input into partitions (clusters) of
* related values.
*
* The partition whose centroids are closest to the query vector will be
* exhaustiely searched to find matches. This parameter controls how many
* partitions should be searched.
*
* Increasing this value will increase the recall of your query but will
* also increase the latency of your query. The default value is 20. This
* default is good for many cases but the best value to use will depend on
* your data and the recall that you need to achieve.
*
* For best results we recommend tuning this parameter with a benchmark against
* your actual data to find the smallest possible value that will still give
* you the desired recall.
*/
nprobes(nprobes: number): VectorQuery {
this.inner.nprobes(nprobes);
return this;
}
/**
* Set the vector column to query
*
* This controls which column is compared to the query vector supplied in
* the call to @see {@link Query#nearestTo}
*
* This parameter must be specified if the table has more than one column
* whose data type is a fixed-size-list of floats.
*/
column(column: string): VectorQuery {
this.inner.column(column);
return this;
}
/**
* Set the distance metric to use
*
* When performing a vector search we try and find the "nearest" vectors according
* to some kind of distance metric. This parameter controls which distance metric to
* use. See @see {@link IvfPqOptions.distanceType} for more details on the different
* distance metrics available.
*
* Note: if there is a vector index then the distance type used MUST match the distance
* type used to train the vector index. If this is not done then the results will be
* invalid.
*
* By default "l2" is used.
*/
distanceType(distanceType: string): VectorQuery {
this.inner.distanceType(distanceType);
return this;
}
/**
* A multiplier to control how many additional rows are taken during the refine step
*
* This argument is only used when the vector column has an IVF PQ index.
* If there is no index then this value is ignored.
*
* An IVF PQ index stores compressed (quantized) values. They query vector is compared
* against these values and, since they are compressed, the comparison is inaccurate.
*
* This parameter can be used to refine the results. It can improve both improve recall
* and correct the ordering of the nearest results.
*
* To refine results LanceDb will first perform an ANN search to find the nearest
* `limit` * `refine_factor` results. In other words, if `refine_factor` is 3 and
* `limit` is the default (10) then the first 30 results will be selected. LanceDb
* then fetches the full, uncompressed, values for these 30 results. The results are
* then reordered by the true distance and only the nearest 10 are kept.
*
* Note: there is a difference between calling this method with a value of 1 and never
* calling this method at all. Calling this method with any value will have an impact
* on your search latency. When you call this method with a `refine_factor` of 1 then
* LanceDb still needs to fetch the full, uncompressed, values so that it can potentially
* reorder the results.
*
* Note: if this method is NOT called then the distances returned in the _distance column
* will be approximate distances based on the comparison of the quantized query vector
* and the quantized result vectors. This can be considerably different than the true
* distance between the query vector and the actual uncompressed vector.
*/
refineFactor(refineFactor: number): VectorQuery {
this.inner.refineFactor(refineFactor);
return this;
}
/**
* If this is called then filtering will happen after the vector search instead of
* before.
*
* By default filtering will be performed before the vector search. This is how
* filtering is typically understood to work. This prefilter step does add some
* additional latency. Creating a scalar index on the filter column(s) can
* often improve this latency. However, sometimes a filter is too complex or scalar
* indices cannot be applied to the column. In these cases postfiltering can be
* used instead of prefiltering to improve latency.
*
* Post filtering applies the filter to the results of the vector search. This means
* we only run the filter on a much smaller set of data. However, it can cause the
* query to return fewer than `limit` results (or even no results) if none of the nearest
* results match the filter.
*
* Post filtering happens during the "refine stage" (described in more detail in
* @see {@link VectorQuery#refineFactor}). This means that setting a higher refine
* factor can often help restore some of the results lost by post filtering.
*/
postfilter(): VectorQuery {
this.inner.postfilter();
return this;
}
/**
* If this is called then any vector index is skipped
*
* An exhaustive (flat) search will be performed. The query vector will
* be compared to every vector in the table. At high scales this can be
* expensive. However, this is often still useful. For example, skipping
* the vector index can give you ground truth results which you can use to
* calculate your recall to select an appropriate value for nprobes.
*/
bypassVectorIndex(): VectorQuery {
this.inner.bypassVectorIndex();
return this;
}
}
/** A builder for LanceDB queries. */
export class Query extends QueryBase<NativeQuery, Query> {
constructor(tbl: NativeTable) {
super(tbl.query());
}
/**
* Find the nearest vectors to the given query vector.
*
* This converts the query from a plain query to a vector query.
*
* This method will attempt to convert the input to the query vector
* expected by the embedding model. If the input cannot be converted
* then an error will be thrown.
*
* By default, there is no embedding model, and the input should be
* an array-like object of numbers (something that can be used as input
* to Float32Array.from)
*
* If there is only one vector column (a column whose data type is a
* fixed size list of floats) then the column does not need to be specified.
* If there is more than one vector column you must use
* @see {@link VectorQuery#column} to specify which column you would like
* to compare with.
*
* If no index has been created on the vector column then a vector query
* will perform a distance comparison between the query vector and every
* vector in the database and then sort the results. This is sometimes
* called a "flat search"
*
* For small databases, with a few hundred thousand vectors or less, this can
* be reasonably fast. In larger databases you should create a vector index
* on the column. If there is a vector index then an "approximate" nearest
* neighbor search (frequently called an ANN search) will be performed. This
* search is much faster, but the results will be approximate.
*
* The query can be further parameterized using the returned builder. There
* are various ANN search parameters that will let you fine tune your recall
* accuracy vs search latency.
*
* Vector searches always have a `limit`. If `limit` has not been called then
* a default `limit` of 10 will be used. @see {@link Query#limit}
*/
nearestTo(vector: unknown): VectorQuery {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const vectorQuery = this.inner.nearestTo(Float32Array.from(vector as any));
return new VectorQuery(vectorQuery);
}
}

View File

@@ -481,6 +481,13 @@ function sanitizeField(fieldLike: unknown): Field {
return new Field(name, type, nullable, metadata);
}
/**
* Convert something schemaLike into a Schema instance
*
* This method is often needed even when the caller is using a Schema
* instance because they might be using a different instance of apache-arrow
* than lancedb is using.
*/
export function sanitizeSchema(schemaLike: unknown): Schema {
if (schemaLike instanceof Schema) {
return schemaLike;

View File

@@ -19,7 +19,7 @@ import {
IndexConfig,
Table as _NativeTable,
} from "./native";
import { Query } from "./query";
import { Query, VectorQuery } from "./query";
import { IndexOptions } from "./indices";
import { Data, fromDataToBuffer } from "./arrow";
@@ -28,7 +28,8 @@ export { IndexConfig } from "./native";
* Options for adding data to a table.
*/
export interface AddDataOptions {
/** If "append" (the default) then the new data will be added to the table
/**
* If "append" (the default) then the new data will be added to the table
*
* If "overwrite" then the new data will replace the existing data in the table.
*/
@@ -74,7 +75,8 @@ export class Table {
return this.inner.isOpen();
}
/** Close the table, releasing any underlying resources.
/**
* Close the table, releasing any underlying resources.
*
* It is safe to call this method multiple times.
*
@@ -98,9 +100,7 @@ export class Table {
/**
* Insert records into this Table.
*
* @param {Data} data Records to be inserted into the Table
* @return The number of rows added to the table
*/
async add(data: Data, options?: Partial<AddDataOptions>): Promise<void> {
const mode = options?.mode ?? "append";
@@ -124,15 +124,15 @@ export class Table {
* you are updating many rows (with different ids) then you will get
* better performance with a single [`merge_insert`] call instead of
* repeatedly calilng this method.
*
* @param updates the columns to update
* @param {Map<string, string> | Record<string, string>} updates - the
* columns to update
*
* Keys in the map should specify the name of the column to update.
* Values in the map provide the new value of the column. These can
* be SQL literal strings (e.g. "7" or "'foo'") or they can be expressions
* based on the row being updated (e.g. "my_col + 1")
*
* @param options additional options to control the update behavior
* @param {Partial<UpdateOptions>} options - additional options to control
* the update behavior
*/
async update(
updates: Map<string, string> | Record<string, string>,
@@ -158,37 +158,28 @@ export class Table {
await this.inner.delete(predicate);
}
/** Create an index to speed up queries.
/**
* Create an index to speed up queries.
*
* Indices can be created on vector columns or scalar columns.
* Indices on vector columns will speed up vector searches.
* Indices on scalar columns will speed up filtering (in both
* vector and non-vector searches)
*
* @example
*
* If the column has a vector (fixed size list) data type then
* an IvfPq vector index will be created.
*
* ```typescript
* // If the column has a vector (fixed size list) data type then
* // an IvfPq vector index will be created.
* const table = await conn.openTable("my_table");
* await table.createIndex(["vector"]);
* ```
*
* For advanced control over vector index creation you can specify
* the index type and options.
* ```typescript
* @example
* // For advanced control over vector index creation you can specify
* // the index type and options.
* const table = await conn.openTable("my_table");
* await table.createIndex(["vector"], I)
* .ivf_pq({ num_partitions: 128, num_sub_vectors: 16 })
* .build();
* ```
*
* Or create a Scalar index
*
* ```typescript
* @example
* // Or create a Scalar index
* await table.createIndex("my_float_col").build();
* ```
*/
async createIndex(column: string, options?: Partial<IndexOptions>) {
// Bit of a hack to get around the fact that TS has no package-scope.
@@ -198,69 +189,74 @@ export class Table {
}
/**
* Create a generic {@link Query} Builder.
* Create a {@link Query} Builder.
*
* Queries allow you to search your existing data. By default the query will
* return all the data in the table in no particular order. The builder
* returned by this method can be used to control the query using filtering,
* vector similarity, sorting, and more.
*
* Note: By default, all columns are returned. For best performance, you should
* only fetch the columns you need. See [`Query::select_with_projection`] for
* more details.
*
* When appropriate, various indices and statistics based pruning will be used to
* accelerate the query.
*
* @example
*
* ### Run a SQL-style query
* ```typescript
* // SQL-style filtering
* //
* // This query will return up to 1000 rows whose value in the `id` column
* // is greater than 5. LanceDb supports a broad set of filtering functions.
* for await (const batch of table.query()
* .filter("id > 1").select(["id"]).limit(20)) {
* console.log(batch);
* }
* ```
*
* ### Run Top-10 vector similarity search
* ```typescript
* @example
* // Vector Similarity Search
* //
* // This example will find the 10 rows whose value in the "vector" column are
* // closest to the query vector [1.0, 2.0, 3.0]. If an index has been created
* // on the "vector" column then this will perform an ANN search.
* //
* // The `refine_factor` and `nprobes` methods are used to control the recall /
* // latency tradeoff of the search.
* for await (const batch of table.query()
* .nearestTo([1, 2, 3])
* .refineFactor(5).nprobe(10)
* .limit(10)) {
* console.log(batch);
* }
*```
*
* ### Scan the full dataset
* ```typescript
* @example
* // Scan the full dataset
* //
* // This query will return everything in the table in no particular order.
* for await (const batch of table.query()) {
* console.log(batch);
* }
*
* ### Return the full dataset as Arrow Table
* ```typescript
* let arrowTbl = await table.query().nearestTo([1.0, 2.0, 0.5, 6.7]).toArrow();
* ```
*
* @returns {@link Query}
* @returns {Query} A builder that can be used to parameterize the query
*/
query(): Query {
return new Query(this.inner);
}
/** Search the table with a given query vector.
/**
* Search the table with a given query vector.
*
* This is a convenience method for preparing an ANN {@link Query}.
* This is a convenience method for preparing a vector query and
* is the same thing as calling `nearestTo` on the builder returned
* by `query`. @see {@link Query#nearestTo} for more details.
*/
search(vector: number[], column?: string): Query {
const q = this.query();
q.nearestTo(vector);
if (column !== undefined) {
q.column(column);
}
return q;
vectorSearch(vector: unknown): VectorQuery {
return this.query().nearestTo(vector);
}
// TODO: Support BatchUDF
/**
* Add new columns with defined values.
*
* @param newColumnTransforms pairs of column names and the SQL expression to use
* to calculate the value of the new column. These
* expressions will be evaluated for each row in the
* table, and can reference existing columns in the table.
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
* the SQL expression to use to calculate the value of the new column. These
* expressions will be evaluated for each row in the table, and can
* reference existing columns in the table.
*/
async addColumns(newColumnTransforms: AddColumnsSql[]): Promise<void> {
await this.inner.addColumns(newColumnTransforms);
@@ -268,8 +264,8 @@ export class Table {
/**
* Alter the name or nullability of columns.
*
* @param columnAlterations One or more alterations to apply to columns.
* @param {ColumnAlteration[]} columnAlterations One or more alterations to
* apply to columns.
*/
async alterColumns(columnAlterations: ColumnAlteration[]): Promise<void> {
await this.inner.alterColumns(columnAlterations);
@@ -282,16 +278,16 @@ export class Table {
* underlying storage. In order to remove the data, you must subsequently
* call ``compact_files`` to rewrite the data without the removed columns and
* then call ``cleanup_files`` to remove the old files.
*
* @param columnNames The names of the columns to drop. These can be nested
* column references (e.g. "a.b.c") or top-level column
* names (e.g. "a").
* @param {string[]} columnNames The names of the columns to drop. These can
* be nested column references (e.g. "a.b.c") or top-level column names
* (e.g. "a").
*/
async dropColumns(columnNames: string[]): Promise<void> {
await this.inner.dropColumns(columnNames);
}
/** Retrieve the version of the table
/**
* Retrieve the version of the table
*
* LanceDb supports versioning. Every operation that modifies the table increases
* version. As long as a version hasn't been deleted you can `[Self::checkout]` that
@@ -302,7 +298,8 @@ export class Table {
return await this.inner.version();
}
/** Checks out a specific version of the Table
/**
* Checks out a specific version of the Table
*
* Any read operation on the table will now access the data at the checked out version.
* As a consequence, calling this method will disable any read consistency interval
@@ -321,7 +318,8 @@ export class Table {
await this.inner.checkout(version);
}
/** Ensures the table is pointing at the latest version
/**
* Ensures the table is pointing at the latest version
*
* This can be used to manually update a table when the read_consistency_interval is None
* It can also be used to undo a `[Self::checkout]` operation
@@ -330,7 +328,8 @@ export class Table {
await this.inner.checkoutLatest();
}
/** Restore the table to the currently checked out version
/**
* Restore the table to the currently checked out version
*
* This operation will fail if checkout has not been called previously
*

120
nodejs/package-lock.json generated
View File

@@ -26,6 +26,7 @@
"apache-arrow-old": "npm:apache-arrow@13.0.0",
"eslint": "^8.57.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-jsdoc": "^48.2.1",
"jest": "^29.7.0",
"prettier": "^3.1.0",
"tmp": "^0.2.3",
@@ -755,6 +756,20 @@
"integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==",
"dev": true
},
"node_modules/@es-joy/jsdoccomment": {
"version": "0.42.0",
"resolved": "https://registry.npmjs.org/@es-joy/jsdoccomment/-/jsdoccomment-0.42.0.tgz",
"integrity": "sha512-R1w57YlVA6+YE01wch3GPYn6bCsrOV3YW/5oGGE2tmX6JcL9Nr+b5IikrjMPF+v9CV3ay+obImEdsDhovhJrzw==",
"dev": true,
"dependencies": {
"comment-parser": "1.4.1",
"esquery": "^1.5.0",
"jsdoc-type-pratt-parser": "~4.0.0"
},
"engines": {
"node": ">=16"
}
},
"node_modules/@eslint-community/eslint-utils": {
"version": "4.4.0",
"resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.4.0.tgz",
@@ -1948,6 +1963,15 @@
"integrity": "sha512-cumHmIAf6On83X7yP+LrsEyUOf/YlociZelmpRYaGFydoaPdxdt80MAbu6vWerQT2COCp2nPvHdsbD7tHn/YlQ==",
"dev": true
},
"node_modules/are-docs-informative": {
"version": "0.0.2",
"resolved": "https://registry.npmjs.org/are-docs-informative/-/are-docs-informative-0.0.2.tgz",
"integrity": "sha512-ixiS0nLNNG5jNQzgZJNoUpBKdo9yTYZMGJ+QgT2jmjR7G7+QHRCc4v6LQ3NgE7EBJq+o0ams3waJwkrlBom8Ig==",
"dev": true,
"engines": {
"node": ">=14"
}
},
"node_modules/argparse": {
"version": "1.0.10",
"resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz",
@@ -2189,6 +2213,18 @@
"integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
"dev": true
},
"node_modules/builtin-modules": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/builtin-modules/-/builtin-modules-3.3.0.tgz",
"integrity": "sha512-zhaCDicdLuWN5UbN5IMnFqNMhNfo919sH85y2/ea+5Yg9TsTkeZxpL+JLbp6cgYFS4sRLp3YV4S6yDuqVWHYOw==",
"dev": true,
"engines": {
"node": ">=6"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/camelcase": {
"version": "5.3.1",
"resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.3.1.tgz",
@@ -2373,6 +2409,15 @@
"node": ">=12.17"
}
},
"node_modules/comment-parser": {
"version": "1.4.1",
"resolved": "https://registry.npmjs.org/comment-parser/-/comment-parser-1.4.1.tgz",
"integrity": "sha512-buhp5kePrmda3vhc5B9t7pUQXAb2Tnd0qgpkIhPhkHXxJpiPJ11H0ZEU0oBpJ2QztSbzG/ZxMj/CHsYJqRHmyg==",
"dev": true,
"engines": {
"node": ">= 12.0.0"
}
},
"node_modules/concat-map": {
"version": "0.0.1",
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
@@ -2660,6 +2705,29 @@
"eslint": ">=7.0.0"
}
},
"node_modules/eslint-plugin-jsdoc": {
"version": "48.2.1",
"resolved": "https://registry.npmjs.org/eslint-plugin-jsdoc/-/eslint-plugin-jsdoc-48.2.1.tgz",
"integrity": "sha512-iUvbcyDZSO/9xSuRv2HQBw++8VkV/pt3UWtX9cpPH0l7GKPq78QC/6+PmyQHHvNZaTjAce6QVciEbnc6J/zH5g==",
"dev": true,
"dependencies": {
"@es-joy/jsdoccomment": "~0.42.0",
"are-docs-informative": "^0.0.2",
"comment-parser": "1.4.1",
"debug": "^4.3.4",
"escape-string-regexp": "^4.0.0",
"esquery": "^1.5.0",
"is-builtin-module": "^3.2.1",
"semver": "^7.6.0",
"spdx-expression-parse": "^4.0.0"
},
"engines": {
"node": ">=18"
},
"peerDependencies": {
"eslint": "^7.0.0 || ^8.0.0 || ^9.0.0"
}
},
"node_modules/eslint-scope": {
"version": "7.2.2",
"resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-7.2.2.tgz",
@@ -3299,6 +3367,21 @@
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==",
"optional": true
},
"node_modules/is-builtin-module": {
"version": "3.2.1",
"resolved": "https://registry.npmjs.org/is-builtin-module/-/is-builtin-module-3.2.1.tgz",
"integrity": "sha512-BSLE3HnV2syZ0FK0iMA/yUGplUeMmNz4AW5fnTunbCIqZi4vG3WjJT9FHMy5D69xmAYBHXQhJdALdpwVxV501A==",
"dev": true,
"dependencies": {
"builtin-modules": "^3.3.0"
},
"engines": {
"node": ">=6"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/is-core-module": {
"version": "2.13.1",
"resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.13.1.tgz",
@@ -4172,6 +4255,15 @@
"js-yaml": "bin/js-yaml.js"
}
},
"node_modules/jsdoc-type-pratt-parser": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/jsdoc-type-pratt-parser/-/jsdoc-type-pratt-parser-4.0.0.tgz",
"integrity": "sha512-YtOli5Cmzy3q4dP26GraSOeAhqecewG04hoO8DY56CH4KJ9Fvv5qKWUCCo3HZob7esJQHCv6/+bnTy72xZZaVQ==",
"dev": true,
"engines": {
"node": ">=12.0.0"
}
},
"node_modules/jsesc": {
"version": "2.5.2",
"resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
@@ -5018,9 +5110,9 @@
}
},
"node_modules/semver": {
"version": "7.5.4",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
"version": "7.6.0",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz",
"integrity": "sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg==",
"dev": true,
"dependencies": {
"lru-cache": "^6.0.0"
@@ -5105,6 +5197,28 @@
"source-map": "^0.6.0"
}
},
"node_modules/spdx-exceptions": {
"version": "2.5.0",
"resolved": "https://registry.npmjs.org/spdx-exceptions/-/spdx-exceptions-2.5.0.tgz",
"integrity": "sha512-PiU42r+xO4UbUS1buo3LPJkjlO7430Xn5SVAhdpzzsPHsjbYVflnnFdATgabnLude+Cqu25p6N+g2lw/PFsa4w==",
"dev": true
},
"node_modules/spdx-expression-parse": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-4.0.0.tgz",
"integrity": "sha512-Clya5JIij/7C6bRR22+tnGXbc4VKlibKSVj2iHvVeX5iMW7s1SIQlqu699JkODJJIhh/pUu8L0/VLh8xflD+LQ==",
"dev": true,
"dependencies": {
"spdx-exceptions": "^2.1.0",
"spdx-license-ids": "^3.0.0"
}
},
"node_modules/spdx-license-ids": {
"version": "3.0.17",
"resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.17.tgz",
"integrity": "sha512-sh8PWc/ftMqAAdFiBu6Fy6JUOYjqDJBJvIhpfDMyHrr0Rbp5liZqd4TjtQ/RgfLjKFZb+LMx5hpml5qOWy0qvg==",
"dev": true
},
"node_modules/sprintf-js": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",

View File

@@ -25,6 +25,7 @@
"apache-arrow-old": "npm:apache-arrow@13.0.0",
"eslint": "^8.57.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-jsdoc": "^48.2.1",
"jest": "^29.7.0",
"prettier": "^3.1.0",
"tmp": "^0.2.3",

View File

@@ -17,9 +17,10 @@ use std::sync::Mutex;
use lancedb::index::scalar::BTreeIndexBuilder;
use lancedb::index::vector::IvfPqIndexBuilder;
use lancedb::index::Index as LanceDbIndex;
use lancedb::DistanceType;
use napi_derive::napi;
use crate::util::parse_distance_type;
#[napi]
pub struct Index {
inner: Mutex<Option<LanceDbIndex>>,
@@ -49,15 +50,7 @@ impl Index {
) -> napi::Result<Self> {
let mut ivf_pq_builder = IvfPqIndexBuilder::default();
if let Some(distance_type) = distance_type {
let distance_type = match distance_type.as_str() {
"l2" => Ok(DistanceType::L2),
"cosine" => Ok(DistanceType::Cosine),
"dot" => Ok(DistanceType::Dot),
_ => Err(napi::Error::from_reason(format!(
"Invalid distance type '{}'. Must be one of l2, cosine, or dot",
distance_type
))),
}?;
let distance_type = parse_distance_type(distance_type)?;
ivf_pq_builder = ivf_pq_builder.distance_type(distance_type);
}
if let Some(num_partitions) = num_partitions {

View File

@@ -21,6 +21,7 @@ mod index;
mod iterator;
mod query;
mod table;
mod util;
#[napi(object)]
#[derive(Debug)]

View File

@@ -12,36 +12,38 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use lancedb::query::Query as LanceDBQuery;
use lancedb::query::ExecutableQuery;
use lancedb::query::Query as LanceDbQuery;
use lancedb::query::QueryBase;
use lancedb::query::Select;
use lancedb::query::VectorQuery as LanceDbVectorQuery;
use napi::bindgen_prelude::*;
use napi_derive::napi;
use crate::error::NapiErrorExt;
use crate::iterator::RecordBatchIterator;
use crate::util::parse_distance_type;
#[napi]
pub struct Query {
inner: LanceDBQuery,
inner: LanceDbQuery,
}
#[napi]
impl Query {
pub fn new(query: LanceDBQuery) -> Self {
pub fn new(query: LanceDbQuery) -> Self {
Self { inner: query }
}
// We cannot call this r#where because NAPI gets confused by the r#
#[napi]
pub fn column(&mut self, column: String) {
self.inner = self.inner.clone().column(&column);
pub fn only_if(&mut self, predicate: String) {
self.inner = self.inner.clone().only_if(predicate);
}
#[napi]
pub fn filter(&mut self, filter: String) {
self.inner = self.inner.clone().filter(filter);
}
#[napi]
pub fn select(&mut self, columns: Vec<String>) {
self.inner = self.inner.clone().select(&columns);
pub fn select(&mut self, columns: Vec<(String, String)>) {
self.inner = self.inner.clone().select(Select::dynamic(&columns));
}
#[napi]
@@ -50,13 +52,46 @@ impl Query {
}
#[napi]
pub fn prefilter(&mut self, prefilter: bool) {
self.inner = self.inner.clone().prefilter(prefilter);
pub fn nearest_to(&mut self, vector: Float32Array) -> Result<VectorQuery> {
let inner = self
.inner
.clone()
.nearest_to(vector.as_ref())
.default_error()?;
Ok(VectorQuery { inner })
}
#[napi]
pub fn nearest_to(&mut self, vector: Float32Array) {
self.inner = self.inner.clone().nearest_to(&vector);
pub async fn execute(&self) -> napi::Result<RecordBatchIterator> {
let inner_stream = self.inner.execute().await.map_err(|e| {
napi::Error::from_reason(format!("Failed to execute query stream: {}", e))
})?;
Ok(RecordBatchIterator::new(inner_stream))
}
}
#[napi]
pub struct VectorQuery {
inner: LanceDbVectorQuery,
}
#[napi]
impl VectorQuery {
#[napi]
pub fn column(&mut self, column: String) {
self.inner = self.inner.clone().column(&column);
}
#[napi]
pub fn distance_type(&mut self, distance_type: String) -> napi::Result<()> {
let distance_type = parse_distance_type(distance_type)?;
self.inner = self.inner.clone().distance_type(distance_type);
Ok(())
}
#[napi]
pub fn postfilter(&mut self) {
self.inner = self.inner.clone().postfilter();
}
#[napi]
@@ -70,8 +105,28 @@ impl Query {
}
#[napi]
pub async fn execute_stream(&self) -> napi::Result<RecordBatchIterator> {
let inner_stream = self.inner.execute_stream().await.map_err(|e| {
pub fn bypass_vector_index(&mut self) {
self.inner = self.inner.clone().bypass_vector_index()
}
#[napi]
pub fn only_if(&mut self, predicate: String) {
self.inner = self.inner.clone().only_if(predicate);
}
#[napi]
pub fn select(&mut self, columns: Vec<(String, String)>) {
self.inner = self.inner.clone().select(Select::dynamic(&columns));
}
#[napi]
pub fn limit(&mut self, limit: u32) {
self.inner = self.inner.clone().limit(limit as usize);
}
#[napi]
pub async fn execute(&self) -> napi::Result<RecordBatchIterator> {
let inner_stream = self.inner.execute().await.map_err(|e| {
napi::Error::from_reason(format!("Failed to execute query stream: {}", e))
})?;
Ok(RecordBatchIterator::new(inner_stream))

View File

@@ -23,7 +23,7 @@ use napi_derive::napi;
use crate::error::NapiErrorExt;
use crate::index::Index;
use crate::query::Query;
use crate::query::{Query, VectorQuery};
#[napi]
pub struct Table {
@@ -171,6 +171,11 @@ impl Table {
Ok(Query::new(self.inner_ref()?.query()))
}
#[napi]
pub fn vector_search(&self, vector: Float32Array) -> napi::Result<VectorQuery> {
self.query()?.nearest_to(vector)
}
#[napi]
pub async fn add_columns(&self, transforms: Vec<AddColumnsSql>) -> napi::Result<()> {
let transforms = transforms

13
nodejs/src/util.rs Normal file
View File

@@ -0,0 +1,13 @@
use lancedb::DistanceType;
pub fn parse_distance_type(distance_type: impl AsRef<str>) -> napi::Result<DistanceType> {
match distance_type.as_ref().to_lowercase().as_str() {
"l2" => Ok(DistanceType::L2),
"cosine" => Ok(DistanceType::Cosine),
"dot" => Ok(DistanceType::Dot),
_ => Err(napi::Error::from_reason(format!(
"Invalid distance type '{}'. Must be one of l2, cosine, or dot",
distance_type.as_ref()
))),
}
}