Compare commits

..

1 Commits

Author SHA1 Message Date
rmeng
93b7ae61be feat: upgrade to 0.12.4 2024-06-20 15:57:57 -04:00
42 changed files with 143 additions and 612 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion] [tool.bumpversion]
current_version = "0.5.2-final.1" current_version = "0.5.2"
parse = """(?x) parse = """(?x)
(?P<major>0|[1-9]\\d*)\\. (?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\. (?P<minor>0|[1-9]\\d*)\\.

View File

@@ -28,7 +28,7 @@ runs:
args: ${{ inputs.args }} args: ${{ inputs.args }}
docker-options: "-e PIP_EXTRA_INDEX_URL=https://pypi.fury.io/lancedb/" docker-options: "-e PIP_EXTRA_INDEX_URL=https://pypi.fury.io/lancedb/"
working-directory: python working-directory: python
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v3
with: with:
name: windows-wheels name: windows-wheels
path: python\target\wheels path: python\target\wheels

View File

@@ -20,11 +20,13 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"] categories = ["database-implementations"]
[workspace.dependencies] [workspace.dependencies]
lance = { "version" = "=0.13.0", "features" = ["dynamodb"] } lance = { "version" = "=0.12.4", "features" = [
lance-index = { "version" = "=0.13.0" } "dynamodb",
lance-linalg = { "version" = "=0.13.0" } ]}
lance-testing = { "version" = "=0.13.0" } lance-index = { "version" = "=0.12.4" }
lance-datafusion = { "version" = "=0.13.0" } lance-linalg = { "version" = "=0.12.4" }
lance-testing = { "version" = "=0.12.4" }
lance-datafusion = { "version" = "=0.12.4" }
# Note that this one does not include pyarrow # Note that this one does not include pyarrow
arrow = { version = "51.0", optional = false } arrow = { version = "51.0", optional = false }
arrow-array = "51.0" arrow-array = "51.0"
@@ -35,7 +37,7 @@ arrow-schema = "51.0"
arrow-arith = "51.0" arrow-arith = "51.0"
arrow-cast = "51.0" arrow-cast = "51.0"
async-trait = "0" async-trait = "0"
chrono = "=0.4.39" chrono = "0.4.35"
datafusion-physical-plan = "37.1" datafusion-physical-plan = "37.1"
half = { "version" = "=2.4.1", default-features = false, features = [ half = { "version" = "=2.4.1", default-features = false, features = [
"num-traits", "num-traits",

View File

@@ -54,16 +54,6 @@ This returns the result as a list of dictionaries as follows.
!!! note !!! note
LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead. LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
## Tokenization
By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
```python
table.create_fts_index("text", tokenizer_name="en_stem")
```
The following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
## Index multiple columns ## Index multiple columns
If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`: If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
@@ -149,7 +139,6 @@ is treated as a phrase query.
In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
double quotes replaced by single quotes. double quotes replaced by single quotes.
## Configurations ## Configurations
By default, LanceDB configures a 1GB heap size limit for creating the index. You can By default, LanceDB configures a 1GB heap size limit for creating the index. You can

View File

@@ -1,12 +1,12 @@
{ {
"name": "vectordb", "name": "vectordb",
"version": "0.5.2-final.1", "version": "0.5.2",
"description": " Serverless, low-latency vector database for AI applications", "description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",
"scripts": { "scripts": {
"tsc": "tsc -b", "tsc": "tsc -b",
"build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb_node index.node -- cargo build --message-format=json", "build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb-node index.node -- cargo build --message-format=json",
"build-release": "npm run build -- --release", "build-release": "npm run build -- --release",
"test": "npm run tsc && mocha -recursive dist/test", "test": "npm run tsc && mocha -recursive dist/test",
"integration-test": "npm run tsc && mocha -recursive dist/integration_test", "integration-test": "npm run tsc && mocha -recursive dist/integration_test",

View File

@@ -57,18 +57,6 @@ describe("given a connection", () => {
expect(db.isOpen()).toBe(false); expect(db.isOpen()).toBe(false);
await expect(db.tableNames()).rejects.toThrow("Connection is closed"); await expect(db.tableNames()).rejects.toThrow("Connection is closed");
}); });
it("should be able to create a table from an object arg `createTable(options)`, or args `createTable(name, data, options)`", async () => {
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
await expect(tbl.countRows()).resolves.toBe(2);
tbl = await db.createTable({
name: "test",
data: [{ id: 3 }],
mode: "overwrite",
});
await expect(tbl.countRows()).resolves.toBe(1);
});
it("should fail if creating table twice, unless overwrite is true", async () => { it("should fail if creating table twice, unless overwrite is true", async () => {
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]); let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);

View File

@@ -230,7 +230,7 @@ describe("embedding functions", () => {
}, },
); );
test.each([new Float16(), new Float32(), new Float64()])( test.only.each([new Float16(), new Float32(), new Float64()])(
"should be able to provide auto embeddings with multiple float datatypes", "should be able to provide auto embeddings with multiple float datatypes",
async (floatType) => { async (floatType) => {
@register("test1") @register("test1")

View File

@@ -305,7 +305,6 @@ describe("When creating an index", () => {
const indices = await tbl.listIndices(); const indices = await tbl.listIndices();
expect(indices.length).toBe(1); expect(indices.length).toBe(1);
expect(indices[0]).toEqual({ expect(indices[0]).toEqual({
name: "vec_idx",
indexType: "IvfPq", indexType: "IvfPq",
columns: ["vec"], columns: ["vec"],
}); });
@@ -362,24 +361,6 @@ describe("When creating an index", () => {
for await (const r of tbl.query().where("id > 1").select(["id"])) { for await (const r of tbl.query().where("id > 1").select(["id"])) {
expect(r.numRows).toBe(298); expect(r.numRows).toBe(298);
} }
// should also work with 'filter' alias
for await (const r of tbl.query().filter("id > 1").select(["id"])) {
expect(r.numRows).toBe(298);
}
});
test("should be able to get index stats", async () => {
await tbl.createIndex("id");
const stats = await tbl.indexStats("id_idx");
expect(stats).toBeDefined();
expect(stats?.numIndexedRows).toEqual(300);
expect(stats?.numUnindexedRows).toEqual(0);
});
test("when getting stats on non-existent index", async () => {
const stats = await tbl.indexStats("some non-existent index");
expect(stats).toBeUndefined();
}); });
// TODO: Move this test to the query API test (making sure we can reject queries // TODO: Move this test to the query API test (making sure we can reject queries

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
import { Table as ArrowTable, Data, Schema } from "./arrow"; import { Table as ArrowTable, Schema } from "./arrow";
import { fromTableToBuffer, makeEmptyTable } from "./arrow"; import { fromTableToBuffer, makeEmptyTable } from "./arrow";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry"; import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import { Connection as LanceDbConnection } from "./native"; import { Connection as LanceDbConnection } from "./native";
@@ -151,19 +151,6 @@ export abstract class Connection {
options?: Partial<OpenTableOptions>, options?: Partial<OpenTableOptions>,
): Promise<Table>; ): Promise<Table>;
/**
* Creates a new Table and initialize it with new data.
* @param {object} options - The options object.
* @param {string} options.name - The name of the table.
* @param {Data} options.data - Non-empty Array of Records to be inserted into the table
*
*/
abstract createTable(
options: {
name: string;
data: Data;
} & Partial<CreateTableOptions>,
): Promise<Table>;
/** /**
* Creates a new Table and initialize it with new data. * Creates a new Table and initialize it with new data.
* @param {string} name - The name of the table. * @param {string} name - The name of the table.
@@ -232,22 +219,13 @@ export class LocalConnection extends Connection {
} }
async createTable( async createTable(
nameOrOptions: name: string,
| string data: Record<string, unknown>[] | ArrowTable,
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
data?: Record<string, unknown>[] | ArrowTable,
options?: Partial<CreateTableOptions>, options?: Partial<CreateTableOptions>,
): Promise<Table> { ): Promise<Table> {
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
const { name, data, ...options } = nameOrOptions;
return this.createTable(name, data, options);
}
if (data === undefined) {
throw new Error("data is required");
}
const { buf, mode } = await Table.parseTableData(data, options); const { buf, mode } = await Table.parseTableData(data, options);
const innerTable = await this.inner.createTable( const innerTable = await this.inner.createTable(
nameOrOptions, name,
buf, buf,
mode, mode,
cleanseStorageOptions(options?.storageOptions), cleanseStorageOptions(options?.storageOptions),

View File

@@ -31,9 +31,6 @@ export {
AddColumnsSql, AddColumnsSql,
ColumnAlteration, ColumnAlteration,
ConnectionOptions, ConnectionOptions,
IndexStatistics,
IndexMetadata,
IndexConfig,
} from "./native.js"; } from "./native.js";
export { export {
@@ -59,7 +56,12 @@ export {
export { Index, IndexOptions, IvfPqOptions } from "./indices"; export { Index, IndexOptions, IvfPqOptions } from "./indices";
export { Table, AddDataOptions, UpdateOptions } from "./table"; export {
Table,
AddDataOptions,
IndexConfig,
UpdateOptions,
} from "./table";
export * as embedding from "./embedding"; export * as embedding from "./embedding";
@@ -74,61 +76,15 @@ export * as embedding from "./embedding";
* @param {string} uri - The uri of the database. If the database uri starts * @param {string} uri - The uri of the database. If the database uri starts
* with `db://` then it connects to a remote database. * with `db://` then it connects to a remote database.
* @see {@link ConnectionOptions} for more details on the URI format. * @see {@link ConnectionOptions} for more details on the URI format.
* @example
* ```ts
* const conn = await connect("/path/to/database");
* ```
* @example
* ```ts
* const conn = await connect(
* "s3://bucket/path/to/database",
* {storageOptions: {timeout: "60s"}
* });
* ```
*/ */
export async function connect( export async function connect(
uri: string, uri: string,
opts?: Partial<ConnectionOptions | RemoteConnectionOptions>, opts?: Partial<ConnectionOptions | RemoteConnectionOptions>,
): Promise<Connection>;
/**
* Connect to a LanceDB instance at the given URI.
*
* Accepted formats:
*
* - `/path/to/database` - local database
* - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
* - `db://host:port` - remote database (LanceDB cloud)
* @param options - The options to use when connecting to the database
* @see {@link ConnectionOptions} for more details on the URI format.
* @example
* ```ts
* const conn = await connect({
* uri: "/path/to/database",
* storageOptions: {timeout: "60s"}
* });
* ```
*/
export async function connect(
opts: Partial<RemoteConnectionOptions | ConnectionOptions> & { uri: string },
): Promise<Connection>;
export async function connect(
uriOrOptions:
| string
| (Partial<RemoteConnectionOptions | ConnectionOptions> & { uri: string }),
opts: Partial<ConnectionOptions | RemoteConnectionOptions> = {},
): Promise<Connection> { ): Promise<Connection> {
let uri: string | undefined;
if (typeof uriOrOptions !== "string") {
const { uri: uri_, ...options } = uriOrOptions;
uri = uri_;
opts = options;
} else {
uri = uriOrOptions;
}
if (!uri) { if (!uri) {
throw new Error("uri is required"); throw new Error("uri is required");
} }
opts = opts ?? {};
if (uri?.startsWith("db://")) { if (uri?.startsWith("db://")) {
return new RemoteConnection(uri, opts as RemoteConnectionOptions); return new RemoteConnection(uri, opts as RemoteConnectionOptions);

View File

@@ -114,14 +114,6 @@ export class QueryBase<
this.inner.onlyIf(predicate); this.inner.onlyIf(predicate);
return this as unknown as QueryType; return this as unknown as QueryType;
} }
/**
* A filter statement to be applied to this query.
* @alias where
* @deprecated Use `where` instead
*/
filter(predicate: string): QueryType {
return this.where(predicate);
}
/** /**
* Return only the specified columns. * Return only the specified columns.

View File

@@ -106,19 +106,10 @@ export class RemoteConnection extends Connection {
} }
async createTable( async createTable(
nameOrOptions: tableName: string,
| string data: Data,
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
data?: Data,
options?: Partial<CreateTableOptions> | undefined, options?: Partial<CreateTableOptions> | undefined,
): Promise<Table> { ): Promise<Table> {
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
const { name, data, ...options } = nameOrOptions;
return this.createTable(name, data, options);
}
if (data === undefined) {
throw new Error("data is required");
}
if (options?.mode) { if (options?.mode) {
console.warn( console.warn(
"option 'mode' is not supported in LanceDB Cloud", "option 'mode' is not supported in LanceDB Cloud",
@@ -141,7 +132,7 @@ export class RemoteConnection extends Connection {
); );
await this.#client.post( await this.#client.post(
`/v1/table/${encodeURIComponent(nameOrOptions)}/create/`, `/v1/table/${encodeURIComponent(tableName)}/create/`,
buf, buf,
{ {
config: { config: {
@@ -150,8 +141,8 @@ export class RemoteConnection extends Connection {
headers: { "Content-Type": "application/vnd.apache.arrow.stream" }, headers: { "Content-Type": "application/vnd.apache.arrow.stream" },
}, },
); );
this.#tableCache.set(nameOrOptions, true); this.#tableCache.set(tableName, true);
return new RemoteTable(this.#client, nameOrOptions, this.#dbName); return new RemoteTable(this.#client, tableName, this.#dbName);
} }
async createEmptyTable( async createEmptyTable(

View File

@@ -16,7 +16,6 @@ import { Table as ArrowTable } from "apache-arrow";
import { Data, IntoVector } from "../arrow"; import { Data, IntoVector } from "../arrow";
import { IndexStatistics } from "..";
import { CreateTableOptions } from "../connection"; import { CreateTableOptions } from "../connection";
import { IndexOptions } from "../indices"; import { IndexOptions } from "../indices";
import { MergeInsertBuilder } from "../merge"; import { MergeInsertBuilder } from "../merge";
@@ -35,10 +34,6 @@ export class RemoteTable extends Table {
return `/v1/table/${encodeURIComponent(this.#name)}/`; return `/v1/table/${encodeURIComponent(this.#name)}/`;
} }
get name(): string {
return this.#name;
}
public constructor( public constructor(
client: RestfulLanceDBClient, client: RestfulLanceDBClient,
tableName: string, tableName: string,
@@ -166,7 +161,4 @@ export class RemoteTable extends Table {
mergeInsert(_on: string | string[]): MergeInsertBuilder { mergeInsert(_on: string | string[]): MergeInsertBuilder {
throw new Error("mergeInsert() is not yet supported on the LanceDB cloud"); throw new Error("mergeInsert() is not yet supported on the LanceDB cloud");
} }
async indexStats(_name: string): Promise<IndexStatistics | undefined> {
throw new Error("indexStats() is not yet supported on the LanceDB cloud");
}
} }

View File

@@ -33,11 +33,11 @@ import {
AddColumnsSql, AddColumnsSql,
ColumnAlteration, ColumnAlteration,
IndexConfig, IndexConfig,
IndexStatistics,
OptimizeStats, OptimizeStats,
Table as _NativeTable, Table as _NativeTable,
} from "./native"; } from "./native";
import { Query, VectorQuery } from "./query"; import { Query, VectorQuery } from "./query";
export { IndexConfig } from "./native";
/** /**
* Options for adding data to a table. * Options for adding data to a table.
@@ -98,8 +98,6 @@ export abstract class Table {
[Symbol.for("nodejs.util.inspect.custom")](): string { [Symbol.for("nodejs.util.inspect.custom")](): string {
return this.display(); return this.display();
} }
/** Returns the name of the table */
abstract get name(): string;
/** Return true if the table has not been closed */ /** Return true if the table has not been closed */
abstract isOpen(): boolean; abstract isOpen(): boolean;
@@ -160,9 +158,6 @@ export abstract class Table {
* Indices on vector columns will speed up vector searches. * Indices on vector columns will speed up vector searches.
* Indices on scalar columns will speed up filtering (in both * Indices on scalar columns will speed up filtering (in both
* vector and non-vector searches) * vector and non-vector searches)
*
* @note We currently don't support custom named indexes,
* The index name will always be `${column}_idx`
* @example * @example
* // If the column has a vector (fixed size list) data type then * // If the column has a vector (fixed size list) data type then
* // an IvfPq vector index will be created. * // an IvfPq vector index will be created.
@@ -373,13 +368,6 @@ export abstract class Table {
abstract mergeInsert(on: string | string[]): MergeInsertBuilder; abstract mergeInsert(on: string | string[]): MergeInsertBuilder;
/** List all the stats of a specified index
*
* @param {string} name The name of the index.
* @returns {IndexStatistics | undefined} The stats of the index. If the index does not exist, it will return undefined
*/
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
static async parseTableData( static async parseTableData(
// biome-ignore lint/suspicious/noExplicitAny: <explanation> // biome-ignore lint/suspicious/noExplicitAny: <explanation>
data: Record<string, unknown>[] | ArrowTable<any>, data: Record<string, unknown>[] | ArrowTable<any>,
@@ -424,9 +412,7 @@ export class LocalTable extends Table {
super(); super();
this.inner = inner; this.inner = inner;
} }
get name(): string {
return this.inner.name;
}
isOpen(): boolean { isOpen(): boolean {
return this.inner.isOpen(); return this.inner.isOpen();
} }
@@ -579,13 +565,6 @@ export class LocalTable extends Table {
return await this.query().toArrow(); return await this.query().toArrow();
} }
async indexStats(name: string): Promise<IndexStatistics | undefined> {
const stats = await this.inner.indexStats(name);
if (stats === null) {
return undefined;
}
return stats;
}
mergeInsert(on: string | string[]): MergeInsertBuilder { mergeInsert(on: string | string[]): MergeInsertBuilder {
on = Array.isArray(on) ? on : [on]; on = Array.isArray(on) ? on : [on];
return new MergeInsertBuilder(this.inner.mergeInsert(on)); return new MergeInsertBuilder(this.inner.mergeInsert(on));

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-darwin-arm64", "name": "@lancedb/lancedb-darwin-arm64",
"version": "0.5.2-final.1", "version": "0.5.2",
"os": ["darwin"], "os": ["darwin"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node", "main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-darwin-x64", "name": "@lancedb/lancedb-darwin-x64",
"version": "0.5.2-final.1", "version": "0.5.2",
"os": ["darwin"], "os": ["darwin"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.darwin-x64.node", "main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-arm64-gnu", "name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.5.2-final.1", "version": "0.5.2",
"os": ["linux"], "os": ["linux"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node", "main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-x64-gnu", "name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.5.2-final.1", "version": "0.5.2",
"os": ["linux"], "os": ["linux"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node", "main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-win32-x64-msvc", "name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.5.2-final.1", "version": "0.5.2",
"os": ["win32"], "os": ["win32"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node", "main": "lancedb.win32-x64-msvc.node",

View File

@@ -18,8 +18,10 @@
"win32" "win32"
], ],
"dependencies": { "dependencies": {
"@types/axios": "^0.14.0",
"apache-arrow": "^15.0.0", "apache-arrow": "^15.0.0",
"axios": "^1.7.2", "axios": "^1.7.2",
"memoize": "^10.0.0",
"openai": "^4.29.2", "openai": "^4.29.2",
"reflect-metadata": "^0.2.2" "reflect-metadata": "^0.2.2"
}, },
@@ -29,7 +31,6 @@
"@biomejs/biome": "^1.7.3", "@biomejs/biome": "^1.7.3",
"@jest/globals": "^29.7.0", "@jest/globals": "^29.7.0",
"@napi-rs/cli": "^2.18.0", "@napi-rs/cli": "^2.18.0",
"@types/axios": "^0.14.0",
"@types/jest": "^29.1.2", "@types/jest": "^29.1.2",
"@types/tmp": "^0.2.6", "@types/tmp": "^0.2.6",
"apache-arrow-old": "npm:apache-arrow@13.0.0", "apache-arrow-old": "npm:apache-arrow@13.0.0",
@@ -3130,7 +3131,6 @@
"resolved": "https://registry.npmjs.org/@types/axios/-/axios-0.14.0.tgz", "resolved": "https://registry.npmjs.org/@types/axios/-/axios-0.14.0.tgz",
"integrity": "sha512-KqQnQbdYE54D7oa/UmYVMZKq7CO4l8DEENzOKc4aBRwxCXSlJXGz83flFx5L7AWrOQnmuN3kVsRdt+GZPPjiVQ==", "integrity": "sha512-KqQnQbdYE54D7oa/UmYVMZKq7CO4l8DEENzOKc4aBRwxCXSlJXGz83flFx5L7AWrOQnmuN3kVsRdt+GZPPjiVQ==",
"deprecated": "This is a stub types definition for axios (https://github.com/mzabriskie/axios). axios provides its own type definitions, so you don't need @types/axios installed!", "deprecated": "This is a stub types definition for axios (https://github.com/mzabriskie/axios). axios provides its own type definitions, so you don't need @types/axios installed!",
"dev": true,
"dependencies": { "dependencies": {
"axios": "*" "axios": "*"
} }
@@ -5942,6 +5942,20 @@
"is-buffer": "~1.1.6" "is-buffer": "~1.1.6"
} }
}, },
"node_modules/memoize": {
"version": "10.0.0",
"resolved": "https://registry.npmjs.org/memoize/-/memoize-10.0.0.tgz",
"integrity": "sha512-H6cBLgsi6vMWOcCpvVCdFFnl3kerEXbrYh9q+lY6VXvQSmM6CkmV08VOwT+WE2tzIEqRPFfAq3fm4v/UIW6mSA==",
"dependencies": {
"mimic-function": "^5.0.0"
},
"engines": {
"node": ">=18"
},
"funding": {
"url": "https://github.com/sindresorhus/memoize?sponsor=1"
}
},
"node_modules/merge-stream": { "node_modules/merge-stream": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz", "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
@@ -5989,6 +6003,17 @@
"node": ">= 0.6" "node": ">= 0.6"
} }
}, },
"node_modules/mimic-function": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/mimic-function/-/mimic-function-5.0.1.tgz",
"integrity": "sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==",
"engines": {
"node": ">=18"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/minimatch": { "node_modules/minimatch": {
"version": "3.1.2", "version": "3.1.2",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",

View File

@@ -1,16 +1,6 @@
{ {
"name": "@lancedb/lancedb", "name": "@lancedb/lancedb",
"description": "LanceDB: A serverless, low-latency vector database for AI applications", "version": "0.5.2",
"keywords": [
"database",
"lance",
"lancedb",
"search",
"vector",
"vector database",
"ann"
],
"version": "0.5.2-final.1",
"main": "dist/index.js", "main": "dist/index.js",
"exports": { "exports": {
".": "./dist/index.js", ".": "./dist/index.js",
@@ -48,8 +38,7 @@
"typedoc": "^0.25.7", "typedoc": "^0.25.7",
"typedoc-plugin-markdown": "^3.17.1", "typedoc-plugin-markdown": "^3.17.1",
"typescript": "^5.3.3", "typescript": "^5.3.3",
"typescript-eslint": "^7.1.0", "typescript-eslint": "^7.1.0"
"@types/axios": "^0.14.0"
}, },
"ava": { "ava": {
"timeout": "3m" "timeout": "3m"
@@ -76,6 +65,7 @@
"version": "napi version" "version": "napi version"
}, },
"dependencies": { "dependencies": {
"@types/axios": "^0.14.0",
"apache-arrow": "^15.0.0", "apache-arrow": "^15.0.0",
"axios": "^1.7.2", "axios": "^1.7.2",
"openai": "^4.29.2", "openai": "^4.29.2",

View File

@@ -56,6 +56,12 @@ impl Connection {
#[napi(factory)] #[napi(factory)]
pub async fn new(uri: String, options: ConnectionOptions) -> napi::Result<Self> { pub async fn new(uri: String, options: ConnectionOptions) -> napi::Result<Self> {
let mut builder = ConnectBuilder::new(&uri); let mut builder = ConnectBuilder::new(&uri);
if let Some(api_key) = options.api_key {
builder = builder.api_key(&api_key);
}
if let Some(host_override) = options.host_override {
builder = builder.host_override(&host_override);
}
if let Some(interval) = options.read_consistency_interval { if let Some(interval) = options.read_consistency_interval {
builder = builder =
builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval)); builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));

View File

@@ -28,6 +28,8 @@ mod util;
#[napi(object)] #[napi(object)]
#[derive(Debug)] #[derive(Debug)]
pub struct ConnectionOptions { pub struct ConnectionOptions {
pub api_key: Option<String>,
pub host_override: Option<String>,
/// (For LanceDB OSS only): The interval, in seconds, at which to check for /// (For LanceDB OSS only): The interval, in seconds, at which to check for
/// updates to the table from other processes. If None, then consistency is not /// updates to the table from other processes. If None, then consistency is not
/// checked. For performance reasons, this is the default. For strong /// checked. For performance reasons, this is the default. For strong

View File

@@ -30,7 +30,7 @@ use crate::query::{Query, VectorQuery};
pub struct Table { pub struct Table {
// We keep a duplicate of the table name so we can use it for error // We keep a duplicate of the table name so we can use it for error
// messages even if the table has been closed // messages even if the table has been closed
pub name: String, name: String,
pub(crate) inner: Option<LanceDbTable>, pub(crate) inner: Option<LanceDbTable>,
} }
@@ -330,13 +330,6 @@ impl Table {
.collect::<Vec<_>>()) .collect::<Vec<_>>())
} }
#[napi]
pub async fn index_stats(&self, index_name: String) -> napi::Result<Option<IndexStatistics>> {
let tbl = self.inner_ref()?.as_native().unwrap();
let stats = tbl.index_stats(&index_name).await.default_error()?;
Ok(stats.map(IndexStatistics::from))
}
#[napi] #[napi]
pub fn merge_insert(&self, on: Vec<String>) -> napi::Result<NativeMergeInsertBuilder> { pub fn merge_insert(&self, on: Vec<String>) -> napi::Result<NativeMergeInsertBuilder> {
let on: Vec<_> = on.iter().map(String::as_str).collect(); let on: Vec<_> = on.iter().map(String::as_str).collect();
@@ -347,13 +340,11 @@ impl Table {
#[napi(object)] #[napi(object)]
/// A description of an index currently configured on a column /// A description of an index currently configured on a column
pub struct IndexConfig { pub struct IndexConfig {
/// The name of the index
pub name: String,
/// The type of the index /// The type of the index
pub index_type: String, pub index_type: String,
/// The columns in the index /// The columns in the index
/// ///
/// Currently this is always an array of size 1. In the future there may /// Currently this is always an array of size 1. In the future there may
/// be more columns to represent composite indices. /// be more columns to represent composite indices.
pub columns: Vec<String>, pub columns: Vec<String>,
} }
@@ -364,7 +355,6 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
Self { Self {
index_type, index_type,
columns: value.columns, columns: value.columns,
name: value.name,
} }
} }
} }
@@ -447,40 +437,3 @@ pub struct AddColumnsSql {
/// The expression can reference other columns in the table. /// The expression can reference other columns in the table.
pub value_sql: String, pub value_sql: String,
} }
#[napi(object)]
pub struct IndexStatistics {
/// The number of rows indexed by the index
pub num_indexed_rows: f64,
/// The number of rows not indexed
pub num_unindexed_rows: f64,
/// The type of the index
pub index_type: Option<String>,
/// The metadata for each index
pub indices: Vec<IndexMetadata>,
}
impl From<lancedb::index::IndexStatistics> for IndexStatistics {
fn from(value: lancedb::index::IndexStatistics) -> Self {
Self {
num_indexed_rows: value.num_indexed_rows as f64,
num_unindexed_rows: value.num_unindexed_rows as f64,
index_type: value.index_type.map(|t| format!("{:?}", t)),
indices: value.indices.into_iter().map(Into::into).collect(),
}
}
}
#[napi(object)]
pub struct IndexMetadata {
pub metric_type: Option<String>,
pub index_type: Option<String>,
}
impl From<lancedb::index::IndexMetadata> for IndexMetadata {
fn from(value: lancedb::index::IndexMetadata) -> Self {
Self {
metric_type: value.metric_type,
index_type: value.index_type,
}
}
}

View File

@@ -1,5 +1,5 @@
[tool.bumpversion] [tool.bumpversion]
current_version = "0.9.0-beta.8" current_version = "0.8.2"
parse = """(?x) parse = """(?x)
(?P<major>0|[1-9]\\d*)\\. (?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\. (?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb-python" name = "lancedb-python"
version = "0.9.0-beta.8" version = "0.8.2"
edition.workspace = true edition.workspace = true
description = "Python bindings for LanceDB" description = "Python bindings for LanceDB"
license.workspace = true license.workspace = true
@@ -19,8 +19,6 @@ lancedb = { path = "../rust/lancedb" }
env_logger = "0.10" env_logger = "0.10"
pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] } pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] } pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
base64ct = "=1.6.0" # workaround for https://github.com/RustCrypto/formats/issues/1684
chrono = "=0.4.39"
# Prevent dynamic linking of lzma, which comes from datafusion # Prevent dynamic linking of lzma, which comes from datafusion
lzma-sys = { version = "*", features = ["static"] } lzma-sys = { version = "*", features = ["static"] }

View File

@@ -3,7 +3,7 @@ name = "lancedb"
# version in Cargo.toml # version in Cargo.toml
dependencies = [ dependencies = [
"deprecation", "deprecation",
"pylance==0.13.0", "pylance==0.12.2-beta.2",
"ratelimiter~=1.0", "ratelimiter~=1.0",
"requests>=2.31.0", "requests>=2.31.0",
"retry>=0.9.2", "retry>=0.9.2",
@@ -13,7 +13,6 @@ dependencies = [
"packaging", "packaging",
"cachetools", "cachetools",
"overrides>=0.7", "overrides>=0.7",
"urllib3==1.26.19"
] ]
description = "lancedb" description = "lancedb"
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }] authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]

View File

@@ -35,7 +35,6 @@ def connect(
host_override: Optional[str] = None, host_override: Optional[str] = None,
read_consistency_interval: Optional[timedelta] = None, read_consistency_interval: Optional[timedelta] = None,
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None, request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
storage_options: Optional[Dict[str, str]] = None,
**kwargs, **kwargs,
) -> DBConnection: ) -> DBConnection:
"""Connect to a LanceDB database. """Connect to a LanceDB database.
@@ -71,9 +70,6 @@ def connect(
executor will be used for making requests. This is for LanceDB Cloud executor will be used for making requests. This is for LanceDB Cloud
only and is only used when making batch requests (i.e., passing in only and is only used when making batch requests (i.e., passing in
multiple queries to the search method at once). multiple queries to the search method at once).
storage_options: dict, optional
Additional options for the storage backend. See available options at
https://lancedb.github.io/lancedb/guides/storage/
Examples Examples
-------- --------
@@ -109,16 +105,12 @@ def connect(
region, region,
host_override, host_override,
request_thread_pool=request_thread_pool, request_thread_pool=request_thread_pool,
storage_options=storage_options,
**kwargs, **kwargs,
) )
if kwargs: if kwargs:
raise ValueError(f"Unknown keyword arguments: {kwargs}") raise ValueError(f"Unknown keyword arguments: {kwargs}")
return LanceDBConnection( return LanceDBConnection(uri, read_consistency_interval=read_consistency_interval)
uri,
read_consistency_interval=read_consistency_interval,
)
async def connect_async( async def connect_async(

View File

@@ -29,10 +29,7 @@ from .table import LanceTable
def create_index( def create_index(
index_path: str, index_path: str, text_fields: List[str], ordering_fields: List[str] = None
text_fields: List[str],
ordering_fields: List[str] = None,
tokenizer_name: str = "default",
) -> tantivy.Index: ) -> tantivy.Index:
""" """
Create a new Index (not populated) Create a new Index (not populated)
@@ -45,8 +42,6 @@ def create_index(
List of text fields to index List of text fields to index
ordering_fields: List[str] ordering_fields: List[str]
List of unsigned type fields to order by at search time List of unsigned type fields to order by at search time
tokenizer_name : str, default "default"
The tokenizer to use
Returns Returns
------- -------
@@ -61,7 +56,7 @@ def create_index(
schema_builder.add_integer_field("doc_id", stored=True) schema_builder.add_integer_field("doc_id", stored=True)
# data fields # data fields
for name in text_fields: for name in text_fields:
schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name) schema_builder.add_text_field(name, stored=True)
if ordering_fields: if ordering_fields:
for name in ordering_fields: for name in ordering_fields:
schema_builder.add_unsigned_field(name, fast=True) schema_builder.add_unsigned_field(name, fast=True)

View File

@@ -117,8 +117,6 @@ class Query(pydantic.BaseModel):
with_row_id: bool = False with_row_id: bool = False
fast_search: bool = False
class LanceQueryBuilder(ABC): class LanceQueryBuilder(ABC):
"""An abstract query builder. Subclasses are defined for vector search, """An abstract query builder. Subclasses are defined for vector search,
@@ -127,14 +125,12 @@ class LanceQueryBuilder(ABC):
@classmethod @classmethod
def create( def create(
cls, cls,
table: "Table", table: "Table",
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]], query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
query_type: str, query_type: str,
vector_column_name: str, vector_column_name: str,
ordering_field_name: Optional[str] = None, ordering_field_name: str = None,
fts_columns: Union[str, List[str]] = [],
fast_search: bool = False,
) -> LanceQueryBuilder: ) -> LanceQueryBuilder:
""" """
Create a query builder based on the given query and query type. Create a query builder based on the given query and query type.
@@ -151,19 +147,14 @@ class LanceQueryBuilder(ABC):
If "auto", the query type is inferred based on the query. If "auto", the query type is inferred based on the query.
vector_column_name: str vector_column_name: str
The name of the vector column to use for vector search. The name of the vector column to use for vector search.
fast_search: bool
Skip flat search of unindexed data.
""" """
# Check hybrid search first as it supports empty query pattern
if query_type == "hybrid":
# hybrid fts and vector query
return LanceHybridQueryBuilder(
table, query, vector_column_name, fts_columns=fts_columns
)
if query is None: if query is None:
return LanceEmptyQueryBuilder(table) return LanceEmptyQueryBuilder(table)
if query_type == "hybrid":
# hybrid fts and vector query
return LanceHybridQueryBuilder(table, query, vector_column_name)
# remember the string query for reranking purpose # remember the string query for reranking purpose
str_query = query if isinstance(query, str) else None str_query = query if isinstance(query, str) else None
@@ -174,17 +165,12 @@ class LanceQueryBuilder(ABC):
) )
if query_type == "hybrid": if query_type == "hybrid":
return LanceHybridQueryBuilder( return LanceHybridQueryBuilder(table, query, vector_column_name)
table, query, vector_column_name, fts_columns=fts_columns
)
if isinstance(query, str): if isinstance(query, str):
# fts # fts
return LanceFtsQueryBuilder( return LanceFtsQueryBuilder(
table, table, query, ordering_field_name=ordering_field_name
query,
ordering_field_name=ordering_field_name,
fts_columns=fts_columns,
) )
if isinstance(query, list): if isinstance(query, list):
@@ -194,9 +180,7 @@ class LanceQueryBuilder(ABC):
else: else:
raise TypeError(f"Unsupported query type: {type(query)}") raise TypeError(f"Unsupported query type: {type(query)}")
return LanceVectorQueryBuilder( return LanceVectorQueryBuilder(table, query, vector_column_name, str_query)
table, query, vector_column_name, str_query, fast_search
)
@classmethod @classmethod
def _resolve_query(cls, table, query, query_type, vector_column_name): def _resolve_query(cls, table, query, query_type, vector_column_name):
@@ -212,6 +196,8 @@ class LanceQueryBuilder(ABC):
elif query_type == "auto": elif query_type == "auto":
if isinstance(query, (list, np.ndarray)): if isinstance(query, (list, np.ndarray)):
return query, "vector" return query, "vector"
if isinstance(query, tuple):
return query, "hybrid"
else: else:
conf = table.embedding_functions.get(vector_column_name) conf = table.embedding_functions.get(vector_column_name)
if conf is not None: if conf is not None:
@@ -238,14 +224,9 @@ class LanceQueryBuilder(ABC):
def __init__(self, table: "Table"): def __init__(self, table: "Table"):
self._table = table self._table = table
self._limit = 10 self._limit = 10
self._offset = 0
self._columns = None self._columns = None
self._where = None self._where = None
self._prefilter = False
self._with_row_id = False self._with_row_id = False
self._vector = None
self._text = None
self._ef = None
@deprecation.deprecated( @deprecation.deprecated(
deprecated_in="0.3.1", deprecated_in="0.3.1",
@@ -356,13 +337,11 @@ class LanceQueryBuilder(ABC):
---------- ----------
limit: int limit: int
The maximum number of results to return. The maximum number of results to return.
The default query limit is 10 results. By default the query is limited to the first 10.
For ANN/KNN queries, you must specify a limit. Call this method and pass 0, a negative value,
Entering 0, a negative number, or None will reset or None to remove the limit.
the limit to the default value of 10. *WARNING* if you have a large dataset, removing
*WARNING* if you have a large dataset, setting the limit can potentially result in reading a
the limit to a large number, e.g. the table size,
can potentially result in reading a
large amount of data into memory and cause large amount of data into memory and cause
out of memory issues. out of memory issues.
@@ -372,33 +351,11 @@ class LanceQueryBuilder(ABC):
The LanceQueryBuilder object. The LanceQueryBuilder object.
""" """
if limit is None or limit <= 0: if limit is None or limit <= 0:
if isinstance(self, LanceVectorQueryBuilder): self._limit = None
raise ValueError("Limit is required for ANN/KNN queries")
else:
self._limit = None
else: else:
self._limit = limit self._limit = limit
return self return self
def offset(self, offset: int) -> LanceQueryBuilder:
"""Set the offset for the results.
Parameters
----------
offset: int
The offset to start fetching results from.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
if offset is None or offset <= 0:
self._offset = 0
else:
self._offset = offset
return self
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder: def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
"""Set the columns to return. """Set the columns to return.
@@ -460,80 +417,6 @@ class LanceQueryBuilder(ABC):
self._with_row_id = with_row_id self._with_row_id = with_row_id
return self return self
def explain_plan(self, verbose: Optional[bool] = False) -> str:
"""Return the execution plan for this query.
Examples
--------
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> table = db.create_table("my_table", [{"vector": [99, 99]}])
>>> query = [100, 100]
>>> plan = table.search(query).explain_plan(True)
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
GlobalLimitExec: skip=0, fetch=10
FilterExec: _distance@2 IS NOT NULL
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
KNNVectorDistance: metric=l2
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
Parameters
----------
verbose : bool, default False
Use a verbose output format.
Returns
-------
plan : str
""" # noqa: E501
ds = self._table.to_lance()
return ds.scanner(
nearest={
"column": self._vector_column,
"q": self._query,
"k": self._limit,
"metric": self._metric,
"nprobes": self._nprobes,
"refine_factor": self._refine_factor,
},
prefilter=self._prefilter,
filter=self._str_query,
limit=self._limit,
with_row_id=self._with_row_id,
offset=self._offset,
).explain_plan(verbose)
def vector(self, vector: Union[np.ndarray, list]) -> LanceQueryBuilder:
"""Set the vector to search for.
Parameters
----------
vector: np.ndarray or list
The vector to search for.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError
def text(self, text: str) -> LanceQueryBuilder:
"""Set the text to search for.
Parameters
----------
text: str
The text to search for.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError
class LanceVectorQueryBuilder(LanceQueryBuilder): class LanceVectorQueryBuilder(LanceQueryBuilder):
""" """
@@ -557,12 +440,11 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
""" """
def __init__( def __init__(
self, self,
table: "Table", table: "Table",
query: Union[np.ndarray, list, "PIL.Image.Image"], query: Union[np.ndarray, list, "PIL.Image.Image"],
vector_column: str, vector_column: str,
str_query: Optional[str] = None, str_query: Optional[str] = None,
fast_search: bool = False,
): ):
super().__init__(table) super().__init__(table)
self._query = query self._query = query
@@ -573,14 +455,13 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
self._prefilter = False self._prefilter = False
self._reranker = None self._reranker = None
self._str_query = str_query self._str_query = str_query
self._fast_search = fast_search
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder: def metric(self, metric: Literal["L2", "cosine"]) -> LanceVectorQueryBuilder:
"""Set the distance metric to use. """Set the distance metric to use.
Parameters Parameters
---------- ----------
metric: "L2" or "cosine" or "dot" metric: "L2" or "cosine"
The distance metric to use. By default "L2" is used. The distance metric to use. By default "L2" is used.
Returns Returns
@@ -588,7 +469,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
LanceVectorQueryBuilder LanceVectorQueryBuilder
The LanceQueryBuilder object. The LanceQueryBuilder object.
""" """
self._metric = metric.lower() self._metric = metric
return self return self
def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder: def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
@@ -613,28 +494,6 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
self._nprobes = nprobes self._nprobes = nprobes
return self return self
def ef(self, ef: int) -> LanceVectorQueryBuilder:
"""Set the number of candidates to consider during search.
Higher values will yield better recall (more likely to find vectors if
they exist) at the expense of latency.
This only applies to the HNSW-related index.
The default value is 1.5 * limit.
Parameters
----------
ef: int
The number of candidates to consider during search.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._ef = ef
return self
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder: def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
"""Set the refine factor to use, increasing the number of vectors sampled. """Set the refine factor to use, increasing the number of vectors sampled.
@@ -695,11 +554,15 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
refine_factor=self._refine_factor, refine_factor=self._refine_factor,
vector_column=self._vector_column, vector_column=self._vector_column,
with_row_id=self._with_row_id, with_row_id=self._with_row_id,
offset=self._offset,
fast_search=self._fast_search,
ef=self._ef,
) )
result_set = self._table._execute_query(query, batch_size) result_set = self._table._execute_query(query, batch_size)
if self._reranker is not None:
rs_table = result_set.read_all()
result_set = self._reranker.rerank_vector(self._str_query, rs_table)
# convert result_set back to RecordBatchReader
result_set = pa.RecordBatchReader.from_batches(
result_set.schema, result_set.to_batches()
)
return result_set return result_set
@@ -728,7 +591,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
return self return self
def rerank( def rerank(
self, reranker: Reranker, query_string: Optional[str] = None self, reranker: Reranker, query_string: Optional[str] = None
) -> LanceVectorQueryBuilder: ) -> LanceVectorQueryBuilder:
"""Rerank the results using the specified reranker. """Rerank the results using the specified reranker.
@@ -893,34 +756,12 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
class LanceEmptyQueryBuilder(LanceQueryBuilder): class LanceEmptyQueryBuilder(LanceQueryBuilder):
def to_arrow(self) -> pa.Table: def to_arrow(self) -> pa.Table:
return self.to_batches().read_all() ds = self._table.to_lance()
return ds.to_table(
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
query = Query(
columns=self._columns, columns=self._columns,
filter=self._where, filter=self._where,
k=self._limit or 10, limit=self._limit,
with_row_id=self._with_row_id,
vector=[],
# not actually respected in remote query
offset=self._offset or 0,
) )
return self._table._execute_query(query)
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
"""Rerank the results using the specified reranker.
Parameters
----------
reranker: Reranker
The reranker to use.
Returns
-------
LanceEmptyQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError("Reranking is not yet supported.")
class LanceHybridQueryBuilder(LanceQueryBuilder): class LanceHybridQueryBuilder(LanceQueryBuilder):

View File

@@ -55,13 +55,11 @@ class RestfulLanceDBClient:
region: str region: str
api_key: Credential api_key: Credential
host_override: Optional[str] = attrs.field(default=None) host_override: Optional[str] = attrs.field(default=None)
db_prefix: Optional[str] = attrs.field(default=None)
closed: bool = attrs.field(default=False, init=False) closed: bool = attrs.field(default=False, init=False)
connection_timeout: float = attrs.field(default=120.0, kw_only=True) connection_timeout: float = attrs.field(default=120.0, kw_only=True)
read_timeout: float = attrs.field(default=300.0, kw_only=True) read_timeout: float = attrs.field(default=300.0, kw_only=True)
storage_options: Optional[Dict[str, str]] = attrs.field(default=None, kw_only=True)
@functools.cached_property @functools.cached_property
def session(self) -> requests.Session: def session(self) -> requests.Session:
@@ -94,18 +92,6 @@ class RestfulLanceDBClient:
headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com" headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
if self.host_override: if self.host_override:
headers["x-lancedb-database"] = self.db_name headers["x-lancedb-database"] = self.db_name
if self.storage_options:
if self.storage_options.get("account_name") is not None:
headers["x-azure-storage-account-name"] = self.storage_options[
"account_name"
]
if self.storage_options.get("azure_storage_account_name") is not None:
headers["x-azure-storage-account-name"] = self.storage_options[
"azure_storage_account_name"
]
if self.db_prefix:
headers["x-lancedb-database-prefix"] = self.db_prefix
return headers return headers
@staticmethod @staticmethod
@@ -172,7 +158,6 @@ class RestfulLanceDBClient:
headers["content-type"] = content_type headers["content-type"] = content_type
if request_id is not None: if request_id is not None:
headers["x-request-id"] = request_id headers["x-request-id"] = request_id
with self.session.post( with self.session.post(
urljoin(self.url, uri), urljoin(self.url, uri),
headers=headers, headers=headers,
@@ -260,6 +245,7 @@ def retry_adapter(options: Dict[str, Any]) -> HTTPAdapter:
connect=connect_retries, connect=connect_retries,
read=read_retries, read=read_retries,
backoff_factor=backoff_factor, backoff_factor=backoff_factor,
backoff_jitter=backoff_jitter,
status_forcelist=statuses, status_forcelist=statuses,
allowed_methods=methods, allowed_methods=methods,
) )

View File

@@ -15,7 +15,7 @@ import inspect
import logging import logging
import uuid import uuid
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from typing import Dict, Iterable, List, Optional, Union from typing import Iterable, List, Optional, Union
from urllib.parse import urlparse from urllib.parse import urlparse
from cachetools import TTLCache from cachetools import TTLCache
@@ -44,25 +44,20 @@ class RemoteDBConnection(DBConnection):
request_thread_pool: Optional[ThreadPoolExecutor] = None, request_thread_pool: Optional[ThreadPoolExecutor] = None,
connection_timeout: float = 120.0, connection_timeout: float = 120.0,
read_timeout: float = 300.0, read_timeout: float = 300.0,
storage_options: Optional[Dict[str, str]] = None,
): ):
"""Connect to a remote LanceDB database.""" """Connect to a remote LanceDB database."""
parsed = urlparse(db_url) parsed = urlparse(db_url)
if parsed.scheme != "db": if parsed.scheme != "db":
raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://") raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://")
self.db_name = parsed.netloc self.db_name = parsed.netloc
prefix = parsed.path.lstrip("/")
self.db_prefix = None if not prefix else prefix
self.api_key = api_key self.api_key = api_key
self._client = RestfulLanceDBClient( self._client = RestfulLanceDBClient(
self.db_name, self.db_name,
region, region,
api_key, api_key,
host_override, host_override,
self.db_prefix,
connection_timeout=connection_timeout, connection_timeout=connection_timeout,
read_timeout=read_timeout, read_timeout=read_timeout,
storage_options=storage_options,
) )
self._request_thread_pool = request_thread_pool self._request_thread_pool = request_thread_pool
self._table_cache = TTLCache(maxsize=10000, ttl=300) self._table_cache = TTLCache(maxsize=10000, ttl=300)

View File

@@ -15,14 +15,13 @@ import logging
import uuid import uuid
from concurrent.futures import Future from concurrent.futures import Future
from functools import cached_property from functools import cached_property
from typing import Dict, Iterable, Optional, Union, Literal from typing import Dict, Iterable, Optional, Union
import pyarrow as pa import pyarrow as pa
from lance import json_to_schema from lance import json_to_schema
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
from lancedb.merge import LanceMergeInsertBuilder from lancedb.merge import LanceMergeInsertBuilder
from lancedb.query import LanceQueryBuilder
from ..query import LanceVectorQueryBuilder from ..query import LanceVectorQueryBuilder
from ..table import Query, Table, _sanitize_data from ..table import Query, Table, _sanitize_data
@@ -82,7 +81,6 @@ class RemoteTable(Table):
def create_scalar_index( def create_scalar_index(
self, self,
column: str, column: str,
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
): ):
"""Creates a scalar index """Creates a scalar index
Parameters Parameters
@@ -91,6 +89,8 @@ class RemoteTable(Table):
The column to be indexed. Must be a boolean, integer, float, The column to be indexed. Must be a boolean, integer, float,
or string column. or string column.
""" """
index_type = "scalar"
data = { data = {
"column": column, "column": column,
"index_type": index_type, "index_type": index_type,
@@ -228,21 +228,10 @@ class RemoteTable(Table):
content_type=ARROW_STREAM_CONTENT_TYPE, content_type=ARROW_STREAM_CONTENT_TYPE,
) )
def query(
self,
query: Union[VEC, str] = None,
query_type: str = "vector",
vector_column_name: Optional[str] = None,
fast_search: bool = False,
) -> LanceVectorQueryBuilder:
return self.search(query, query_type, vector_column_name, fast_search)
def search( def search(
self, self,
query: Union[VEC, str] = None, query: Union[VEC, str],
query_type: str = "vector",
vector_column_name: Optional[str] = None, vector_column_name: Optional[str] = None,
fast_search: bool = False,
) -> LanceVectorQueryBuilder: ) -> LanceVectorQueryBuilder:
"""Create a search query to find the nearest neighbors """Create a search query to find the nearest neighbors
of the given query vector. We currently support [vector search][search] of the given query vector. We currently support [vector search][search]
@@ -289,11 +278,6 @@ class RemoteTable(Table):
- If the table has multiple vector columns then the *vector_column_name* - If the table has multiple vector columns then the *vector_column_name*
needs to be specified. Otherwise, an error is raised. needs to be specified. Otherwise, an error is raised.
fast_search: bool, optional
Skip a flat search of unindexed data. This may improve
search performance but search results will not include unindexed data.
- *default False*.
Returns Returns
------- -------
LanceQueryBuilder LanceQueryBuilder
@@ -309,14 +293,7 @@ class RemoteTable(Table):
""" """
if vector_column_name is None: if vector_column_name is None:
vector_column_name = inf_vector_column_query(self.schema) vector_column_name = inf_vector_column_query(self.schema)
return LanceVectorQueryBuilder(self, query, vector_column_name)
return LanceQueryBuilder.create(
self,
query,
query_type,
vector_column_name=vector_column_name,
fast_search=fast_search,
)
def _execute_query( def _execute_query(
self, query: Query, batch_size: Optional[int] = None self, query: Query, batch_size: Optional[int] = None

View File

@@ -1171,7 +1171,6 @@ class LanceTable(Table):
*, *,
replace: bool = False, replace: bool = False,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024, writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default",
): ):
"""Create a full-text search index on the table. """Create a full-text search index on the table.
@@ -1190,10 +1189,6 @@ class LanceTable(Table):
ordering_field_names: ordering_field_names:
A list of unsigned type fields to index to optionally order A list of unsigned type fields to index to optionally order
results on at search time results on at search time
tokenizer_name: str, default "default"
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
language code followed by "_stem". So for english it would be "en_stem".
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
""" """
from .fts import create_index, populate_index from .fts import create_index, populate_index
@@ -1219,7 +1214,6 @@ class LanceTable(Table):
self._get_fts_index_path(), self._get_fts_index_path(),
field_names, field_names,
ordering_fields=ordering_field_names, ordering_fields=ordering_field_names,
tokenizer_name=tokenizer_name,
) )
populate_index( populate_index(
index, index,

View File

@@ -66,17 +66,6 @@ def test_create_index(tmp_path):
assert os.path.exists(str(tmp_path / "index")) assert os.path.exists(str(tmp_path / "index"))
def test_create_index_with_stemming(tmp_path, table):
index = ldb.fts.create_index(
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
)
assert isinstance(index, tantivy.Index)
assert os.path.exists(str(tmp_path / "index"))
# Check stemming by running tokenizer on non empty table
table.create_fts_index("text", tokenizer_name="en_stem")
def test_populate_index(tmp_path, table): def test_populate_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"]) index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert ldb.fts.populate_index(index, table, ["text"]) == len(table) assert ldb.fts.populate_index(index, table, ["text"]) == len(table)

View File

@@ -21,7 +21,6 @@ class FakeLanceDBClient:
pass pass
def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult: def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
print(f"{query=}")
assert table_name == "test" assert table_name == "test"
t = pa.schema([]).empty_table() t = pa.schema([]).empty_table()
return VectorQueryResult(t) return VectorQueryResult(t)
@@ -40,21 +39,3 @@ def test_remote_db():
table = conn["test"] table = conn["test"]
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))]) table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
table.search([1.0, 2.0]).to_pandas() table.search([1.0, 2.0]).to_pandas()
def test_empty_query_with_filter():
conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
setattr(conn, "_client", FakeLanceDBClient())
table = conn["test"]
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
print(table.query().select(["vector"]).where("foo == bar").to_arrow())
def test_fast_search_query_with_filter():
conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
setattr(conn, "_client", FakeLanceDBClient())
table = conn["test"]
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
print(table.query([0, 0], fast_search=True).select(["vector"]).where("foo == bar").to_arrow())

View File

@@ -735,7 +735,7 @@ def test_create_scalar_index(db):
indices = table.to_lance().list_indices() indices = table.to_lance().list_indices()
assert len(indices) == 1 assert len(indices) == 1
scalar_index = indices[0] scalar_index = indices[0]
assert scalar_index["type"] == "BTree" assert scalar_index["type"] == "Scalar"
# Confirm that prefiltering still works with the scalar index column # Confirm that prefiltering still works with the scalar index column
results = table.search().where("x = 'c'").to_arrow() results = table.search().where("x = 'c'").to_arrow()

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb-node" name = "lancedb-node"
version = "0.5.2-final.1" version = "0.5.2"
description = "Serverless, low-latency vector database for AI applications" description = "Serverless, low-latency vector database for AI applications"
license.workspace = true license.workspace = true
edition.workspace = true edition.workspace = true

View File

@@ -463,7 +463,6 @@ impl JsTable {
Ok(promise) Ok(promise)
} }
#[allow(deprecated)]
pub(crate) fn js_index_stats(mut cx: FunctionContext) -> JsResult<JsPromise> { pub(crate) fn js_index_stats(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<Self>, _>(&mut cx)?; let js_table = cx.this().downcast_or_throw::<JsBox<Self>, _>(&mut cx)?;
let rt = runtime(&mut cx)?; let rt = runtime(&mut cx)?;

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb" name = "lancedb"
version = "0.5.2-final.1" version = "0.5.2"
edition.workspace = true edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications" description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true license.workspace = true

View File

@@ -80,8 +80,6 @@ pub enum IndexType {
/// A description of an index currently configured on a column /// A description of an index currently configured on a column
pub struct IndexConfig { pub struct IndexConfig {
/// The name of the index
pub name: String,
/// The type of the index /// The type of the index
pub index_type: IndexType, pub index_type: IndexType,
/// The columns in the index /// The columns in the index

View File

@@ -1206,36 +1206,28 @@ impl NativeTable {
.await) .await)
} }
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> { pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
#[allow(deprecated)]
match self.load_index_stats(index_uuid).await? { match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some(stats.num_indexed_rows)), Some(stats) => Ok(Some(stats.num_indexed_rows)),
None => Ok(None), None => Ok(None),
} }
} }
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> { pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
#[allow(deprecated)]
match self.load_index_stats(index_uuid).await? { match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some(stats.num_unindexed_rows)), Some(stats) => Ok(Some(stats.num_unindexed_rows)),
None => Ok(None), None => Ok(None),
} }
} }
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
pub async fn get_index_type(&self, index_uuid: &str) -> Result<Option<String>> { pub async fn get_index_type(&self, index_uuid: &str) -> Result<Option<String>> {
#[allow(deprecated)]
match self.load_index_stats(index_uuid).await? { match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some(stats.index_type.unwrap_or_default())), Some(stats) => Ok(Some(stats.index_type.unwrap_or_default())),
None => Ok(None), None => Ok(None),
} }
} }
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
pub async fn get_distance_type(&self, index_uuid: &str) -> Result<Option<String>> { pub async fn get_distance_type(&self, index_uuid: &str) -> Result<Option<String>> {
#[allow(deprecated)]
match self.load_index_stats(index_uuid).await? { match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some( Some(stats) => Ok(Some(
stats stats
@@ -1248,8 +1240,16 @@ impl NativeTable {
} }
} }
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")] pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
pub async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<IndexStatistics>> { let dataset = self.dataset.get().await?;
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
Ok(indices
.iter()
.map(|i| VectorIndex::new_from_format(&mf, i))
.collect())
}
async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<IndexStatistics>> {
let index = self let index = self
.load_indices() .load_indices()
.await? .await?
@@ -1268,35 +1268,6 @@ impl NativeTable {
Ok(Some(index_stats)) Ok(Some(index_stats))
} }
/// Get statistics about an index.
/// Returns an error if the index does not exist.
pub async fn index_stats<S: AsRef<str>>(
&self,
index_name: S,
) -> Result<Option<IndexStatistics>> {
self.dataset
.get()
.await?
.index_statistics(index_name.as_ref())
.await
.ok()
.map(|stats| {
serde_json::from_str(&stats).map_err(|e| Error::InvalidInput {
message: format!("error deserializing index statistics: {}", e),
})
})
.transpose()
}
pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
let dataset = self.dataset.get().await?;
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
Ok(indices
.iter()
.map(|i| VectorIndex::new_from_format(&mf, i))
.collect())
}
async fn create_ivf_pq_index( async fn create_ivf_pq_index(
&self, &self,
index: IvfPqIndexBuilder, index: IvfPqIndexBuilder,
@@ -1889,20 +1860,12 @@ impl TableInternal for NativeTable {
} }
columns.push(field.name.clone()); columns.push(field.name.clone());
} }
let index_type = if is_vector { Ok(IndexConfig { index_type: if is_vector { crate::index::IndexType::IvfPq } else { crate::index::IndexType::BTree }, columns })
crate::index::IndexType::IvfPq
} else {
crate::index::IndexType::BTree
};
let name = idx.name.clone();
Ok(IndexConfig { index_type, columns, name })
}).collect::<Result<Vec<_>>>() }).collect::<Result<Vec<_>>>()
} }
} }
#[cfg(test)] #[cfg(test)]
#[allow(deprecated)]
mod tests { mod tests {
use std::iter; use std::iter;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, Ordering};