Compare commits

...

23 Commits

Author SHA1 Message Date
Lance Release
204a075be9 Bump version: 0.9.0-beta.2 → 0.9.0-beta.3 2024-12-18 16:25:09 +00:00
Ryan Green
d6d7ad3b06 bump version 2024-12-18 10:21:04 -06:00
Ryan Green
e58d64c286 Remove unsupported Retry params 2024-12-18 10:08:38 -06:00
Ryan Green
76cbd18c46 bump version 2024-12-18 09:38:36 -06:00
Ryan Green
4abb38ac70 bump version 2024-12-18 09:37:58 -06:00
Ryan Green
cc7bc5011d Merge remote-tracking branch 'origin/python-v0.9.0-patch' into python-v0.9.0-patch
# Conflicts:
#	python/pyproject.toml
2024-12-18 08:59:35 -06:00
Ryan Green
8193183304 override urllib3 version 2024-12-18 08:59:24 -06:00
Ryan Green
cf28b58b7d override urllib3 version 2024-12-18 08:58:41 -06:00
Lance Release
e3b7ee47b9 Bump version: 0.9.0 → 0.9.0-final.1 2024-12-13 01:16:24 +00:00
Lu Qiu
97c9c906e4 Fix version test 2024-12-12 17:10:07 -08:00
Lu Qiu
358f86b9c6 fix 2024-12-12 16:44:24 -08:00
Lu Qiu
5489e215a3 Support storage options and folder prefix 2024-12-12 16:17:34 -08:00
Lance Release
bc0814767b Bump version: 0.9.0-beta.0 → 0.9.0 2024-06-25 00:25:27 +00:00
Lance Release
8960a8e535 Bump version: 0.8.2 → 0.9.0-beta.0 2024-06-25 00:25:27 +00:00
Weston Pace
a8568ddc72 feat: upgrade to lance 0.13.0 (#1404) 2024-06-24 17:22:57 -07:00
Cory Grinstead
55f88346d0 feat(nodejs): table.indexStats (#1361)
closes https://github.com/lancedb/lancedb/issues/1359
2024-06-21 17:06:52 -05:00
Will Jones
dfb9a28795 ci(node): add description and keywords for lancedb package (#1398) 2024-06-21 14:43:35 -07:00
Cory Grinstead
a797f5fe59 feat(nodejs): feature parity [5/N] - add query.filter() alias (#1391)
to make the transition from `vectordb` to `@lancedb/lancedb` as seamless
as possible, this adds `query.filter` with a deprecated tag.


depends on https://github.com/lancedb/lancedb/pull/1390
see actual diff here
https://github.com/universalmind303/lancedb/compare/list-indices-name...universalmind303:query-filter
2024-06-21 16:03:58 -05:00
Cory Grinstead
3cd84c9375 feat(nodejs): feature parity [4/N] - add 'name' to 'IndexConfig' for 'listIndices' (#1390)
depends on https://github.com/lancedb/lancedb/pull/1386

see actual diff here
https://github.com/universalmind303/lancedb/compare/create-table-args...universalmind303:list-indices-name
2024-06-21 15:45:02 -05:00
Cory Grinstead
5ca83fdc99 fix(node): node build (#1396)
i have no idea why this fixes the build.
2024-06-21 15:42:22 -05:00
Cory Grinstead
33cc9b682f feat(nodejs): feature parity [3/N] - createTable({name, data, ...options}) (#1386)
adds support for the `vectordb` syntax of `createTable({name, data,
...options})`.


depends on https://github.com/lancedb/lancedb/pull/1380
see actual diff here
https://github.com/universalmind303/lancedb/compare/table-name...universalmind303:create-table-args
2024-06-21 12:17:39 -05:00
Cory Grinstead
b3e5ac6d2a feat(nodejs): feature parity [2/N] - add table.name and lancedb.connect({args}) (#1380)
depends on https://github.com/lancedb/lancedb/pull/1378

see proper diff here
https://github.com/universalmind303/lancedb/compare/remote-table-node...universalmind303:lancedb:table-name
2024-06-21 11:38:26 -05:00
josca42
0fe844034d feat: enable stemming (#1356)
Added the ability to specify tokenizer_name, when creating a full text
search index using tantivy. This enables the use of language specific
stemming.

Also updated the [guide on full text
search](https://lancedb.github.io/lancedb/fts/) with a short section on
choosing tokenizer.

Fixes #1315
2024-06-20 14:23:55 -07:00
30 changed files with 351 additions and 86 deletions

View File

@@ -20,13 +20,11 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"]
[workspace.dependencies]
lance = { "version" = "=0.12.2", "features" = [
"dynamodb",
], git = "https://github.com/lancedb/lance.git", tag = "v0.12.2-beta.2" }
lance-index = { "version" = "=0.12.2", git = "https://github.com/lancedb/lance.git", tag = "v0.12.2-beta.2" }
lance-linalg = { "version" = "=0.12.2", git = "https://github.com/lancedb/lance.git", tag = "v0.12.2-beta.2" }
lance-testing = { "version" = "=0.12.2", git = "https://github.com/lancedb/lance.git", tag = "v0.12.2-beta.2" }
lance-datafusion = { "version" = "=0.12.2", git = "https://github.com/lancedb/lance.git", tag = "v0.12.2-beta.2" }
lance = { "version" = "=0.13.0", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.13.0" }
lance-linalg = { "version" = "=0.13.0" }
lance-testing = { "version" = "=0.13.0" }
lance-datafusion = { "version" = "=0.13.0" }
# Note that this one does not include pyarrow
arrow = { version = "51.0", optional = false }
arrow-array = "51.0"

View File

@@ -54,6 +54,16 @@ This returns the result as a list of dictionaries as follows.
!!! note
LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
## Tokenization
By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
```python
table.create_fts_index("text", tokenizer_name="en_stem")
```
The following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
## Index multiple columns
If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
@@ -139,6 +149,7 @@ is treated as a phrase query.
In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
double quotes replaced by single quotes.
## Configurations
By default, LanceDB configures a 1GB heap size limit for creating the index. You can

View File

@@ -6,7 +6,7 @@
"types": "dist/index.d.ts",
"scripts": {
"tsc": "tsc -b",
"build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb-node index.node -- cargo build --message-format=json",
"build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb_node index.node -- cargo build --message-format=json",
"build-release": "npm run build -- --release",
"test": "npm run tsc && mocha -recursive dist/test",
"integration-test": "npm run tsc && mocha -recursive dist/integration_test",

View File

@@ -57,6 +57,18 @@ describe("given a connection", () => {
expect(db.isOpen()).toBe(false);
await expect(db.tableNames()).rejects.toThrow("Connection is closed");
});
it("should be able to create a table from an object arg `createTable(options)`, or args `createTable(name, data, options)`", async () => {
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
await expect(tbl.countRows()).resolves.toBe(2);
tbl = await db.createTable({
name: "test",
data: [{ id: 3 }],
mode: "overwrite",
});
await expect(tbl.countRows()).resolves.toBe(1);
});
it("should fail if creating table twice, unless overwrite is true", async () => {
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);

View File

@@ -230,7 +230,7 @@ describe("embedding functions", () => {
},
);
test.only.each([new Float16(), new Float32(), new Float64()])(
test.each([new Float16(), new Float32(), new Float64()])(
"should be able to provide auto embeddings with multiple float datatypes",
async (floatType) => {
@register("test1")

View File

@@ -305,6 +305,7 @@ describe("When creating an index", () => {
const indices = await tbl.listIndices();
expect(indices.length).toBe(1);
expect(indices[0]).toEqual({
name: "vec_idx",
indexType: "IvfPq",
columns: ["vec"],
});
@@ -361,6 +362,24 @@ describe("When creating an index", () => {
for await (const r of tbl.query().where("id > 1").select(["id"])) {
expect(r.numRows).toBe(298);
}
// should also work with 'filter' alias
for await (const r of tbl.query().filter("id > 1").select(["id"])) {
expect(r.numRows).toBe(298);
}
});
test("should be able to get index stats", async () => {
await tbl.createIndex("id");
const stats = await tbl.indexStats("id_idx");
expect(stats).toBeDefined();
expect(stats?.numIndexedRows).toEqual(300);
expect(stats?.numUnindexedRows).toEqual(0);
});
test("when getting stats on non-existent index", async () => {
const stats = await tbl.indexStats("some non-existent index");
expect(stats).toBeUndefined();
});
// TODO: Move this test to the query API test (making sure we can reject queries

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
import { Table as ArrowTable, Schema } from "./arrow";
import { Table as ArrowTable, Data, Schema } from "./arrow";
import { fromTableToBuffer, makeEmptyTable } from "./arrow";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import { Connection as LanceDbConnection } from "./native";
@@ -151,6 +151,19 @@ export abstract class Connection {
options?: Partial<OpenTableOptions>,
): Promise<Table>;
/**
* Creates a new Table and initialize it with new data.
* @param {object} options - The options object.
* @param {string} options.name - The name of the table.
* @param {Data} options.data - Non-empty Array of Records to be inserted into the table
*
*/
abstract createTable(
options: {
name: string;
data: Data;
} & Partial<CreateTableOptions>,
): Promise<Table>;
/**
* Creates a new Table and initialize it with new data.
* @param {string} name - The name of the table.
@@ -219,13 +232,22 @@ export class LocalConnection extends Connection {
}
async createTable(
name: string,
data: Record<string, unknown>[] | ArrowTable,
nameOrOptions:
| string
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
data?: Record<string, unknown>[] | ArrowTable,
options?: Partial<CreateTableOptions>,
): Promise<Table> {
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
const { name, data, ...options } = nameOrOptions;
return this.createTable(name, data, options);
}
if (data === undefined) {
throw new Error("data is required");
}
const { buf, mode } = await Table.parseTableData(data, options);
const innerTable = await this.inner.createTable(
name,
nameOrOptions,
buf,
mode,
cleanseStorageOptions(options?.storageOptions),

View File

@@ -31,6 +31,9 @@ export {
AddColumnsSql,
ColumnAlteration,
ConnectionOptions,
IndexStatistics,
IndexMetadata,
IndexConfig,
} from "./native.js";
export {
@@ -56,12 +59,7 @@ export {
export { Index, IndexOptions, IvfPqOptions } from "./indices";
export {
Table,
AddDataOptions,
IndexConfig,
UpdateOptions,
} from "./table";
export { Table, AddDataOptions, UpdateOptions } from "./table";
export * as embedding from "./embedding";
@@ -76,15 +74,61 @@ export * as embedding from "./embedding";
* @param {string} uri - The uri of the database. If the database uri starts
* with `db://` then it connects to a remote database.
* @see {@link ConnectionOptions} for more details on the URI format.
* @example
* ```ts
* const conn = await connect("/path/to/database");
* ```
* @example
* ```ts
* const conn = await connect(
* "s3://bucket/path/to/database",
* {storageOptions: {timeout: "60s"}
* });
* ```
*/
export async function connect(
uri: string,
opts?: Partial<ConnectionOptions | RemoteConnectionOptions>,
): Promise<Connection>;
/**
* Connect to a LanceDB instance at the given URI.
*
* Accepted formats:
*
* - `/path/to/database` - local database
* - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
* - `db://host:port` - remote database (LanceDB cloud)
* @param options - The options to use when connecting to the database
* @see {@link ConnectionOptions} for more details on the URI format.
* @example
* ```ts
* const conn = await connect({
* uri: "/path/to/database",
* storageOptions: {timeout: "60s"}
* });
* ```
*/
export async function connect(
opts: Partial<RemoteConnectionOptions | ConnectionOptions> & { uri: string },
): Promise<Connection>;
export async function connect(
uriOrOptions:
| string
| (Partial<RemoteConnectionOptions | ConnectionOptions> & { uri: string }),
opts: Partial<ConnectionOptions | RemoteConnectionOptions> = {},
): Promise<Connection> {
let uri: string | undefined;
if (typeof uriOrOptions !== "string") {
const { uri: uri_, ...options } = uriOrOptions;
uri = uri_;
opts = options;
} else {
uri = uriOrOptions;
}
if (!uri) {
throw new Error("uri is required");
}
opts = opts ?? {};
if (uri?.startsWith("db://")) {
return new RemoteConnection(uri, opts as RemoteConnectionOptions);

View File

@@ -114,6 +114,14 @@ export class QueryBase<
this.inner.onlyIf(predicate);
return this as unknown as QueryType;
}
/**
* A filter statement to be applied to this query.
* @alias where
* @deprecated Use `where` instead
*/
filter(predicate: string): QueryType {
return this.where(predicate);
}
/**
* Return only the specified columns.

View File

@@ -106,10 +106,19 @@ export class RemoteConnection extends Connection {
}
async createTable(
tableName: string,
data: Data,
nameOrOptions:
| string
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
data?: Data,
options?: Partial<CreateTableOptions> | undefined,
): Promise<Table> {
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
const { name, data, ...options } = nameOrOptions;
return this.createTable(name, data, options);
}
if (data === undefined) {
throw new Error("data is required");
}
if (options?.mode) {
console.warn(
"option 'mode' is not supported in LanceDB Cloud",
@@ -132,7 +141,7 @@ export class RemoteConnection extends Connection {
);
await this.#client.post(
`/v1/table/${encodeURIComponent(tableName)}/create/`,
`/v1/table/${encodeURIComponent(nameOrOptions)}/create/`,
buf,
{
config: {
@@ -141,8 +150,8 @@ export class RemoteConnection extends Connection {
headers: { "Content-Type": "application/vnd.apache.arrow.stream" },
},
);
this.#tableCache.set(tableName, true);
return new RemoteTable(this.#client, tableName, this.#dbName);
this.#tableCache.set(nameOrOptions, true);
return new RemoteTable(this.#client, nameOrOptions, this.#dbName);
}
async createEmptyTable(

View File

@@ -16,6 +16,7 @@ import { Table as ArrowTable } from "apache-arrow";
import { Data, IntoVector } from "../arrow";
import { IndexStatistics } from "..";
import { CreateTableOptions } from "../connection";
import { IndexOptions } from "../indices";
import { MergeInsertBuilder } from "../merge";
@@ -34,6 +35,10 @@ export class RemoteTable extends Table {
return `/v1/table/${encodeURIComponent(this.#name)}/`;
}
get name(): string {
return this.#name;
}
public constructor(
client: RestfulLanceDBClient,
tableName: string,
@@ -161,4 +166,7 @@ export class RemoteTable extends Table {
mergeInsert(_on: string | string[]): MergeInsertBuilder {
throw new Error("mergeInsert() is not yet supported on the LanceDB cloud");
}
async indexStats(_name: string): Promise<IndexStatistics | undefined> {
throw new Error("indexStats() is not yet supported on the LanceDB cloud");
}
}

View File

@@ -33,11 +33,11 @@ import {
AddColumnsSql,
ColumnAlteration,
IndexConfig,
IndexStatistics,
OptimizeStats,
Table as _NativeTable,
} from "./native";
import { Query, VectorQuery } from "./query";
export { IndexConfig } from "./native";
/**
* Options for adding data to a table.
@@ -98,6 +98,8 @@ export abstract class Table {
[Symbol.for("nodejs.util.inspect.custom")](): string {
return this.display();
}
/** Returns the name of the table */
abstract get name(): string;
/** Return true if the table has not been closed */
abstract isOpen(): boolean;
@@ -158,6 +160,9 @@ export abstract class Table {
* Indices on vector columns will speed up vector searches.
* Indices on scalar columns will speed up filtering (in both
* vector and non-vector searches)
*
* @note We currently don't support custom named indexes,
* The index name will always be `${column}_idx`
* @example
* // If the column has a vector (fixed size list) data type then
* // an IvfPq vector index will be created.
@@ -368,6 +373,13 @@ export abstract class Table {
abstract mergeInsert(on: string | string[]): MergeInsertBuilder;
/** List all the stats of a specified index
*
* @param {string} name The name of the index.
* @returns {IndexStatistics | undefined} The stats of the index. If the index does not exist, it will return undefined
*/
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
static async parseTableData(
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
data: Record<string, unknown>[] | ArrowTable<any>,
@@ -412,7 +424,9 @@ export class LocalTable extends Table {
super();
this.inner = inner;
}
get name(): string {
return this.inner.name;
}
isOpen(): boolean {
return this.inner.isOpen();
}
@@ -565,6 +579,13 @@ export class LocalTable extends Table {
return await this.query().toArrow();
}
async indexStats(name: string): Promise<IndexStatistics | undefined> {
const stats = await this.inner.indexStats(name);
if (stats === null) {
return undefined;
}
return stats;
}
mergeInsert(on: string | string[]): MergeInsertBuilder {
on = Array.isArray(on) ? on : [on];
return new MergeInsertBuilder(this.inner.mergeInsert(on));

View File

@@ -18,10 +18,8 @@
"win32"
],
"dependencies": {
"@types/axios": "^0.14.0",
"apache-arrow": "^15.0.0",
"axios": "^1.7.2",
"memoize": "^10.0.0",
"openai": "^4.29.2",
"reflect-metadata": "^0.2.2"
},
@@ -31,6 +29,7 @@
"@biomejs/biome": "^1.7.3",
"@jest/globals": "^29.7.0",
"@napi-rs/cli": "^2.18.0",
"@types/axios": "^0.14.0",
"@types/jest": "^29.1.2",
"@types/tmp": "^0.2.6",
"apache-arrow-old": "npm:apache-arrow@13.0.0",
@@ -3131,6 +3130,7 @@
"resolved": "https://registry.npmjs.org/@types/axios/-/axios-0.14.0.tgz",
"integrity": "sha512-KqQnQbdYE54D7oa/UmYVMZKq7CO4l8DEENzOKc4aBRwxCXSlJXGz83flFx5L7AWrOQnmuN3kVsRdt+GZPPjiVQ==",
"deprecated": "This is a stub types definition for axios (https://github.com/mzabriskie/axios). axios provides its own type definitions, so you don't need @types/axios installed!",
"dev": true,
"dependencies": {
"axios": "*"
}
@@ -5942,20 +5942,6 @@
"is-buffer": "~1.1.6"
}
},
"node_modules/memoize": {
"version": "10.0.0",
"resolved": "https://registry.npmjs.org/memoize/-/memoize-10.0.0.tgz",
"integrity": "sha512-H6cBLgsi6vMWOcCpvVCdFFnl3kerEXbrYh9q+lY6VXvQSmM6CkmV08VOwT+WE2tzIEqRPFfAq3fm4v/UIW6mSA==",
"dependencies": {
"mimic-function": "^5.0.0"
},
"engines": {
"node": ">=18"
},
"funding": {
"url": "https://github.com/sindresorhus/memoize?sponsor=1"
}
},
"node_modules/merge-stream": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
@@ -6003,17 +5989,6 @@
"node": ">= 0.6"
}
},
"node_modules/mimic-function": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/mimic-function/-/mimic-function-5.0.1.tgz",
"integrity": "sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==",
"engines": {
"node": ">=18"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/minimatch": {
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",

View File

@@ -1,5 +1,15 @@
{
"name": "@lancedb/lancedb",
"description": "LanceDB: A serverless, low-latency vector database for AI applications",
"keywords": [
"database",
"lance",
"lancedb",
"search",
"vector",
"vector database",
"ann"
],
"version": "0.5.2",
"main": "dist/index.js",
"exports": {
@@ -38,7 +48,8 @@
"typedoc": "^0.25.7",
"typedoc-plugin-markdown": "^3.17.1",
"typescript": "^5.3.3",
"typescript-eslint": "^7.1.0"
"typescript-eslint": "^7.1.0",
"@types/axios": "^0.14.0"
},
"ava": {
"timeout": "3m"
@@ -65,7 +76,6 @@
"version": "napi version"
},
"dependencies": {
"@types/axios": "^0.14.0",
"apache-arrow": "^15.0.0",
"axios": "^1.7.2",
"openai": "^4.29.2",

View File

@@ -56,12 +56,6 @@ impl Connection {
#[napi(factory)]
pub async fn new(uri: String, options: ConnectionOptions) -> napi::Result<Self> {
let mut builder = ConnectBuilder::new(&uri);
if let Some(api_key) = options.api_key {
builder = builder.api_key(&api_key);
}
if let Some(host_override) = options.host_override {
builder = builder.host_override(&host_override);
}
if let Some(interval) = options.read_consistency_interval {
builder =
builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));

View File

@@ -28,8 +28,6 @@ mod util;
#[napi(object)]
#[derive(Debug)]
pub struct ConnectionOptions {
pub api_key: Option<String>,
pub host_override: Option<String>,
/// (For LanceDB OSS only): The interval, in seconds, at which to check for
/// updates to the table from other processes. If None, then consistency is not
/// checked. For performance reasons, this is the default. For strong

View File

@@ -30,7 +30,7 @@ use crate::query::{Query, VectorQuery};
pub struct Table {
// We keep a duplicate of the table name so we can use it for error
// messages even if the table has been closed
name: String,
pub name: String,
pub(crate) inner: Option<LanceDbTable>,
}
@@ -330,6 +330,13 @@ impl Table {
.collect::<Vec<_>>())
}
#[napi]
pub async fn index_stats(&self, index_name: String) -> napi::Result<Option<IndexStatistics>> {
let tbl = self.inner_ref()?.as_native().unwrap();
let stats = tbl.index_stats(&index_name).await.default_error()?;
Ok(stats.map(IndexStatistics::from))
}
#[napi]
pub fn merge_insert(&self, on: Vec<String>) -> napi::Result<NativeMergeInsertBuilder> {
let on: Vec<_> = on.iter().map(String::as_str).collect();
@@ -340,11 +347,13 @@ impl Table {
#[napi(object)]
/// A description of an index currently configured on a column
pub struct IndexConfig {
/// The name of the index
pub name: String,
/// The type of the index
pub index_type: String,
/// The columns in the index
///
/// Currently this is always an array of size 1. In the future there may
/// Currently this is always an array of size 1. In the future there may
/// be more columns to represent composite indices.
pub columns: Vec<String>,
}
@@ -355,6 +364,7 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
Self {
index_type,
columns: value.columns,
name: value.name,
}
}
}
@@ -437,3 +447,40 @@ pub struct AddColumnsSql {
/// The expression can reference other columns in the table.
pub value_sql: String,
}
#[napi(object)]
pub struct IndexStatistics {
/// The number of rows indexed by the index
pub num_indexed_rows: f64,
/// The number of rows not indexed
pub num_unindexed_rows: f64,
/// The type of the index
pub index_type: Option<String>,
/// The metadata for each index
pub indices: Vec<IndexMetadata>,
}
impl From<lancedb::index::IndexStatistics> for IndexStatistics {
fn from(value: lancedb::index::IndexStatistics) -> Self {
Self {
num_indexed_rows: value.num_indexed_rows as f64,
num_unindexed_rows: value.num_unindexed_rows as f64,
index_type: value.index_type.map(|t| format!("{:?}", t)),
indices: value.indices.into_iter().map(Into::into).collect(),
}
}
}
#[napi(object)]
pub struct IndexMetadata {
pub metric_type: Option<String>,
pub index_type: Option<String>,
}
impl From<lancedb::index::IndexMetadata> for IndexMetadata {
fn from(value: lancedb::index::IndexMetadata) -> Self {
Self {
metric_type: value.metric_type,
index_type: value.index_type,
}
}
}

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.8.2"
current_version = "0.9.0-beta.3"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.8.2"
version = "0.9.0-beta.3"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -3,7 +3,7 @@ name = "lancedb"
# version in Cargo.toml
dependencies = [
"deprecation",
"pylance==0.12.2-beta.2",
"pylance==0.13.0",
"ratelimiter~=1.0",
"requests>=2.31.0",
"retry>=0.9.2",
@@ -13,6 +13,7 @@ dependencies = [
"packaging",
"cachetools",
"overrides>=0.7",
"urllib3==1.26.19"
]
description = "lancedb"
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]

View File

@@ -35,6 +35,7 @@ def connect(
host_override: Optional[str] = None,
read_consistency_interval: Optional[timedelta] = None,
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
storage_options: Optional[Dict[str, str]] = None,
**kwargs,
) -> DBConnection:
"""Connect to a LanceDB database.
@@ -70,6 +71,9 @@ def connect(
executor will be used for making requests. This is for LanceDB Cloud
only and is only used when making batch requests (i.e., passing in
multiple queries to the search method at once).
storage_options: dict, optional
Additional options for the storage backend. See available options at
https://lancedb.github.io/lancedb/guides/storage/
Examples
--------
@@ -105,12 +109,16 @@ def connect(
region,
host_override,
request_thread_pool=request_thread_pool,
storage_options=storage_options,
**kwargs,
)
if kwargs:
raise ValueError(f"Unknown keyword arguments: {kwargs}")
return LanceDBConnection(uri, read_consistency_interval=read_consistency_interval)
return LanceDBConnection(
uri,
read_consistency_interval=read_consistency_interval,
)
async def connect_async(

View File

@@ -29,7 +29,10 @@ from .table import LanceTable
def create_index(
index_path: str, text_fields: List[str], ordering_fields: List[str] = None
index_path: str,
text_fields: List[str],
ordering_fields: List[str] = None,
tokenizer_name: str = "default",
) -> tantivy.Index:
"""
Create a new Index (not populated)
@@ -42,6 +45,8 @@ def create_index(
List of text fields to index
ordering_fields: List[str]
List of unsigned type fields to order by at search time
tokenizer_name : str, default "default"
The tokenizer to use
Returns
-------
@@ -56,7 +61,7 @@ def create_index(
schema_builder.add_integer_field("doc_id", stored=True)
# data fields
for name in text_fields:
schema_builder.add_text_field(name, stored=True)
schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
if ordering_fields:
for name in ordering_fields:
schema_builder.add_unsigned_field(name, fast=True)

View File

@@ -55,11 +55,13 @@ class RestfulLanceDBClient:
region: str
api_key: Credential
host_override: Optional[str] = attrs.field(default=None)
db_prefix: Optional[str] = attrs.field(default=None)
closed: bool = attrs.field(default=False, init=False)
connection_timeout: float = attrs.field(default=120.0, kw_only=True)
read_timeout: float = attrs.field(default=300.0, kw_only=True)
storage_options: Optional[Dict[str, str]] = attrs.field(default=None, kw_only=True)
@functools.cached_property
def session(self) -> requests.Session:
@@ -92,6 +94,18 @@ class RestfulLanceDBClient:
headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
if self.host_override:
headers["x-lancedb-database"] = self.db_name
if self.storage_options:
if self.storage_options.get("account_name") is not None:
headers["x-azure-storage-account-name"] = self.storage_options[
"account_name"
]
if self.storage_options.get("azure_storage_account_name") is not None:
headers["x-azure-storage-account-name"] = self.storage_options[
"azure_storage_account_name"
]
if self.db_prefix:
headers["x-lancedb-database-prefix"] = self.db_prefix
return headers
@staticmethod
@@ -245,7 +259,6 @@ def retry_adapter(options: Dict[str, Any]) -> HTTPAdapter:
connect=connect_retries,
read=read_retries,
backoff_factor=backoff_factor,
backoff_jitter=backoff_jitter,
status_forcelist=statuses,
allowed_methods=methods,
)

View File

@@ -15,7 +15,7 @@ import inspect
import logging
import uuid
from concurrent.futures import ThreadPoolExecutor
from typing import Iterable, List, Optional, Union
from typing import Dict, Iterable, List, Optional, Union
from urllib.parse import urlparse
from cachetools import TTLCache
@@ -44,20 +44,25 @@ class RemoteDBConnection(DBConnection):
request_thread_pool: Optional[ThreadPoolExecutor] = None,
connection_timeout: float = 120.0,
read_timeout: float = 300.0,
storage_options: Optional[Dict[str, str]] = None,
):
"""Connect to a remote LanceDB database."""
parsed = urlparse(db_url)
if parsed.scheme != "db":
raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://")
self.db_name = parsed.netloc
prefix = parsed.path.lstrip("/")
self.db_prefix = None if not prefix else prefix
self.api_key = api_key
self._client = RestfulLanceDBClient(
self.db_name,
region,
api_key,
host_override,
self.db_prefix,
connection_timeout=connection_timeout,
read_timeout=read_timeout,
storage_options=storage_options,
)
self._request_thread_pool = request_thread_pool
self._table_cache = TTLCache(maxsize=10000, ttl=300)

View File

@@ -1171,6 +1171,7 @@ class LanceTable(Table):
*,
replace: bool = False,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default",
):
"""Create a full-text search index on the table.
@@ -1189,6 +1190,10 @@ class LanceTable(Table):
ordering_field_names:
A list of unsigned type fields to index to optionally order
results on at search time
tokenizer_name: str, default "default"
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
language code followed by "_stem". So for english it would be "en_stem".
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
"""
from .fts import create_index, populate_index
@@ -1214,6 +1219,7 @@ class LanceTable(Table):
self._get_fts_index_path(),
field_names,
ordering_fields=ordering_field_names,
tokenizer_name=tokenizer_name,
)
populate_index(
index,

View File

@@ -66,6 +66,17 @@ def test_create_index(tmp_path):
assert os.path.exists(str(tmp_path / "index"))
def test_create_index_with_stemming(tmp_path, table):
index = ldb.fts.create_index(
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
)
assert isinstance(index, tantivy.Index)
assert os.path.exists(str(tmp_path / "index"))
# Check stemming by running tokenizer on non empty table
table.create_fts_index("text", tokenizer_name="en_stem")
def test_populate_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)

View File

@@ -735,7 +735,7 @@ def test_create_scalar_index(db):
indices = table.to_lance().list_indices()
assert len(indices) == 1
scalar_index = indices[0]
assert scalar_index["type"] == "Scalar"
assert scalar_index["type"] == "BTree"
# Confirm that prefiltering still works with the scalar index column
results = table.search().where("x = 'c'").to_arrow()

View File

@@ -463,6 +463,7 @@ impl JsTable {
Ok(promise)
}
#[allow(deprecated)]
pub(crate) fn js_index_stats(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<Self>, _>(&mut cx)?;
let rt = runtime(&mut cx)?;

View File

@@ -80,6 +80,8 @@ pub enum IndexType {
/// A description of an index currently configured on a column
pub struct IndexConfig {
/// The name of the index
pub name: String,
/// The type of the index
pub index_type: IndexType,
/// The columns in the index

View File

@@ -1206,28 +1206,36 @@ impl NativeTable {
.await)
}
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
#[allow(deprecated)]
match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some(stats.num_indexed_rows)),
None => Ok(None),
}
}
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
#[allow(deprecated)]
match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some(stats.num_unindexed_rows)),
None => Ok(None),
}
}
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
pub async fn get_index_type(&self, index_uuid: &str) -> Result<Option<String>> {
#[allow(deprecated)]
match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some(stats.index_type.unwrap_or_default())),
None => Ok(None),
}
}
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
pub async fn get_distance_type(&self, index_uuid: &str) -> Result<Option<String>> {
#[allow(deprecated)]
match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some(
stats
@@ -1240,16 +1248,8 @@ impl NativeTable {
}
}
pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
let dataset = self.dataset.get().await?;
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
Ok(indices
.iter()
.map(|i| VectorIndex::new_from_format(&mf, i))
.collect())
}
async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<IndexStatistics>> {
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
pub async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<IndexStatistics>> {
let index = self
.load_indices()
.await?
@@ -1268,6 +1268,35 @@ impl NativeTable {
Ok(Some(index_stats))
}
/// Get statistics about an index.
/// Returns an error if the index does not exist.
pub async fn index_stats<S: AsRef<str>>(
&self,
index_name: S,
) -> Result<Option<IndexStatistics>> {
self.dataset
.get()
.await?
.index_statistics(index_name.as_ref())
.await
.ok()
.map(|stats| {
serde_json::from_str(&stats).map_err(|e| Error::InvalidInput {
message: format!("error deserializing index statistics: {}", e),
})
})
.transpose()
}
pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
let dataset = self.dataset.get().await?;
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
Ok(indices
.iter()
.map(|i| VectorIndex::new_from_format(&mf, i))
.collect())
}
async fn create_ivf_pq_index(
&self,
index: IvfPqIndexBuilder,
@@ -1860,12 +1889,20 @@ impl TableInternal for NativeTable {
}
columns.push(field.name.clone());
}
Ok(IndexConfig { index_type: if is_vector { crate::index::IndexType::IvfPq } else { crate::index::IndexType::BTree }, columns })
let index_type = if is_vector {
crate::index::IndexType::IvfPq
} else {
crate::index::IndexType::BTree
};
let name = idx.name.clone();
Ok(IndexConfig { index_type, columns, name })
}).collect::<Result<Vec<_>>>()
}
}
#[cfg(test)]
#[allow(deprecated)]
mod tests {
use std::iter;
use std::sync::atomic::{AtomicBool, Ordering};