mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-09 05:12:58 +00:00
Compare commits
18 Commits
rmeng/0124
...
lance-13.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dcfa17c9fc | ||
|
|
79a1667753 | ||
|
|
a866b78a31 | ||
|
|
c7d37b3e6e | ||
|
|
4b71552b73 | ||
|
|
5ce5f64da3 | ||
|
|
c582b0fc63 | ||
|
|
bc0814767b | ||
|
|
8960a8e535 | ||
|
|
a8568ddc72 | ||
|
|
55f88346d0 | ||
|
|
dfb9a28795 | ||
|
|
a797f5fe59 | ||
|
|
3cd84c9375 | ||
|
|
5ca83fdc99 | ||
|
|
33cc9b682f | ||
|
|
b3e5ac6d2a | ||
|
|
0fe844034d |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.5.2"
|
current_version = "0.6.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
19
Cargo.toml
19
Cargo.toml
@@ -20,13 +20,18 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
|
|||||||
categories = ["database-implementations"]
|
categories = ["database-implementations"]
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.12.4", "features" = [
|
# lance = { "version" = "=0.13.0", "features" = ["dynamodb"] }
|
||||||
"dynamodb",
|
# lance-index = { "version" = "=0.13.0" }
|
||||||
]}
|
# lance-linalg = { "version" = "=0.13.0" }
|
||||||
lance-index = { "version" = "=0.12.4" }
|
# lance-testing = { "version" = "=0.13.0" }
|
||||||
lance-linalg = { "version" = "=0.12.4" }
|
# lance-datafusion = { "version" = "=0.13.0" }
|
||||||
lance-testing = { "version" = "=0.12.4" }
|
|
||||||
lance-datafusion = { "version" = "=0.12.4" }
|
lance = { path = "../lance/rust/lance" }
|
||||||
|
lance-index = { path = "../lance/rust/lance-index" }
|
||||||
|
lance-linalg= { path = "../lance/rust/lance-linalg" }
|
||||||
|
lance-testing = { path = "../lance/rust/lance-testing" }
|
||||||
|
lance-datafusion = { path = "../lance/rust/lance-datafusion" }
|
||||||
|
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "51.0", optional = false }
|
arrow = { version = "51.0", optional = false }
|
||||||
arrow-array = "51.0"
|
arrow-array = "51.0"
|
||||||
|
|||||||
@@ -54,6 +54,16 @@ This returns the result as a list of dictionaries as follows.
|
|||||||
!!! note
|
!!! note
|
||||||
LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
|
LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
|
||||||
|
|
||||||
|
## Tokenization
|
||||||
|
By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.create_fts_index("text", tokenizer_name="en_stem")
|
||||||
|
```
|
||||||
|
|
||||||
|
The following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
|
||||||
|
|
||||||
|
|
||||||
## Index multiple columns
|
## Index multiple columns
|
||||||
|
|
||||||
If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
|
If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
|
||||||
@@ -139,6 +149,7 @@ is treated as a phrase query.
|
|||||||
In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
|
In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
|
||||||
double quotes replaced by single quotes.
|
double quotes replaced by single quotes.
|
||||||
|
|
||||||
|
|
||||||
## Configurations
|
## Configurations
|
||||||
|
|
||||||
By default, LanceDB configures a 1GB heap size limit for creating the index. You can
|
By default, LanceDB configures a 1GB heap size limit for creating the index. You can
|
||||||
|
|||||||
@@ -116,21 +116,21 @@ This guide will show how to create tables, insert data into them, and update the
|
|||||||
|
|
||||||
### From a Polars DataFrame
|
### From a Polars DataFrame
|
||||||
|
|
||||||
LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
|
LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
|
||||||
written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
|
written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
|
||||||
under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
|
under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
|
||||||
is on the way.
|
is on the way.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
data = pl.DataFrame({
|
data = pl.DataFrame({
|
||||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||||
"item": ["foo", "bar"],
|
"item": ["foo", "bar"],
|
||||||
"price": [10.0, 20.0]
|
"price": [10.0, 20.0]
|
||||||
})
|
})
|
||||||
table = db.create_table("pl_table", data=data)
|
table = db.create_table("pl_table", data=data)
|
||||||
```
|
```
|
||||||
|
|
||||||
### From an Arrow Table
|
### From an Arrow Table
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|||||||
4
node/package-lock.json
generated
4
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.5.2",
|
"version": "0.6.0",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.5.2",
|
"version": "0.6.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.5.2",
|
"version": "0.6.0",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"tsc": "tsc -b",
|
"tsc": "tsc -b",
|
||||||
"build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb-node index.node -- cargo build --message-format=json",
|
"build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb_node index.node -- cargo build --message-format=json",
|
||||||
"build-release": "npm run build -- --release",
|
"build-release": "npm run build -- --release",
|
||||||
"test": "npm run tsc && mocha -recursive dist/test",
|
"test": "npm run tsc && mocha -recursive dist/test",
|
||||||
"integration-test": "npm run tsc && mocha -recursive dist/integration_test",
|
"integration-test": "npm run tsc && mocha -recursive dist/integration_test",
|
||||||
|
|||||||
@@ -57,6 +57,18 @@ describe("given a connection", () => {
|
|||||||
expect(db.isOpen()).toBe(false);
|
expect(db.isOpen()).toBe(false);
|
||||||
await expect(db.tableNames()).rejects.toThrow("Connection is closed");
|
await expect(db.tableNames()).rejects.toThrow("Connection is closed");
|
||||||
});
|
});
|
||||||
|
it("should be able to create a table from an object arg `createTable(options)`, or args `createTable(name, data, options)`", async () => {
|
||||||
|
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
|
||||||
|
await expect(tbl.countRows()).resolves.toBe(2);
|
||||||
|
|
||||||
|
tbl = await db.createTable({
|
||||||
|
name: "test",
|
||||||
|
data: [{ id: 3 }],
|
||||||
|
mode: "overwrite",
|
||||||
|
});
|
||||||
|
|
||||||
|
await expect(tbl.countRows()).resolves.toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
it("should fail if creating table twice, unless overwrite is true", async () => {
|
it("should fail if creating table twice, unless overwrite is true", async () => {
|
||||||
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
|
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
|
||||||
|
|||||||
@@ -230,7 +230,7 @@ describe("embedding functions", () => {
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
test.only.each([new Float16(), new Float32(), new Float64()])(
|
test.each([new Float16(), new Float32(), new Float64()])(
|
||||||
"should be able to provide auto embeddings with multiple float datatypes",
|
"should be able to provide auto embeddings with multiple float datatypes",
|
||||||
async (floatType) => {
|
async (floatType) => {
|
||||||
@register("test1")
|
@register("test1")
|
||||||
|
|||||||
@@ -39,7 +39,9 @@ describe.each([arrow, arrowOld])("Given a table", (arrow: any) => {
|
|||||||
let tmpDir: tmp.DirResult;
|
let tmpDir: tmp.DirResult;
|
||||||
let table: Table;
|
let table: Table;
|
||||||
|
|
||||||
const schema = new arrow.Schema([
|
const schema:
|
||||||
|
| import("apache-arrow").Schema
|
||||||
|
| import("apache-arrow-old").Schema = new arrow.Schema([
|
||||||
new arrow.Field("id", new arrow.Float64(), true),
|
new arrow.Field("id", new arrow.Float64(), true),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
@@ -305,6 +307,7 @@ describe("When creating an index", () => {
|
|||||||
const indices = await tbl.listIndices();
|
const indices = await tbl.listIndices();
|
||||||
expect(indices.length).toBe(1);
|
expect(indices.length).toBe(1);
|
||||||
expect(indices[0]).toEqual({
|
expect(indices[0]).toEqual({
|
||||||
|
name: "vec_idx",
|
||||||
indexType: "IvfPq",
|
indexType: "IvfPq",
|
||||||
columns: ["vec"],
|
columns: ["vec"],
|
||||||
});
|
});
|
||||||
@@ -314,7 +317,7 @@ describe("When creating an index", () => {
|
|||||||
.query()
|
.query()
|
||||||
.limit(2)
|
.limit(2)
|
||||||
.nearestTo(queryVec)
|
.nearestTo(queryVec)
|
||||||
.distanceType("DoT")
|
.distanceType("dot")
|
||||||
.toArrow();
|
.toArrow();
|
||||||
expect(rst.numRows).toBe(2);
|
expect(rst.numRows).toBe(2);
|
||||||
|
|
||||||
@@ -361,6 +364,24 @@ describe("When creating an index", () => {
|
|||||||
for await (const r of tbl.query().where("id > 1").select(["id"])) {
|
for await (const r of tbl.query().where("id > 1").select(["id"])) {
|
||||||
expect(r.numRows).toBe(298);
|
expect(r.numRows).toBe(298);
|
||||||
}
|
}
|
||||||
|
// should also work with 'filter' alias
|
||||||
|
for await (const r of tbl.query().filter("id > 1").select(["id"])) {
|
||||||
|
expect(r.numRows).toBe(298);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should be able to get index stats", async () => {
|
||||||
|
await tbl.createIndex("id");
|
||||||
|
|
||||||
|
const stats = await tbl.indexStats("id_idx");
|
||||||
|
expect(stats).toBeDefined();
|
||||||
|
expect(stats?.numIndexedRows).toEqual(300);
|
||||||
|
expect(stats?.numUnindexedRows).toEqual(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("when getting stats on non-existent index", async () => {
|
||||||
|
const stats = await tbl.indexStats("some non-existent index");
|
||||||
|
expect(stats).toBeUndefined();
|
||||||
});
|
});
|
||||||
|
|
||||||
// TODO: Move this test to the query API test (making sure we can reject queries
|
// TODO: Move this test to the query API test (making sure we can reject queries
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
import {
|
import {
|
||||||
Table as ArrowTable,
|
Table as ArrowTable,
|
||||||
Binary,
|
Binary,
|
||||||
|
BufferType,
|
||||||
DataType,
|
DataType,
|
||||||
Field,
|
Field,
|
||||||
FixedSizeBinary,
|
FixedSizeBinary,
|
||||||
@@ -37,14 +38,68 @@ import {
|
|||||||
type makeTable,
|
type makeTable,
|
||||||
vectorFromArray,
|
vectorFromArray,
|
||||||
} from "apache-arrow";
|
} from "apache-arrow";
|
||||||
|
import { Buffers } from "apache-arrow/data";
|
||||||
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
||||||
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
||||||
import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize";
|
import {
|
||||||
|
sanitizeField,
|
||||||
|
sanitizeSchema,
|
||||||
|
sanitizeTable,
|
||||||
|
sanitizeType,
|
||||||
|
} from "./sanitize";
|
||||||
export * from "apache-arrow";
|
export * from "apache-arrow";
|
||||||
|
export type SchemaLike =
|
||||||
|
| Schema
|
||||||
|
| {
|
||||||
|
fields: FieldLike[];
|
||||||
|
metadata: Map<string, string>;
|
||||||
|
get names(): unknown[];
|
||||||
|
};
|
||||||
|
export type FieldLike =
|
||||||
|
| Field
|
||||||
|
| {
|
||||||
|
type: string;
|
||||||
|
name: string;
|
||||||
|
nullable?: boolean;
|
||||||
|
metadata?: Map<string, string>;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type DataLike =
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
| import("apache-arrow").Data<Struct<any>>
|
||||||
|
| {
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
type: any;
|
||||||
|
length: number;
|
||||||
|
offset: number;
|
||||||
|
stride: number;
|
||||||
|
nullable: boolean;
|
||||||
|
children: DataLike[];
|
||||||
|
get nullCount(): number;
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
values: Buffers<any>[BufferType.DATA];
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
typeIds: Buffers<any>[BufferType.TYPE];
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
nullBitmap: Buffers<any>[BufferType.VALIDITY];
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
valueOffsets: Buffers<any>[BufferType.OFFSET];
|
||||||
|
};
|
||||||
|
|
||||||
|
export type RecordBatchLike =
|
||||||
|
| RecordBatch
|
||||||
|
| {
|
||||||
|
schema: SchemaLike;
|
||||||
|
data: DataLike;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type TableLike =
|
||||||
|
| ArrowTable
|
||||||
|
| { schema: SchemaLike; batches: RecordBatchLike[] };
|
||||||
|
|
||||||
export type IntoVector = Float32Array | Float64Array | number[];
|
export type IntoVector = Float32Array | Float64Array | number[];
|
||||||
|
|
||||||
export function isArrowTable(value: object): value is ArrowTable {
|
export function isArrowTable(value: object): value is TableLike {
|
||||||
if (value instanceof ArrowTable) return true;
|
if (value instanceof ArrowTable) return true;
|
||||||
return "schema" in value && "batches" in value;
|
return "schema" in value && "batches" in value;
|
||||||
}
|
}
|
||||||
@@ -135,7 +190,7 @@ export function isFixedSizeList(value: unknown): value is FixedSizeList {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Data type accepted by NodeJS SDK */
|
/** Data type accepted by NodeJS SDK */
|
||||||
export type Data = Record<string, unknown>[] | ArrowTable;
|
export type Data = Record<string, unknown>[] | TableLike;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Options to control how a column should be converted to a vector array
|
* Options to control how a column should be converted to a vector array
|
||||||
@@ -162,7 +217,7 @@ export class MakeArrowTableOptions {
|
|||||||
* The schema must be specified if there are no records (e.g. to make
|
* The schema must be specified if there are no records (e.g. to make
|
||||||
* an empty table)
|
* an empty table)
|
||||||
*/
|
*/
|
||||||
schema?: Schema;
|
schema?: SchemaLike;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Mapping from vector column name to expected type
|
* Mapping from vector column name to expected type
|
||||||
@@ -310,7 +365,7 @@ export function makeArrowTable(
|
|||||||
if (opt.schema !== undefined && opt.schema !== null) {
|
if (opt.schema !== undefined && opt.schema !== null) {
|
||||||
opt.schema = sanitizeSchema(opt.schema);
|
opt.schema = sanitizeSchema(opt.schema);
|
||||||
opt.schema = validateSchemaEmbeddings(
|
opt.schema = validateSchemaEmbeddings(
|
||||||
opt.schema,
|
opt.schema as Schema,
|
||||||
data,
|
data,
|
||||||
options?.embeddingFunction,
|
options?.embeddingFunction,
|
||||||
);
|
);
|
||||||
@@ -394,7 +449,7 @@ export function makeArrowTable(
|
|||||||
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
||||||
const firstTable = new ArrowTable(columns);
|
const firstTable = new ArrowTable(columns);
|
||||||
const batchesFixed = firstTable.batches.map(
|
const batchesFixed = firstTable.batches.map(
|
||||||
(batch) => new RecordBatch(opt.schema!, batch.data),
|
(batch) => new RecordBatch(opt.schema as Schema, batch.data),
|
||||||
);
|
);
|
||||||
let schema: Schema;
|
let schema: Schema;
|
||||||
if (metadata !== undefined) {
|
if (metadata !== undefined) {
|
||||||
@@ -407,9 +462,9 @@ export function makeArrowTable(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
schema = new Schema(opt.schema.fields, schemaMetadata);
|
schema = new Schema(opt.schema.fields as Field[], schemaMetadata);
|
||||||
} else {
|
} else {
|
||||||
schema = opt.schema;
|
schema = opt.schema as Schema;
|
||||||
}
|
}
|
||||||
return new ArrowTable(schema, batchesFixed);
|
return new ArrowTable(schema, batchesFixed);
|
||||||
}
|
}
|
||||||
@@ -425,7 +480,7 @@ export function makeArrowTable(
|
|||||||
* Create an empty Arrow table with the provided schema
|
* Create an empty Arrow table with the provided schema
|
||||||
*/
|
*/
|
||||||
export function makeEmptyTable(
|
export function makeEmptyTable(
|
||||||
schema: Schema,
|
schema: SchemaLike,
|
||||||
metadata?: Map<string, string>,
|
metadata?: Map<string, string>,
|
||||||
): ArrowTable {
|
): ArrowTable {
|
||||||
return makeArrowTable([], { schema }, metadata);
|
return makeArrowTable([], { schema }, metadata);
|
||||||
@@ -563,17 +618,16 @@ async function applyEmbeddingsFromMetadata(
|
|||||||
async function applyEmbeddings<T>(
|
async function applyEmbeddings<T>(
|
||||||
table: ArrowTable,
|
table: ArrowTable,
|
||||||
embeddings?: EmbeddingFunctionConfig,
|
embeddings?: EmbeddingFunctionConfig,
|
||||||
schema?: Schema,
|
schema?: SchemaLike,
|
||||||
): Promise<ArrowTable> {
|
): Promise<ArrowTable> {
|
||||||
if (schema?.metadata.has("embedding_functions")) {
|
|
||||||
return applyEmbeddingsFromMetadata(table, schema!);
|
|
||||||
} else if (embeddings == null || embeddings === undefined) {
|
|
||||||
return table;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (schema !== undefined && schema !== null) {
|
if (schema !== undefined && schema !== null) {
|
||||||
schema = sanitizeSchema(schema);
|
schema = sanitizeSchema(schema);
|
||||||
}
|
}
|
||||||
|
if (schema?.metadata.has("embedding_functions")) {
|
||||||
|
return applyEmbeddingsFromMetadata(table, schema! as Schema);
|
||||||
|
} else if (embeddings == null || embeddings === undefined) {
|
||||||
|
return table;
|
||||||
|
}
|
||||||
|
|
||||||
// Convert from ArrowTable to Record<String, Vector>
|
// Convert from ArrowTable to Record<String, Vector>
|
||||||
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
|
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
|
||||||
@@ -650,7 +704,7 @@ async function applyEmbeddings<T>(
|
|||||||
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
|
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return alignTable(newTable, schema);
|
return alignTable(newTable, schema as Schema);
|
||||||
}
|
}
|
||||||
return newTable;
|
return newTable;
|
||||||
}
|
}
|
||||||
@@ -744,7 +798,7 @@ export async function fromRecordsToStreamBuffer(
|
|||||||
export async function fromTableToBuffer(
|
export async function fromTableToBuffer(
|
||||||
table: ArrowTable,
|
table: ArrowTable,
|
||||||
embeddings?: EmbeddingFunctionConfig,
|
embeddings?: EmbeddingFunctionConfig,
|
||||||
schema?: Schema,
|
schema?: SchemaLike,
|
||||||
): Promise<Buffer> {
|
): Promise<Buffer> {
|
||||||
if (schema !== undefined && schema !== null) {
|
if (schema !== undefined && schema !== null) {
|
||||||
schema = sanitizeSchema(schema);
|
schema = sanitizeSchema(schema);
|
||||||
@@ -771,7 +825,7 @@ export async function fromDataToBuffer(
|
|||||||
schema = sanitizeSchema(schema);
|
schema = sanitizeSchema(schema);
|
||||||
}
|
}
|
||||||
if (isArrowTable(data)) {
|
if (isArrowTable(data)) {
|
||||||
return fromTableToBuffer(data, embeddings, schema);
|
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
|
||||||
} else {
|
} else {
|
||||||
const table = await convertToTable(data, embeddings, { schema });
|
const table = await convertToTable(data, embeddings, { schema });
|
||||||
return fromTableToBuffer(table);
|
return fromTableToBuffer(table);
|
||||||
@@ -789,7 +843,7 @@ export async function fromDataToBuffer(
|
|||||||
export async function fromTableToStreamBuffer(
|
export async function fromTableToStreamBuffer(
|
||||||
table: ArrowTable,
|
table: ArrowTable,
|
||||||
embeddings?: EmbeddingFunctionConfig,
|
embeddings?: EmbeddingFunctionConfig,
|
||||||
schema?: Schema,
|
schema?: SchemaLike,
|
||||||
): Promise<Buffer> {
|
): Promise<Buffer> {
|
||||||
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
||||||
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
|
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
|
||||||
@@ -854,7 +908,6 @@ function validateSchemaEmbeddings(
|
|||||||
for (let field of schema.fields) {
|
for (let field of schema.fields) {
|
||||||
if (isFixedSizeList(field.type)) {
|
if (isFixedSizeList(field.type)) {
|
||||||
field = sanitizeField(field);
|
field = sanitizeField(field);
|
||||||
|
|
||||||
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
||||||
if (schema.metadata.has("embedding_functions")) {
|
if (schema.metadata.has("embedding_functions")) {
|
||||||
const embeddings = JSON.parse(
|
const embeddings = JSON.parse(
|
||||||
|
|||||||
@@ -12,7 +12,7 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
import { Table as ArrowTable, Schema } from "./arrow";
|
import { Data, Schema, SchemaLike, TableLike } from "./arrow";
|
||||||
import { fromTableToBuffer, makeEmptyTable } from "./arrow";
|
import { fromTableToBuffer, makeEmptyTable } from "./arrow";
|
||||||
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
||||||
import { Connection as LanceDbConnection } from "./native";
|
import { Connection as LanceDbConnection } from "./native";
|
||||||
@@ -50,7 +50,7 @@ export interface CreateTableOptions {
|
|||||||
* The default is true while the new format is in beta
|
* The default is true while the new format is in beta
|
||||||
*/
|
*/
|
||||||
useLegacyFormat?: boolean;
|
useLegacyFormat?: boolean;
|
||||||
schema?: Schema;
|
schema?: SchemaLike;
|
||||||
embeddingFunction?: EmbeddingFunctionConfig;
|
embeddingFunction?: EmbeddingFunctionConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,15 +151,28 @@ export abstract class Connection {
|
|||||||
options?: Partial<OpenTableOptions>,
|
options?: Partial<OpenTableOptions>,
|
||||||
): Promise<Table>;
|
): Promise<Table>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new Table and initialize it with new data.
|
||||||
|
* @param {object} options - The options object.
|
||||||
|
* @param {string} options.name - The name of the table.
|
||||||
|
* @param {Data} options.data - Non-empty Array of Records to be inserted into the table
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
abstract createTable(
|
||||||
|
options: {
|
||||||
|
name: string;
|
||||||
|
data: Data;
|
||||||
|
} & Partial<CreateTableOptions>,
|
||||||
|
): Promise<Table>;
|
||||||
/**
|
/**
|
||||||
* Creates a new Table and initialize it with new data.
|
* Creates a new Table and initialize it with new data.
|
||||||
* @param {string} name - The name of the table.
|
* @param {string} name - The name of the table.
|
||||||
* @param {Record<string, unknown>[] | ArrowTable} data - Non-empty Array of Records
|
* @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
|
||||||
* to be inserted into the table
|
* to be inserted into the table
|
||||||
*/
|
*/
|
||||||
abstract createTable(
|
abstract createTable(
|
||||||
name: string,
|
name: string,
|
||||||
data: Record<string, unknown>[] | ArrowTable,
|
data: Record<string, unknown>[] | TableLike,
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table>;
|
): Promise<Table>;
|
||||||
|
|
||||||
@@ -170,7 +183,7 @@ export abstract class Connection {
|
|||||||
*/
|
*/
|
||||||
abstract createEmptyTable(
|
abstract createEmptyTable(
|
||||||
name: string,
|
name: string,
|
||||||
schema: Schema,
|
schema: import("./arrow").SchemaLike,
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table>;
|
): Promise<Table>;
|
||||||
|
|
||||||
@@ -219,13 +232,22 @@ export class LocalConnection extends Connection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async createTable(
|
async createTable(
|
||||||
name: string,
|
nameOrOptions:
|
||||||
data: Record<string, unknown>[] | ArrowTable,
|
| string
|
||||||
|
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
|
||||||
|
data?: Record<string, unknown>[] | TableLike,
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
|
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
|
||||||
|
const { name, data, ...options } = nameOrOptions;
|
||||||
|
return this.createTable(name, data, options);
|
||||||
|
}
|
||||||
|
if (data === undefined) {
|
||||||
|
throw new Error("data is required");
|
||||||
|
}
|
||||||
const { buf, mode } = await Table.parseTableData(data, options);
|
const { buf, mode } = await Table.parseTableData(data, options);
|
||||||
const innerTable = await this.inner.createTable(
|
const innerTable = await this.inner.createTable(
|
||||||
name,
|
nameOrOptions,
|
||||||
buf,
|
buf,
|
||||||
mode,
|
mode,
|
||||||
cleanseStorageOptions(options?.storageOptions),
|
cleanseStorageOptions(options?.storageOptions),
|
||||||
@@ -237,7 +259,7 @@ export class LocalConnection extends Connection {
|
|||||||
|
|
||||||
async createEmptyTable(
|
async createEmptyTable(
|
||||||
name: string,
|
name: string,
|
||||||
schema: Schema,
|
schema: import("./arrow").SchemaLike,
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
let mode: string = options?.mode ?? "create";
|
let mode: string = options?.mode ?? "create";
|
||||||
|
|||||||
@@ -31,6 +31,9 @@ export {
|
|||||||
AddColumnsSql,
|
AddColumnsSql,
|
||||||
ColumnAlteration,
|
ColumnAlteration,
|
||||||
ConnectionOptions,
|
ConnectionOptions,
|
||||||
|
IndexStatistics,
|
||||||
|
IndexMetadata,
|
||||||
|
IndexConfig,
|
||||||
} from "./native.js";
|
} from "./native.js";
|
||||||
|
|
||||||
export {
|
export {
|
||||||
@@ -56,12 +59,7 @@ export {
|
|||||||
|
|
||||||
export { Index, IndexOptions, IvfPqOptions } from "./indices";
|
export { Index, IndexOptions, IvfPqOptions } from "./indices";
|
||||||
|
|
||||||
export {
|
export { Table, AddDataOptions, UpdateOptions } from "./table";
|
||||||
Table,
|
|
||||||
AddDataOptions,
|
|
||||||
IndexConfig,
|
|
||||||
UpdateOptions,
|
|
||||||
} from "./table";
|
|
||||||
|
|
||||||
export * as embedding from "./embedding";
|
export * as embedding from "./embedding";
|
||||||
|
|
||||||
@@ -76,15 +74,61 @@ export * as embedding from "./embedding";
|
|||||||
* @param {string} uri - The uri of the database. If the database uri starts
|
* @param {string} uri - The uri of the database. If the database uri starts
|
||||||
* with `db://` then it connects to a remote database.
|
* with `db://` then it connects to a remote database.
|
||||||
* @see {@link ConnectionOptions} for more details on the URI format.
|
* @see {@link ConnectionOptions} for more details on the URI format.
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* const conn = await connect("/path/to/database");
|
||||||
|
* ```
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* const conn = await connect(
|
||||||
|
* "s3://bucket/path/to/database",
|
||||||
|
* {storageOptions: {timeout: "60s"}
|
||||||
|
* });
|
||||||
|
* ```
|
||||||
*/
|
*/
|
||||||
export async function connect(
|
export async function connect(
|
||||||
uri: string,
|
uri: string,
|
||||||
opts?: Partial<ConnectionOptions | RemoteConnectionOptions>,
|
opts?: Partial<ConnectionOptions | RemoteConnectionOptions>,
|
||||||
|
): Promise<Connection>;
|
||||||
|
/**
|
||||||
|
* Connect to a LanceDB instance at the given URI.
|
||||||
|
*
|
||||||
|
* Accepted formats:
|
||||||
|
*
|
||||||
|
* - `/path/to/database` - local database
|
||||||
|
* - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
|
||||||
|
* - `db://host:port` - remote database (LanceDB cloud)
|
||||||
|
* @param options - The options to use when connecting to the database
|
||||||
|
* @see {@link ConnectionOptions} for more details on the URI format.
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* const conn = await connect({
|
||||||
|
* uri: "/path/to/database",
|
||||||
|
* storageOptions: {timeout: "60s"}
|
||||||
|
* });
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
export async function connect(
|
||||||
|
opts: Partial<RemoteConnectionOptions | ConnectionOptions> & { uri: string },
|
||||||
|
): Promise<Connection>;
|
||||||
|
export async function connect(
|
||||||
|
uriOrOptions:
|
||||||
|
| string
|
||||||
|
| (Partial<RemoteConnectionOptions | ConnectionOptions> & { uri: string }),
|
||||||
|
opts: Partial<ConnectionOptions | RemoteConnectionOptions> = {},
|
||||||
): Promise<Connection> {
|
): Promise<Connection> {
|
||||||
|
let uri: string | undefined;
|
||||||
|
if (typeof uriOrOptions !== "string") {
|
||||||
|
const { uri: uri_, ...options } = uriOrOptions;
|
||||||
|
uri = uri_;
|
||||||
|
opts = options;
|
||||||
|
} else {
|
||||||
|
uri = uriOrOptions;
|
||||||
|
}
|
||||||
|
|
||||||
if (!uri) {
|
if (!uri) {
|
||||||
throw new Error("uri is required");
|
throw new Error("uri is required");
|
||||||
}
|
}
|
||||||
opts = opts ?? {};
|
|
||||||
|
|
||||||
if (uri?.startsWith("db://")) {
|
if (uri?.startsWith("db://")) {
|
||||||
return new RemoteConnection(uri, opts as RemoteConnectionOptions);
|
return new RemoteConnection(uri, opts as RemoteConnectionOptions);
|
||||||
|
|||||||
@@ -114,6 +114,14 @@ export class QueryBase<
|
|||||||
this.inner.onlyIf(predicate);
|
this.inner.onlyIf(predicate);
|
||||||
return this as unknown as QueryType;
|
return this as unknown as QueryType;
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* A filter statement to be applied to this query.
|
||||||
|
* @alias where
|
||||||
|
* @deprecated Use `where` instead
|
||||||
|
*/
|
||||||
|
filter(predicate: string): QueryType {
|
||||||
|
return this.where(predicate);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return only the specified columns.
|
* Return only the specified columns.
|
||||||
@@ -292,7 +300,9 @@ export class VectorQuery extends QueryBase<NativeVectorQuery, VectorQuery> {
|
|||||||
*
|
*
|
||||||
* By default "l2" is used.
|
* By default "l2" is used.
|
||||||
*/
|
*/
|
||||||
distanceType(distanceType: string): VectorQuery {
|
distanceType(
|
||||||
|
distanceType: Required<IvfPqOptions>["distanceType"],
|
||||||
|
): VectorQuery {
|
||||||
this.inner.distanceType(distanceType);
|
this.inner.distanceType(distanceType);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,10 @@
|
|||||||
import { Schema } from "apache-arrow";
|
import { Schema } from "apache-arrow";
|
||||||
import { Data, fromTableToStreamBuffer, makeEmptyTable } from "../arrow";
|
import {
|
||||||
|
Data,
|
||||||
|
SchemaLike,
|
||||||
|
fromTableToStreamBuffer,
|
||||||
|
makeEmptyTable,
|
||||||
|
} from "../arrow";
|
||||||
import {
|
import {
|
||||||
Connection,
|
Connection,
|
||||||
CreateTableOptions,
|
CreateTableOptions,
|
||||||
@@ -106,10 +111,19 @@ export class RemoteConnection extends Connection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async createTable(
|
async createTable(
|
||||||
tableName: string,
|
nameOrOptions:
|
||||||
data: Data,
|
| string
|
||||||
|
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
|
||||||
|
data?: Data,
|
||||||
options?: Partial<CreateTableOptions> | undefined,
|
options?: Partial<CreateTableOptions> | undefined,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
|
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
|
||||||
|
const { name, data, ...options } = nameOrOptions;
|
||||||
|
return this.createTable(name, data, options);
|
||||||
|
}
|
||||||
|
if (data === undefined) {
|
||||||
|
throw new Error("data is required");
|
||||||
|
}
|
||||||
if (options?.mode) {
|
if (options?.mode) {
|
||||||
console.warn(
|
console.warn(
|
||||||
"option 'mode' is not supported in LanceDB Cloud",
|
"option 'mode' is not supported in LanceDB Cloud",
|
||||||
@@ -132,7 +146,7 @@ export class RemoteConnection extends Connection {
|
|||||||
);
|
);
|
||||||
|
|
||||||
await this.#client.post(
|
await this.#client.post(
|
||||||
`/v1/table/${encodeURIComponent(tableName)}/create/`,
|
`/v1/table/${encodeURIComponent(nameOrOptions)}/create/`,
|
||||||
buf,
|
buf,
|
||||||
{
|
{
|
||||||
config: {
|
config: {
|
||||||
@@ -141,13 +155,13 @@ export class RemoteConnection extends Connection {
|
|||||||
headers: { "Content-Type": "application/vnd.apache.arrow.stream" },
|
headers: { "Content-Type": "application/vnd.apache.arrow.stream" },
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
this.#tableCache.set(tableName, true);
|
this.#tableCache.set(nameOrOptions, true);
|
||||||
return new RemoteTable(this.#client, tableName, this.#dbName);
|
return new RemoteTable(this.#client, nameOrOptions, this.#dbName);
|
||||||
}
|
}
|
||||||
|
|
||||||
async createEmptyTable(
|
async createEmptyTable(
|
||||||
name: string,
|
name: string,
|
||||||
schema: Schema,
|
schema: SchemaLike,
|
||||||
options?: Partial<CreateTableOptions> | undefined,
|
options?: Partial<CreateTableOptions> | undefined,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
if (options?.mode) {
|
if (options?.mode) {
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ import { Table as ArrowTable } from "apache-arrow";
|
|||||||
|
|
||||||
import { Data, IntoVector } from "../arrow";
|
import { Data, IntoVector } from "../arrow";
|
||||||
|
|
||||||
|
import { IndexStatistics } from "..";
|
||||||
import { CreateTableOptions } from "../connection";
|
import { CreateTableOptions } from "../connection";
|
||||||
import { IndexOptions } from "../indices";
|
import { IndexOptions } from "../indices";
|
||||||
import { MergeInsertBuilder } from "../merge";
|
import { MergeInsertBuilder } from "../merge";
|
||||||
@@ -34,6 +35,10 @@ export class RemoteTable extends Table {
|
|||||||
return `/v1/table/${encodeURIComponent(this.#name)}/`;
|
return `/v1/table/${encodeURIComponent(this.#name)}/`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get name(): string {
|
||||||
|
return this.#name;
|
||||||
|
}
|
||||||
|
|
||||||
public constructor(
|
public constructor(
|
||||||
client: RestfulLanceDBClient,
|
client: RestfulLanceDBClient,
|
||||||
tableName: string,
|
tableName: string,
|
||||||
@@ -161,4 +166,7 @@ export class RemoteTable extends Table {
|
|||||||
mergeInsert(_on: string | string[]): MergeInsertBuilder {
|
mergeInsert(_on: string | string[]): MergeInsertBuilder {
|
||||||
throw new Error("mergeInsert() is not yet supported on the LanceDB cloud");
|
throw new Error("mergeInsert() is not yet supported on the LanceDB cloud");
|
||||||
}
|
}
|
||||||
|
async indexStats(_name: string): Promise<IndexStatistics | undefined> {
|
||||||
|
throw new Error("indexStats() is not yet supported on the LanceDB cloud");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,10 +20,12 @@
|
|||||||
// comes from the exact same library instance. This is not always the case
|
// comes from the exact same library instance. This is not always the case
|
||||||
// and so we must sanitize the input to ensure that it is compatible.
|
// and so we must sanitize the input to ensure that it is compatible.
|
||||||
|
|
||||||
|
import { BufferType, Data } from "apache-arrow";
|
||||||
import type { IntBitWidth, TKeys, TimeBitWidth } from "apache-arrow/type";
|
import type { IntBitWidth, TKeys, TimeBitWidth } from "apache-arrow/type";
|
||||||
import {
|
import {
|
||||||
Binary,
|
Binary,
|
||||||
Bool,
|
Bool,
|
||||||
|
DataLike,
|
||||||
DataType,
|
DataType,
|
||||||
DateDay,
|
DateDay,
|
||||||
DateMillisecond,
|
DateMillisecond,
|
||||||
@@ -56,9 +58,14 @@ import {
|
|||||||
Map_,
|
Map_,
|
||||||
Null,
|
Null,
|
||||||
type Precision,
|
type Precision,
|
||||||
|
RecordBatch,
|
||||||
|
RecordBatchLike,
|
||||||
Schema,
|
Schema,
|
||||||
|
SchemaLike,
|
||||||
SparseUnion,
|
SparseUnion,
|
||||||
Struct,
|
Struct,
|
||||||
|
Table,
|
||||||
|
TableLike,
|
||||||
Time,
|
Time,
|
||||||
TimeMicrosecond,
|
TimeMicrosecond,
|
||||||
TimeMillisecond,
|
TimeMillisecond,
|
||||||
@@ -488,7 +495,7 @@ export function sanitizeField(fieldLike: unknown): Field {
|
|||||||
* instance because they might be using a different instance of apache-arrow
|
* instance because they might be using a different instance of apache-arrow
|
||||||
* than lancedb is using.
|
* than lancedb is using.
|
||||||
*/
|
*/
|
||||||
export function sanitizeSchema(schemaLike: unknown): Schema {
|
export function sanitizeSchema(schemaLike: SchemaLike): Schema {
|
||||||
if (schemaLike instanceof Schema) {
|
if (schemaLike instanceof Schema) {
|
||||||
return schemaLike;
|
return schemaLike;
|
||||||
}
|
}
|
||||||
@@ -514,3 +521,68 @@ export function sanitizeSchema(schemaLike: unknown): Schema {
|
|||||||
);
|
);
|
||||||
return new Schema(sanitizedFields, metadata);
|
return new Schema(sanitizedFields, metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function sanitizeTable(tableLike: TableLike): Table {
|
||||||
|
if (tableLike instanceof Table) {
|
||||||
|
return tableLike;
|
||||||
|
}
|
||||||
|
if (typeof tableLike !== "object" || tableLike === null) {
|
||||||
|
throw Error("Expected a Table but object was null/undefined");
|
||||||
|
}
|
||||||
|
if (!("schema" in tableLike)) {
|
||||||
|
throw Error(
|
||||||
|
"The table passed in does not appear to be a table (no 'schema' property)",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (!("batches" in tableLike)) {
|
||||||
|
throw Error(
|
||||||
|
"The table passed in does not appear to be a table (no 'columns' property)",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const schema = sanitizeSchema(tableLike.schema);
|
||||||
|
|
||||||
|
const batches = tableLike.batches.map(sanitizeRecordBatch);
|
||||||
|
return new Table(schema, batches);
|
||||||
|
}
|
||||||
|
|
||||||
|
function sanitizeRecordBatch(batchLike: RecordBatchLike): RecordBatch {
|
||||||
|
if (batchLike instanceof RecordBatch) {
|
||||||
|
return batchLike;
|
||||||
|
}
|
||||||
|
if (typeof batchLike !== "object" || batchLike === null) {
|
||||||
|
throw Error("Expected a RecordBatch but object was null/undefined");
|
||||||
|
}
|
||||||
|
if (!("schema" in batchLike)) {
|
||||||
|
throw Error(
|
||||||
|
"The record batch passed in does not appear to be a record batch (no 'schema' property)",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (!("data" in batchLike)) {
|
||||||
|
throw Error(
|
||||||
|
"The record batch passed in does not appear to be a record batch (no 'data' property)",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const schema = sanitizeSchema(batchLike.schema);
|
||||||
|
const data = sanitizeData(batchLike.data);
|
||||||
|
return new RecordBatch(schema, data);
|
||||||
|
}
|
||||||
|
function sanitizeData(
|
||||||
|
dataLike: DataLike,
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
): import("apache-arrow").Data<Struct<any>> {
|
||||||
|
if (dataLike instanceof Data) {
|
||||||
|
return dataLike;
|
||||||
|
}
|
||||||
|
return new Data(
|
||||||
|
dataLike.type,
|
||||||
|
dataLike.offset,
|
||||||
|
dataLike.length,
|
||||||
|
dataLike.nullCount,
|
||||||
|
{
|
||||||
|
[BufferType.OFFSET]: dataLike.valueOffsets,
|
||||||
|
[BufferType.DATA]: dataLike.values,
|
||||||
|
[BufferType.VALIDITY]: dataLike.nullBitmap,
|
||||||
|
[BufferType.TYPE]: dataLike.typeIds,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ import {
|
|||||||
Data,
|
Data,
|
||||||
IntoVector,
|
IntoVector,
|
||||||
Schema,
|
Schema,
|
||||||
|
TableLike,
|
||||||
fromDataToBuffer,
|
fromDataToBuffer,
|
||||||
fromTableToBuffer,
|
fromTableToBuffer,
|
||||||
fromTableToStreamBuffer,
|
fromTableToStreamBuffer,
|
||||||
@@ -33,10 +34,12 @@ import {
|
|||||||
AddColumnsSql,
|
AddColumnsSql,
|
||||||
ColumnAlteration,
|
ColumnAlteration,
|
||||||
IndexConfig,
|
IndexConfig,
|
||||||
|
IndexStatistics,
|
||||||
OptimizeStats,
|
OptimizeStats,
|
||||||
Table as _NativeTable,
|
Table as _NativeTable,
|
||||||
} from "./native";
|
} from "./native";
|
||||||
import { Query, VectorQuery } from "./query";
|
import { Query, VectorQuery } from "./query";
|
||||||
|
import { sanitizeTable } from "./sanitize";
|
||||||
export { IndexConfig } from "./native";
|
export { IndexConfig } from "./native";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -98,6 +101,8 @@ export abstract class Table {
|
|||||||
[Symbol.for("nodejs.util.inspect.custom")](): string {
|
[Symbol.for("nodejs.util.inspect.custom")](): string {
|
||||||
return this.display();
|
return this.display();
|
||||||
}
|
}
|
||||||
|
/** Returns the name of the table */
|
||||||
|
abstract get name(): string;
|
||||||
|
|
||||||
/** Return true if the table has not been closed */
|
/** Return true if the table has not been closed */
|
||||||
abstract isOpen(): boolean;
|
abstract isOpen(): boolean;
|
||||||
@@ -158,6 +163,9 @@ export abstract class Table {
|
|||||||
* Indices on vector columns will speed up vector searches.
|
* Indices on vector columns will speed up vector searches.
|
||||||
* Indices on scalar columns will speed up filtering (in both
|
* Indices on scalar columns will speed up filtering (in both
|
||||||
* vector and non-vector searches)
|
* vector and non-vector searches)
|
||||||
|
*
|
||||||
|
* @note We currently don't support custom named indexes,
|
||||||
|
* The index name will always be `${column}_idx`
|
||||||
* @example
|
* @example
|
||||||
* // If the column has a vector (fixed size list) data type then
|
* // If the column has a vector (fixed size list) data type then
|
||||||
* // an IvfPq vector index will be created.
|
* // an IvfPq vector index will be created.
|
||||||
@@ -368,9 +376,15 @@ export abstract class Table {
|
|||||||
|
|
||||||
abstract mergeInsert(on: string | string[]): MergeInsertBuilder;
|
abstract mergeInsert(on: string | string[]): MergeInsertBuilder;
|
||||||
|
|
||||||
|
/** List all the stats of a specified index
|
||||||
|
*
|
||||||
|
* @param {string} name The name of the index.
|
||||||
|
* @returns {IndexStatistics | undefined} The stats of the index. If the index does not exist, it will return undefined
|
||||||
|
*/
|
||||||
|
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
|
||||||
|
|
||||||
static async parseTableData(
|
static async parseTableData(
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
data: Record<string, unknown>[] | TableLike,
|
||||||
data: Record<string, unknown>[] | ArrowTable<any>,
|
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
streaming = false,
|
streaming = false,
|
||||||
) {
|
) {
|
||||||
@@ -383,9 +397,9 @@ export abstract class Table {
|
|||||||
|
|
||||||
let table: ArrowTable;
|
let table: ArrowTable;
|
||||||
if (isArrowTable(data)) {
|
if (isArrowTable(data)) {
|
||||||
table = data;
|
table = sanitizeTable(data);
|
||||||
} else {
|
} else {
|
||||||
table = makeArrowTable(data, options);
|
table = makeArrowTable(data as Record<string, unknown>[], options);
|
||||||
}
|
}
|
||||||
if (streaming) {
|
if (streaming) {
|
||||||
const buf = await fromTableToStreamBuffer(
|
const buf = await fromTableToStreamBuffer(
|
||||||
@@ -412,7 +426,9 @@ export class LocalTable extends Table {
|
|||||||
super();
|
super();
|
||||||
this.inner = inner;
|
this.inner = inner;
|
||||||
}
|
}
|
||||||
|
get name(): string {
|
||||||
|
return this.inner.name;
|
||||||
|
}
|
||||||
isOpen(): boolean {
|
isOpen(): boolean {
|
||||||
return this.inner.isOpen();
|
return this.inner.isOpen();
|
||||||
}
|
}
|
||||||
@@ -565,6 +581,13 @@ export class LocalTable extends Table {
|
|||||||
return await this.query().toArrow();
|
return await this.query().toArrow();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async indexStats(name: string): Promise<IndexStatistics | undefined> {
|
||||||
|
const stats = await this.inner.indexStats(name);
|
||||||
|
if (stats === null) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return stats;
|
||||||
|
}
|
||||||
mergeInsert(on: string | string[]): MergeInsertBuilder {
|
mergeInsert(on: string | string[]): MergeInsertBuilder {
|
||||||
on = Array.isArray(on) ? on : [on];
|
on = Array.isArray(on) ? on : [on];
|
||||||
return new MergeInsertBuilder(this.inner.mergeInsert(on));
|
return new MergeInsertBuilder(this.inner.mergeInsert(on));
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.5.2",
|
"version": "0.6.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.5.2",
|
"version": "0.6.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.5.2",
|
"version": "0.6.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.5.2",
|
"version": "0.6.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.5.2",
|
"version": "0.6.0",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
29
nodejs/package-lock.json
generated
29
nodejs/package-lock.json
generated
@@ -18,10 +18,8 @@
|
|||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@types/axios": "^0.14.0",
|
|
||||||
"apache-arrow": "^15.0.0",
|
"apache-arrow": "^15.0.0",
|
||||||
"axios": "^1.7.2",
|
"axios": "^1.7.2",
|
||||||
"memoize": "^10.0.0",
|
|
||||||
"openai": "^4.29.2",
|
"openai": "^4.29.2",
|
||||||
"reflect-metadata": "^0.2.2"
|
"reflect-metadata": "^0.2.2"
|
||||||
},
|
},
|
||||||
@@ -31,6 +29,7 @@
|
|||||||
"@biomejs/biome": "^1.7.3",
|
"@biomejs/biome": "^1.7.3",
|
||||||
"@jest/globals": "^29.7.0",
|
"@jest/globals": "^29.7.0",
|
||||||
"@napi-rs/cli": "^2.18.0",
|
"@napi-rs/cli": "^2.18.0",
|
||||||
|
"@types/axios": "^0.14.0",
|
||||||
"@types/jest": "^29.1.2",
|
"@types/jest": "^29.1.2",
|
||||||
"@types/tmp": "^0.2.6",
|
"@types/tmp": "^0.2.6",
|
||||||
"apache-arrow-old": "npm:apache-arrow@13.0.0",
|
"apache-arrow-old": "npm:apache-arrow@13.0.0",
|
||||||
@@ -3131,6 +3130,7 @@
|
|||||||
"resolved": "https://registry.npmjs.org/@types/axios/-/axios-0.14.0.tgz",
|
"resolved": "https://registry.npmjs.org/@types/axios/-/axios-0.14.0.tgz",
|
||||||
"integrity": "sha512-KqQnQbdYE54D7oa/UmYVMZKq7CO4l8DEENzOKc4aBRwxCXSlJXGz83flFx5L7AWrOQnmuN3kVsRdt+GZPPjiVQ==",
|
"integrity": "sha512-KqQnQbdYE54D7oa/UmYVMZKq7CO4l8DEENzOKc4aBRwxCXSlJXGz83flFx5L7AWrOQnmuN3kVsRdt+GZPPjiVQ==",
|
||||||
"deprecated": "This is a stub types definition for axios (https://github.com/mzabriskie/axios). axios provides its own type definitions, so you don't need @types/axios installed!",
|
"deprecated": "This is a stub types definition for axios (https://github.com/mzabriskie/axios). axios provides its own type definitions, so you don't need @types/axios installed!",
|
||||||
|
"dev": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"axios": "*"
|
"axios": "*"
|
||||||
}
|
}
|
||||||
@@ -5942,20 +5942,6 @@
|
|||||||
"is-buffer": "~1.1.6"
|
"is-buffer": "~1.1.6"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/memoize": {
|
|
||||||
"version": "10.0.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/memoize/-/memoize-10.0.0.tgz",
|
|
||||||
"integrity": "sha512-H6cBLgsi6vMWOcCpvVCdFFnl3kerEXbrYh9q+lY6VXvQSmM6CkmV08VOwT+WE2tzIEqRPFfAq3fm4v/UIW6mSA==",
|
|
||||||
"dependencies": {
|
|
||||||
"mimic-function": "^5.0.0"
|
|
||||||
},
|
|
||||||
"engines": {
|
|
||||||
"node": ">=18"
|
|
||||||
},
|
|
||||||
"funding": {
|
|
||||||
"url": "https://github.com/sindresorhus/memoize?sponsor=1"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/merge-stream": {
|
"node_modules/merge-stream": {
|
||||||
"version": "2.0.0",
|
"version": "2.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
|
||||||
@@ -6003,17 +5989,6 @@
|
|||||||
"node": ">= 0.6"
|
"node": ">= 0.6"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/mimic-function": {
|
|
||||||
"version": "5.0.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/mimic-function/-/mimic-function-5.0.1.tgz",
|
|
||||||
"integrity": "sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==",
|
|
||||||
"engines": {
|
|
||||||
"node": ">=18"
|
|
||||||
},
|
|
||||||
"funding": {
|
|
||||||
"url": "https://github.com/sponsors/sindresorhus"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/minimatch": {
|
"node_modules/minimatch": {
|
||||||
"version": "3.1.2",
|
"version": "3.1.2",
|
||||||
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
|
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
|
||||||
|
|||||||
@@ -1,6 +1,16 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.5.2",
|
"description": "LanceDB: A serverless, low-latency vector database for AI applications",
|
||||||
|
"keywords": [
|
||||||
|
"database",
|
||||||
|
"lance",
|
||||||
|
"lancedb",
|
||||||
|
"search",
|
||||||
|
"vector",
|
||||||
|
"vector database",
|
||||||
|
"ann"
|
||||||
|
],
|
||||||
|
"version": "0.6.0",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
@@ -38,7 +48,8 @@
|
|||||||
"typedoc": "^0.25.7",
|
"typedoc": "^0.25.7",
|
||||||
"typedoc-plugin-markdown": "^3.17.1",
|
"typedoc-plugin-markdown": "^3.17.1",
|
||||||
"typescript": "^5.3.3",
|
"typescript": "^5.3.3",
|
||||||
"typescript-eslint": "^7.1.0"
|
"typescript-eslint": "^7.1.0",
|
||||||
|
"@types/axios": "^0.14.0"
|
||||||
},
|
},
|
||||||
"ava": {
|
"ava": {
|
||||||
"timeout": "3m"
|
"timeout": "3m"
|
||||||
@@ -65,7 +76,6 @@
|
|||||||
"version": "napi version"
|
"version": "napi version"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@types/axios": "^0.14.0",
|
|
||||||
"apache-arrow": "^15.0.0",
|
"apache-arrow": "^15.0.0",
|
||||||
"axios": "^1.7.2",
|
"axios": "^1.7.2",
|
||||||
"openai": "^4.29.2",
|
"openai": "^4.29.2",
|
||||||
|
|||||||
@@ -56,12 +56,6 @@ impl Connection {
|
|||||||
#[napi(factory)]
|
#[napi(factory)]
|
||||||
pub async fn new(uri: String, options: ConnectionOptions) -> napi::Result<Self> {
|
pub async fn new(uri: String, options: ConnectionOptions) -> napi::Result<Self> {
|
||||||
let mut builder = ConnectBuilder::new(&uri);
|
let mut builder = ConnectBuilder::new(&uri);
|
||||||
if let Some(api_key) = options.api_key {
|
|
||||||
builder = builder.api_key(&api_key);
|
|
||||||
}
|
|
||||||
if let Some(host_override) = options.host_override {
|
|
||||||
builder = builder.host_override(&host_override);
|
|
||||||
}
|
|
||||||
if let Some(interval) = options.read_consistency_interval {
|
if let Some(interval) = options.read_consistency_interval {
|
||||||
builder =
|
builder =
|
||||||
builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));
|
builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));
|
||||||
|
|||||||
@@ -28,8 +28,6 @@ mod util;
|
|||||||
#[napi(object)]
|
#[napi(object)]
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct ConnectionOptions {
|
pub struct ConnectionOptions {
|
||||||
pub api_key: Option<String>,
|
|
||||||
pub host_override: Option<String>,
|
|
||||||
/// (For LanceDB OSS only): The interval, in seconds, at which to check for
|
/// (For LanceDB OSS only): The interval, in seconds, at which to check for
|
||||||
/// updates to the table from other processes. If None, then consistency is not
|
/// updates to the table from other processes. If None, then consistency is not
|
||||||
/// checked. For performance reasons, this is the default. For strong
|
/// checked. For performance reasons, this is the default. For strong
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ use crate::query::{Query, VectorQuery};
|
|||||||
pub struct Table {
|
pub struct Table {
|
||||||
// We keep a duplicate of the table name so we can use it for error
|
// We keep a duplicate of the table name so we can use it for error
|
||||||
// messages even if the table has been closed
|
// messages even if the table has been closed
|
||||||
name: String,
|
pub name: String,
|
||||||
pub(crate) inner: Option<LanceDbTable>,
|
pub(crate) inner: Option<LanceDbTable>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -330,6 +330,13 @@ impl Table {
|
|||||||
.collect::<Vec<_>>())
|
.collect::<Vec<_>>())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub async fn index_stats(&self, index_name: String) -> napi::Result<Option<IndexStatistics>> {
|
||||||
|
let tbl = self.inner_ref()?.as_native().unwrap();
|
||||||
|
let stats = tbl.index_stats(&index_name).await.default_error()?;
|
||||||
|
Ok(stats.map(IndexStatistics::from))
|
||||||
|
}
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub fn merge_insert(&self, on: Vec<String>) -> napi::Result<NativeMergeInsertBuilder> {
|
pub fn merge_insert(&self, on: Vec<String>) -> napi::Result<NativeMergeInsertBuilder> {
|
||||||
let on: Vec<_> = on.iter().map(String::as_str).collect();
|
let on: Vec<_> = on.iter().map(String::as_str).collect();
|
||||||
@@ -340,11 +347,13 @@ impl Table {
|
|||||||
#[napi(object)]
|
#[napi(object)]
|
||||||
/// A description of an index currently configured on a column
|
/// A description of an index currently configured on a column
|
||||||
pub struct IndexConfig {
|
pub struct IndexConfig {
|
||||||
|
/// The name of the index
|
||||||
|
pub name: String,
|
||||||
/// The type of the index
|
/// The type of the index
|
||||||
pub index_type: String,
|
pub index_type: String,
|
||||||
/// The columns in the index
|
/// The columns in the index
|
||||||
///
|
///
|
||||||
/// Currently this is always an array of size 1. In the future there may
|
/// Currently this is always an array of size 1. In the future there may
|
||||||
/// be more columns to represent composite indices.
|
/// be more columns to represent composite indices.
|
||||||
pub columns: Vec<String>,
|
pub columns: Vec<String>,
|
||||||
}
|
}
|
||||||
@@ -355,6 +364,7 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
|
|||||||
Self {
|
Self {
|
||||||
index_type,
|
index_type,
|
||||||
columns: value.columns,
|
columns: value.columns,
|
||||||
|
name: value.name,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -437,3 +447,40 @@ pub struct AddColumnsSql {
|
|||||||
/// The expression can reference other columns in the table.
|
/// The expression can reference other columns in the table.
|
||||||
pub value_sql: String,
|
pub value_sql: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(object)]
|
||||||
|
pub struct IndexStatistics {
|
||||||
|
/// The number of rows indexed by the index
|
||||||
|
pub num_indexed_rows: f64,
|
||||||
|
/// The number of rows not indexed
|
||||||
|
pub num_unindexed_rows: f64,
|
||||||
|
/// The type of the index
|
||||||
|
pub index_type: Option<String>,
|
||||||
|
/// The metadata for each index
|
||||||
|
pub indices: Vec<IndexMetadata>,
|
||||||
|
}
|
||||||
|
impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
||||||
|
fn from(value: lancedb::index::IndexStatistics) -> Self {
|
||||||
|
Self {
|
||||||
|
num_indexed_rows: value.num_indexed_rows as f64,
|
||||||
|
num_unindexed_rows: value.num_unindexed_rows as f64,
|
||||||
|
index_type: value.index_type.map(|t| format!("{:?}", t)),
|
||||||
|
indices: value.indices.into_iter().map(Into::into).collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi(object)]
|
||||||
|
pub struct IndexMetadata {
|
||||||
|
pub metric_type: Option<String>,
|
||||||
|
pub index_type: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<lancedb::index::IndexMetadata> for IndexMetadata {
|
||||||
|
fn from(value: lancedb::index::IndexMetadata) -> Self {
|
||||||
|
Self {
|
||||||
|
metric_type: value.metric_type,
|
||||||
|
index_type: value.index_type,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.8.2"
|
current_version = "0.9.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.8.2"
|
version = "0.9.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ name = "lancedb"
|
|||||||
# version in Cargo.toml
|
# version in Cargo.toml
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"pylance==0.12.2-beta.2",
|
"pylance==0.13.0",
|
||||||
"ratelimiter~=1.0",
|
"ratelimiter~=1.0",
|
||||||
"requests>=2.31.0",
|
"requests>=2.31.0",
|
||||||
"retry>=0.9.2",
|
"retry>=0.9.2",
|
||||||
|
|||||||
@@ -29,7 +29,10 @@ from .table import LanceTable
|
|||||||
|
|
||||||
|
|
||||||
def create_index(
|
def create_index(
|
||||||
index_path: str, text_fields: List[str], ordering_fields: List[str] = None
|
index_path: str,
|
||||||
|
text_fields: List[str],
|
||||||
|
ordering_fields: List[str] = None,
|
||||||
|
tokenizer_name: str = "default",
|
||||||
) -> tantivy.Index:
|
) -> tantivy.Index:
|
||||||
"""
|
"""
|
||||||
Create a new Index (not populated)
|
Create a new Index (not populated)
|
||||||
@@ -42,6 +45,8 @@ def create_index(
|
|||||||
List of text fields to index
|
List of text fields to index
|
||||||
ordering_fields: List[str]
|
ordering_fields: List[str]
|
||||||
List of unsigned type fields to order by at search time
|
List of unsigned type fields to order by at search time
|
||||||
|
tokenizer_name : str, default "default"
|
||||||
|
The tokenizer to use
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -56,7 +61,7 @@ def create_index(
|
|||||||
schema_builder.add_integer_field("doc_id", stored=True)
|
schema_builder.add_integer_field("doc_id", stored=True)
|
||||||
# data fields
|
# data fields
|
||||||
for name in text_fields:
|
for name in text_fields:
|
||||||
schema_builder.add_text_field(name, stored=True)
|
schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
|
||||||
if ordering_fields:
|
if ordering_fields:
|
||||||
for name in ordering_fields:
|
for name in ordering_fields:
|
||||||
schema_builder.add_unsigned_field(name, fast=True)
|
schema_builder.add_unsigned_field(name, fast=True)
|
||||||
|
|||||||
@@ -1171,6 +1171,7 @@ class LanceTable(Table):
|
|||||||
*,
|
*,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||||
|
tokenizer_name: str = "default",
|
||||||
):
|
):
|
||||||
"""Create a full-text search index on the table.
|
"""Create a full-text search index on the table.
|
||||||
|
|
||||||
@@ -1189,6 +1190,10 @@ class LanceTable(Table):
|
|||||||
ordering_field_names:
|
ordering_field_names:
|
||||||
A list of unsigned type fields to index to optionally order
|
A list of unsigned type fields to index to optionally order
|
||||||
results on at search time
|
results on at search time
|
||||||
|
tokenizer_name: str, default "default"
|
||||||
|
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
||||||
|
language code followed by "_stem". So for english it would be "en_stem".
|
||||||
|
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
||||||
"""
|
"""
|
||||||
from .fts import create_index, populate_index
|
from .fts import create_index, populate_index
|
||||||
|
|
||||||
@@ -1214,6 +1219,7 @@ class LanceTable(Table):
|
|||||||
self._get_fts_index_path(),
|
self._get_fts_index_path(),
|
||||||
field_names,
|
field_names,
|
||||||
ordering_fields=ordering_field_names,
|
ordering_fields=ordering_field_names,
|
||||||
|
tokenizer_name=tokenizer_name,
|
||||||
)
|
)
|
||||||
populate_index(
|
populate_index(
|
||||||
index,
|
index,
|
||||||
|
|||||||
@@ -66,6 +66,17 @@ def test_create_index(tmp_path):
|
|||||||
assert os.path.exists(str(tmp_path / "index"))
|
assert os.path.exists(str(tmp_path / "index"))
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_index_with_stemming(tmp_path, table):
|
||||||
|
index = ldb.fts.create_index(
|
||||||
|
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
|
||||||
|
)
|
||||||
|
assert isinstance(index, tantivy.Index)
|
||||||
|
assert os.path.exists(str(tmp_path / "index"))
|
||||||
|
|
||||||
|
# Check stemming by running tokenizer on non empty table
|
||||||
|
table.create_fts_index("text", tokenizer_name="en_stem")
|
||||||
|
|
||||||
|
|
||||||
def test_populate_index(tmp_path, table):
|
def test_populate_index(tmp_path, table):
|
||||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||||
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.5.2"
|
version = "0.6.0"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -463,6 +463,7 @@ impl JsTable {
|
|||||||
Ok(promise)
|
Ok(promise)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(deprecated)]
|
||||||
pub(crate) fn js_index_stats(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
pub(crate) fn js_index_stats(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
let js_table = cx.this().downcast_or_throw::<JsBox<Self>, _>(&mut cx)?;
|
let js_table = cx.this().downcast_or_throw::<JsBox<Self>, _>(&mut cx)?;
|
||||||
let rt = runtime(&mut cx)?;
|
let rt = runtime(&mut cx)?;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.5.2"
|
version = "0.6.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -6,3 +6,12 @@
|
|||||||
LanceDB Rust SDK, a serverless vector database.
|
LanceDB Rust SDK, a serverless vector database.
|
||||||
|
|
||||||
Read more at: https://lancedb.com/
|
Read more at: https://lancedb.com/
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> A transitive dependency of `lancedb` is `lzma-sys`, which uses dynamic linking
|
||||||
|
> by default. If you want to statically link `lzma-sys`, you should activate it's
|
||||||
|
> `static` feature by adding the following to your dependencies:
|
||||||
|
>
|
||||||
|
> ```toml
|
||||||
|
> lzma-sys = { version = "*", features = ["static"] }
|
||||||
|
> ```
|
||||||
|
|||||||
@@ -80,6 +80,8 @@ pub enum IndexType {
|
|||||||
|
|
||||||
/// A description of an index currently configured on a column
|
/// A description of an index currently configured on a column
|
||||||
pub struct IndexConfig {
|
pub struct IndexConfig {
|
||||||
|
/// The name of the index
|
||||||
|
pub name: String,
|
||||||
/// The type of the index
|
/// The type of the index
|
||||||
pub index_type: IndexType,
|
pub index_type: IndexType,
|
||||||
/// The columns in the index
|
/// The columns in the index
|
||||||
|
|||||||
@@ -1206,28 +1206,36 @@ impl NativeTable {
|
|||||||
.await)
|
.await)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
|
||||||
pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
|
pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
|
||||||
|
#[allow(deprecated)]
|
||||||
match self.load_index_stats(index_uuid).await? {
|
match self.load_index_stats(index_uuid).await? {
|
||||||
Some(stats) => Ok(Some(stats.num_indexed_rows)),
|
Some(stats) => Ok(Some(stats.num_indexed_rows)),
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
|
||||||
pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
|
pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
|
||||||
|
#[allow(deprecated)]
|
||||||
match self.load_index_stats(index_uuid).await? {
|
match self.load_index_stats(index_uuid).await? {
|
||||||
Some(stats) => Ok(Some(stats.num_unindexed_rows)),
|
Some(stats) => Ok(Some(stats.num_unindexed_rows)),
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
|
||||||
pub async fn get_index_type(&self, index_uuid: &str) -> Result<Option<String>> {
|
pub async fn get_index_type(&self, index_uuid: &str) -> Result<Option<String>> {
|
||||||
|
#[allow(deprecated)]
|
||||||
match self.load_index_stats(index_uuid).await? {
|
match self.load_index_stats(index_uuid).await? {
|
||||||
Some(stats) => Ok(Some(stats.index_type.unwrap_or_default())),
|
Some(stats) => Ok(Some(stats.index_type.unwrap_or_default())),
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
|
||||||
pub async fn get_distance_type(&self, index_uuid: &str) -> Result<Option<String>> {
|
pub async fn get_distance_type(&self, index_uuid: &str) -> Result<Option<String>> {
|
||||||
|
#[allow(deprecated)]
|
||||||
match self.load_index_stats(index_uuid).await? {
|
match self.load_index_stats(index_uuid).await? {
|
||||||
Some(stats) => Ok(Some(
|
Some(stats) => Ok(Some(
|
||||||
stats
|
stats
|
||||||
@@ -1240,16 +1248,8 @@ impl NativeTable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
|
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
|
||||||
let dataset = self.dataset.get().await?;
|
pub async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<IndexStatistics>> {
|
||||||
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
|
|
||||||
Ok(indices
|
|
||||||
.iter()
|
|
||||||
.map(|i| VectorIndex::new_from_format(&mf, i))
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<IndexStatistics>> {
|
|
||||||
let index = self
|
let index = self
|
||||||
.load_indices()
|
.load_indices()
|
||||||
.await?
|
.await?
|
||||||
@@ -1268,6 +1268,35 @@ impl NativeTable {
|
|||||||
Ok(Some(index_stats))
|
Ok(Some(index_stats))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get statistics about an index.
|
||||||
|
/// Returns an error if the index does not exist.
|
||||||
|
pub async fn index_stats<S: AsRef<str>>(
|
||||||
|
&self,
|
||||||
|
index_name: S,
|
||||||
|
) -> Result<Option<IndexStatistics>> {
|
||||||
|
self.dataset
|
||||||
|
.get()
|
||||||
|
.await?
|
||||||
|
.index_statistics(index_name.as_ref())
|
||||||
|
.await
|
||||||
|
.ok()
|
||||||
|
.map(|stats| {
|
||||||
|
serde_json::from_str(&stats).map_err(|e| Error::InvalidInput {
|
||||||
|
message: format!("error deserializing index statistics: {}", e),
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.transpose()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
|
||||||
|
let dataset = self.dataset.get().await?;
|
||||||
|
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
|
||||||
|
Ok(indices
|
||||||
|
.iter()
|
||||||
|
.map(|i| VectorIndex::new_from_format(&mf, i))
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
async fn create_ivf_pq_index(
|
async fn create_ivf_pq_index(
|
||||||
&self,
|
&self,
|
||||||
index: IvfPqIndexBuilder,
|
index: IvfPqIndexBuilder,
|
||||||
@@ -1860,12 +1889,21 @@ impl TableInternal for NativeTable {
|
|||||||
}
|
}
|
||||||
columns.push(field.name.clone());
|
columns.push(field.name.clone());
|
||||||
}
|
}
|
||||||
Ok(IndexConfig { index_type: if is_vector { crate::index::IndexType::IvfPq } else { crate::index::IndexType::BTree }, columns })
|
|
||||||
|
let index_type = if is_vector {
|
||||||
|
crate::index::IndexType::IvfPq
|
||||||
|
} else {
|
||||||
|
crate::index::IndexType::BTree
|
||||||
|
};
|
||||||
|
|
||||||
|
let name = idx.name.clone();
|
||||||
|
Ok(IndexConfig { index_type, columns, name })
|
||||||
}).collect::<Result<Vec<_>>>()
|
}).collect::<Result<Vec<_>>>()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
#[allow(deprecated)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::iter;
|
use std::iter;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
|||||||
Reference in New Issue
Block a user