mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 22:59:57 +00:00
Compare commits
7 Commits
python-v0.
...
lance-13.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dcfa17c9fc | ||
|
|
79a1667753 | ||
|
|
a866b78a31 | ||
|
|
c7d37b3e6e | ||
|
|
4b71552b73 | ||
|
|
5ce5f64da3 | ||
|
|
c582b0fc63 |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.5.2-final.1"
|
current_version = "0.6.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ runs:
|
|||||||
args: ${{ inputs.args }}
|
args: ${{ inputs.args }}
|
||||||
docker-options: "-e PIP_EXTRA_INDEX_URL=https://pypi.fury.io/lancedb/"
|
docker-options: "-e PIP_EXTRA_INDEX_URL=https://pypi.fury.io/lancedb/"
|
||||||
working-directory: python
|
working-directory: python
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
name: windows-wheels
|
name: windows-wheels
|
||||||
path: python\target\wheels
|
path: python\target\wheels
|
||||||
|
|||||||
19
Cargo.toml
19
Cargo.toml
@@ -20,11 +20,18 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
|
|||||||
categories = ["database-implementations"]
|
categories = ["database-implementations"]
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.13.0", "features" = ["dynamodb"] }
|
# lance = { "version" = "=0.13.0", "features" = ["dynamodb"] }
|
||||||
lance-index = { "version" = "=0.13.0" }
|
# lance-index = { "version" = "=0.13.0" }
|
||||||
lance-linalg = { "version" = "=0.13.0" }
|
# lance-linalg = { "version" = "=0.13.0" }
|
||||||
lance-testing = { "version" = "=0.13.0" }
|
# lance-testing = { "version" = "=0.13.0" }
|
||||||
lance-datafusion = { "version" = "=0.13.0" }
|
# lance-datafusion = { "version" = "=0.13.0" }
|
||||||
|
|
||||||
|
lance = { path = "../lance/rust/lance" }
|
||||||
|
lance-index = { path = "../lance/rust/lance-index" }
|
||||||
|
lance-linalg= { path = "../lance/rust/lance-linalg" }
|
||||||
|
lance-testing = { path = "../lance/rust/lance-testing" }
|
||||||
|
lance-datafusion = { path = "../lance/rust/lance-datafusion" }
|
||||||
|
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "51.0", optional = false }
|
arrow = { version = "51.0", optional = false }
|
||||||
arrow-array = "51.0"
|
arrow-array = "51.0"
|
||||||
@@ -35,7 +42,7 @@ arrow-schema = "51.0"
|
|||||||
arrow-arith = "51.0"
|
arrow-arith = "51.0"
|
||||||
arrow-cast = "51.0"
|
arrow-cast = "51.0"
|
||||||
async-trait = "0"
|
async-trait = "0"
|
||||||
chrono = "=0.4.39"
|
chrono = "0.4.35"
|
||||||
datafusion-physical-plan = "37.1"
|
datafusion-physical-plan = "37.1"
|
||||||
half = { "version" = "=2.4.1", default-features = false, features = [
|
half = { "version" = "=2.4.1", default-features = false, features = [
|
||||||
"num-traits",
|
"num-traits",
|
||||||
|
|||||||
@@ -116,21 +116,21 @@ This guide will show how to create tables, insert data into them, and update the
|
|||||||
|
|
||||||
### From a Polars DataFrame
|
### From a Polars DataFrame
|
||||||
|
|
||||||
LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
|
LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
|
||||||
written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
|
written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
|
||||||
under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
|
under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
|
||||||
is on the way.
|
is on the way.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
data = pl.DataFrame({
|
data = pl.DataFrame({
|
||||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||||
"item": ["foo", "bar"],
|
"item": ["foo", "bar"],
|
||||||
"price": [10.0, 20.0]
|
"price": [10.0, 20.0]
|
||||||
})
|
})
|
||||||
table = db.create_table("pl_table", data=data)
|
table = db.create_table("pl_table", data=data)
|
||||||
```
|
```
|
||||||
|
|
||||||
### From an Arrow Table
|
### From an Arrow Table
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|||||||
4
node/package-lock.json
generated
4
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.5.2",
|
"version": "0.6.0",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.5.2",
|
"version": "0.6.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.5.2-final.1",
|
"version": "0.6.0",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
|||||||
@@ -39,7 +39,9 @@ describe.each([arrow, arrowOld])("Given a table", (arrow: any) => {
|
|||||||
let tmpDir: tmp.DirResult;
|
let tmpDir: tmp.DirResult;
|
||||||
let table: Table;
|
let table: Table;
|
||||||
|
|
||||||
const schema = new arrow.Schema([
|
const schema:
|
||||||
|
| import("apache-arrow").Schema
|
||||||
|
| import("apache-arrow-old").Schema = new arrow.Schema([
|
||||||
new arrow.Field("id", new arrow.Float64(), true),
|
new arrow.Field("id", new arrow.Float64(), true),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
@@ -315,7 +317,7 @@ describe("When creating an index", () => {
|
|||||||
.query()
|
.query()
|
||||||
.limit(2)
|
.limit(2)
|
||||||
.nearestTo(queryVec)
|
.nearestTo(queryVec)
|
||||||
.distanceType("DoT")
|
.distanceType("dot")
|
||||||
.toArrow();
|
.toArrow();
|
||||||
expect(rst.numRows).toBe(2);
|
expect(rst.numRows).toBe(2);
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
import {
|
import {
|
||||||
Table as ArrowTable,
|
Table as ArrowTable,
|
||||||
Binary,
|
Binary,
|
||||||
|
BufferType,
|
||||||
DataType,
|
DataType,
|
||||||
Field,
|
Field,
|
||||||
FixedSizeBinary,
|
FixedSizeBinary,
|
||||||
@@ -37,14 +38,68 @@ import {
|
|||||||
type makeTable,
|
type makeTable,
|
||||||
vectorFromArray,
|
vectorFromArray,
|
||||||
} from "apache-arrow";
|
} from "apache-arrow";
|
||||||
|
import { Buffers } from "apache-arrow/data";
|
||||||
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
||||||
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
||||||
import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize";
|
import {
|
||||||
|
sanitizeField,
|
||||||
|
sanitizeSchema,
|
||||||
|
sanitizeTable,
|
||||||
|
sanitizeType,
|
||||||
|
} from "./sanitize";
|
||||||
export * from "apache-arrow";
|
export * from "apache-arrow";
|
||||||
|
export type SchemaLike =
|
||||||
|
| Schema
|
||||||
|
| {
|
||||||
|
fields: FieldLike[];
|
||||||
|
metadata: Map<string, string>;
|
||||||
|
get names(): unknown[];
|
||||||
|
};
|
||||||
|
export type FieldLike =
|
||||||
|
| Field
|
||||||
|
| {
|
||||||
|
type: string;
|
||||||
|
name: string;
|
||||||
|
nullable?: boolean;
|
||||||
|
metadata?: Map<string, string>;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type DataLike =
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
| import("apache-arrow").Data<Struct<any>>
|
||||||
|
| {
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
type: any;
|
||||||
|
length: number;
|
||||||
|
offset: number;
|
||||||
|
stride: number;
|
||||||
|
nullable: boolean;
|
||||||
|
children: DataLike[];
|
||||||
|
get nullCount(): number;
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
values: Buffers<any>[BufferType.DATA];
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
typeIds: Buffers<any>[BufferType.TYPE];
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
nullBitmap: Buffers<any>[BufferType.VALIDITY];
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
valueOffsets: Buffers<any>[BufferType.OFFSET];
|
||||||
|
};
|
||||||
|
|
||||||
|
export type RecordBatchLike =
|
||||||
|
| RecordBatch
|
||||||
|
| {
|
||||||
|
schema: SchemaLike;
|
||||||
|
data: DataLike;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type TableLike =
|
||||||
|
| ArrowTable
|
||||||
|
| { schema: SchemaLike; batches: RecordBatchLike[] };
|
||||||
|
|
||||||
export type IntoVector = Float32Array | Float64Array | number[];
|
export type IntoVector = Float32Array | Float64Array | number[];
|
||||||
|
|
||||||
export function isArrowTable(value: object): value is ArrowTable {
|
export function isArrowTable(value: object): value is TableLike {
|
||||||
if (value instanceof ArrowTable) return true;
|
if (value instanceof ArrowTable) return true;
|
||||||
return "schema" in value && "batches" in value;
|
return "schema" in value && "batches" in value;
|
||||||
}
|
}
|
||||||
@@ -135,7 +190,7 @@ export function isFixedSizeList(value: unknown): value is FixedSizeList {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Data type accepted by NodeJS SDK */
|
/** Data type accepted by NodeJS SDK */
|
||||||
export type Data = Record<string, unknown>[] | ArrowTable;
|
export type Data = Record<string, unknown>[] | TableLike;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Options to control how a column should be converted to a vector array
|
* Options to control how a column should be converted to a vector array
|
||||||
@@ -162,7 +217,7 @@ export class MakeArrowTableOptions {
|
|||||||
* The schema must be specified if there are no records (e.g. to make
|
* The schema must be specified if there are no records (e.g. to make
|
||||||
* an empty table)
|
* an empty table)
|
||||||
*/
|
*/
|
||||||
schema?: Schema;
|
schema?: SchemaLike;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Mapping from vector column name to expected type
|
* Mapping from vector column name to expected type
|
||||||
@@ -310,7 +365,7 @@ export function makeArrowTable(
|
|||||||
if (opt.schema !== undefined && opt.schema !== null) {
|
if (opt.schema !== undefined && opt.schema !== null) {
|
||||||
opt.schema = sanitizeSchema(opt.schema);
|
opt.schema = sanitizeSchema(opt.schema);
|
||||||
opt.schema = validateSchemaEmbeddings(
|
opt.schema = validateSchemaEmbeddings(
|
||||||
opt.schema,
|
opt.schema as Schema,
|
||||||
data,
|
data,
|
||||||
options?.embeddingFunction,
|
options?.embeddingFunction,
|
||||||
);
|
);
|
||||||
@@ -394,7 +449,7 @@ export function makeArrowTable(
|
|||||||
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
||||||
const firstTable = new ArrowTable(columns);
|
const firstTable = new ArrowTable(columns);
|
||||||
const batchesFixed = firstTable.batches.map(
|
const batchesFixed = firstTable.batches.map(
|
||||||
(batch) => new RecordBatch(opt.schema!, batch.data),
|
(batch) => new RecordBatch(opt.schema as Schema, batch.data),
|
||||||
);
|
);
|
||||||
let schema: Schema;
|
let schema: Schema;
|
||||||
if (metadata !== undefined) {
|
if (metadata !== undefined) {
|
||||||
@@ -407,9 +462,9 @@ export function makeArrowTable(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
schema = new Schema(opt.schema.fields, schemaMetadata);
|
schema = new Schema(opt.schema.fields as Field[], schemaMetadata);
|
||||||
} else {
|
} else {
|
||||||
schema = opt.schema;
|
schema = opt.schema as Schema;
|
||||||
}
|
}
|
||||||
return new ArrowTable(schema, batchesFixed);
|
return new ArrowTable(schema, batchesFixed);
|
||||||
}
|
}
|
||||||
@@ -425,7 +480,7 @@ export function makeArrowTable(
|
|||||||
* Create an empty Arrow table with the provided schema
|
* Create an empty Arrow table with the provided schema
|
||||||
*/
|
*/
|
||||||
export function makeEmptyTable(
|
export function makeEmptyTable(
|
||||||
schema: Schema,
|
schema: SchemaLike,
|
||||||
metadata?: Map<string, string>,
|
metadata?: Map<string, string>,
|
||||||
): ArrowTable {
|
): ArrowTable {
|
||||||
return makeArrowTable([], { schema }, metadata);
|
return makeArrowTable([], { schema }, metadata);
|
||||||
@@ -563,17 +618,16 @@ async function applyEmbeddingsFromMetadata(
|
|||||||
async function applyEmbeddings<T>(
|
async function applyEmbeddings<T>(
|
||||||
table: ArrowTable,
|
table: ArrowTable,
|
||||||
embeddings?: EmbeddingFunctionConfig,
|
embeddings?: EmbeddingFunctionConfig,
|
||||||
schema?: Schema,
|
schema?: SchemaLike,
|
||||||
): Promise<ArrowTable> {
|
): Promise<ArrowTable> {
|
||||||
if (schema?.metadata.has("embedding_functions")) {
|
|
||||||
return applyEmbeddingsFromMetadata(table, schema!);
|
|
||||||
} else if (embeddings == null || embeddings === undefined) {
|
|
||||||
return table;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (schema !== undefined && schema !== null) {
|
if (schema !== undefined && schema !== null) {
|
||||||
schema = sanitizeSchema(schema);
|
schema = sanitizeSchema(schema);
|
||||||
}
|
}
|
||||||
|
if (schema?.metadata.has("embedding_functions")) {
|
||||||
|
return applyEmbeddingsFromMetadata(table, schema! as Schema);
|
||||||
|
} else if (embeddings == null || embeddings === undefined) {
|
||||||
|
return table;
|
||||||
|
}
|
||||||
|
|
||||||
// Convert from ArrowTable to Record<String, Vector>
|
// Convert from ArrowTable to Record<String, Vector>
|
||||||
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
|
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
|
||||||
@@ -650,7 +704,7 @@ async function applyEmbeddings<T>(
|
|||||||
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
|
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return alignTable(newTable, schema);
|
return alignTable(newTable, schema as Schema);
|
||||||
}
|
}
|
||||||
return newTable;
|
return newTable;
|
||||||
}
|
}
|
||||||
@@ -744,7 +798,7 @@ export async function fromRecordsToStreamBuffer(
|
|||||||
export async function fromTableToBuffer(
|
export async function fromTableToBuffer(
|
||||||
table: ArrowTable,
|
table: ArrowTable,
|
||||||
embeddings?: EmbeddingFunctionConfig,
|
embeddings?: EmbeddingFunctionConfig,
|
||||||
schema?: Schema,
|
schema?: SchemaLike,
|
||||||
): Promise<Buffer> {
|
): Promise<Buffer> {
|
||||||
if (schema !== undefined && schema !== null) {
|
if (schema !== undefined && schema !== null) {
|
||||||
schema = sanitizeSchema(schema);
|
schema = sanitizeSchema(schema);
|
||||||
@@ -771,7 +825,7 @@ export async function fromDataToBuffer(
|
|||||||
schema = sanitizeSchema(schema);
|
schema = sanitizeSchema(schema);
|
||||||
}
|
}
|
||||||
if (isArrowTable(data)) {
|
if (isArrowTable(data)) {
|
||||||
return fromTableToBuffer(data, embeddings, schema);
|
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
|
||||||
} else {
|
} else {
|
||||||
const table = await convertToTable(data, embeddings, { schema });
|
const table = await convertToTable(data, embeddings, { schema });
|
||||||
return fromTableToBuffer(table);
|
return fromTableToBuffer(table);
|
||||||
@@ -789,7 +843,7 @@ export async function fromDataToBuffer(
|
|||||||
export async function fromTableToStreamBuffer(
|
export async function fromTableToStreamBuffer(
|
||||||
table: ArrowTable,
|
table: ArrowTable,
|
||||||
embeddings?: EmbeddingFunctionConfig,
|
embeddings?: EmbeddingFunctionConfig,
|
||||||
schema?: Schema,
|
schema?: SchemaLike,
|
||||||
): Promise<Buffer> {
|
): Promise<Buffer> {
|
||||||
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
||||||
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
|
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
|
||||||
@@ -854,7 +908,6 @@ function validateSchemaEmbeddings(
|
|||||||
for (let field of schema.fields) {
|
for (let field of schema.fields) {
|
||||||
if (isFixedSizeList(field.type)) {
|
if (isFixedSizeList(field.type)) {
|
||||||
field = sanitizeField(field);
|
field = sanitizeField(field);
|
||||||
|
|
||||||
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
||||||
if (schema.metadata.has("embedding_functions")) {
|
if (schema.metadata.has("embedding_functions")) {
|
||||||
const embeddings = JSON.parse(
|
const embeddings = JSON.parse(
|
||||||
|
|||||||
@@ -12,7 +12,7 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
import { Table as ArrowTable, Data, Schema } from "./arrow";
|
import { Data, Schema, SchemaLike, TableLike } from "./arrow";
|
||||||
import { fromTableToBuffer, makeEmptyTable } from "./arrow";
|
import { fromTableToBuffer, makeEmptyTable } from "./arrow";
|
||||||
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
||||||
import { Connection as LanceDbConnection } from "./native";
|
import { Connection as LanceDbConnection } from "./native";
|
||||||
@@ -50,7 +50,7 @@ export interface CreateTableOptions {
|
|||||||
* The default is true while the new format is in beta
|
* The default is true while the new format is in beta
|
||||||
*/
|
*/
|
||||||
useLegacyFormat?: boolean;
|
useLegacyFormat?: boolean;
|
||||||
schema?: Schema;
|
schema?: SchemaLike;
|
||||||
embeddingFunction?: EmbeddingFunctionConfig;
|
embeddingFunction?: EmbeddingFunctionConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -167,12 +167,12 @@ export abstract class Connection {
|
|||||||
/**
|
/**
|
||||||
* Creates a new Table and initialize it with new data.
|
* Creates a new Table and initialize it with new data.
|
||||||
* @param {string} name - The name of the table.
|
* @param {string} name - The name of the table.
|
||||||
* @param {Record<string, unknown>[] | ArrowTable} data - Non-empty Array of Records
|
* @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
|
||||||
* to be inserted into the table
|
* to be inserted into the table
|
||||||
*/
|
*/
|
||||||
abstract createTable(
|
abstract createTable(
|
||||||
name: string,
|
name: string,
|
||||||
data: Record<string, unknown>[] | ArrowTable,
|
data: Record<string, unknown>[] | TableLike,
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table>;
|
): Promise<Table>;
|
||||||
|
|
||||||
@@ -183,7 +183,7 @@ export abstract class Connection {
|
|||||||
*/
|
*/
|
||||||
abstract createEmptyTable(
|
abstract createEmptyTable(
|
||||||
name: string,
|
name: string,
|
||||||
schema: Schema,
|
schema: import("./arrow").SchemaLike,
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table>;
|
): Promise<Table>;
|
||||||
|
|
||||||
@@ -235,7 +235,7 @@ export class LocalConnection extends Connection {
|
|||||||
nameOrOptions:
|
nameOrOptions:
|
||||||
| string
|
| string
|
||||||
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
|
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
|
||||||
data?: Record<string, unknown>[] | ArrowTable,
|
data?: Record<string, unknown>[] | TableLike,
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
|
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
|
||||||
@@ -259,7 +259,7 @@ export class LocalConnection extends Connection {
|
|||||||
|
|
||||||
async createEmptyTable(
|
async createEmptyTable(
|
||||||
name: string,
|
name: string,
|
||||||
schema: Schema,
|
schema: import("./arrow").SchemaLike,
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
let mode: string = options?.mode ?? "create";
|
let mode: string = options?.mode ?? "create";
|
||||||
|
|||||||
@@ -300,7 +300,9 @@ export class VectorQuery extends QueryBase<NativeVectorQuery, VectorQuery> {
|
|||||||
*
|
*
|
||||||
* By default "l2" is used.
|
* By default "l2" is used.
|
||||||
*/
|
*/
|
||||||
distanceType(distanceType: string): VectorQuery {
|
distanceType(
|
||||||
|
distanceType: Required<IvfPqOptions>["distanceType"],
|
||||||
|
): VectorQuery {
|
||||||
this.inner.distanceType(distanceType);
|
this.inner.distanceType(distanceType);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,10 @@
|
|||||||
import { Schema } from "apache-arrow";
|
import { Schema } from "apache-arrow";
|
||||||
import { Data, fromTableToStreamBuffer, makeEmptyTable } from "../arrow";
|
import {
|
||||||
|
Data,
|
||||||
|
SchemaLike,
|
||||||
|
fromTableToStreamBuffer,
|
||||||
|
makeEmptyTable,
|
||||||
|
} from "../arrow";
|
||||||
import {
|
import {
|
||||||
Connection,
|
Connection,
|
||||||
CreateTableOptions,
|
CreateTableOptions,
|
||||||
@@ -156,7 +161,7 @@ export class RemoteConnection extends Connection {
|
|||||||
|
|
||||||
async createEmptyTable(
|
async createEmptyTable(
|
||||||
name: string,
|
name: string,
|
||||||
schema: Schema,
|
schema: SchemaLike,
|
||||||
options?: Partial<CreateTableOptions> | undefined,
|
options?: Partial<CreateTableOptions> | undefined,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
if (options?.mode) {
|
if (options?.mode) {
|
||||||
|
|||||||
@@ -20,10 +20,12 @@
|
|||||||
// comes from the exact same library instance. This is not always the case
|
// comes from the exact same library instance. This is not always the case
|
||||||
// and so we must sanitize the input to ensure that it is compatible.
|
// and so we must sanitize the input to ensure that it is compatible.
|
||||||
|
|
||||||
|
import { BufferType, Data } from "apache-arrow";
|
||||||
import type { IntBitWidth, TKeys, TimeBitWidth } from "apache-arrow/type";
|
import type { IntBitWidth, TKeys, TimeBitWidth } from "apache-arrow/type";
|
||||||
import {
|
import {
|
||||||
Binary,
|
Binary,
|
||||||
Bool,
|
Bool,
|
||||||
|
DataLike,
|
||||||
DataType,
|
DataType,
|
||||||
DateDay,
|
DateDay,
|
||||||
DateMillisecond,
|
DateMillisecond,
|
||||||
@@ -56,9 +58,14 @@ import {
|
|||||||
Map_,
|
Map_,
|
||||||
Null,
|
Null,
|
||||||
type Precision,
|
type Precision,
|
||||||
|
RecordBatch,
|
||||||
|
RecordBatchLike,
|
||||||
Schema,
|
Schema,
|
||||||
|
SchemaLike,
|
||||||
SparseUnion,
|
SparseUnion,
|
||||||
Struct,
|
Struct,
|
||||||
|
Table,
|
||||||
|
TableLike,
|
||||||
Time,
|
Time,
|
||||||
TimeMicrosecond,
|
TimeMicrosecond,
|
||||||
TimeMillisecond,
|
TimeMillisecond,
|
||||||
@@ -488,7 +495,7 @@ export function sanitizeField(fieldLike: unknown): Field {
|
|||||||
* instance because they might be using a different instance of apache-arrow
|
* instance because they might be using a different instance of apache-arrow
|
||||||
* than lancedb is using.
|
* than lancedb is using.
|
||||||
*/
|
*/
|
||||||
export function sanitizeSchema(schemaLike: unknown): Schema {
|
export function sanitizeSchema(schemaLike: SchemaLike): Schema {
|
||||||
if (schemaLike instanceof Schema) {
|
if (schemaLike instanceof Schema) {
|
||||||
return schemaLike;
|
return schemaLike;
|
||||||
}
|
}
|
||||||
@@ -514,3 +521,68 @@ export function sanitizeSchema(schemaLike: unknown): Schema {
|
|||||||
);
|
);
|
||||||
return new Schema(sanitizedFields, metadata);
|
return new Schema(sanitizedFields, metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function sanitizeTable(tableLike: TableLike): Table {
|
||||||
|
if (tableLike instanceof Table) {
|
||||||
|
return tableLike;
|
||||||
|
}
|
||||||
|
if (typeof tableLike !== "object" || tableLike === null) {
|
||||||
|
throw Error("Expected a Table but object was null/undefined");
|
||||||
|
}
|
||||||
|
if (!("schema" in tableLike)) {
|
||||||
|
throw Error(
|
||||||
|
"The table passed in does not appear to be a table (no 'schema' property)",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (!("batches" in tableLike)) {
|
||||||
|
throw Error(
|
||||||
|
"The table passed in does not appear to be a table (no 'columns' property)",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const schema = sanitizeSchema(tableLike.schema);
|
||||||
|
|
||||||
|
const batches = tableLike.batches.map(sanitizeRecordBatch);
|
||||||
|
return new Table(schema, batches);
|
||||||
|
}
|
||||||
|
|
||||||
|
function sanitizeRecordBatch(batchLike: RecordBatchLike): RecordBatch {
|
||||||
|
if (batchLike instanceof RecordBatch) {
|
||||||
|
return batchLike;
|
||||||
|
}
|
||||||
|
if (typeof batchLike !== "object" || batchLike === null) {
|
||||||
|
throw Error("Expected a RecordBatch but object was null/undefined");
|
||||||
|
}
|
||||||
|
if (!("schema" in batchLike)) {
|
||||||
|
throw Error(
|
||||||
|
"The record batch passed in does not appear to be a record batch (no 'schema' property)",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (!("data" in batchLike)) {
|
||||||
|
throw Error(
|
||||||
|
"The record batch passed in does not appear to be a record batch (no 'data' property)",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const schema = sanitizeSchema(batchLike.schema);
|
||||||
|
const data = sanitizeData(batchLike.data);
|
||||||
|
return new RecordBatch(schema, data);
|
||||||
|
}
|
||||||
|
function sanitizeData(
|
||||||
|
dataLike: DataLike,
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||||
|
): import("apache-arrow").Data<Struct<any>> {
|
||||||
|
if (dataLike instanceof Data) {
|
||||||
|
return dataLike;
|
||||||
|
}
|
||||||
|
return new Data(
|
||||||
|
dataLike.type,
|
||||||
|
dataLike.offset,
|
||||||
|
dataLike.length,
|
||||||
|
dataLike.nullCount,
|
||||||
|
{
|
||||||
|
[BufferType.OFFSET]: dataLike.valueOffsets,
|
||||||
|
[BufferType.DATA]: dataLike.values,
|
||||||
|
[BufferType.VALIDITY]: dataLike.nullBitmap,
|
||||||
|
[BufferType.TYPE]: dataLike.typeIds,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ import {
|
|||||||
Data,
|
Data,
|
||||||
IntoVector,
|
IntoVector,
|
||||||
Schema,
|
Schema,
|
||||||
|
TableLike,
|
||||||
fromDataToBuffer,
|
fromDataToBuffer,
|
||||||
fromTableToBuffer,
|
fromTableToBuffer,
|
||||||
fromTableToStreamBuffer,
|
fromTableToStreamBuffer,
|
||||||
@@ -38,6 +39,8 @@ import {
|
|||||||
Table as _NativeTable,
|
Table as _NativeTable,
|
||||||
} from "./native";
|
} from "./native";
|
||||||
import { Query, VectorQuery } from "./query";
|
import { Query, VectorQuery } from "./query";
|
||||||
|
import { sanitizeTable } from "./sanitize";
|
||||||
|
export { IndexConfig } from "./native";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Options for adding data to a table.
|
* Options for adding data to a table.
|
||||||
@@ -381,8 +384,7 @@ export abstract class Table {
|
|||||||
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
|
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
|
||||||
|
|
||||||
static async parseTableData(
|
static async parseTableData(
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
data: Record<string, unknown>[] | TableLike,
|
||||||
data: Record<string, unknown>[] | ArrowTable<any>,
|
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
streaming = false,
|
streaming = false,
|
||||||
) {
|
) {
|
||||||
@@ -395,9 +397,9 @@ export abstract class Table {
|
|||||||
|
|
||||||
let table: ArrowTable;
|
let table: ArrowTable;
|
||||||
if (isArrowTable(data)) {
|
if (isArrowTable(data)) {
|
||||||
table = data;
|
table = sanitizeTable(data);
|
||||||
} else {
|
} else {
|
||||||
table = makeArrowTable(data, options);
|
table = makeArrowTable(data as Record<string, unknown>[], options);
|
||||||
}
|
}
|
||||||
if (streaming) {
|
if (streaming) {
|
||||||
const buf = await fromTableToStreamBuffer(
|
const buf = await fromTableToStreamBuffer(
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.5.2-final.1",
|
"version": "0.6.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.5.2-final.1",
|
"version": "0.6.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.5.2-final.1",
|
"version": "0.6.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.5.2-final.1",
|
"version": "0.6.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.5.2-final.1",
|
"version": "0.6.0",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
"vector database",
|
"vector database",
|
||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"version": "0.5.2-final.1",
|
"version": "0.6.0",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.9.0-beta.8"
|
current_version = "0.9.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.9.0-beta.8"
|
version = "0.9.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -19,8 +19,6 @@ lancedb = { path = "../rust/lancedb" }
|
|||||||
env_logger = "0.10"
|
env_logger = "0.10"
|
||||||
pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
|
pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
|
||||||
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
|
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
|
||||||
base64ct = "=1.6.0" # workaround for https://github.com/RustCrypto/formats/issues/1684
|
|
||||||
chrono = "=0.4.39"
|
|
||||||
|
|
||||||
# Prevent dynamic linking of lzma, which comes from datafusion
|
# Prevent dynamic linking of lzma, which comes from datafusion
|
||||||
lzma-sys = { version = "*", features = ["static"] }
|
lzma-sys = { version = "*", features = ["static"] }
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ dependencies = [
|
|||||||
"packaging",
|
"packaging",
|
||||||
"cachetools",
|
"cachetools",
|
||||||
"overrides>=0.7",
|
"overrides>=0.7",
|
||||||
"urllib3==1.26.19"
|
|
||||||
]
|
]
|
||||||
description = "lancedb"
|
description = "lancedb"
|
||||||
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
||||||
|
|||||||
@@ -35,7 +35,6 @@ def connect(
|
|||||||
host_override: Optional[str] = None,
|
host_override: Optional[str] = None,
|
||||||
read_consistency_interval: Optional[timedelta] = None,
|
read_consistency_interval: Optional[timedelta] = None,
|
||||||
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
|
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> DBConnection:
|
) -> DBConnection:
|
||||||
"""Connect to a LanceDB database.
|
"""Connect to a LanceDB database.
|
||||||
@@ -71,9 +70,6 @@ def connect(
|
|||||||
executor will be used for making requests. This is for LanceDB Cloud
|
executor will be used for making requests. This is for LanceDB Cloud
|
||||||
only and is only used when making batch requests (i.e., passing in
|
only and is only used when making batch requests (i.e., passing in
|
||||||
multiple queries to the search method at once).
|
multiple queries to the search method at once).
|
||||||
storage_options: dict, optional
|
|
||||||
Additional options for the storage backend. See available options at
|
|
||||||
https://lancedb.github.io/lancedb/guides/storage/
|
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -109,16 +105,12 @@ def connect(
|
|||||||
region,
|
region,
|
||||||
host_override,
|
host_override,
|
||||||
request_thread_pool=request_thread_pool,
|
request_thread_pool=request_thread_pool,
|
||||||
storage_options=storage_options,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
if kwargs:
|
if kwargs:
|
||||||
raise ValueError(f"Unknown keyword arguments: {kwargs}")
|
raise ValueError(f"Unknown keyword arguments: {kwargs}")
|
||||||
return LanceDBConnection(
|
return LanceDBConnection(uri, read_consistency_interval=read_consistency_interval)
|
||||||
uri,
|
|
||||||
read_consistency_interval=read_consistency_interval,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def connect_async(
|
async def connect_async(
|
||||||
|
|||||||
@@ -117,8 +117,6 @@ class Query(pydantic.BaseModel):
|
|||||||
|
|
||||||
with_row_id: bool = False
|
with_row_id: bool = False
|
||||||
|
|
||||||
fast_search: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
class LanceQueryBuilder(ABC):
|
class LanceQueryBuilder(ABC):
|
||||||
"""An abstract query builder. Subclasses are defined for vector search,
|
"""An abstract query builder. Subclasses are defined for vector search,
|
||||||
@@ -127,14 +125,12 @@ class LanceQueryBuilder(ABC):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(
|
def create(
|
||||||
cls,
|
cls,
|
||||||
table: "Table",
|
table: "Table",
|
||||||
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
|
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
|
||||||
query_type: str,
|
query_type: str,
|
||||||
vector_column_name: str,
|
vector_column_name: str,
|
||||||
ordering_field_name: Optional[str] = None,
|
ordering_field_name: str = None,
|
||||||
fts_columns: Union[str, List[str]] = [],
|
|
||||||
fast_search: bool = False,
|
|
||||||
) -> LanceQueryBuilder:
|
) -> LanceQueryBuilder:
|
||||||
"""
|
"""
|
||||||
Create a query builder based on the given query and query type.
|
Create a query builder based on the given query and query type.
|
||||||
@@ -151,19 +147,14 @@ class LanceQueryBuilder(ABC):
|
|||||||
If "auto", the query type is inferred based on the query.
|
If "auto", the query type is inferred based on the query.
|
||||||
vector_column_name: str
|
vector_column_name: str
|
||||||
The name of the vector column to use for vector search.
|
The name of the vector column to use for vector search.
|
||||||
fast_search: bool
|
|
||||||
Skip flat search of unindexed data.
|
|
||||||
"""
|
"""
|
||||||
# Check hybrid search first as it supports empty query pattern
|
|
||||||
if query_type == "hybrid":
|
|
||||||
# hybrid fts and vector query
|
|
||||||
return LanceHybridQueryBuilder(
|
|
||||||
table, query, vector_column_name, fts_columns=fts_columns
|
|
||||||
)
|
|
||||||
|
|
||||||
if query is None:
|
if query is None:
|
||||||
return LanceEmptyQueryBuilder(table)
|
return LanceEmptyQueryBuilder(table)
|
||||||
|
|
||||||
|
if query_type == "hybrid":
|
||||||
|
# hybrid fts and vector query
|
||||||
|
return LanceHybridQueryBuilder(table, query, vector_column_name)
|
||||||
|
|
||||||
# remember the string query for reranking purpose
|
# remember the string query for reranking purpose
|
||||||
str_query = query if isinstance(query, str) else None
|
str_query = query if isinstance(query, str) else None
|
||||||
|
|
||||||
@@ -174,17 +165,12 @@ class LanceQueryBuilder(ABC):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if query_type == "hybrid":
|
if query_type == "hybrid":
|
||||||
return LanceHybridQueryBuilder(
|
return LanceHybridQueryBuilder(table, query, vector_column_name)
|
||||||
table, query, vector_column_name, fts_columns=fts_columns
|
|
||||||
)
|
|
||||||
|
|
||||||
if isinstance(query, str):
|
if isinstance(query, str):
|
||||||
# fts
|
# fts
|
||||||
return LanceFtsQueryBuilder(
|
return LanceFtsQueryBuilder(
|
||||||
table,
|
table, query, ordering_field_name=ordering_field_name
|
||||||
query,
|
|
||||||
ordering_field_name=ordering_field_name,
|
|
||||||
fts_columns=fts_columns,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(query, list):
|
if isinstance(query, list):
|
||||||
@@ -194,9 +180,7 @@ class LanceQueryBuilder(ABC):
|
|||||||
else:
|
else:
|
||||||
raise TypeError(f"Unsupported query type: {type(query)}")
|
raise TypeError(f"Unsupported query type: {type(query)}")
|
||||||
|
|
||||||
return LanceVectorQueryBuilder(
|
return LanceVectorQueryBuilder(table, query, vector_column_name, str_query)
|
||||||
table, query, vector_column_name, str_query, fast_search
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _resolve_query(cls, table, query, query_type, vector_column_name):
|
def _resolve_query(cls, table, query, query_type, vector_column_name):
|
||||||
@@ -212,6 +196,8 @@ class LanceQueryBuilder(ABC):
|
|||||||
elif query_type == "auto":
|
elif query_type == "auto":
|
||||||
if isinstance(query, (list, np.ndarray)):
|
if isinstance(query, (list, np.ndarray)):
|
||||||
return query, "vector"
|
return query, "vector"
|
||||||
|
if isinstance(query, tuple):
|
||||||
|
return query, "hybrid"
|
||||||
else:
|
else:
|
||||||
conf = table.embedding_functions.get(vector_column_name)
|
conf = table.embedding_functions.get(vector_column_name)
|
||||||
if conf is not None:
|
if conf is not None:
|
||||||
@@ -238,14 +224,9 @@ class LanceQueryBuilder(ABC):
|
|||||||
def __init__(self, table: "Table"):
|
def __init__(self, table: "Table"):
|
||||||
self._table = table
|
self._table = table
|
||||||
self._limit = 10
|
self._limit = 10
|
||||||
self._offset = 0
|
|
||||||
self._columns = None
|
self._columns = None
|
||||||
self._where = None
|
self._where = None
|
||||||
self._prefilter = False
|
|
||||||
self._with_row_id = False
|
self._with_row_id = False
|
||||||
self._vector = None
|
|
||||||
self._text = None
|
|
||||||
self._ef = None
|
|
||||||
|
|
||||||
@deprecation.deprecated(
|
@deprecation.deprecated(
|
||||||
deprecated_in="0.3.1",
|
deprecated_in="0.3.1",
|
||||||
@@ -356,13 +337,11 @@ class LanceQueryBuilder(ABC):
|
|||||||
----------
|
----------
|
||||||
limit: int
|
limit: int
|
||||||
The maximum number of results to return.
|
The maximum number of results to return.
|
||||||
The default query limit is 10 results.
|
By default the query is limited to the first 10.
|
||||||
For ANN/KNN queries, you must specify a limit.
|
Call this method and pass 0, a negative value,
|
||||||
Entering 0, a negative number, or None will reset
|
or None to remove the limit.
|
||||||
the limit to the default value of 10.
|
*WARNING* if you have a large dataset, removing
|
||||||
*WARNING* if you have a large dataset, setting
|
the limit can potentially result in reading a
|
||||||
the limit to a large number, e.g. the table size,
|
|
||||||
can potentially result in reading a
|
|
||||||
large amount of data into memory and cause
|
large amount of data into memory and cause
|
||||||
out of memory issues.
|
out of memory issues.
|
||||||
|
|
||||||
@@ -372,33 +351,11 @@ class LanceQueryBuilder(ABC):
|
|||||||
The LanceQueryBuilder object.
|
The LanceQueryBuilder object.
|
||||||
"""
|
"""
|
||||||
if limit is None or limit <= 0:
|
if limit is None or limit <= 0:
|
||||||
if isinstance(self, LanceVectorQueryBuilder):
|
self._limit = None
|
||||||
raise ValueError("Limit is required for ANN/KNN queries")
|
|
||||||
else:
|
|
||||||
self._limit = None
|
|
||||||
else:
|
else:
|
||||||
self._limit = limit
|
self._limit = limit
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def offset(self, offset: int) -> LanceQueryBuilder:
|
|
||||||
"""Set the offset for the results.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
offset: int
|
|
||||||
The offset to start fetching results from.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
LanceQueryBuilder
|
|
||||||
The LanceQueryBuilder object.
|
|
||||||
"""
|
|
||||||
if offset is None or offset <= 0:
|
|
||||||
self._offset = 0
|
|
||||||
else:
|
|
||||||
self._offset = offset
|
|
||||||
return self
|
|
||||||
|
|
||||||
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
|
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
|
||||||
"""Set the columns to return.
|
"""Set the columns to return.
|
||||||
|
|
||||||
@@ -460,80 +417,6 @@ class LanceQueryBuilder(ABC):
|
|||||||
self._with_row_id = with_row_id
|
self._with_row_id = with_row_id
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def explain_plan(self, verbose: Optional[bool] = False) -> str:
|
|
||||||
"""Return the execution plan for this query.
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
>>> import lancedb
|
|
||||||
>>> db = lancedb.connect("./.lancedb")
|
|
||||||
>>> table = db.create_table("my_table", [{"vector": [99, 99]}])
|
|
||||||
>>> query = [100, 100]
|
|
||||||
>>> plan = table.search(query).explain_plan(True)
|
|
||||||
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
|
||||||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
|
||||||
GlobalLimitExec: skip=0, fetch=10
|
|
||||||
FilterExec: _distance@2 IS NOT NULL
|
|
||||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
|
||||||
KNNVectorDistance: metric=l2
|
|
||||||
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
verbose : bool, default False
|
|
||||||
Use a verbose output format.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
plan : str
|
|
||||||
""" # noqa: E501
|
|
||||||
ds = self._table.to_lance()
|
|
||||||
return ds.scanner(
|
|
||||||
nearest={
|
|
||||||
"column": self._vector_column,
|
|
||||||
"q": self._query,
|
|
||||||
"k": self._limit,
|
|
||||||
"metric": self._metric,
|
|
||||||
"nprobes": self._nprobes,
|
|
||||||
"refine_factor": self._refine_factor,
|
|
||||||
},
|
|
||||||
prefilter=self._prefilter,
|
|
||||||
filter=self._str_query,
|
|
||||||
limit=self._limit,
|
|
||||||
with_row_id=self._with_row_id,
|
|
||||||
offset=self._offset,
|
|
||||||
).explain_plan(verbose)
|
|
||||||
|
|
||||||
def vector(self, vector: Union[np.ndarray, list]) -> LanceQueryBuilder:
|
|
||||||
"""Set the vector to search for.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
vector: np.ndarray or list
|
|
||||||
The vector to search for.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
LanceQueryBuilder
|
|
||||||
The LanceQueryBuilder object.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def text(self, text: str) -> LanceQueryBuilder:
|
|
||||||
"""Set the text to search for.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
text: str
|
|
||||||
The text to search for.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
LanceQueryBuilder
|
|
||||||
The LanceQueryBuilder object.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
class LanceVectorQueryBuilder(LanceQueryBuilder):
|
class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||||
"""
|
"""
|
||||||
@@ -557,12 +440,11 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
table: "Table",
|
table: "Table",
|
||||||
query: Union[np.ndarray, list, "PIL.Image.Image"],
|
query: Union[np.ndarray, list, "PIL.Image.Image"],
|
||||||
vector_column: str,
|
vector_column: str,
|
||||||
str_query: Optional[str] = None,
|
str_query: Optional[str] = None,
|
||||||
fast_search: bool = False,
|
|
||||||
):
|
):
|
||||||
super().__init__(table)
|
super().__init__(table)
|
||||||
self._query = query
|
self._query = query
|
||||||
@@ -573,14 +455,13 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
self._prefilter = False
|
self._prefilter = False
|
||||||
self._reranker = None
|
self._reranker = None
|
||||||
self._str_query = str_query
|
self._str_query = str_query
|
||||||
self._fast_search = fast_search
|
|
||||||
|
|
||||||
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
|
def metric(self, metric: Literal["L2", "cosine"]) -> LanceVectorQueryBuilder:
|
||||||
"""Set the distance metric to use.
|
"""Set the distance metric to use.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
metric: "L2" or "cosine" or "dot"
|
metric: "L2" or "cosine"
|
||||||
The distance metric to use. By default "L2" is used.
|
The distance metric to use. By default "L2" is used.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
@@ -588,7 +469,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
LanceVectorQueryBuilder
|
LanceVectorQueryBuilder
|
||||||
The LanceQueryBuilder object.
|
The LanceQueryBuilder object.
|
||||||
"""
|
"""
|
||||||
self._metric = metric.lower()
|
self._metric = metric
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
|
def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
|
||||||
@@ -613,28 +494,6 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
self._nprobes = nprobes
|
self._nprobes = nprobes
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def ef(self, ef: int) -> LanceVectorQueryBuilder:
|
|
||||||
"""Set the number of candidates to consider during search.
|
|
||||||
|
|
||||||
Higher values will yield better recall (more likely to find vectors if
|
|
||||||
they exist) at the expense of latency.
|
|
||||||
|
|
||||||
This only applies to the HNSW-related index.
|
|
||||||
The default value is 1.5 * limit.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
ef: int
|
|
||||||
The number of candidates to consider during search.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
LanceVectorQueryBuilder
|
|
||||||
The LanceQueryBuilder object.
|
|
||||||
"""
|
|
||||||
self._ef = ef
|
|
||||||
return self
|
|
||||||
|
|
||||||
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
|
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
|
||||||
"""Set the refine factor to use, increasing the number of vectors sampled.
|
"""Set the refine factor to use, increasing the number of vectors sampled.
|
||||||
|
|
||||||
@@ -695,11 +554,15 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
refine_factor=self._refine_factor,
|
refine_factor=self._refine_factor,
|
||||||
vector_column=self._vector_column,
|
vector_column=self._vector_column,
|
||||||
with_row_id=self._with_row_id,
|
with_row_id=self._with_row_id,
|
||||||
offset=self._offset,
|
|
||||||
fast_search=self._fast_search,
|
|
||||||
ef=self._ef,
|
|
||||||
)
|
)
|
||||||
result_set = self._table._execute_query(query, batch_size)
|
result_set = self._table._execute_query(query, batch_size)
|
||||||
|
if self._reranker is not None:
|
||||||
|
rs_table = result_set.read_all()
|
||||||
|
result_set = self._reranker.rerank_vector(self._str_query, rs_table)
|
||||||
|
# convert result_set back to RecordBatchReader
|
||||||
|
result_set = pa.RecordBatchReader.from_batches(
|
||||||
|
result_set.schema, result_set.to_batches()
|
||||||
|
)
|
||||||
|
|
||||||
return result_set
|
return result_set
|
||||||
|
|
||||||
@@ -728,7 +591,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def rerank(
|
def rerank(
|
||||||
self, reranker: Reranker, query_string: Optional[str] = None
|
self, reranker: Reranker, query_string: Optional[str] = None
|
||||||
) -> LanceVectorQueryBuilder:
|
) -> LanceVectorQueryBuilder:
|
||||||
"""Rerank the results using the specified reranker.
|
"""Rerank the results using the specified reranker.
|
||||||
|
|
||||||
@@ -893,34 +756,12 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
|||||||
|
|
||||||
class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
||||||
def to_arrow(self) -> pa.Table:
|
def to_arrow(self) -> pa.Table:
|
||||||
return self.to_batches().read_all()
|
ds = self._table.to_lance()
|
||||||
|
return ds.to_table(
|
||||||
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
|
|
||||||
query = Query(
|
|
||||||
columns=self._columns,
|
columns=self._columns,
|
||||||
filter=self._where,
|
filter=self._where,
|
||||||
k=self._limit or 10,
|
limit=self._limit,
|
||||||
with_row_id=self._with_row_id,
|
|
||||||
vector=[],
|
|
||||||
# not actually respected in remote query
|
|
||||||
offset=self._offset or 0,
|
|
||||||
)
|
)
|
||||||
return self._table._execute_query(query)
|
|
||||||
|
|
||||||
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
|
|
||||||
"""Rerank the results using the specified reranker.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
reranker: Reranker
|
|
||||||
The reranker to use.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
LanceEmptyQueryBuilder
|
|
||||||
The LanceQueryBuilder object.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError("Reranking is not yet supported.")
|
|
||||||
|
|
||||||
|
|
||||||
class LanceHybridQueryBuilder(LanceQueryBuilder):
|
class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||||
|
|||||||
@@ -55,13 +55,11 @@ class RestfulLanceDBClient:
|
|||||||
region: str
|
region: str
|
||||||
api_key: Credential
|
api_key: Credential
|
||||||
host_override: Optional[str] = attrs.field(default=None)
|
host_override: Optional[str] = attrs.field(default=None)
|
||||||
db_prefix: Optional[str] = attrs.field(default=None)
|
|
||||||
|
|
||||||
closed: bool = attrs.field(default=False, init=False)
|
closed: bool = attrs.field(default=False, init=False)
|
||||||
|
|
||||||
connection_timeout: float = attrs.field(default=120.0, kw_only=True)
|
connection_timeout: float = attrs.field(default=120.0, kw_only=True)
|
||||||
read_timeout: float = attrs.field(default=300.0, kw_only=True)
|
read_timeout: float = attrs.field(default=300.0, kw_only=True)
|
||||||
storage_options: Optional[Dict[str, str]] = attrs.field(default=None, kw_only=True)
|
|
||||||
|
|
||||||
@functools.cached_property
|
@functools.cached_property
|
||||||
def session(self) -> requests.Session:
|
def session(self) -> requests.Session:
|
||||||
@@ -94,18 +92,6 @@ class RestfulLanceDBClient:
|
|||||||
headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
|
headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
|
||||||
if self.host_override:
|
if self.host_override:
|
||||||
headers["x-lancedb-database"] = self.db_name
|
headers["x-lancedb-database"] = self.db_name
|
||||||
if self.storage_options:
|
|
||||||
if self.storage_options.get("account_name") is not None:
|
|
||||||
headers["x-azure-storage-account-name"] = self.storage_options[
|
|
||||||
"account_name"
|
|
||||||
]
|
|
||||||
if self.storage_options.get("azure_storage_account_name") is not None:
|
|
||||||
headers["x-azure-storage-account-name"] = self.storage_options[
|
|
||||||
"azure_storage_account_name"
|
|
||||||
]
|
|
||||||
if self.db_prefix:
|
|
||||||
headers["x-lancedb-database-prefix"] = self.db_prefix
|
|
||||||
|
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -172,7 +158,6 @@ class RestfulLanceDBClient:
|
|||||||
headers["content-type"] = content_type
|
headers["content-type"] = content_type
|
||||||
if request_id is not None:
|
if request_id is not None:
|
||||||
headers["x-request-id"] = request_id
|
headers["x-request-id"] = request_id
|
||||||
|
|
||||||
with self.session.post(
|
with self.session.post(
|
||||||
urljoin(self.url, uri),
|
urljoin(self.url, uri),
|
||||||
headers=headers,
|
headers=headers,
|
||||||
@@ -260,6 +245,7 @@ def retry_adapter(options: Dict[str, Any]) -> HTTPAdapter:
|
|||||||
connect=connect_retries,
|
connect=connect_retries,
|
||||||
read=read_retries,
|
read=read_retries,
|
||||||
backoff_factor=backoff_factor,
|
backoff_factor=backoff_factor,
|
||||||
|
backoff_jitter=backoff_jitter,
|
||||||
status_forcelist=statuses,
|
status_forcelist=statuses,
|
||||||
allowed_methods=methods,
|
allowed_methods=methods,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ import inspect
|
|||||||
import logging
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from typing import Dict, Iterable, List, Optional, Union
|
from typing import Iterable, List, Optional, Union
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from cachetools import TTLCache
|
from cachetools import TTLCache
|
||||||
@@ -44,25 +44,20 @@ class RemoteDBConnection(DBConnection):
|
|||||||
request_thread_pool: Optional[ThreadPoolExecutor] = None,
|
request_thread_pool: Optional[ThreadPoolExecutor] = None,
|
||||||
connection_timeout: float = 120.0,
|
connection_timeout: float = 120.0,
|
||||||
read_timeout: float = 300.0,
|
read_timeout: float = 300.0,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
|
||||||
):
|
):
|
||||||
"""Connect to a remote LanceDB database."""
|
"""Connect to a remote LanceDB database."""
|
||||||
parsed = urlparse(db_url)
|
parsed = urlparse(db_url)
|
||||||
if parsed.scheme != "db":
|
if parsed.scheme != "db":
|
||||||
raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://")
|
raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://")
|
||||||
self.db_name = parsed.netloc
|
self.db_name = parsed.netloc
|
||||||
prefix = parsed.path.lstrip("/")
|
|
||||||
self.db_prefix = None if not prefix else prefix
|
|
||||||
self.api_key = api_key
|
self.api_key = api_key
|
||||||
self._client = RestfulLanceDBClient(
|
self._client = RestfulLanceDBClient(
|
||||||
self.db_name,
|
self.db_name,
|
||||||
region,
|
region,
|
||||||
api_key,
|
api_key,
|
||||||
host_override,
|
host_override,
|
||||||
self.db_prefix,
|
|
||||||
connection_timeout=connection_timeout,
|
connection_timeout=connection_timeout,
|
||||||
read_timeout=read_timeout,
|
read_timeout=read_timeout,
|
||||||
storage_options=storage_options,
|
|
||||||
)
|
)
|
||||||
self._request_thread_pool = request_thread_pool
|
self._request_thread_pool = request_thread_pool
|
||||||
self._table_cache = TTLCache(maxsize=10000, ttl=300)
|
self._table_cache = TTLCache(maxsize=10000, ttl=300)
|
||||||
|
|||||||
@@ -15,14 +15,13 @@ import logging
|
|||||||
import uuid
|
import uuid
|
||||||
from concurrent.futures import Future
|
from concurrent.futures import Future
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import Dict, Iterable, Optional, Union, Literal
|
from typing import Dict, Iterable, Optional, Union
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from lance import json_to_schema
|
from lance import json_to_schema
|
||||||
|
|
||||||
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
|
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||||
from lancedb.merge import LanceMergeInsertBuilder
|
from lancedb.merge import LanceMergeInsertBuilder
|
||||||
from lancedb.query import LanceQueryBuilder
|
|
||||||
|
|
||||||
from ..query import LanceVectorQueryBuilder
|
from ..query import LanceVectorQueryBuilder
|
||||||
from ..table import Query, Table, _sanitize_data
|
from ..table import Query, Table, _sanitize_data
|
||||||
@@ -82,7 +81,6 @@ class RemoteTable(Table):
|
|||||||
def create_scalar_index(
|
def create_scalar_index(
|
||||||
self,
|
self,
|
||||||
column: str,
|
column: str,
|
||||||
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
|
|
||||||
):
|
):
|
||||||
"""Creates a scalar index
|
"""Creates a scalar index
|
||||||
Parameters
|
Parameters
|
||||||
@@ -91,6 +89,8 @@ class RemoteTable(Table):
|
|||||||
The column to be indexed. Must be a boolean, integer, float,
|
The column to be indexed. Must be a boolean, integer, float,
|
||||||
or string column.
|
or string column.
|
||||||
"""
|
"""
|
||||||
|
index_type = "scalar"
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"column": column,
|
"column": column,
|
||||||
"index_type": index_type,
|
"index_type": index_type,
|
||||||
@@ -228,21 +228,10 @@ class RemoteTable(Table):
|
|||||||
content_type=ARROW_STREAM_CONTENT_TYPE,
|
content_type=ARROW_STREAM_CONTENT_TYPE,
|
||||||
)
|
)
|
||||||
|
|
||||||
def query(
|
|
||||||
self,
|
|
||||||
query: Union[VEC, str] = None,
|
|
||||||
query_type: str = "vector",
|
|
||||||
vector_column_name: Optional[str] = None,
|
|
||||||
fast_search: bool = False,
|
|
||||||
) -> LanceVectorQueryBuilder:
|
|
||||||
return self.search(query, query_type, vector_column_name, fast_search)
|
|
||||||
|
|
||||||
def search(
|
def search(
|
||||||
self,
|
self,
|
||||||
query: Union[VEC, str] = None,
|
query: Union[VEC, str],
|
||||||
query_type: str = "vector",
|
|
||||||
vector_column_name: Optional[str] = None,
|
vector_column_name: Optional[str] = None,
|
||||||
fast_search: bool = False,
|
|
||||||
) -> LanceVectorQueryBuilder:
|
) -> LanceVectorQueryBuilder:
|
||||||
"""Create a search query to find the nearest neighbors
|
"""Create a search query to find the nearest neighbors
|
||||||
of the given query vector. We currently support [vector search][search]
|
of the given query vector. We currently support [vector search][search]
|
||||||
@@ -289,11 +278,6 @@ class RemoteTable(Table):
|
|||||||
- If the table has multiple vector columns then the *vector_column_name*
|
- If the table has multiple vector columns then the *vector_column_name*
|
||||||
needs to be specified. Otherwise, an error is raised.
|
needs to be specified. Otherwise, an error is raised.
|
||||||
|
|
||||||
fast_search: bool, optional
|
|
||||||
Skip a flat search of unindexed data. This may improve
|
|
||||||
search performance but search results will not include unindexed data.
|
|
||||||
|
|
||||||
- *default False*.
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
LanceQueryBuilder
|
LanceQueryBuilder
|
||||||
@@ -309,14 +293,7 @@ class RemoteTable(Table):
|
|||||||
"""
|
"""
|
||||||
if vector_column_name is None:
|
if vector_column_name is None:
|
||||||
vector_column_name = inf_vector_column_query(self.schema)
|
vector_column_name = inf_vector_column_query(self.schema)
|
||||||
|
return LanceVectorQueryBuilder(self, query, vector_column_name)
|
||||||
return LanceQueryBuilder.create(
|
|
||||||
self,
|
|
||||||
query,
|
|
||||||
query_type,
|
|
||||||
vector_column_name=vector_column_name,
|
|
||||||
fast_search=fast_search,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _execute_query(
|
def _execute_query(
|
||||||
self, query: Query, batch_size: Optional[int] = None
|
self, query: Query, batch_size: Optional[int] = None
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ class FakeLanceDBClient:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
|
def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
|
||||||
print(f"{query=}")
|
|
||||||
assert table_name == "test"
|
assert table_name == "test"
|
||||||
t = pa.schema([]).empty_table()
|
t = pa.schema([]).empty_table()
|
||||||
return VectorQueryResult(t)
|
return VectorQueryResult(t)
|
||||||
@@ -40,21 +39,3 @@ def test_remote_db():
|
|||||||
table = conn["test"]
|
table = conn["test"]
|
||||||
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
|
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
|
||||||
table.search([1.0, 2.0]).to_pandas()
|
table.search([1.0, 2.0]).to_pandas()
|
||||||
|
|
||||||
|
|
||||||
def test_empty_query_with_filter():
|
|
||||||
conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
|
|
||||||
setattr(conn, "_client", FakeLanceDBClient())
|
|
||||||
|
|
||||||
table = conn["test"]
|
|
||||||
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
|
|
||||||
print(table.query().select(["vector"]).where("foo == bar").to_arrow())
|
|
||||||
|
|
||||||
|
|
||||||
def test_fast_search_query_with_filter():
|
|
||||||
conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
|
|
||||||
setattr(conn, "_client", FakeLanceDBClient())
|
|
||||||
|
|
||||||
table = conn["test"]
|
|
||||||
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
|
|
||||||
print(table.query([0, 0], fast_search=True).select(["vector"]).where("foo == bar").to_arrow())
|
|
||||||
|
|||||||
@@ -735,7 +735,7 @@ def test_create_scalar_index(db):
|
|||||||
indices = table.to_lance().list_indices()
|
indices = table.to_lance().list_indices()
|
||||||
assert len(indices) == 1
|
assert len(indices) == 1
|
||||||
scalar_index = indices[0]
|
scalar_index = indices[0]
|
||||||
assert scalar_index["type"] == "BTree"
|
assert scalar_index["type"] == "Scalar"
|
||||||
|
|
||||||
# Confirm that prefiltering still works with the scalar index column
|
# Confirm that prefiltering still works with the scalar index column
|
||||||
results = table.search().where("x = 'c'").to_arrow()
|
results = table.search().where("x = 'c'").to_arrow()
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.5.2-final.1"
|
version = "0.6.0"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.5.2-final.1"
|
version = "0.6.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -6,3 +6,12 @@
|
|||||||
LanceDB Rust SDK, a serverless vector database.
|
LanceDB Rust SDK, a serverless vector database.
|
||||||
|
|
||||||
Read more at: https://lancedb.com/
|
Read more at: https://lancedb.com/
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> A transitive dependency of `lancedb` is `lzma-sys`, which uses dynamic linking
|
||||||
|
> by default. If you want to statically link `lzma-sys`, you should activate it's
|
||||||
|
> `static` feature by adding the following to your dependencies:
|
||||||
|
>
|
||||||
|
> ```toml
|
||||||
|
> lzma-sys = { version = "*", features = ["static"] }
|
||||||
|
> ```
|
||||||
|
|||||||
@@ -1889,6 +1889,7 @@ impl TableInternal for NativeTable {
|
|||||||
}
|
}
|
||||||
columns.push(field.name.clone());
|
columns.push(field.name.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
let index_type = if is_vector {
|
let index_type = if is_vector {
|
||||||
crate::index::IndexType::IvfPq
|
crate::index::IndexType::IvfPq
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
Reference in New Issue
Block a user