Compare commits

..

7 Commits

Author SHA1 Message Date
albertlockett
dcfa17c9fc temporarily use local dependencies 2024-06-26 15:28:30 -03:00
Cory Grinstead
79a1667753 feat(nodejs): feature parity [6/N] - make public interface work with multiple arrow versions (#1392)
previously we didnt have great compatibility with other versions of
apache arrow. This should bridge that gap a bit.


depends on https://github.com/lancedb/lancedb/pull/1391
see actual diff here
https://github.com/universalmind303/lancedb/compare/query-filter...universalmind303:arrow-compatibility
2024-06-25 11:10:08 -05:00
Thomas J. Fan
a866b78a31 docs: fixes polars formatting in docs (#1400)
Currently, the whole polars section is formatted as a code block:
https://lancedb.github.io/lancedb/guides/tables/#from-a-polars-dataframe

This PR fixes the formatting.
2024-06-25 08:46:16 -07:00
Will Jones
c7d37b3e6e docs: add tip about lzma linking (#1397)
Similar to https://github.com/lancedb/lance/pull/2505
2024-06-25 08:20:31 -07:00
Lance Release
4b71552b73 Updating package-lock.json 2024-06-25 00:26:08 +00:00
Lance Release
5ce5f64da3 Bump version: 0.6.0-beta.0 → 0.6.0 2024-06-25 00:25:45 +00:00
Lance Release
c582b0fc63 Bump version: 0.5.2 → 0.6.0-beta.0 2024-06-25 00:25:45 +00:00
33 changed files with 274 additions and 352 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion] [tool.bumpversion]
current_version = "0.5.2-final.1" current_version = "0.6.0"
parse = """(?x) parse = """(?x)
(?P<major>0|[1-9]\\d*)\\. (?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\. (?P<minor>0|[1-9]\\d*)\\.

View File

@@ -28,7 +28,7 @@ runs:
args: ${{ inputs.args }} args: ${{ inputs.args }}
docker-options: "-e PIP_EXTRA_INDEX_URL=https://pypi.fury.io/lancedb/" docker-options: "-e PIP_EXTRA_INDEX_URL=https://pypi.fury.io/lancedb/"
working-directory: python working-directory: python
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v3
with: with:
name: windows-wheels name: windows-wheels
path: python\target\wheels path: python\target\wheels

View File

@@ -20,11 +20,18 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"] categories = ["database-implementations"]
[workspace.dependencies] [workspace.dependencies]
lance = { "version" = "=0.13.0", "features" = ["dynamodb"] } # lance = { "version" = "=0.13.0", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.13.0" } # lance-index = { "version" = "=0.13.0" }
lance-linalg = { "version" = "=0.13.0" } # lance-linalg = { "version" = "=0.13.0" }
lance-testing = { "version" = "=0.13.0" } # lance-testing = { "version" = "=0.13.0" }
lance-datafusion = { "version" = "=0.13.0" } # lance-datafusion = { "version" = "=0.13.0" }
lance = { path = "../lance/rust/lance" }
lance-index = { path = "../lance/rust/lance-index" }
lance-linalg= { path = "../lance/rust/lance-linalg" }
lance-testing = { path = "../lance/rust/lance-testing" }
lance-datafusion = { path = "../lance/rust/lance-datafusion" }
# Note that this one does not include pyarrow # Note that this one does not include pyarrow
arrow = { version = "51.0", optional = false } arrow = { version = "51.0", optional = false }
arrow-array = "51.0" arrow-array = "51.0"
@@ -35,7 +42,7 @@ arrow-schema = "51.0"
arrow-arith = "51.0" arrow-arith = "51.0"
arrow-cast = "51.0" arrow-cast = "51.0"
async-trait = "0" async-trait = "0"
chrono = "=0.4.39" chrono = "0.4.35"
datafusion-physical-plan = "37.1" datafusion-physical-plan = "37.1"
half = { "version" = "=2.4.1", default-features = false, features = [ half = { "version" = "=2.4.1", default-features = false, features = [
"num-traits", "num-traits",

View File

@@ -116,21 +116,21 @@ This guide will show how to create tables, insert data into them, and update the
### From a Polars DataFrame ### From a Polars DataFrame
LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
under the hood. A deeper integration between LanceDB Tables and Polars DataFrames under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
is on the way. is on the way.
```python ```python
import polars as pl import polars as pl
data = pl.DataFrame({ data = pl.DataFrame({
"vector": [[3.1, 4.1], [5.9, 26.5]], "vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"], "item": ["foo", "bar"],
"price": [10.0, 20.0] "price": [10.0, 20.0]
}) })
table = db.create_table("pl_table", data=data) table = db.create_table("pl_table", data=data)
``` ```
### From an Arrow Table ### From an Arrow Table
=== "Python" === "Python"

View File

@@ -1,12 +1,12 @@
{ {
"name": "vectordb", "name": "vectordb",
"version": "0.5.2", "version": "0.6.0",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "vectordb", "name": "vectordb",
"version": "0.5.2", "version": "0.6.0",
"cpu": [ "cpu": [
"x64", "x64",
"arm64" "arm64"

View File

@@ -1,6 +1,6 @@
{ {
"name": "vectordb", "name": "vectordb",
"version": "0.5.2-final.1", "version": "0.6.0",
"description": " Serverless, low-latency vector database for AI applications", "description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",

View File

@@ -39,7 +39,9 @@ describe.each([arrow, arrowOld])("Given a table", (arrow: any) => {
let tmpDir: tmp.DirResult; let tmpDir: tmp.DirResult;
let table: Table; let table: Table;
const schema = new arrow.Schema([ const schema:
| import("apache-arrow").Schema
| import("apache-arrow-old").Schema = new arrow.Schema([
new arrow.Field("id", new arrow.Float64(), true), new arrow.Field("id", new arrow.Float64(), true),
]); ]);
@@ -315,7 +317,7 @@ describe("When creating an index", () => {
.query() .query()
.limit(2) .limit(2)
.nearestTo(queryVec) .nearestTo(queryVec)
.distanceType("DoT") .distanceType("dot")
.toArrow(); .toArrow();
expect(rst.numRows).toBe(2); expect(rst.numRows).toBe(2);

View File

@@ -15,6 +15,7 @@
import { import {
Table as ArrowTable, Table as ArrowTable,
Binary, Binary,
BufferType,
DataType, DataType,
Field, Field,
FixedSizeBinary, FixedSizeBinary,
@@ -37,14 +38,68 @@ import {
type makeTable, type makeTable,
vectorFromArray, vectorFromArray,
} from "apache-arrow"; } from "apache-arrow";
import { Buffers } from "apache-arrow/data";
import { type EmbeddingFunction } from "./embedding/embedding_function"; import { type EmbeddingFunction } from "./embedding/embedding_function";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry"; import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize"; import {
sanitizeField,
sanitizeSchema,
sanitizeTable,
sanitizeType,
} from "./sanitize";
export * from "apache-arrow"; export * from "apache-arrow";
export type SchemaLike =
| Schema
| {
fields: FieldLike[];
metadata: Map<string, string>;
get names(): unknown[];
};
export type FieldLike =
| Field
| {
type: string;
name: string;
nullable?: boolean;
metadata?: Map<string, string>;
};
export type DataLike =
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
| import("apache-arrow").Data<Struct<any>>
| {
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
type: any;
length: number;
offset: number;
stride: number;
nullable: boolean;
children: DataLike[];
get nullCount(): number;
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
values: Buffers<any>[BufferType.DATA];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
typeIds: Buffers<any>[BufferType.TYPE];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
nullBitmap: Buffers<any>[BufferType.VALIDITY];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
valueOffsets: Buffers<any>[BufferType.OFFSET];
};
export type RecordBatchLike =
| RecordBatch
| {
schema: SchemaLike;
data: DataLike;
};
export type TableLike =
| ArrowTable
| { schema: SchemaLike; batches: RecordBatchLike[] };
export type IntoVector = Float32Array | Float64Array | number[]; export type IntoVector = Float32Array | Float64Array | number[];
export function isArrowTable(value: object): value is ArrowTable { export function isArrowTable(value: object): value is TableLike {
if (value instanceof ArrowTable) return true; if (value instanceof ArrowTable) return true;
return "schema" in value && "batches" in value; return "schema" in value && "batches" in value;
} }
@@ -135,7 +190,7 @@ export function isFixedSizeList(value: unknown): value is FixedSizeList {
} }
/** Data type accepted by NodeJS SDK */ /** Data type accepted by NodeJS SDK */
export type Data = Record<string, unknown>[] | ArrowTable; export type Data = Record<string, unknown>[] | TableLike;
/* /*
* Options to control how a column should be converted to a vector array * Options to control how a column should be converted to a vector array
@@ -162,7 +217,7 @@ export class MakeArrowTableOptions {
* The schema must be specified if there are no records (e.g. to make * The schema must be specified if there are no records (e.g. to make
* an empty table) * an empty table)
*/ */
schema?: Schema; schema?: SchemaLike;
/* /*
* Mapping from vector column name to expected type * Mapping from vector column name to expected type
@@ -310,7 +365,7 @@ export function makeArrowTable(
if (opt.schema !== undefined && opt.schema !== null) { if (opt.schema !== undefined && opt.schema !== null) {
opt.schema = sanitizeSchema(opt.schema); opt.schema = sanitizeSchema(opt.schema);
opt.schema = validateSchemaEmbeddings( opt.schema = validateSchemaEmbeddings(
opt.schema, opt.schema as Schema,
data, data,
options?.embeddingFunction, options?.embeddingFunction,
); );
@@ -394,7 +449,7 @@ export function makeArrowTable(
// `new ArrowTable(schema, batches)` which does not do any schema inference // `new ArrowTable(schema, batches)` which does not do any schema inference
const firstTable = new ArrowTable(columns); const firstTable = new ArrowTable(columns);
const batchesFixed = firstTable.batches.map( const batchesFixed = firstTable.batches.map(
(batch) => new RecordBatch(opt.schema!, batch.data), (batch) => new RecordBatch(opt.schema as Schema, batch.data),
); );
let schema: Schema; let schema: Schema;
if (metadata !== undefined) { if (metadata !== undefined) {
@@ -407,9 +462,9 @@ export function makeArrowTable(
} }
} }
schema = new Schema(opt.schema.fields, schemaMetadata); schema = new Schema(opt.schema.fields as Field[], schemaMetadata);
} else { } else {
schema = opt.schema; schema = opt.schema as Schema;
} }
return new ArrowTable(schema, batchesFixed); return new ArrowTable(schema, batchesFixed);
} }
@@ -425,7 +480,7 @@ export function makeArrowTable(
* Create an empty Arrow table with the provided schema * Create an empty Arrow table with the provided schema
*/ */
export function makeEmptyTable( export function makeEmptyTable(
schema: Schema, schema: SchemaLike,
metadata?: Map<string, string>, metadata?: Map<string, string>,
): ArrowTable { ): ArrowTable {
return makeArrowTable([], { schema }, metadata); return makeArrowTable([], { schema }, metadata);
@@ -563,17 +618,16 @@ async function applyEmbeddingsFromMetadata(
async function applyEmbeddings<T>( async function applyEmbeddings<T>(
table: ArrowTable, table: ArrowTable,
embeddings?: EmbeddingFunctionConfig, embeddings?: EmbeddingFunctionConfig,
schema?: Schema, schema?: SchemaLike,
): Promise<ArrowTable> { ): Promise<ArrowTable> {
if (schema?.metadata.has("embedding_functions")) {
return applyEmbeddingsFromMetadata(table, schema!);
} else if (embeddings == null || embeddings === undefined) {
return table;
}
if (schema !== undefined && schema !== null) { if (schema !== undefined && schema !== null) {
schema = sanitizeSchema(schema); schema = sanitizeSchema(schema);
} }
if (schema?.metadata.has("embedding_functions")) {
return applyEmbeddingsFromMetadata(table, schema! as Schema);
} else if (embeddings == null || embeddings === undefined) {
return table;
}
// Convert from ArrowTable to Record<String, Vector> // Convert from ArrowTable to Record<String, Vector>
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => { const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
@@ -650,7 +704,7 @@ async function applyEmbeddings<T>(
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`, `When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
); );
} }
return alignTable(newTable, schema); return alignTable(newTable, schema as Schema);
} }
return newTable; return newTable;
} }
@@ -744,7 +798,7 @@ export async function fromRecordsToStreamBuffer(
export async function fromTableToBuffer( export async function fromTableToBuffer(
table: ArrowTable, table: ArrowTable,
embeddings?: EmbeddingFunctionConfig, embeddings?: EmbeddingFunctionConfig,
schema?: Schema, schema?: SchemaLike,
): Promise<Buffer> { ): Promise<Buffer> {
if (schema !== undefined && schema !== null) { if (schema !== undefined && schema !== null) {
schema = sanitizeSchema(schema); schema = sanitizeSchema(schema);
@@ -771,7 +825,7 @@ export async function fromDataToBuffer(
schema = sanitizeSchema(schema); schema = sanitizeSchema(schema);
} }
if (isArrowTable(data)) { if (isArrowTable(data)) {
return fromTableToBuffer(data, embeddings, schema); return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
} else { } else {
const table = await convertToTable(data, embeddings, { schema }); const table = await convertToTable(data, embeddings, { schema });
return fromTableToBuffer(table); return fromTableToBuffer(table);
@@ -789,7 +843,7 @@ export async function fromDataToBuffer(
export async function fromTableToStreamBuffer( export async function fromTableToStreamBuffer(
table: ArrowTable, table: ArrowTable,
embeddings?: EmbeddingFunctionConfig, embeddings?: EmbeddingFunctionConfig,
schema?: Schema, schema?: SchemaLike,
): Promise<Buffer> { ): Promise<Buffer> {
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema); const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings); const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
@@ -854,7 +908,6 @@ function validateSchemaEmbeddings(
for (let field of schema.fields) { for (let field of schema.fields) {
if (isFixedSizeList(field.type)) { if (isFixedSizeList(field.type)) {
field = sanitizeField(field); field = sanitizeField(field);
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) { if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
if (schema.metadata.has("embedding_functions")) { if (schema.metadata.has("embedding_functions")) {
const embeddings = JSON.parse( const embeddings = JSON.parse(

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
import { Table as ArrowTable, Data, Schema } from "./arrow"; import { Data, Schema, SchemaLike, TableLike } from "./arrow";
import { fromTableToBuffer, makeEmptyTable } from "./arrow"; import { fromTableToBuffer, makeEmptyTable } from "./arrow";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry"; import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import { Connection as LanceDbConnection } from "./native"; import { Connection as LanceDbConnection } from "./native";
@@ -50,7 +50,7 @@ export interface CreateTableOptions {
* The default is true while the new format is in beta * The default is true while the new format is in beta
*/ */
useLegacyFormat?: boolean; useLegacyFormat?: boolean;
schema?: Schema; schema?: SchemaLike;
embeddingFunction?: EmbeddingFunctionConfig; embeddingFunction?: EmbeddingFunctionConfig;
} }
@@ -167,12 +167,12 @@ export abstract class Connection {
/** /**
* Creates a new Table and initialize it with new data. * Creates a new Table and initialize it with new data.
* @param {string} name - The name of the table. * @param {string} name - The name of the table.
* @param {Record<string, unknown>[] | ArrowTable} data - Non-empty Array of Records * @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
* to be inserted into the table * to be inserted into the table
*/ */
abstract createTable( abstract createTable(
name: string, name: string,
data: Record<string, unknown>[] | ArrowTable, data: Record<string, unknown>[] | TableLike,
options?: Partial<CreateTableOptions>, options?: Partial<CreateTableOptions>,
): Promise<Table>; ): Promise<Table>;
@@ -183,7 +183,7 @@ export abstract class Connection {
*/ */
abstract createEmptyTable( abstract createEmptyTable(
name: string, name: string,
schema: Schema, schema: import("./arrow").SchemaLike,
options?: Partial<CreateTableOptions>, options?: Partial<CreateTableOptions>,
): Promise<Table>; ): Promise<Table>;
@@ -235,7 +235,7 @@ export class LocalConnection extends Connection {
nameOrOptions: nameOrOptions:
| string | string
| ({ name: string; data: Data } & Partial<CreateTableOptions>), | ({ name: string; data: Data } & Partial<CreateTableOptions>),
data?: Record<string, unknown>[] | ArrowTable, data?: Record<string, unknown>[] | TableLike,
options?: Partial<CreateTableOptions>, options?: Partial<CreateTableOptions>,
): Promise<Table> { ): Promise<Table> {
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) { if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
@@ -259,7 +259,7 @@ export class LocalConnection extends Connection {
async createEmptyTable( async createEmptyTable(
name: string, name: string,
schema: Schema, schema: import("./arrow").SchemaLike,
options?: Partial<CreateTableOptions>, options?: Partial<CreateTableOptions>,
): Promise<Table> { ): Promise<Table> {
let mode: string = options?.mode ?? "create"; let mode: string = options?.mode ?? "create";

View File

@@ -300,7 +300,9 @@ export class VectorQuery extends QueryBase<NativeVectorQuery, VectorQuery> {
* *
* By default "l2" is used. * By default "l2" is used.
*/ */
distanceType(distanceType: string): VectorQuery { distanceType(
distanceType: Required<IvfPqOptions>["distanceType"],
): VectorQuery {
this.inner.distanceType(distanceType); this.inner.distanceType(distanceType);
return this; return this;
} }

View File

@@ -1,5 +1,10 @@
import { Schema } from "apache-arrow"; import { Schema } from "apache-arrow";
import { Data, fromTableToStreamBuffer, makeEmptyTable } from "../arrow"; import {
Data,
SchemaLike,
fromTableToStreamBuffer,
makeEmptyTable,
} from "../arrow";
import { import {
Connection, Connection,
CreateTableOptions, CreateTableOptions,
@@ -156,7 +161,7 @@ export class RemoteConnection extends Connection {
async createEmptyTable( async createEmptyTable(
name: string, name: string,
schema: Schema, schema: SchemaLike,
options?: Partial<CreateTableOptions> | undefined, options?: Partial<CreateTableOptions> | undefined,
): Promise<Table> { ): Promise<Table> {
if (options?.mode) { if (options?.mode) {

View File

@@ -20,10 +20,12 @@
// comes from the exact same library instance. This is not always the case // comes from the exact same library instance. This is not always the case
// and so we must sanitize the input to ensure that it is compatible. // and so we must sanitize the input to ensure that it is compatible.
import { BufferType, Data } from "apache-arrow";
import type { IntBitWidth, TKeys, TimeBitWidth } from "apache-arrow/type"; import type { IntBitWidth, TKeys, TimeBitWidth } from "apache-arrow/type";
import { import {
Binary, Binary,
Bool, Bool,
DataLike,
DataType, DataType,
DateDay, DateDay,
DateMillisecond, DateMillisecond,
@@ -56,9 +58,14 @@ import {
Map_, Map_,
Null, Null,
type Precision, type Precision,
RecordBatch,
RecordBatchLike,
Schema, Schema,
SchemaLike,
SparseUnion, SparseUnion,
Struct, Struct,
Table,
TableLike,
Time, Time,
TimeMicrosecond, TimeMicrosecond,
TimeMillisecond, TimeMillisecond,
@@ -488,7 +495,7 @@ export function sanitizeField(fieldLike: unknown): Field {
* instance because they might be using a different instance of apache-arrow * instance because they might be using a different instance of apache-arrow
* than lancedb is using. * than lancedb is using.
*/ */
export function sanitizeSchema(schemaLike: unknown): Schema { export function sanitizeSchema(schemaLike: SchemaLike): Schema {
if (schemaLike instanceof Schema) { if (schemaLike instanceof Schema) {
return schemaLike; return schemaLike;
} }
@@ -514,3 +521,68 @@ export function sanitizeSchema(schemaLike: unknown): Schema {
); );
return new Schema(sanitizedFields, metadata); return new Schema(sanitizedFields, metadata);
} }
export function sanitizeTable(tableLike: TableLike): Table {
if (tableLike instanceof Table) {
return tableLike;
}
if (typeof tableLike !== "object" || tableLike === null) {
throw Error("Expected a Table but object was null/undefined");
}
if (!("schema" in tableLike)) {
throw Error(
"The table passed in does not appear to be a table (no 'schema' property)",
);
}
if (!("batches" in tableLike)) {
throw Error(
"The table passed in does not appear to be a table (no 'columns' property)",
);
}
const schema = sanitizeSchema(tableLike.schema);
const batches = tableLike.batches.map(sanitizeRecordBatch);
return new Table(schema, batches);
}
function sanitizeRecordBatch(batchLike: RecordBatchLike): RecordBatch {
if (batchLike instanceof RecordBatch) {
return batchLike;
}
if (typeof batchLike !== "object" || batchLike === null) {
throw Error("Expected a RecordBatch but object was null/undefined");
}
if (!("schema" in batchLike)) {
throw Error(
"The record batch passed in does not appear to be a record batch (no 'schema' property)",
);
}
if (!("data" in batchLike)) {
throw Error(
"The record batch passed in does not appear to be a record batch (no 'data' property)",
);
}
const schema = sanitizeSchema(batchLike.schema);
const data = sanitizeData(batchLike.data);
return new RecordBatch(schema, data);
}
function sanitizeData(
dataLike: DataLike,
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
): import("apache-arrow").Data<Struct<any>> {
if (dataLike instanceof Data) {
return dataLike;
}
return new Data(
dataLike.type,
dataLike.offset,
dataLike.length,
dataLike.nullCount,
{
[BufferType.OFFSET]: dataLike.valueOffsets,
[BufferType.DATA]: dataLike.values,
[BufferType.VALIDITY]: dataLike.nullBitmap,
[BufferType.TYPE]: dataLike.typeIds,
},
);
}

View File

@@ -17,6 +17,7 @@ import {
Data, Data,
IntoVector, IntoVector,
Schema, Schema,
TableLike,
fromDataToBuffer, fromDataToBuffer,
fromTableToBuffer, fromTableToBuffer,
fromTableToStreamBuffer, fromTableToStreamBuffer,
@@ -38,6 +39,8 @@ import {
Table as _NativeTable, Table as _NativeTable,
} from "./native"; } from "./native";
import { Query, VectorQuery } from "./query"; import { Query, VectorQuery } from "./query";
import { sanitizeTable } from "./sanitize";
export { IndexConfig } from "./native";
/** /**
* Options for adding data to a table. * Options for adding data to a table.
@@ -381,8 +384,7 @@ export abstract class Table {
abstract indexStats(name: string): Promise<IndexStatistics | undefined>; abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
static async parseTableData( static async parseTableData(
// biome-ignore lint/suspicious/noExplicitAny: <explanation> data: Record<string, unknown>[] | TableLike,
data: Record<string, unknown>[] | ArrowTable<any>,
options?: Partial<CreateTableOptions>, options?: Partial<CreateTableOptions>,
streaming = false, streaming = false,
) { ) {
@@ -395,9 +397,9 @@ export abstract class Table {
let table: ArrowTable; let table: ArrowTable;
if (isArrowTable(data)) { if (isArrowTable(data)) {
table = data; table = sanitizeTable(data);
} else { } else {
table = makeArrowTable(data, options); table = makeArrowTable(data as Record<string, unknown>[], options);
} }
if (streaming) { if (streaming) {
const buf = await fromTableToStreamBuffer( const buf = await fromTableToStreamBuffer(

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-darwin-arm64", "name": "@lancedb/lancedb-darwin-arm64",
"version": "0.5.2-final.1", "version": "0.6.0",
"os": ["darwin"], "os": ["darwin"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node", "main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-darwin-x64", "name": "@lancedb/lancedb-darwin-x64",
"version": "0.5.2-final.1", "version": "0.6.0",
"os": ["darwin"], "os": ["darwin"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.darwin-x64.node", "main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-arm64-gnu", "name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.5.2-final.1", "version": "0.6.0",
"os": ["linux"], "os": ["linux"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node", "main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-x64-gnu", "name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.5.2-final.1", "version": "0.6.0",
"os": ["linux"], "os": ["linux"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node", "main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-win32-x64-msvc", "name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.5.2-final.1", "version": "0.6.0",
"os": ["win32"], "os": ["win32"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node", "main": "lancedb.win32-x64-msvc.node",

View File

@@ -10,7 +10,7 @@
"vector database", "vector database",
"ann" "ann"
], ],
"version": "0.5.2-final.1", "version": "0.6.0",
"main": "dist/index.js", "main": "dist/index.js",
"exports": { "exports": {
".": "./dist/index.js", ".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion] [tool.bumpversion]
current_version = "0.9.0-beta.8" current_version = "0.9.0"
parse = """(?x) parse = """(?x)
(?P<major>0|[1-9]\\d*)\\. (?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\. (?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb-python" name = "lancedb-python"
version = "0.9.0-beta.8" version = "0.9.0"
edition.workspace = true edition.workspace = true
description = "Python bindings for LanceDB" description = "Python bindings for LanceDB"
license.workspace = true license.workspace = true
@@ -19,8 +19,6 @@ lancedb = { path = "../rust/lancedb" }
env_logger = "0.10" env_logger = "0.10"
pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] } pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] } pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
base64ct = "=1.6.0" # workaround for https://github.com/RustCrypto/formats/issues/1684
chrono = "=0.4.39"
# Prevent dynamic linking of lzma, which comes from datafusion # Prevent dynamic linking of lzma, which comes from datafusion
lzma-sys = { version = "*", features = ["static"] } lzma-sys = { version = "*", features = ["static"] }

View File

@@ -13,7 +13,6 @@ dependencies = [
"packaging", "packaging",
"cachetools", "cachetools",
"overrides>=0.7", "overrides>=0.7",
"urllib3==1.26.19"
] ]
description = "lancedb" description = "lancedb"
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }] authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]

View File

@@ -35,7 +35,6 @@ def connect(
host_override: Optional[str] = None, host_override: Optional[str] = None,
read_consistency_interval: Optional[timedelta] = None, read_consistency_interval: Optional[timedelta] = None,
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None, request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
storage_options: Optional[Dict[str, str]] = None,
**kwargs, **kwargs,
) -> DBConnection: ) -> DBConnection:
"""Connect to a LanceDB database. """Connect to a LanceDB database.
@@ -71,9 +70,6 @@ def connect(
executor will be used for making requests. This is for LanceDB Cloud executor will be used for making requests. This is for LanceDB Cloud
only and is only used when making batch requests (i.e., passing in only and is only used when making batch requests (i.e., passing in
multiple queries to the search method at once). multiple queries to the search method at once).
storage_options: dict, optional
Additional options for the storage backend. See available options at
https://lancedb.github.io/lancedb/guides/storage/
Examples Examples
-------- --------
@@ -109,16 +105,12 @@ def connect(
region, region,
host_override, host_override,
request_thread_pool=request_thread_pool, request_thread_pool=request_thread_pool,
storage_options=storage_options,
**kwargs, **kwargs,
) )
if kwargs: if kwargs:
raise ValueError(f"Unknown keyword arguments: {kwargs}") raise ValueError(f"Unknown keyword arguments: {kwargs}")
return LanceDBConnection( return LanceDBConnection(uri, read_consistency_interval=read_consistency_interval)
uri,
read_consistency_interval=read_consistency_interval,
)
async def connect_async( async def connect_async(

View File

@@ -117,8 +117,6 @@ class Query(pydantic.BaseModel):
with_row_id: bool = False with_row_id: bool = False
fast_search: bool = False
class LanceQueryBuilder(ABC): class LanceQueryBuilder(ABC):
"""An abstract query builder. Subclasses are defined for vector search, """An abstract query builder. Subclasses are defined for vector search,
@@ -127,14 +125,12 @@ class LanceQueryBuilder(ABC):
@classmethod @classmethod
def create( def create(
cls, cls,
table: "Table", table: "Table",
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]], query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
query_type: str, query_type: str,
vector_column_name: str, vector_column_name: str,
ordering_field_name: Optional[str] = None, ordering_field_name: str = None,
fts_columns: Union[str, List[str]] = [],
fast_search: bool = False,
) -> LanceQueryBuilder: ) -> LanceQueryBuilder:
""" """
Create a query builder based on the given query and query type. Create a query builder based on the given query and query type.
@@ -151,19 +147,14 @@ class LanceQueryBuilder(ABC):
If "auto", the query type is inferred based on the query. If "auto", the query type is inferred based on the query.
vector_column_name: str vector_column_name: str
The name of the vector column to use for vector search. The name of the vector column to use for vector search.
fast_search: bool
Skip flat search of unindexed data.
""" """
# Check hybrid search first as it supports empty query pattern
if query_type == "hybrid":
# hybrid fts and vector query
return LanceHybridQueryBuilder(
table, query, vector_column_name, fts_columns=fts_columns
)
if query is None: if query is None:
return LanceEmptyQueryBuilder(table) return LanceEmptyQueryBuilder(table)
if query_type == "hybrid":
# hybrid fts and vector query
return LanceHybridQueryBuilder(table, query, vector_column_name)
# remember the string query for reranking purpose # remember the string query for reranking purpose
str_query = query if isinstance(query, str) else None str_query = query if isinstance(query, str) else None
@@ -174,17 +165,12 @@ class LanceQueryBuilder(ABC):
) )
if query_type == "hybrid": if query_type == "hybrid":
return LanceHybridQueryBuilder( return LanceHybridQueryBuilder(table, query, vector_column_name)
table, query, vector_column_name, fts_columns=fts_columns
)
if isinstance(query, str): if isinstance(query, str):
# fts # fts
return LanceFtsQueryBuilder( return LanceFtsQueryBuilder(
table, table, query, ordering_field_name=ordering_field_name
query,
ordering_field_name=ordering_field_name,
fts_columns=fts_columns,
) )
if isinstance(query, list): if isinstance(query, list):
@@ -194,9 +180,7 @@ class LanceQueryBuilder(ABC):
else: else:
raise TypeError(f"Unsupported query type: {type(query)}") raise TypeError(f"Unsupported query type: {type(query)}")
return LanceVectorQueryBuilder( return LanceVectorQueryBuilder(table, query, vector_column_name, str_query)
table, query, vector_column_name, str_query, fast_search
)
@classmethod @classmethod
def _resolve_query(cls, table, query, query_type, vector_column_name): def _resolve_query(cls, table, query, query_type, vector_column_name):
@@ -212,6 +196,8 @@ class LanceQueryBuilder(ABC):
elif query_type == "auto": elif query_type == "auto":
if isinstance(query, (list, np.ndarray)): if isinstance(query, (list, np.ndarray)):
return query, "vector" return query, "vector"
if isinstance(query, tuple):
return query, "hybrid"
else: else:
conf = table.embedding_functions.get(vector_column_name) conf = table.embedding_functions.get(vector_column_name)
if conf is not None: if conf is not None:
@@ -238,14 +224,9 @@ class LanceQueryBuilder(ABC):
def __init__(self, table: "Table"): def __init__(self, table: "Table"):
self._table = table self._table = table
self._limit = 10 self._limit = 10
self._offset = 0
self._columns = None self._columns = None
self._where = None self._where = None
self._prefilter = False
self._with_row_id = False self._with_row_id = False
self._vector = None
self._text = None
self._ef = None
@deprecation.deprecated( @deprecation.deprecated(
deprecated_in="0.3.1", deprecated_in="0.3.1",
@@ -356,13 +337,11 @@ class LanceQueryBuilder(ABC):
---------- ----------
limit: int limit: int
The maximum number of results to return. The maximum number of results to return.
The default query limit is 10 results. By default the query is limited to the first 10.
For ANN/KNN queries, you must specify a limit. Call this method and pass 0, a negative value,
Entering 0, a negative number, or None will reset or None to remove the limit.
the limit to the default value of 10. *WARNING* if you have a large dataset, removing
*WARNING* if you have a large dataset, setting the limit can potentially result in reading a
the limit to a large number, e.g. the table size,
can potentially result in reading a
large amount of data into memory and cause large amount of data into memory and cause
out of memory issues. out of memory issues.
@@ -372,33 +351,11 @@ class LanceQueryBuilder(ABC):
The LanceQueryBuilder object. The LanceQueryBuilder object.
""" """
if limit is None or limit <= 0: if limit is None or limit <= 0:
if isinstance(self, LanceVectorQueryBuilder): self._limit = None
raise ValueError("Limit is required for ANN/KNN queries")
else:
self._limit = None
else: else:
self._limit = limit self._limit = limit
return self return self
def offset(self, offset: int) -> LanceQueryBuilder:
"""Set the offset for the results.
Parameters
----------
offset: int
The offset to start fetching results from.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
if offset is None or offset <= 0:
self._offset = 0
else:
self._offset = offset
return self
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder: def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
"""Set the columns to return. """Set the columns to return.
@@ -460,80 +417,6 @@ class LanceQueryBuilder(ABC):
self._with_row_id = with_row_id self._with_row_id = with_row_id
return self return self
def explain_plan(self, verbose: Optional[bool] = False) -> str:
"""Return the execution plan for this query.
Examples
--------
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> table = db.create_table("my_table", [{"vector": [99, 99]}])
>>> query = [100, 100]
>>> plan = table.search(query).explain_plan(True)
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
GlobalLimitExec: skip=0, fetch=10
FilterExec: _distance@2 IS NOT NULL
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
KNNVectorDistance: metric=l2
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
Parameters
----------
verbose : bool, default False
Use a verbose output format.
Returns
-------
plan : str
""" # noqa: E501
ds = self._table.to_lance()
return ds.scanner(
nearest={
"column": self._vector_column,
"q": self._query,
"k": self._limit,
"metric": self._metric,
"nprobes": self._nprobes,
"refine_factor": self._refine_factor,
},
prefilter=self._prefilter,
filter=self._str_query,
limit=self._limit,
with_row_id=self._with_row_id,
offset=self._offset,
).explain_plan(verbose)
def vector(self, vector: Union[np.ndarray, list]) -> LanceQueryBuilder:
"""Set the vector to search for.
Parameters
----------
vector: np.ndarray or list
The vector to search for.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError
def text(self, text: str) -> LanceQueryBuilder:
"""Set the text to search for.
Parameters
----------
text: str
The text to search for.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError
class LanceVectorQueryBuilder(LanceQueryBuilder): class LanceVectorQueryBuilder(LanceQueryBuilder):
""" """
@@ -557,12 +440,11 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
""" """
def __init__( def __init__(
self, self,
table: "Table", table: "Table",
query: Union[np.ndarray, list, "PIL.Image.Image"], query: Union[np.ndarray, list, "PIL.Image.Image"],
vector_column: str, vector_column: str,
str_query: Optional[str] = None, str_query: Optional[str] = None,
fast_search: bool = False,
): ):
super().__init__(table) super().__init__(table)
self._query = query self._query = query
@@ -573,14 +455,13 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
self._prefilter = False self._prefilter = False
self._reranker = None self._reranker = None
self._str_query = str_query self._str_query = str_query
self._fast_search = fast_search
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder: def metric(self, metric: Literal["L2", "cosine"]) -> LanceVectorQueryBuilder:
"""Set the distance metric to use. """Set the distance metric to use.
Parameters Parameters
---------- ----------
metric: "L2" or "cosine" or "dot" metric: "L2" or "cosine"
The distance metric to use. By default "L2" is used. The distance metric to use. By default "L2" is used.
Returns Returns
@@ -588,7 +469,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
LanceVectorQueryBuilder LanceVectorQueryBuilder
The LanceQueryBuilder object. The LanceQueryBuilder object.
""" """
self._metric = metric.lower() self._metric = metric
return self return self
def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder: def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
@@ -613,28 +494,6 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
self._nprobes = nprobes self._nprobes = nprobes
return self return self
def ef(self, ef: int) -> LanceVectorQueryBuilder:
"""Set the number of candidates to consider during search.
Higher values will yield better recall (more likely to find vectors if
they exist) at the expense of latency.
This only applies to the HNSW-related index.
The default value is 1.5 * limit.
Parameters
----------
ef: int
The number of candidates to consider during search.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._ef = ef
return self
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder: def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
"""Set the refine factor to use, increasing the number of vectors sampled. """Set the refine factor to use, increasing the number of vectors sampled.
@@ -695,11 +554,15 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
refine_factor=self._refine_factor, refine_factor=self._refine_factor,
vector_column=self._vector_column, vector_column=self._vector_column,
with_row_id=self._with_row_id, with_row_id=self._with_row_id,
offset=self._offset,
fast_search=self._fast_search,
ef=self._ef,
) )
result_set = self._table._execute_query(query, batch_size) result_set = self._table._execute_query(query, batch_size)
if self._reranker is not None:
rs_table = result_set.read_all()
result_set = self._reranker.rerank_vector(self._str_query, rs_table)
# convert result_set back to RecordBatchReader
result_set = pa.RecordBatchReader.from_batches(
result_set.schema, result_set.to_batches()
)
return result_set return result_set
@@ -728,7 +591,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
return self return self
def rerank( def rerank(
self, reranker: Reranker, query_string: Optional[str] = None self, reranker: Reranker, query_string: Optional[str] = None
) -> LanceVectorQueryBuilder: ) -> LanceVectorQueryBuilder:
"""Rerank the results using the specified reranker. """Rerank the results using the specified reranker.
@@ -893,34 +756,12 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
class LanceEmptyQueryBuilder(LanceQueryBuilder): class LanceEmptyQueryBuilder(LanceQueryBuilder):
def to_arrow(self) -> pa.Table: def to_arrow(self) -> pa.Table:
return self.to_batches().read_all() ds = self._table.to_lance()
return ds.to_table(
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
query = Query(
columns=self._columns, columns=self._columns,
filter=self._where, filter=self._where,
k=self._limit or 10, limit=self._limit,
with_row_id=self._with_row_id,
vector=[],
# not actually respected in remote query
offset=self._offset or 0,
) )
return self._table._execute_query(query)
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
"""Rerank the results using the specified reranker.
Parameters
----------
reranker: Reranker
The reranker to use.
Returns
-------
LanceEmptyQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError("Reranking is not yet supported.")
class LanceHybridQueryBuilder(LanceQueryBuilder): class LanceHybridQueryBuilder(LanceQueryBuilder):

View File

@@ -55,13 +55,11 @@ class RestfulLanceDBClient:
region: str region: str
api_key: Credential api_key: Credential
host_override: Optional[str] = attrs.field(default=None) host_override: Optional[str] = attrs.field(default=None)
db_prefix: Optional[str] = attrs.field(default=None)
closed: bool = attrs.field(default=False, init=False) closed: bool = attrs.field(default=False, init=False)
connection_timeout: float = attrs.field(default=120.0, kw_only=True) connection_timeout: float = attrs.field(default=120.0, kw_only=True)
read_timeout: float = attrs.field(default=300.0, kw_only=True) read_timeout: float = attrs.field(default=300.0, kw_only=True)
storage_options: Optional[Dict[str, str]] = attrs.field(default=None, kw_only=True)
@functools.cached_property @functools.cached_property
def session(self) -> requests.Session: def session(self) -> requests.Session:
@@ -94,18 +92,6 @@ class RestfulLanceDBClient:
headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com" headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
if self.host_override: if self.host_override:
headers["x-lancedb-database"] = self.db_name headers["x-lancedb-database"] = self.db_name
if self.storage_options:
if self.storage_options.get("account_name") is not None:
headers["x-azure-storage-account-name"] = self.storage_options[
"account_name"
]
if self.storage_options.get("azure_storage_account_name") is not None:
headers["x-azure-storage-account-name"] = self.storage_options[
"azure_storage_account_name"
]
if self.db_prefix:
headers["x-lancedb-database-prefix"] = self.db_prefix
return headers return headers
@staticmethod @staticmethod
@@ -172,7 +158,6 @@ class RestfulLanceDBClient:
headers["content-type"] = content_type headers["content-type"] = content_type
if request_id is not None: if request_id is not None:
headers["x-request-id"] = request_id headers["x-request-id"] = request_id
with self.session.post( with self.session.post(
urljoin(self.url, uri), urljoin(self.url, uri),
headers=headers, headers=headers,
@@ -260,6 +245,7 @@ def retry_adapter(options: Dict[str, Any]) -> HTTPAdapter:
connect=connect_retries, connect=connect_retries,
read=read_retries, read=read_retries,
backoff_factor=backoff_factor, backoff_factor=backoff_factor,
backoff_jitter=backoff_jitter,
status_forcelist=statuses, status_forcelist=statuses,
allowed_methods=methods, allowed_methods=methods,
) )

View File

@@ -15,7 +15,7 @@ import inspect
import logging import logging
import uuid import uuid
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from typing import Dict, Iterable, List, Optional, Union from typing import Iterable, List, Optional, Union
from urllib.parse import urlparse from urllib.parse import urlparse
from cachetools import TTLCache from cachetools import TTLCache
@@ -44,25 +44,20 @@ class RemoteDBConnection(DBConnection):
request_thread_pool: Optional[ThreadPoolExecutor] = None, request_thread_pool: Optional[ThreadPoolExecutor] = None,
connection_timeout: float = 120.0, connection_timeout: float = 120.0,
read_timeout: float = 300.0, read_timeout: float = 300.0,
storage_options: Optional[Dict[str, str]] = None,
): ):
"""Connect to a remote LanceDB database.""" """Connect to a remote LanceDB database."""
parsed = urlparse(db_url) parsed = urlparse(db_url)
if parsed.scheme != "db": if parsed.scheme != "db":
raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://") raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://")
self.db_name = parsed.netloc self.db_name = parsed.netloc
prefix = parsed.path.lstrip("/")
self.db_prefix = None if not prefix else prefix
self.api_key = api_key self.api_key = api_key
self._client = RestfulLanceDBClient( self._client = RestfulLanceDBClient(
self.db_name, self.db_name,
region, region,
api_key, api_key,
host_override, host_override,
self.db_prefix,
connection_timeout=connection_timeout, connection_timeout=connection_timeout,
read_timeout=read_timeout, read_timeout=read_timeout,
storage_options=storage_options,
) )
self._request_thread_pool = request_thread_pool self._request_thread_pool = request_thread_pool
self._table_cache = TTLCache(maxsize=10000, ttl=300) self._table_cache = TTLCache(maxsize=10000, ttl=300)

View File

@@ -15,14 +15,13 @@ import logging
import uuid import uuid
from concurrent.futures import Future from concurrent.futures import Future
from functools import cached_property from functools import cached_property
from typing import Dict, Iterable, Optional, Union, Literal from typing import Dict, Iterable, Optional, Union
import pyarrow as pa import pyarrow as pa
from lance import json_to_schema from lance import json_to_schema
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
from lancedb.merge import LanceMergeInsertBuilder from lancedb.merge import LanceMergeInsertBuilder
from lancedb.query import LanceQueryBuilder
from ..query import LanceVectorQueryBuilder from ..query import LanceVectorQueryBuilder
from ..table import Query, Table, _sanitize_data from ..table import Query, Table, _sanitize_data
@@ -82,7 +81,6 @@ class RemoteTable(Table):
def create_scalar_index( def create_scalar_index(
self, self,
column: str, column: str,
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
): ):
"""Creates a scalar index """Creates a scalar index
Parameters Parameters
@@ -91,6 +89,8 @@ class RemoteTable(Table):
The column to be indexed. Must be a boolean, integer, float, The column to be indexed. Must be a boolean, integer, float,
or string column. or string column.
""" """
index_type = "scalar"
data = { data = {
"column": column, "column": column,
"index_type": index_type, "index_type": index_type,
@@ -228,21 +228,10 @@ class RemoteTable(Table):
content_type=ARROW_STREAM_CONTENT_TYPE, content_type=ARROW_STREAM_CONTENT_TYPE,
) )
def query(
self,
query: Union[VEC, str] = None,
query_type: str = "vector",
vector_column_name: Optional[str] = None,
fast_search: bool = False,
) -> LanceVectorQueryBuilder:
return self.search(query, query_type, vector_column_name, fast_search)
def search( def search(
self, self,
query: Union[VEC, str] = None, query: Union[VEC, str],
query_type: str = "vector",
vector_column_name: Optional[str] = None, vector_column_name: Optional[str] = None,
fast_search: bool = False,
) -> LanceVectorQueryBuilder: ) -> LanceVectorQueryBuilder:
"""Create a search query to find the nearest neighbors """Create a search query to find the nearest neighbors
of the given query vector. We currently support [vector search][search] of the given query vector. We currently support [vector search][search]
@@ -289,11 +278,6 @@ class RemoteTable(Table):
- If the table has multiple vector columns then the *vector_column_name* - If the table has multiple vector columns then the *vector_column_name*
needs to be specified. Otherwise, an error is raised. needs to be specified. Otherwise, an error is raised.
fast_search: bool, optional
Skip a flat search of unindexed data. This may improve
search performance but search results will not include unindexed data.
- *default False*.
Returns Returns
------- -------
LanceQueryBuilder LanceQueryBuilder
@@ -309,14 +293,7 @@ class RemoteTable(Table):
""" """
if vector_column_name is None: if vector_column_name is None:
vector_column_name = inf_vector_column_query(self.schema) vector_column_name = inf_vector_column_query(self.schema)
return LanceVectorQueryBuilder(self, query, vector_column_name)
return LanceQueryBuilder.create(
self,
query,
query_type,
vector_column_name=vector_column_name,
fast_search=fast_search,
)
def _execute_query( def _execute_query(
self, query: Query, batch_size: Optional[int] = None self, query: Query, batch_size: Optional[int] = None

View File

@@ -21,7 +21,6 @@ class FakeLanceDBClient:
pass pass
def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult: def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
print(f"{query=}")
assert table_name == "test" assert table_name == "test"
t = pa.schema([]).empty_table() t = pa.schema([]).empty_table()
return VectorQueryResult(t) return VectorQueryResult(t)
@@ -40,21 +39,3 @@ def test_remote_db():
table = conn["test"] table = conn["test"]
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))]) table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
table.search([1.0, 2.0]).to_pandas() table.search([1.0, 2.0]).to_pandas()
def test_empty_query_with_filter():
conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
setattr(conn, "_client", FakeLanceDBClient())
table = conn["test"]
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
print(table.query().select(["vector"]).where("foo == bar").to_arrow())
def test_fast_search_query_with_filter():
conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
setattr(conn, "_client", FakeLanceDBClient())
table = conn["test"]
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
print(table.query([0, 0], fast_search=True).select(["vector"]).where("foo == bar").to_arrow())

View File

@@ -735,7 +735,7 @@ def test_create_scalar_index(db):
indices = table.to_lance().list_indices() indices = table.to_lance().list_indices()
assert len(indices) == 1 assert len(indices) == 1
scalar_index = indices[0] scalar_index = indices[0]
assert scalar_index["type"] == "BTree" assert scalar_index["type"] == "Scalar"
# Confirm that prefiltering still works with the scalar index column # Confirm that prefiltering still works with the scalar index column
results = table.search().where("x = 'c'").to_arrow() results = table.search().where("x = 'c'").to_arrow()

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb-node" name = "lancedb-node"
version = "0.5.2-final.1" version = "0.6.0"
description = "Serverless, low-latency vector database for AI applications" description = "Serverless, low-latency vector database for AI applications"
license.workspace = true license.workspace = true
edition.workspace = true edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb" name = "lancedb"
version = "0.5.2-final.1" version = "0.6.0"
edition.workspace = true edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications" description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true license.workspace = true

View File

@@ -6,3 +6,12 @@
LanceDB Rust SDK, a serverless vector database. LanceDB Rust SDK, a serverless vector database.
Read more at: https://lancedb.com/ Read more at: https://lancedb.com/
> [!TIP]
> A transitive dependency of `lancedb` is `lzma-sys`, which uses dynamic linking
> by default. If you want to statically link `lzma-sys`, you should activate it's
> `static` feature by adding the following to your dependencies:
>
> ```toml
> lzma-sys = { version = "*", features = ["static"] }
> ```

View File

@@ -1889,6 +1889,7 @@ impl TableInternal for NativeTable {
} }
columns.push(field.name.clone()); columns.push(field.name.clone());
} }
let index_type = if is_vector { let index_type = if is_vector {
crate::index::IndexType::IvfPq crate::index::IndexType::IvfPq
} else { } else {