Compare commits

..

3 Commits

Author SHA1 Message Date
ayush chaurasia
40ffe03cc8 format 2024-06-24 16:57:51 +05:30
ayush chaurasia
617ce3139b lint 2024-06-24 16:55:23 +05:30
ayush chaurasia
242bbe1897 use promote_options with concat_tables 2024-06-24 16:39:03 +05:30
26 changed files with 75 additions and 224 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.6.0"
current_version = "0.5.2"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -20,18 +20,13 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"]
[workspace.dependencies]
# lance = { "version" = "=0.13.0", "features" = ["dynamodb"] }
# lance-index = { "version" = "=0.13.0" }
# lance-linalg = { "version" = "=0.13.0" }
# lance-testing = { "version" = "=0.13.0" }
# lance-datafusion = { "version" = "=0.13.0" }
lance = { path = "../lance/rust/lance" }
lance-index = { path = "../lance/rust/lance-index" }
lance-linalg= { path = "../lance/rust/lance-linalg" }
lance-testing = { path = "../lance/rust/lance-testing" }
lance-datafusion = { path = "../lance/rust/lance-datafusion" }
lance = { "version" = "=0.12.2", "features" = [
"dynamodb",
], git = "https://github.com/lancedb/lance.git", tag = "v0.12.2-beta.2" }
lance-index = { "version" = "=0.12.2", git = "https://github.com/lancedb/lance.git", tag = "v0.12.2-beta.2" }
lance-linalg = { "version" = "=0.12.2", git = "https://github.com/lancedb/lance.git", tag = "v0.12.2-beta.2" }
lance-testing = { "version" = "=0.12.2", git = "https://github.com/lancedb/lance.git", tag = "v0.12.2-beta.2" }
lance-datafusion = { "version" = "=0.12.2", git = "https://github.com/lancedb/lance.git", tag = "v0.12.2-beta.2" }
# Note that this one does not include pyarrow
arrow = { version = "51.0", optional = false }
arrow-array = "51.0"

View File

@@ -116,21 +116,21 @@ This guide will show how to create tables, insert data into them, and update the
### From a Polars DataFrame
LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
is on the way.
LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
is on the way.
```python
import polars as pl
```python
import polars as pl
data = pl.DataFrame({
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0]
})
table = db.create_table("pl_table", data=data)
```
data = pl.DataFrame({
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0]
})
table = db.create_table("pl_table", data=data)
```
### From an Arrow Table
=== "Python"

View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.6.0",
"version": "0.5.2",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.6.0",
"version": "0.5.2",
"cpu": [
"x64",
"arm64"

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.6.0",
"version": "0.5.2",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@@ -39,9 +39,7 @@ describe.each([arrow, arrowOld])("Given a table", (arrow: any) => {
let tmpDir: tmp.DirResult;
let table: Table;
const schema:
| import("apache-arrow").Schema
| import("apache-arrow-old").Schema = new arrow.Schema([
const schema = new arrow.Schema([
new arrow.Field("id", new arrow.Float64(), true),
]);
@@ -317,7 +315,7 @@ describe("When creating an index", () => {
.query()
.limit(2)
.nearestTo(queryVec)
.distanceType("dot")
.distanceType("DoT")
.toArrow();
expect(rst.numRows).toBe(2);

View File

@@ -15,7 +15,6 @@
import {
Table as ArrowTable,
Binary,
BufferType,
DataType,
Field,
FixedSizeBinary,
@@ -38,68 +37,14 @@ import {
type makeTable,
vectorFromArray,
} from "apache-arrow";
import { Buffers } from "apache-arrow/data";
import { type EmbeddingFunction } from "./embedding/embedding_function";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import {
sanitizeField,
sanitizeSchema,
sanitizeTable,
sanitizeType,
} from "./sanitize";
import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize";
export * from "apache-arrow";
export type SchemaLike =
| Schema
| {
fields: FieldLike[];
metadata: Map<string, string>;
get names(): unknown[];
};
export type FieldLike =
| Field
| {
type: string;
name: string;
nullable?: boolean;
metadata?: Map<string, string>;
};
export type DataLike =
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
| import("apache-arrow").Data<Struct<any>>
| {
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
type: any;
length: number;
offset: number;
stride: number;
nullable: boolean;
children: DataLike[];
get nullCount(): number;
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
values: Buffers<any>[BufferType.DATA];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
typeIds: Buffers<any>[BufferType.TYPE];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
nullBitmap: Buffers<any>[BufferType.VALIDITY];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
valueOffsets: Buffers<any>[BufferType.OFFSET];
};
export type RecordBatchLike =
| RecordBatch
| {
schema: SchemaLike;
data: DataLike;
};
export type TableLike =
| ArrowTable
| { schema: SchemaLike; batches: RecordBatchLike[] };
export type IntoVector = Float32Array | Float64Array | number[];
export function isArrowTable(value: object): value is TableLike {
export function isArrowTable(value: object): value is ArrowTable {
if (value instanceof ArrowTable) return true;
return "schema" in value && "batches" in value;
}
@@ -190,7 +135,7 @@ export function isFixedSizeList(value: unknown): value is FixedSizeList {
}
/** Data type accepted by NodeJS SDK */
export type Data = Record<string, unknown>[] | TableLike;
export type Data = Record<string, unknown>[] | ArrowTable;
/*
* Options to control how a column should be converted to a vector array
@@ -217,7 +162,7 @@ export class MakeArrowTableOptions {
* The schema must be specified if there are no records (e.g. to make
* an empty table)
*/
schema?: SchemaLike;
schema?: Schema;
/*
* Mapping from vector column name to expected type
@@ -365,7 +310,7 @@ export function makeArrowTable(
if (opt.schema !== undefined && opt.schema !== null) {
opt.schema = sanitizeSchema(opt.schema);
opt.schema = validateSchemaEmbeddings(
opt.schema as Schema,
opt.schema,
data,
options?.embeddingFunction,
);
@@ -449,7 +394,7 @@ export function makeArrowTable(
// `new ArrowTable(schema, batches)` which does not do any schema inference
const firstTable = new ArrowTable(columns);
const batchesFixed = firstTable.batches.map(
(batch) => new RecordBatch(opt.schema as Schema, batch.data),
(batch) => new RecordBatch(opt.schema!, batch.data),
);
let schema: Schema;
if (metadata !== undefined) {
@@ -462,9 +407,9 @@ export function makeArrowTable(
}
}
schema = new Schema(opt.schema.fields as Field[], schemaMetadata);
schema = new Schema(opt.schema.fields, schemaMetadata);
} else {
schema = opt.schema as Schema;
schema = opt.schema;
}
return new ArrowTable(schema, batchesFixed);
}
@@ -480,7 +425,7 @@ export function makeArrowTable(
* Create an empty Arrow table with the provided schema
*/
export function makeEmptyTable(
schema: SchemaLike,
schema: Schema,
metadata?: Map<string, string>,
): ArrowTable {
return makeArrowTable([], { schema }, metadata);
@@ -618,17 +563,18 @@ async function applyEmbeddingsFromMetadata(
async function applyEmbeddings<T>(
table: ArrowTable,
embeddings?: EmbeddingFunctionConfig,
schema?: SchemaLike,
schema?: Schema,
): Promise<ArrowTable> {
if (schema !== undefined && schema !== null) {
schema = sanitizeSchema(schema);
}
if (schema?.metadata.has("embedding_functions")) {
return applyEmbeddingsFromMetadata(table, schema! as Schema);
return applyEmbeddingsFromMetadata(table, schema!);
} else if (embeddings == null || embeddings === undefined) {
return table;
}
if (schema !== undefined && schema !== null) {
schema = sanitizeSchema(schema);
}
// Convert from ArrowTable to Record<String, Vector>
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
const name = table.schema.fields[idx].name;
@@ -704,7 +650,7 @@ async function applyEmbeddings<T>(
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
);
}
return alignTable(newTable, schema as Schema);
return alignTable(newTable, schema);
}
return newTable;
}
@@ -798,7 +744,7 @@ export async function fromRecordsToStreamBuffer(
export async function fromTableToBuffer(
table: ArrowTable,
embeddings?: EmbeddingFunctionConfig,
schema?: SchemaLike,
schema?: Schema,
): Promise<Buffer> {
if (schema !== undefined && schema !== null) {
schema = sanitizeSchema(schema);
@@ -825,7 +771,7 @@ export async function fromDataToBuffer(
schema = sanitizeSchema(schema);
}
if (isArrowTable(data)) {
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
return fromTableToBuffer(data, embeddings, schema);
} else {
const table = await convertToTable(data, embeddings, { schema });
return fromTableToBuffer(table);
@@ -843,7 +789,7 @@ export async function fromDataToBuffer(
export async function fromTableToStreamBuffer(
table: ArrowTable,
embeddings?: EmbeddingFunctionConfig,
schema?: SchemaLike,
schema?: Schema,
): Promise<Buffer> {
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
@@ -908,6 +854,7 @@ function validateSchemaEmbeddings(
for (let field of schema.fields) {
if (isFixedSizeList(field.type)) {
field = sanitizeField(field);
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
if (schema.metadata.has("embedding_functions")) {
const embeddings = JSON.parse(

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
import { Data, Schema, SchemaLike, TableLike } from "./arrow";
import { Table as ArrowTable, Data, Schema } from "./arrow";
import { fromTableToBuffer, makeEmptyTable } from "./arrow";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import { Connection as LanceDbConnection } from "./native";
@@ -50,7 +50,7 @@ export interface CreateTableOptions {
* The default is true while the new format is in beta
*/
useLegacyFormat?: boolean;
schema?: SchemaLike;
schema?: Schema;
embeddingFunction?: EmbeddingFunctionConfig;
}
@@ -167,12 +167,12 @@ export abstract class Connection {
/**
* Creates a new Table and initialize it with new data.
* @param {string} name - The name of the table.
* @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
* @param {Record<string, unknown>[] | ArrowTable} data - Non-empty Array of Records
* to be inserted into the table
*/
abstract createTable(
name: string,
data: Record<string, unknown>[] | TableLike,
data: Record<string, unknown>[] | ArrowTable,
options?: Partial<CreateTableOptions>,
): Promise<Table>;
@@ -183,7 +183,7 @@ export abstract class Connection {
*/
abstract createEmptyTable(
name: string,
schema: import("./arrow").SchemaLike,
schema: Schema,
options?: Partial<CreateTableOptions>,
): Promise<Table>;
@@ -235,7 +235,7 @@ export class LocalConnection extends Connection {
nameOrOptions:
| string
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
data?: Record<string, unknown>[] | TableLike,
data?: Record<string, unknown>[] | ArrowTable,
options?: Partial<CreateTableOptions>,
): Promise<Table> {
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
@@ -259,7 +259,7 @@ export class LocalConnection extends Connection {
async createEmptyTable(
name: string,
schema: import("./arrow").SchemaLike,
schema: Schema,
options?: Partial<CreateTableOptions>,
): Promise<Table> {
let mode: string = options?.mode ?? "create";

View File

@@ -300,9 +300,7 @@ export class VectorQuery extends QueryBase<NativeVectorQuery, VectorQuery> {
*
* By default "l2" is used.
*/
distanceType(
distanceType: Required<IvfPqOptions>["distanceType"],
): VectorQuery {
distanceType(distanceType: string): VectorQuery {
this.inner.distanceType(distanceType);
return this;
}

View File

@@ -1,10 +1,5 @@
import { Schema } from "apache-arrow";
import {
Data,
SchemaLike,
fromTableToStreamBuffer,
makeEmptyTable,
} from "../arrow";
import { Data, fromTableToStreamBuffer, makeEmptyTable } from "../arrow";
import {
Connection,
CreateTableOptions,
@@ -161,7 +156,7 @@ export class RemoteConnection extends Connection {
async createEmptyTable(
name: string,
schema: SchemaLike,
schema: Schema,
options?: Partial<CreateTableOptions> | undefined,
): Promise<Table> {
if (options?.mode) {

View File

@@ -20,12 +20,10 @@
// comes from the exact same library instance. This is not always the case
// and so we must sanitize the input to ensure that it is compatible.
import { BufferType, Data } from "apache-arrow";
import type { IntBitWidth, TKeys, TimeBitWidth } from "apache-arrow/type";
import {
Binary,
Bool,
DataLike,
DataType,
DateDay,
DateMillisecond,
@@ -58,14 +56,9 @@ import {
Map_,
Null,
type Precision,
RecordBatch,
RecordBatchLike,
Schema,
SchemaLike,
SparseUnion,
Struct,
Table,
TableLike,
Time,
TimeMicrosecond,
TimeMillisecond,
@@ -495,7 +488,7 @@ export function sanitizeField(fieldLike: unknown): Field {
* instance because they might be using a different instance of apache-arrow
* than lancedb is using.
*/
export function sanitizeSchema(schemaLike: SchemaLike): Schema {
export function sanitizeSchema(schemaLike: unknown): Schema {
if (schemaLike instanceof Schema) {
return schemaLike;
}
@@ -521,68 +514,3 @@ export function sanitizeSchema(schemaLike: SchemaLike): Schema {
);
return new Schema(sanitizedFields, metadata);
}
export function sanitizeTable(tableLike: TableLike): Table {
if (tableLike instanceof Table) {
return tableLike;
}
if (typeof tableLike !== "object" || tableLike === null) {
throw Error("Expected a Table but object was null/undefined");
}
if (!("schema" in tableLike)) {
throw Error(
"The table passed in does not appear to be a table (no 'schema' property)",
);
}
if (!("batches" in tableLike)) {
throw Error(
"The table passed in does not appear to be a table (no 'columns' property)",
);
}
const schema = sanitizeSchema(tableLike.schema);
const batches = tableLike.batches.map(sanitizeRecordBatch);
return new Table(schema, batches);
}
function sanitizeRecordBatch(batchLike: RecordBatchLike): RecordBatch {
if (batchLike instanceof RecordBatch) {
return batchLike;
}
if (typeof batchLike !== "object" || batchLike === null) {
throw Error("Expected a RecordBatch but object was null/undefined");
}
if (!("schema" in batchLike)) {
throw Error(
"The record batch passed in does not appear to be a record batch (no 'schema' property)",
);
}
if (!("data" in batchLike)) {
throw Error(
"The record batch passed in does not appear to be a record batch (no 'data' property)",
);
}
const schema = sanitizeSchema(batchLike.schema);
const data = sanitizeData(batchLike.data);
return new RecordBatch(schema, data);
}
function sanitizeData(
dataLike: DataLike,
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
): import("apache-arrow").Data<Struct<any>> {
if (dataLike instanceof Data) {
return dataLike;
}
return new Data(
dataLike.type,
dataLike.offset,
dataLike.length,
dataLike.nullCount,
{
[BufferType.OFFSET]: dataLike.valueOffsets,
[BufferType.DATA]: dataLike.values,
[BufferType.VALIDITY]: dataLike.nullBitmap,
[BufferType.TYPE]: dataLike.typeIds,
},
);
}

View File

@@ -17,7 +17,6 @@ import {
Data,
IntoVector,
Schema,
TableLike,
fromDataToBuffer,
fromTableToBuffer,
fromTableToStreamBuffer,
@@ -39,8 +38,6 @@ import {
Table as _NativeTable,
} from "./native";
import { Query, VectorQuery } from "./query";
import { sanitizeTable } from "./sanitize";
export { IndexConfig } from "./native";
/**
* Options for adding data to a table.
@@ -384,7 +381,8 @@ export abstract class Table {
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
static async parseTableData(
data: Record<string, unknown>[] | TableLike,
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
data: Record<string, unknown>[] | ArrowTable<any>,
options?: Partial<CreateTableOptions>,
streaming = false,
) {
@@ -397,9 +395,9 @@ export abstract class Table {
let table: ArrowTable;
if (isArrowTable(data)) {
table = sanitizeTable(data);
table = data;
} else {
table = makeArrowTable(data as Record<string, unknown>[], options);
table = makeArrowTable(data, options);
}
if (streaming) {
const buf = await fromTableToStreamBuffer(

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.6.0",
"version": "0.5.2",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.6.0",
"version": "0.5.2",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.6.0",
"version": "0.5.2",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.6.0",
"version": "0.5.2",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.6.0",
"version": "0.5.2",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -10,7 +10,7 @@
"vector database",
"ann"
],
"version": "0.6.0",
"version": "0.5.2",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.9.0"
current_version = "0.8.2"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.9.0"
version = "0.8.2"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -3,7 +3,7 @@ name = "lancedb"
# version in Cargo.toml
dependencies = [
"deprecation",
"pylance==0.13.0",
"pylance==0.12.2-beta.2",
"ratelimiter~=1.0",
"requests>=2.31.0",
"retry>=0.9.2",

View File

@@ -119,7 +119,9 @@ class Reranker(ABC):
fts_results : pa.Table
The results from the FTS search
"""
combined = pa.concat_tables([vector_results, fts_results], promote=True)
combined = pa.concat_tables(
[vector_results, fts_results], promote_options="default"
)
row_id = combined.column("_rowid")
# deduplicate

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.6.0"
version = "0.5.2"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.6.0"
version = "0.5.2"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -6,12 +6,3 @@
LanceDB Rust SDK, a serverless vector database.
Read more at: https://lancedb.com/
> [!TIP]
> A transitive dependency of `lancedb` is `lzma-sys`, which uses dynamic linking
> by default. If you want to statically link `lzma-sys`, you should activate it's
> `static` feature by adding the following to your dependencies:
>
> ```toml
> lzma-sys = { version = "*", features = ["static"] }
> ```

View File

@@ -1889,7 +1889,6 @@ impl TableInternal for NativeTable {
}
columns.push(field.name.clone());
}
let index_type = if is_vector {
crate::index::IndexType::IvfPq
} else {