Compare commits

...

7 Commits

Author SHA1 Message Date
albertlockett
dcfa17c9fc temporarily use local dependencies 2024-06-26 15:28:30 -03:00
Cory Grinstead
79a1667753 feat(nodejs): feature parity [6/N] - make public interface work with multiple arrow versions (#1392)
previously we didnt have great compatibility with other versions of
apache arrow. This should bridge that gap a bit.


depends on https://github.com/lancedb/lancedb/pull/1391
see actual diff here
https://github.com/universalmind303/lancedb/compare/query-filter...universalmind303:arrow-compatibility
2024-06-25 11:10:08 -05:00
Thomas J. Fan
a866b78a31 docs: fixes polars formatting in docs (#1400)
Currently, the whole polars section is formatted as a code block:
https://lancedb.github.io/lancedb/guides/tables/#from-a-polars-dataframe

This PR fixes the formatting.
2024-06-25 08:46:16 -07:00
Will Jones
c7d37b3e6e docs: add tip about lzma linking (#1397)
Similar to https://github.com/lancedb/lance/pull/2505
2024-06-25 08:20:31 -07:00
Lance Release
4b71552b73 Updating package-lock.json 2024-06-25 00:26:08 +00:00
Lance Release
5ce5f64da3 Bump version: 0.6.0-beta.0 → 0.6.0 2024-06-25 00:25:45 +00:00
Lance Release
c582b0fc63 Bump version: 0.5.2 → 0.6.0-beta.0 2024-06-25 00:25:45 +00:00
22 changed files with 221 additions and 68 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.5.2"
current_version = "0.6.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -20,11 +20,18 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"]
[workspace.dependencies]
lance = { "version" = "=0.13.0", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.13.0" }
lance-linalg = { "version" = "=0.13.0" }
lance-testing = { "version" = "=0.13.0" }
lance-datafusion = { "version" = "=0.13.0" }
# lance = { "version" = "=0.13.0", "features" = ["dynamodb"] }
# lance-index = { "version" = "=0.13.0" }
# lance-linalg = { "version" = "=0.13.0" }
# lance-testing = { "version" = "=0.13.0" }
# lance-datafusion = { "version" = "=0.13.0" }
lance = { path = "../lance/rust/lance" }
lance-index = { path = "../lance/rust/lance-index" }
lance-linalg= { path = "../lance/rust/lance-linalg" }
lance-testing = { path = "../lance/rust/lance-testing" }
lance-datafusion = { path = "../lance/rust/lance-datafusion" }
# Note that this one does not include pyarrow
arrow = { version = "51.0", optional = false }
arrow-array = "51.0"

View File

@@ -116,21 +116,21 @@ This guide will show how to create tables, insert data into them, and update the
### From a Polars DataFrame
LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
is on the way.
LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
is on the way.
```python
import polars as pl
```python
import polars as pl
data = pl.DataFrame({
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0]
})
table = db.create_table("pl_table", data=data)
```
data = pl.DataFrame({
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0]
})
table = db.create_table("pl_table", data=data)
```
### From an Arrow Table
=== "Python"

View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.5.2",
"version": "0.6.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.5.2",
"version": "0.6.0",
"cpu": [
"x64",
"arm64"

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.5.2",
"version": "0.6.0",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@@ -39,7 +39,9 @@ describe.each([arrow, arrowOld])("Given a table", (arrow: any) => {
let tmpDir: tmp.DirResult;
let table: Table;
const schema = new arrow.Schema([
const schema:
| import("apache-arrow").Schema
| import("apache-arrow-old").Schema = new arrow.Schema([
new arrow.Field("id", new arrow.Float64(), true),
]);
@@ -315,7 +317,7 @@ describe("When creating an index", () => {
.query()
.limit(2)
.nearestTo(queryVec)
.distanceType("DoT")
.distanceType("dot")
.toArrow();
expect(rst.numRows).toBe(2);

View File

@@ -15,6 +15,7 @@
import {
Table as ArrowTable,
Binary,
BufferType,
DataType,
Field,
FixedSizeBinary,
@@ -37,14 +38,68 @@ import {
type makeTable,
vectorFromArray,
} from "apache-arrow";
import { Buffers } from "apache-arrow/data";
import { type EmbeddingFunction } from "./embedding/embedding_function";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize";
import {
sanitizeField,
sanitizeSchema,
sanitizeTable,
sanitizeType,
} from "./sanitize";
export * from "apache-arrow";
export type SchemaLike =
| Schema
| {
fields: FieldLike[];
metadata: Map<string, string>;
get names(): unknown[];
};
export type FieldLike =
| Field
| {
type: string;
name: string;
nullable?: boolean;
metadata?: Map<string, string>;
};
export type DataLike =
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
| import("apache-arrow").Data<Struct<any>>
| {
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
type: any;
length: number;
offset: number;
stride: number;
nullable: boolean;
children: DataLike[];
get nullCount(): number;
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
values: Buffers<any>[BufferType.DATA];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
typeIds: Buffers<any>[BufferType.TYPE];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
nullBitmap: Buffers<any>[BufferType.VALIDITY];
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
valueOffsets: Buffers<any>[BufferType.OFFSET];
};
export type RecordBatchLike =
| RecordBatch
| {
schema: SchemaLike;
data: DataLike;
};
export type TableLike =
| ArrowTable
| { schema: SchemaLike; batches: RecordBatchLike[] };
export type IntoVector = Float32Array | Float64Array | number[];
export function isArrowTable(value: object): value is ArrowTable {
export function isArrowTable(value: object): value is TableLike {
if (value instanceof ArrowTable) return true;
return "schema" in value && "batches" in value;
}
@@ -135,7 +190,7 @@ export function isFixedSizeList(value: unknown): value is FixedSizeList {
}
/** Data type accepted by NodeJS SDK */
export type Data = Record<string, unknown>[] | ArrowTable;
export type Data = Record<string, unknown>[] | TableLike;
/*
* Options to control how a column should be converted to a vector array
@@ -162,7 +217,7 @@ export class MakeArrowTableOptions {
* The schema must be specified if there are no records (e.g. to make
* an empty table)
*/
schema?: Schema;
schema?: SchemaLike;
/*
* Mapping from vector column name to expected type
@@ -310,7 +365,7 @@ export function makeArrowTable(
if (opt.schema !== undefined && opt.schema !== null) {
opt.schema = sanitizeSchema(opt.schema);
opt.schema = validateSchemaEmbeddings(
opt.schema,
opt.schema as Schema,
data,
options?.embeddingFunction,
);
@@ -394,7 +449,7 @@ export function makeArrowTable(
// `new ArrowTable(schema, batches)` which does not do any schema inference
const firstTable = new ArrowTable(columns);
const batchesFixed = firstTable.batches.map(
(batch) => new RecordBatch(opt.schema!, batch.data),
(batch) => new RecordBatch(opt.schema as Schema, batch.data),
);
let schema: Schema;
if (metadata !== undefined) {
@@ -407,9 +462,9 @@ export function makeArrowTable(
}
}
schema = new Schema(opt.schema.fields, schemaMetadata);
schema = new Schema(opt.schema.fields as Field[], schemaMetadata);
} else {
schema = opt.schema;
schema = opt.schema as Schema;
}
return new ArrowTable(schema, batchesFixed);
}
@@ -425,7 +480,7 @@ export function makeArrowTable(
* Create an empty Arrow table with the provided schema
*/
export function makeEmptyTable(
schema: Schema,
schema: SchemaLike,
metadata?: Map<string, string>,
): ArrowTable {
return makeArrowTable([], { schema }, metadata);
@@ -563,17 +618,16 @@ async function applyEmbeddingsFromMetadata(
async function applyEmbeddings<T>(
table: ArrowTable,
embeddings?: EmbeddingFunctionConfig,
schema?: Schema,
schema?: SchemaLike,
): Promise<ArrowTable> {
if (schema?.metadata.has("embedding_functions")) {
return applyEmbeddingsFromMetadata(table, schema!);
} else if (embeddings == null || embeddings === undefined) {
return table;
}
if (schema !== undefined && schema !== null) {
schema = sanitizeSchema(schema);
}
if (schema?.metadata.has("embedding_functions")) {
return applyEmbeddingsFromMetadata(table, schema! as Schema);
} else if (embeddings == null || embeddings === undefined) {
return table;
}
// Convert from ArrowTable to Record<String, Vector>
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
@@ -650,7 +704,7 @@ async function applyEmbeddings<T>(
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
);
}
return alignTable(newTable, schema);
return alignTable(newTable, schema as Schema);
}
return newTable;
}
@@ -744,7 +798,7 @@ export async function fromRecordsToStreamBuffer(
export async function fromTableToBuffer(
table: ArrowTable,
embeddings?: EmbeddingFunctionConfig,
schema?: Schema,
schema?: SchemaLike,
): Promise<Buffer> {
if (schema !== undefined && schema !== null) {
schema = sanitizeSchema(schema);
@@ -771,7 +825,7 @@ export async function fromDataToBuffer(
schema = sanitizeSchema(schema);
}
if (isArrowTable(data)) {
return fromTableToBuffer(data, embeddings, schema);
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
} else {
const table = await convertToTable(data, embeddings, { schema });
return fromTableToBuffer(table);
@@ -789,7 +843,7 @@ export async function fromDataToBuffer(
export async function fromTableToStreamBuffer(
table: ArrowTable,
embeddings?: EmbeddingFunctionConfig,
schema?: Schema,
schema?: SchemaLike,
): Promise<Buffer> {
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
@@ -854,7 +908,6 @@ function validateSchemaEmbeddings(
for (let field of schema.fields) {
if (isFixedSizeList(field.type)) {
field = sanitizeField(field);
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
if (schema.metadata.has("embedding_functions")) {
const embeddings = JSON.parse(

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
import { Table as ArrowTable, Data, Schema } from "./arrow";
import { Data, Schema, SchemaLike, TableLike } from "./arrow";
import { fromTableToBuffer, makeEmptyTable } from "./arrow";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import { Connection as LanceDbConnection } from "./native";
@@ -50,7 +50,7 @@ export interface CreateTableOptions {
* The default is true while the new format is in beta
*/
useLegacyFormat?: boolean;
schema?: Schema;
schema?: SchemaLike;
embeddingFunction?: EmbeddingFunctionConfig;
}
@@ -167,12 +167,12 @@ export abstract class Connection {
/**
* Creates a new Table and initialize it with new data.
* @param {string} name - The name of the table.
* @param {Record<string, unknown>[] | ArrowTable} data - Non-empty Array of Records
* @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
* to be inserted into the table
*/
abstract createTable(
name: string,
data: Record<string, unknown>[] | ArrowTable,
data: Record<string, unknown>[] | TableLike,
options?: Partial<CreateTableOptions>,
): Promise<Table>;
@@ -183,7 +183,7 @@ export abstract class Connection {
*/
abstract createEmptyTable(
name: string,
schema: Schema,
schema: import("./arrow").SchemaLike,
options?: Partial<CreateTableOptions>,
): Promise<Table>;
@@ -235,7 +235,7 @@ export class LocalConnection extends Connection {
nameOrOptions:
| string
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
data?: Record<string, unknown>[] | ArrowTable,
data?: Record<string, unknown>[] | TableLike,
options?: Partial<CreateTableOptions>,
): Promise<Table> {
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
@@ -259,7 +259,7 @@ export class LocalConnection extends Connection {
async createEmptyTable(
name: string,
schema: Schema,
schema: import("./arrow").SchemaLike,
options?: Partial<CreateTableOptions>,
): Promise<Table> {
let mode: string = options?.mode ?? "create";

View File

@@ -300,7 +300,9 @@ export class VectorQuery extends QueryBase<NativeVectorQuery, VectorQuery> {
*
* By default "l2" is used.
*/
distanceType(distanceType: string): VectorQuery {
distanceType(
distanceType: Required<IvfPqOptions>["distanceType"],
): VectorQuery {
this.inner.distanceType(distanceType);
return this;
}

View File

@@ -1,5 +1,10 @@
import { Schema } from "apache-arrow";
import { Data, fromTableToStreamBuffer, makeEmptyTable } from "../arrow";
import {
Data,
SchemaLike,
fromTableToStreamBuffer,
makeEmptyTable,
} from "../arrow";
import {
Connection,
CreateTableOptions,
@@ -156,7 +161,7 @@ export class RemoteConnection extends Connection {
async createEmptyTable(
name: string,
schema: Schema,
schema: SchemaLike,
options?: Partial<CreateTableOptions> | undefined,
): Promise<Table> {
if (options?.mode) {

View File

@@ -20,10 +20,12 @@
// comes from the exact same library instance. This is not always the case
// and so we must sanitize the input to ensure that it is compatible.
import { BufferType, Data } from "apache-arrow";
import type { IntBitWidth, TKeys, TimeBitWidth } from "apache-arrow/type";
import {
Binary,
Bool,
DataLike,
DataType,
DateDay,
DateMillisecond,
@@ -56,9 +58,14 @@ import {
Map_,
Null,
type Precision,
RecordBatch,
RecordBatchLike,
Schema,
SchemaLike,
SparseUnion,
Struct,
Table,
TableLike,
Time,
TimeMicrosecond,
TimeMillisecond,
@@ -488,7 +495,7 @@ export function sanitizeField(fieldLike: unknown): Field {
* instance because they might be using a different instance of apache-arrow
* than lancedb is using.
*/
export function sanitizeSchema(schemaLike: unknown): Schema {
export function sanitizeSchema(schemaLike: SchemaLike): Schema {
if (schemaLike instanceof Schema) {
return schemaLike;
}
@@ -514,3 +521,68 @@ export function sanitizeSchema(schemaLike: unknown): Schema {
);
return new Schema(sanitizedFields, metadata);
}
export function sanitizeTable(tableLike: TableLike): Table {
if (tableLike instanceof Table) {
return tableLike;
}
if (typeof tableLike !== "object" || tableLike === null) {
throw Error("Expected a Table but object was null/undefined");
}
if (!("schema" in tableLike)) {
throw Error(
"The table passed in does not appear to be a table (no 'schema' property)",
);
}
if (!("batches" in tableLike)) {
throw Error(
"The table passed in does not appear to be a table (no 'columns' property)",
);
}
const schema = sanitizeSchema(tableLike.schema);
const batches = tableLike.batches.map(sanitizeRecordBatch);
return new Table(schema, batches);
}
function sanitizeRecordBatch(batchLike: RecordBatchLike): RecordBatch {
if (batchLike instanceof RecordBatch) {
return batchLike;
}
if (typeof batchLike !== "object" || batchLike === null) {
throw Error("Expected a RecordBatch but object was null/undefined");
}
if (!("schema" in batchLike)) {
throw Error(
"The record batch passed in does not appear to be a record batch (no 'schema' property)",
);
}
if (!("data" in batchLike)) {
throw Error(
"The record batch passed in does not appear to be a record batch (no 'data' property)",
);
}
const schema = sanitizeSchema(batchLike.schema);
const data = sanitizeData(batchLike.data);
return new RecordBatch(schema, data);
}
function sanitizeData(
dataLike: DataLike,
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
): import("apache-arrow").Data<Struct<any>> {
if (dataLike instanceof Data) {
return dataLike;
}
return new Data(
dataLike.type,
dataLike.offset,
dataLike.length,
dataLike.nullCount,
{
[BufferType.OFFSET]: dataLike.valueOffsets,
[BufferType.DATA]: dataLike.values,
[BufferType.VALIDITY]: dataLike.nullBitmap,
[BufferType.TYPE]: dataLike.typeIds,
},
);
}

View File

@@ -17,6 +17,7 @@ import {
Data,
IntoVector,
Schema,
TableLike,
fromDataToBuffer,
fromTableToBuffer,
fromTableToStreamBuffer,
@@ -38,6 +39,8 @@ import {
Table as _NativeTable,
} from "./native";
import { Query, VectorQuery } from "./query";
import { sanitizeTable } from "./sanitize";
export { IndexConfig } from "./native";
/**
* Options for adding data to a table.
@@ -381,8 +384,7 @@ export abstract class Table {
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
static async parseTableData(
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
data: Record<string, unknown>[] | ArrowTable<any>,
data: Record<string, unknown>[] | TableLike,
options?: Partial<CreateTableOptions>,
streaming = false,
) {
@@ -395,9 +397,9 @@ export abstract class Table {
let table: ArrowTable;
if (isArrowTable(data)) {
table = data;
table = sanitizeTable(data);
} else {
table = makeArrowTable(data, options);
table = makeArrowTable(data as Record<string, unknown>[], options);
}
if (streaming) {
const buf = await fromTableToStreamBuffer(

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.5.2",
"version": "0.6.0",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.5.2",
"version": "0.6.0",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.5.2",
"version": "0.6.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.5.2",
"version": "0.6.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.5.2",
"version": "0.6.0",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -10,7 +10,7 @@
"vector database",
"ann"
],
"version": "0.5.2",
"version": "0.6.0",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.5.2"
version = "0.6.0"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.5.2"
version = "0.6.0"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -6,3 +6,12 @@
LanceDB Rust SDK, a serverless vector database.
Read more at: https://lancedb.com/
> [!TIP]
> A transitive dependency of `lancedb` is `lzma-sys`, which uses dynamic linking
> by default. If you want to statically link `lzma-sys`, you should activate it's
> `static` feature by adding the following to your dependencies:
>
> ```toml
> lzma-sys = { version = "*", features = ["static"] }
> ```

View File

@@ -1889,6 +1889,7 @@ impl TableInternal for NativeTable {
}
columns.push(field.name.clone());
}
let index_type = if is_vector {
crate::index::IndexType::IvfPq
} else {