Bump version: 0.9.0-beta.3 → 0.9.0-beta.4

bump version
Fix fast_search
2025-12-24 13:59:58 +00:00 · 2024-12-20 16:02:53 +00:00 · 2024-12-20 09:44:46 -06:00 · 2024-12-20 09:43:39 -06:00 · 2024-12-20 09:21:05 -06:00 · 2024-12-18 10:21:04 -06:00
33 changed files with 341 additions and 261 deletions
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.6.0"
+current_version = "0.5.2"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -13,7 +13,6 @@ theme:
    # Palette toggle for light mode
    - scheme: lancedb
      primary: custom
-      accent: custom
      toggle:
        icon: material/weather-night
        name: Switch to dark mode
--- a/docs/src/guides/tables.md
+++ b/docs/src/guides/tables.md
@@ -116,21 +116,21 @@ This guide will show how to create tables, insert data into them, and update the

 ### From a Polars DataFrame

-LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
-written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
-under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
-is on the way.
+    LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library
+    written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow
+    under the hood. A deeper integration between LanceDB Tables and Polars DataFrames
+    is on the way.

-```python
-import polars as pl
+    ```python
+    import polars as pl

-data = pl.DataFrame({
-    "vector": [[3.1, 4.1], [5.9, 26.5]],
-    "item": ["foo", "bar"],
-    "price": [10.0, 20.0]
-})
-table = db.create_table("pl_table", data=data)
-```
+    data = pl.DataFrame({
+        "vector": [[3.1, 4.1], [5.9, 26.5]],
+        "item": ["foo", "bar"],
+        "price": [10.0, 20.0]
+    })
+    table = db.create_table("pl_table", data=data)
+    ```

 ### From an Arrow Table
 === "Python"
--- a/docs/src/styles/global.css
+++ b/docs/src/styles/global.css
@@ -1,16 +1,13 @@
 :root {
-    --md-primary-fg-color: #241F21;
-    --md-default-bg-color: #FAF5F0;
+    --md-primary-fg-color: #625eff;
    --md-text-font: "IBMPlexSans", ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
    --md-code-font: "IBMPlexMono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
 }

 [data-md-color-scheme="slate"] {
    --md-hue: 210;
-    
 }

-
@font-face {
    font-family: "IBMPlexSans";
    src: local("IBMPlexSans"), url("fonts/IBMPlexSans-Regular.woff2");
--- a/node/package-lock.json
+++ b/node/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "vectordb",
-  "version": "0.6.0",
+  "version": "0.5.2",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "vectordb",
-      "version": "0.6.0",
+      "version": "0.5.2",
      "cpu": [
        "x64",
        "arm64"
--- a/node/package.json
+++ b/node/package.json
@@ -1,6 +1,6 @@
 {
  "name": "vectordb",
-  "version": "0.6.0",
+  "version": "0.5.2",
  "description": " Serverless, low-latency vector database for AI applications",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
--- a/nodejs/test/table.test.ts
+++ b/nodejs/test/table.test.ts
@@ -39,9 +39,7 @@ describe.each([arrow, arrowOld])("Given a table", (arrow: any) => {
  let tmpDir: tmp.DirResult;
  let table: Table;

-  const schema:
-    | import("apache-arrow").Schema
-    | import("apache-arrow-old").Schema = new arrow.Schema([
+  const schema = new arrow.Schema([
    new arrow.Field("id", new arrow.Float64(), true),
  ]);

@@ -317,7 +315,7 @@ describe("When creating an index", () => {
      .query()
      .limit(2)
      .nearestTo(queryVec)
-      .distanceType("dot")
+      .distanceType("DoT")
      .toArrow();
    expect(rst.numRows).toBe(2);

--- a/nodejs/lancedb/arrow.ts
+++ b/nodejs/lancedb/arrow.ts
@@ -15,7 +15,6 @@
 import {
  Table as ArrowTable,
  Binary,
-  BufferType,
  DataType,
  Field,
  FixedSizeBinary,
@@ -38,68 +37,14 @@ import {
  type makeTable,
  vectorFromArray,
 } from "apache-arrow";
-import { Buffers } from "apache-arrow/data";
 import { type EmbeddingFunction } from "./embedding/embedding_function";
 import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
-import {
-  sanitizeField,
-  sanitizeSchema,
-  sanitizeTable,
-  sanitizeType,
-} from "./sanitize";
+import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize";
 export * from "apache-arrow";
-export type SchemaLike =
-  | Schema
-  | {
-      fields: FieldLike[];
-      metadata: Map<string, string>;
-      get names(): unknown[];
-    };
-export type FieldLike =
-  | Field
-  | {
-      type: string;
-      name: string;
-      nullable?: boolean;
-      metadata?: Map<string, string>;
-    };
-
-export type DataLike =
-  // biome-ignore lint/suspicious/noExplicitAny: <explanation>
-  | import("apache-arrow").Data<Struct<any>>
-  | {
-      // biome-ignore lint/suspicious/noExplicitAny: <explanation>
-      type: any;
-      length: number;
-      offset: number;
-      stride: number;
-      nullable: boolean;
-      children: DataLike[];
-      get nullCount(): number;
-      // biome-ignore lint/suspicious/noExplicitAny: <explanation>
-      values: Buffers<any>[BufferType.DATA];
-      // biome-ignore lint/suspicious/noExplicitAny: <explanation>
-      typeIds: Buffers<any>[BufferType.TYPE];
-      // biome-ignore lint/suspicious/noExplicitAny: <explanation>
-      nullBitmap: Buffers<any>[BufferType.VALIDITY];
-      // biome-ignore lint/suspicious/noExplicitAny: <explanation>
-      valueOffsets: Buffers<any>[BufferType.OFFSET];
-    };
-
-export type RecordBatchLike =
-  | RecordBatch
-  | {
-      schema: SchemaLike;
-      data: DataLike;
-    };
-
-export type TableLike =
-  | ArrowTable
-  | { schema: SchemaLike; batches: RecordBatchLike[] };

 export type IntoVector = Float32Array | Float64Array | number[];

-export function isArrowTable(value: object): value is TableLike {
+export function isArrowTable(value: object): value is ArrowTable {
  if (value instanceof ArrowTable) return true;
  return "schema" in value && "batches" in value;
 }
@@ -190,7 +135,7 @@ export function isFixedSizeList(value: unknown): value is FixedSizeList {
 }

 /** Data type accepted by NodeJS SDK */
-export type Data = Record<string, unknown>[] | TableLike;
+export type Data = Record<string, unknown>[] | ArrowTable;

 /*
 * Options to control how a column should be converted to a vector array
@@ -217,7 +162,7 @@ export class MakeArrowTableOptions {
   * The schema must be specified if there are no records (e.g. to make
   * an empty table)
   */
-  schema?: SchemaLike;
+  schema?: Schema;

  /*
   * Mapping from vector column name to expected type
@@ -365,7 +310,7 @@ export function makeArrowTable(
  if (opt.schema !== undefined && opt.schema !== null) {
    opt.schema = sanitizeSchema(opt.schema);
    opt.schema = validateSchemaEmbeddings(
-      opt.schema as Schema,
+      opt.schema,
      data,
      options?.embeddingFunction,
    );
@@ -449,7 +394,7 @@ export function makeArrowTable(
    // `new ArrowTable(schema, batches)` which does not do any schema inference
    const firstTable = new ArrowTable(columns);
    const batchesFixed = firstTable.batches.map(
-      (batch) => new RecordBatch(opt.schema as Schema, batch.data),
+      (batch) => new RecordBatch(opt.schema!, batch.data),
    );
    let schema: Schema;
    if (metadata !== undefined) {
@@ -462,9 +407,9 @@ export function makeArrowTable(
        }
      }

-      schema = new Schema(opt.schema.fields as Field[], schemaMetadata);
+      schema = new Schema(opt.schema.fields, schemaMetadata);
    } else {
-      schema = opt.schema as Schema;
+      schema = opt.schema;
    }
    return new ArrowTable(schema, batchesFixed);
  }
@@ -480,7 +425,7 @@ export function makeArrowTable(
 * Create an empty Arrow table with the provided schema
 */
 export function makeEmptyTable(
-  schema: SchemaLike,
+  schema: Schema,
  metadata?: Map<string, string>,
 ): ArrowTable {
  return makeArrowTable([], { schema }, metadata);
@@ -618,17 +563,18 @@ async function applyEmbeddingsFromMetadata(
 async function applyEmbeddings<T>(
  table: ArrowTable,
  embeddings?: EmbeddingFunctionConfig,
-  schema?: SchemaLike,
+  schema?: Schema,
 ): Promise<ArrowTable> {
-  if (schema !== undefined && schema !== null) {
-    schema = sanitizeSchema(schema);
-  }
  if (schema?.metadata.has("embedding_functions")) {
-    return applyEmbeddingsFromMetadata(table, schema! as Schema);
+    return applyEmbeddingsFromMetadata(table, schema!);
  } else if (embeddings == null || embeddings === undefined) {
    return table;
  }

+  if (schema !== undefined && schema !== null) {
+    schema = sanitizeSchema(schema);
+  }
+
  // Convert from ArrowTable to Record<String, Vector>
  const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
    const name = table.schema.fields[idx].name;
@@ -704,7 +650,7 @@ async function applyEmbeddings<T>(
        `When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
      );
    }
-    return alignTable(newTable, schema as Schema);
+    return alignTable(newTable, schema);
  }
  return newTable;
 }
@@ -798,7 +744,7 @@ export async function fromRecordsToStreamBuffer(
 export async function fromTableToBuffer(
  table: ArrowTable,
  embeddings?: EmbeddingFunctionConfig,
-  schema?: SchemaLike,
+  schema?: Schema,
 ): Promise<Buffer> {
  if (schema !== undefined && schema !== null) {
    schema = sanitizeSchema(schema);
@@ -825,7 +771,7 @@ export async function fromDataToBuffer(
    schema = sanitizeSchema(schema);
  }
  if (isArrowTable(data)) {
-    return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
+    return fromTableToBuffer(data, embeddings, schema);
  } else {
    const table = await convertToTable(data, embeddings, { schema });
    return fromTableToBuffer(table);
@@ -843,7 +789,7 @@ export async function fromDataToBuffer(
 export async function fromTableToStreamBuffer(
  table: ArrowTable,
  embeddings?: EmbeddingFunctionConfig,
-  schema?: SchemaLike,
+  schema?: Schema,
 ): Promise<Buffer> {
  const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
  const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
@@ -908,6 +854,7 @@ function validateSchemaEmbeddings(
  for (let field of schema.fields) {
    if (isFixedSizeList(field.type)) {
      field = sanitizeField(field);
+
      if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
        if (schema.metadata.has("embedding_functions")) {
          const embeddings = JSON.parse(
--- a/nodejs/lancedb/connection.ts
+++ b/nodejs/lancedb/connection.ts
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-import { Data, Schema, SchemaLike, TableLike } from "./arrow";
+import { Table as ArrowTable, Data, Schema } from "./arrow";
 import { fromTableToBuffer, makeEmptyTable } from "./arrow";
 import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
 import { Connection as LanceDbConnection } from "./native";
@@ -50,7 +50,7 @@ export interface CreateTableOptions {
   * The default is true while the new format is in beta
   */
  useLegacyFormat?: boolean;
-  schema?: SchemaLike;
+  schema?: Schema;
  embeddingFunction?: EmbeddingFunctionConfig;
 }

@@ -167,12 +167,12 @@ export abstract class Connection {
  /**
   * Creates a new Table and initialize it with new data.
   * @param {string} name - The name of the table.
-   * @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
+   * @param {Record<string, unknown>[] | ArrowTable} data - Non-empty Array of Records
   * to be inserted into the table
   */
  abstract createTable(
    name: string,
-    data: Record<string, unknown>[] | TableLike,
+    data: Record<string, unknown>[] | ArrowTable,
    options?: Partial<CreateTableOptions>,
  ): Promise<Table>;

@@ -183,7 +183,7 @@ export abstract class Connection {
   */
  abstract createEmptyTable(
    name: string,
-    schema: import("./arrow").SchemaLike,
+    schema: Schema,
    options?: Partial<CreateTableOptions>,
  ): Promise<Table>;

@@ -235,7 +235,7 @@ export class LocalConnection extends Connection {
    nameOrOptions:
      | string
      | ({ name: string; data: Data } & Partial<CreateTableOptions>),
-    data?: Record<string, unknown>[] | TableLike,
+    data?: Record<string, unknown>[] | ArrowTable,
    options?: Partial<CreateTableOptions>,
  ): Promise<Table> {
    if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
@@ -259,7 +259,7 @@ export class LocalConnection extends Connection {

  async createEmptyTable(
    name: string,
-    schema: import("./arrow").SchemaLike,
+    schema: Schema,
    options?: Partial<CreateTableOptions>,
  ): Promise<Table> {
    let mode: string = options?.mode ?? "create";
--- a/nodejs/lancedb/query.ts
+++ b/nodejs/lancedb/query.ts
@@ -300,9 +300,7 @@ export class VectorQuery extends QueryBase<NativeVectorQuery, VectorQuery> {
   *
   * By default "l2" is used.
   */
-  distanceType(
-    distanceType: Required<IvfPqOptions>["distanceType"],
-  ): VectorQuery {
+  distanceType(distanceType: string): VectorQuery {
    this.inner.distanceType(distanceType);
    return this;
  }
--- a/nodejs/lancedb/remote/connection.ts
+++ b/nodejs/lancedb/remote/connection.ts
@@ -1,10 +1,5 @@
 import { Schema } from "apache-arrow";
-import {
-  Data,
-  SchemaLike,
-  fromTableToStreamBuffer,
-  makeEmptyTable,
-} from "../arrow";
+import { Data, fromTableToStreamBuffer, makeEmptyTable } from "../arrow";
 import {
  Connection,
  CreateTableOptions,
@@ -161,7 +156,7 @@ export class RemoteConnection extends Connection {

  async createEmptyTable(
    name: string,
-    schema: SchemaLike,
+    schema: Schema,
    options?: Partial<CreateTableOptions> | undefined,
  ): Promise<Table> {
    if (options?.mode) {
--- a/nodejs/lancedb/sanitize.ts
+++ b/nodejs/lancedb/sanitize.ts
@@ -20,12 +20,10 @@
 // comes from the exact same library instance.  This is not always the case
 // and so we must sanitize the input to ensure that it is compatible.

-import { BufferType, Data } from "apache-arrow";
 import type { IntBitWidth, TKeys, TimeBitWidth } from "apache-arrow/type";
 import {
  Binary,
  Bool,
-  DataLike,
  DataType,
  DateDay,
  DateMillisecond,
@@ -58,14 +56,9 @@ import {
  Map_,
  Null,
  type Precision,
-  RecordBatch,
-  RecordBatchLike,
  Schema,
-  SchemaLike,
  SparseUnion,
  Struct,
-  Table,
-  TableLike,
  Time,
  TimeMicrosecond,
  TimeMillisecond,
@@ -495,7 +488,7 @@ export function sanitizeField(fieldLike: unknown): Field {
 * instance because they might be using a different instance of apache-arrow
 * than lancedb is using.
 */
-export function sanitizeSchema(schemaLike: SchemaLike): Schema {
+export function sanitizeSchema(schemaLike: unknown): Schema {
  if (schemaLike instanceof Schema) {
    return schemaLike;
  }
@@ -521,68 +514,3 @@ export function sanitizeSchema(schemaLike: SchemaLike): Schema {
  );
  return new Schema(sanitizedFields, metadata);
 }
-
-export function sanitizeTable(tableLike: TableLike): Table {
-  if (tableLike instanceof Table) {
-    return tableLike;
-  }
-  if (typeof tableLike !== "object" || tableLike === null) {
-    throw Error("Expected a Table but object was null/undefined");
-  }
-  if (!("schema" in tableLike)) {
-    throw Error(
-      "The table passed in does not appear to be a table (no 'schema' property)",
-    );
-  }
-  if (!("batches" in tableLike)) {
-    throw Error(
-      "The table passed in does not appear to be a table (no 'columns' property)",
-    );
-  }
-  const schema = sanitizeSchema(tableLike.schema);
-
-  const batches = tableLike.batches.map(sanitizeRecordBatch);
-  return new Table(schema, batches);
-}
-
-function sanitizeRecordBatch(batchLike: RecordBatchLike): RecordBatch {
-  if (batchLike instanceof RecordBatch) {
-    return batchLike;
-  }
-  if (typeof batchLike !== "object" || batchLike === null) {
-    throw Error("Expected a RecordBatch but object was null/undefined");
-  }
-  if (!("schema" in batchLike)) {
-    throw Error(
-      "The record batch passed in does not appear to be a record batch (no 'schema' property)",
-    );
-  }
-  if (!("data" in batchLike)) {
-    throw Error(
-      "The record batch passed in does not appear to be a record batch (no 'data' property)",
-    );
-  }
-  const schema = sanitizeSchema(batchLike.schema);
-  const data = sanitizeData(batchLike.data);
-  return new RecordBatch(schema, data);
-}
-function sanitizeData(
-  dataLike: DataLike,
-  // biome-ignore lint/suspicious/noExplicitAny: <explanation>
-): import("apache-arrow").Data<Struct<any>> {
-  if (dataLike instanceof Data) {
-    return dataLike;
-  }
-  return new Data(
-    dataLike.type,
-    dataLike.offset,
-    dataLike.length,
-    dataLike.nullCount,
-    {
-      [BufferType.OFFSET]: dataLike.valueOffsets,
-      [BufferType.DATA]: dataLike.values,
-      [BufferType.VALIDITY]: dataLike.nullBitmap,
-      [BufferType.TYPE]: dataLike.typeIds,
-    },
-  );
-}
--- a/nodejs/lancedb/table.ts
+++ b/nodejs/lancedb/table.ts
@@ -17,7 +17,6 @@ import {
  Data,
  IntoVector,
  Schema,
-  TableLike,
  fromDataToBuffer,
  fromTableToBuffer,
  fromTableToStreamBuffer,
@@ -39,8 +38,6 @@ import {
  Table as _NativeTable,
 } from "./native";
 import { Query, VectorQuery } from "./query";
-import { sanitizeTable } from "./sanitize";
-export { IndexConfig } from "./native";

 /**
 * Options for adding data to a table.
@@ -384,7 +381,8 @@ export abstract class Table {
  abstract indexStats(name: string): Promise<IndexStatistics | undefined>;

  static async parseTableData(
-    data: Record<string, unknown>[] | TableLike,
+    // biome-ignore lint/suspicious/noExplicitAny: <explanation>
+    data: Record<string, unknown>[] | ArrowTable<any>,
    options?: Partial<CreateTableOptions>,
    streaming = false,
  ) {
@@ -397,9 +395,9 @@ export abstract class Table {

    let table: ArrowTable;
    if (isArrowTable(data)) {
-      table = sanitizeTable(data);
+      table = data;
    } else {
-      table = makeArrowTable(data as Record<string, unknown>[], options);
+      table = makeArrowTable(data, options);
    }
    if (streaming) {
      const buf = await fromTableToStreamBuffer(
--- a/nodejs/npm/darwin-arm64/package.json
+++ b/nodejs/npm/darwin-arm64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-arm64",
-	"version": "0.6.0",
+	"version": "0.5.2",
 	"os": ["darwin"],
 	"cpu": ["arm64"],
 	"main": "lancedb.darwin-arm64.node",
--- a/nodejs/npm/darwin-x64/package.json
+++ b/nodejs/npm/darwin-x64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-x64",
-	"version": "0.6.0",
+	"version": "0.5.2",
 	"os": ["darwin"],
 	"cpu": ["x64"],
 	"main": "lancedb.darwin-x64.node",
--- a/nodejs/npm/linux-arm64-gnu/package.json
+++ b/nodejs/npm/linux-arm64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-arm64-gnu",
-	"version": "0.6.0",
+	"version": "0.5.2",
 	"os": ["linux"],
 	"cpu": ["arm64"],
 	"main": "lancedb.linux-arm64-gnu.node",
--- a/nodejs/npm/linux-x64-gnu/package.json
+++ b/nodejs/npm/linux-x64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-x64-gnu",
-	"version": "0.6.0",
+	"version": "0.5.2",
 	"os": ["linux"],
 	"cpu": ["x64"],
 	"main": "lancedb.linux-x64-gnu.node",
--- a/nodejs/npm/win32-x64-msvc/package.json
+++ b/nodejs/npm/win32-x64-msvc/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-win32-x64-msvc",
-	"version": "0.6.0",
+	"version": "0.5.2",
 	"os": ["win32"],
 	"cpu": ["x64"],
 	"main": "lancedb.win32-x64-msvc.node",
--- a/nodejs/package.json
+++ b/nodejs/package.json
@@ -10,7 +10,7 @@
    "vector database",
    "ann"
  ],
-  "version": "0.6.0",
+  "version": "0.5.2",
  "main": "dist/index.js",
  "exports": {
    ".": "./dist/index.js",
--- a/python/.bumpversion.toml
+++ b/python/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.9.0"
+current_version = "0.9.0-beta.4"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-python"
-version = "0.9.0"
+version = "0.9.0-beta.4"
 edition.workspace = true
 description = "Python bindings for LanceDB"
 license.workspace = true
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
    "packaging",
    "cachetools",
    "overrides>=0.7",
+    "urllib3==1.26.19"
 ]
 description = "lancedb"
 authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
--- a/python/python/lancedb/init.py
+++ b/python/python/lancedb/init.py
@@ -35,6 +35,7 @@ def connect(
    host_override: Optional[str] = None,
    read_consistency_interval: Optional[timedelta] = None,
    request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
+    storage_options: Optional[Dict[str, str]] = None,
    **kwargs,
 ) -> DBConnection:
    """Connect to a LanceDB database.
@@ -70,6 +71,9 @@ def connect(
        executor will be used for making requests. This is for LanceDB Cloud
        only and is only used when making batch requests (i.e., passing in
        multiple queries to the search method at once).
+    storage_options: dict, optional
+        Additional options for the storage backend. See available options at
+        https://lancedb.github.io/lancedb/guides/storage/

    Examples
    --------
@@ -105,12 +109,16 @@ def connect(
            region,
            host_override,
            request_thread_pool=request_thread_pool,
+            storage_options=storage_options,
            **kwargs,
        )

    if kwargs:
        raise ValueError(f"Unknown keyword arguments: {kwargs}")
-    return LanceDBConnection(uri, read_consistency_interval=read_consistency_interval)
+    return LanceDBConnection(
+        uri,
+        read_consistency_interval=read_consistency_interval,
+    )


 async def connect_async(
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -117,6 +117,8 @@ class Query(pydantic.BaseModel):

    with_row_id: bool = False

+    fast_search: bool = False
+

 class LanceQueryBuilder(ABC):
    """An abstract query builder. Subclasses are defined for vector search,
@@ -125,12 +127,14 @@ class LanceQueryBuilder(ABC):

    @classmethod
    def create(
-        cls,
-        table: "Table",
-        query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
-        query_type: str,
-        vector_column_name: str,
-        ordering_field_name: str = None,
+            cls,
+            table: "Table",
+            query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
+            query_type: str,
+            vector_column_name: str,
+            ordering_field_name: Optional[str] = None,
+            fts_columns: Union[str, List[str]] = [],
+            fast_search: bool = False,
    ) -> LanceQueryBuilder:
        """
        Create a query builder based on the given query and query type.
@@ -147,13 +151,18 @@ class LanceQueryBuilder(ABC):
            If "auto", the query type is inferred based on the query.
        vector_column_name: str
            The name of the vector column to use for vector search.
+        fast_search: bool
+            Skip flat search of unindexed data.
        """
-        if query is None:
-            return LanceEmptyQueryBuilder(table)
-
+        # Check hybrid search first as it supports empty query pattern
        if query_type == "hybrid":
            # hybrid fts and vector query
-            return LanceHybridQueryBuilder(table, query, vector_column_name)
+            return LanceHybridQueryBuilder(
+                table, query, vector_column_name, fts_columns=fts_columns
+            )
+
+        if query is None:
+            return LanceEmptyQueryBuilder(table)

        # remember the string query for reranking purpose
        str_query = query if isinstance(query, str) else None
@@ -165,12 +174,17 @@ class LanceQueryBuilder(ABC):
        )

        if query_type == "hybrid":
-            return LanceHybridQueryBuilder(table, query, vector_column_name)
+            return LanceHybridQueryBuilder(
+                table, query, vector_column_name, fts_columns=fts_columns
+            )

        if isinstance(query, str):
            # fts
            return LanceFtsQueryBuilder(
-                table, query, ordering_field_name=ordering_field_name
+                table,
+                query,
+                ordering_field_name=ordering_field_name,
+                fts_columns=fts_columns,
            )

        if isinstance(query, list):
@@ -180,7 +194,9 @@ class LanceQueryBuilder(ABC):
        else:
            raise TypeError(f"Unsupported query type: {type(query)}")

-        return LanceVectorQueryBuilder(table, query, vector_column_name, str_query)
+        return LanceVectorQueryBuilder(
+            table, query, vector_column_name, str_query, fast_search
+        )

    @classmethod
    def _resolve_query(cls, table, query, query_type, vector_column_name):
@@ -196,8 +212,6 @@ class LanceQueryBuilder(ABC):
        elif query_type == "auto":
            if isinstance(query, (list, np.ndarray)):
                return query, "vector"
-            if isinstance(query, tuple):
-                return query, "hybrid"
            else:
                conf = table.embedding_functions.get(vector_column_name)
                if conf is not None:
@@ -224,9 +238,14 @@ class LanceQueryBuilder(ABC):
    def __init__(self, table: "Table"):
        self._table = table
        self._limit = 10
+        self._offset = 0
        self._columns = None
        self._where = None
+        self._prefilter = False
        self._with_row_id = False
+        self._vector = None
+        self._text = None
+        self._ef = None

    @deprecation.deprecated(
        deprecated_in="0.3.1",
@@ -337,11 +356,13 @@ class LanceQueryBuilder(ABC):
        ----------
        limit: int
            The maximum number of results to return.
-            By default the query is limited to the first 10.
-            Call this method and pass 0, a negative value,
-            or None to remove the limit.
-            *WARNING* if you have a large dataset, removing
-            the limit can potentially result in reading a
+            The default query limit is 10 results.
+            For ANN/KNN queries, you must specify a limit.
+            Entering 0, a negative number, or None will reset
+            the limit to the default value of 10.
+            *WARNING* if you have a large dataset, setting
+            the limit to a large number, e.g. the table size,
+            can potentially result in reading a
            large amount of data into memory and cause
            out of memory issues.

@@ -351,11 +372,33 @@ class LanceQueryBuilder(ABC):
            The LanceQueryBuilder object.
        """
        if limit is None or limit <= 0:
-            self._limit = None
+            if isinstance(self, LanceVectorQueryBuilder):
+                raise ValueError("Limit is required for ANN/KNN queries")
+            else:
+                self._limit = None
        else:
            self._limit = limit
        return self

+    def offset(self, offset: int) -> LanceQueryBuilder:
+        """Set the offset for the results.
+
+        Parameters
+        ----------
+        offset: int
+            The offset to start fetching results from.
+
+        Returns
+        -------
+        LanceQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        if offset is None or offset <= 0:
+            self._offset = 0
+        else:
+            self._offset = offset
+        return self
+
    def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
        """Set the columns to return.

@@ -417,6 +460,80 @@ class LanceQueryBuilder(ABC):
        self._with_row_id = with_row_id
        return self

+    def explain_plan(self, verbose: Optional[bool] = False) -> str:
+        """Return the execution plan for this query.
+
+        Examples
+        --------
+        >>> import lancedb
+        >>> db = lancedb.connect("./.lancedb")
+        >>> table = db.create_table("my_table", [{"vector": [99, 99]}])
+        >>> query = [100, 100]
+        >>> plan = table.search(query).explain_plan(True)
+        >>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+        ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
+        GlobalLimitExec: skip=0, fetch=10
+          FilterExec: _distance@2 IS NOT NULL
+            SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
+              KNNVectorDistance: metric=l2
+                LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
+
+        Parameters
+        ----------
+        verbose : bool, default False
+            Use a verbose output format.
+
+        Returns
+        -------
+        plan : str
+        """  # noqa: E501
+        ds = self._table.to_lance()
+        return ds.scanner(
+            nearest={
+                "column": self._vector_column,
+                "q": self._query,
+                "k": self._limit,
+                "metric": self._metric,
+                "nprobes": self._nprobes,
+                "refine_factor": self._refine_factor,
+            },
+            prefilter=self._prefilter,
+            filter=self._str_query,
+            limit=self._limit,
+            with_row_id=self._with_row_id,
+            offset=self._offset,
+        ).explain_plan(verbose)
+
+    def vector(self, vector: Union[np.ndarray, list]) -> LanceQueryBuilder:
+        """Set the vector to search for.
+
+        Parameters
+        ----------
+        vector: np.ndarray or list
+            The vector to search for.
+
+        Returns
+        -------
+        LanceQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        raise NotImplementedError
+
+    def text(self, text: str) -> LanceQueryBuilder:
+        """Set the text to search for.
+
+        Parameters
+        ----------
+        text: str
+            The text to search for.
+
+        Returns
+        -------
+        LanceQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        raise NotImplementedError
+

 class LanceVectorQueryBuilder(LanceQueryBuilder):
    """
@@ -440,11 +557,12 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
    """

    def __init__(
-        self,
-        table: "Table",
-        query: Union[np.ndarray, list, "PIL.Image.Image"],
-        vector_column: str,
-        str_query: Optional[str] = None,
+            self,
+            table: "Table",
+            query: Union[np.ndarray, list, "PIL.Image.Image"],
+            vector_column: str,
+            str_query: Optional[str] = None,
+            fast_search: bool = False,
    ):
        super().__init__(table)
        self._query = query
@@ -455,13 +573,14 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
        self._prefilter = False
        self._reranker = None
        self._str_query = str_query
+        self._fast_search = fast_search

-    def metric(self, metric: Literal["L2", "cosine"]) -> LanceVectorQueryBuilder:
+    def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
        """Set the distance metric to use.

        Parameters
        ----------
-        metric: "L2" or "cosine"
+        metric: "L2" or "cosine" or "dot"
            The distance metric to use. By default "L2" is used.

        Returns
@@ -469,7 +588,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
        LanceVectorQueryBuilder
            The LanceQueryBuilder object.
        """
-        self._metric = metric
+        self._metric = metric.lower()
        return self

    def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
@@ -494,6 +613,28 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
        self._nprobes = nprobes
        return self

+    def ef(self, ef: int) -> LanceVectorQueryBuilder:
+        """Set the number of candidates to consider during search.
+
+        Higher values will yield better recall (more likely to find vectors if
+        they exist) at the expense of latency.
+
+        This only applies to the HNSW-related index.
+        The default value is 1.5 * limit.
+
+        Parameters
+        ----------
+        ef: int
+            The number of candidates to consider during search.
+
+        Returns
+        -------
+        LanceVectorQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        self._ef = ef
+        return self
+
    def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
        """Set the refine factor to use, increasing the number of vectors sampled.

@@ -554,15 +695,11 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
            refine_factor=self._refine_factor,
            vector_column=self._vector_column,
            with_row_id=self._with_row_id,
+            offset=self._offset,
+            fast_search=self._fast_search,
+            ef=self._ef,
        )
        result_set = self._table._execute_query(query, batch_size)
-        if self._reranker is not None:
-            rs_table = result_set.read_all()
-            result_set = self._reranker.rerank_vector(self._str_query, rs_table)
-            # convert result_set back to RecordBatchReader
-            result_set = pa.RecordBatchReader.from_batches(
-                result_set.schema, result_set.to_batches()
-            )

        return result_set

@@ -591,7 +728,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
        return self

    def rerank(
-        self, reranker: Reranker, query_string: Optional[str] = None
+            self, reranker: Reranker, query_string: Optional[str] = None
    ) -> LanceVectorQueryBuilder:
        """Rerank the results using the specified reranker.

@@ -756,12 +893,34 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):

 class LanceEmptyQueryBuilder(LanceQueryBuilder):
    def to_arrow(self) -> pa.Table:
-        ds = self._table.to_lance()
-        return ds.to_table(
+        return self.to_batches().read_all()
+
+    def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
+        query = Query(
            columns=self._columns,
            filter=self._where,
-            limit=self._limit,
+            k=self._limit or 10,
+            with_row_id=self._with_row_id,
+            vector=[],
+            # not actually respected in remote query
+            offset=self._offset or 0,
        )
+        return self._table._execute_query(query)
+
+    def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
+        """Rerank the results using the specified reranker.
+
+        Parameters
+        ----------
+        reranker: Reranker
+            The reranker to use.
+
+        Returns
+        -------
+        LanceEmptyQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        raise NotImplementedError("Reranking is not yet supported.")


 class LanceHybridQueryBuilder(LanceQueryBuilder):
--- a/python/python/lancedb/remote/client.py
+++ b/python/python/lancedb/remote/client.py
@@ -55,11 +55,13 @@ class RestfulLanceDBClient:
    region: str
    api_key: Credential
    host_override: Optional[str] = attrs.field(default=None)
+    db_prefix: Optional[str] = attrs.field(default=None)

    closed: bool = attrs.field(default=False, init=False)

    connection_timeout: float = attrs.field(default=120.0, kw_only=True)
    read_timeout: float = attrs.field(default=300.0, kw_only=True)
+    storage_options: Optional[Dict[str, str]] = attrs.field(default=None, kw_only=True)

    @functools.cached_property
    def session(self) -> requests.Session:
@@ -92,6 +94,18 @@ class RestfulLanceDBClient:
            headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
        if self.host_override:
            headers["x-lancedb-database"] = self.db_name
+        if self.storage_options:
+            if self.storage_options.get("account_name") is not None:
+                headers["x-azure-storage-account-name"] = self.storage_options[
+                    "account_name"
+                ]
+            if self.storage_options.get("azure_storage_account_name") is not None:
+                headers["x-azure-storage-account-name"] = self.storage_options[
+                    "azure_storage_account_name"
+                ]
+        if self.db_prefix:
+            headers["x-lancedb-database-prefix"] = self.db_prefix
+
        return headers

    @staticmethod
@@ -158,6 +172,7 @@ class RestfulLanceDBClient:
            headers["content-type"] = content_type
        if request_id is not None:
            headers["x-request-id"] = request_id
+
        with self.session.post(
            urljoin(self.url, uri),
            headers=headers,
@@ -245,7 +260,6 @@ def retry_adapter(options: Dict[str, Any]) -> HTTPAdapter:
            connect=connect_retries,
            read=read_retries,
            backoff_factor=backoff_factor,
-            backoff_jitter=backoff_jitter,
            status_forcelist=statuses,
            allowed_methods=methods,
        )
--- a/python/python/lancedb/remote/db.py
+++ b/python/python/lancedb/remote/db.py
@@ -15,7 +15,7 @@ import inspect
 import logging
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from typing import Iterable, List, Optional, Union
+from typing import Dict, Iterable, List, Optional, Union
 from urllib.parse import urlparse

 from cachetools import TTLCache
@@ -44,20 +44,25 @@ class RemoteDBConnection(DBConnection):
        request_thread_pool: Optional[ThreadPoolExecutor] = None,
        connection_timeout: float = 120.0,
        read_timeout: float = 300.0,
+        storage_options: Optional[Dict[str, str]] = None,
    ):
        """Connect to a remote LanceDB database."""
        parsed = urlparse(db_url)
        if parsed.scheme != "db":
            raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://")
        self.db_name = parsed.netloc
+        prefix = parsed.path.lstrip("/")
+        self.db_prefix = None if not prefix else prefix
        self.api_key = api_key
        self._client = RestfulLanceDBClient(
            self.db_name,
            region,
            api_key,
            host_override,
+            self.db_prefix,
            connection_timeout=connection_timeout,
            read_timeout=read_timeout,
+            storage_options=storage_options,
        )
        self._request_thread_pool = request_thread_pool
        self._table_cache = TTLCache(maxsize=10000, ttl=300)
--- a/python/python/lancedb/remote/table.py
+++ b/python/python/lancedb/remote/table.py
@@ -22,6 +22,7 @@ from lance import json_to_schema

 from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
 from lancedb.merge import LanceMergeInsertBuilder
+from lancedb.query import LanceQueryBuilder

 from ..query import LanceVectorQueryBuilder
 from ..table import Query, Table, _sanitize_data
@@ -228,10 +229,21 @@ class RemoteTable(Table):
            content_type=ARROW_STREAM_CONTENT_TYPE,
        )

+    def query(
+        self,
+        query: Union[VEC, str] = None,
+        query_type: str = "vector",
+        vector_column_name: Optional[str] = None,
+        fast_search: bool = False,
+    ) -> LanceVectorQueryBuilder:
+        return self.search(query, query_type, vector_column_name, fast_search)
+
    def search(
        self,
-        query: Union[VEC, str],
+        query: Union[VEC, str] = None,
+        query_type: str = "vector",
        vector_column_name: Optional[str] = None,
+        fast_search: bool = False,
    ) -> LanceVectorQueryBuilder:
        """Create a search query to find the nearest neighbors
        of the given query vector. We currently support [vector search][search]
@@ -278,6 +290,11 @@ class RemoteTable(Table):
            - If the table has multiple vector columns then the *vector_column_name*
            needs to be specified. Otherwise, an error is raised.

+        fast_search: bool, optional
+            Skip a flat search of unindexed data. This may improve
+            search performance but search results will not include unindexed data.
+
+            - *default False*.
        Returns
        -------
        LanceQueryBuilder
@@ -293,7 +310,14 @@ class RemoteTable(Table):
        """
        if vector_column_name is None:
            vector_column_name = inf_vector_column_query(self.schema)
-        return LanceVectorQueryBuilder(self, query, vector_column_name)
+
+        return LanceQueryBuilder.create(
+            self,
+            query,
+            query_type,
+            vector_column_name=vector_column_name,
+            fast_search=fast_search,
+        )

    def _execute_query(
        self, query: Query, batch_size: Optional[int] = None
--- a/python/python/tests/test_remote_db.py
+++ b/python/python/tests/test_remote_db.py
@@ -21,6 +21,7 @@ class FakeLanceDBClient:
        pass

    def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
+        print(f"{query=}")
        assert table_name == "test"
        t = pa.schema([]).empty_table()
        return VectorQueryResult(t)
@@ -39,3 +40,21 @@ def test_remote_db():
    table = conn["test"]
    table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
    table.search([1.0, 2.0]).to_pandas()
+
+
+def test_empty_query_with_filter():
+    conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
+    setattr(conn, "_client", FakeLanceDBClient())
+
+    table = conn["test"]
+    table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
+    print(table.query().select(["vector"]).where("foo == bar").to_arrow())
+
+
+def test_fast_search_query_with_filter():
+    conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
+    setattr(conn, "_client", FakeLanceDBClient())
+
+    table = conn["test"]
+    table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
+    print(table.query([0, 0], fast_search=True).select(["vector"]).where("foo == bar").to_arrow())
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -735,7 +735,7 @@ def test_create_scalar_index(db):
    indices = table.to_lance().list_indices()
    assert len(indices) == 1
    scalar_index = indices[0]
-    assert scalar_index["type"] == "Scalar"
+    assert scalar_index["type"] == "BTree"

    # Confirm that prefiltering still works with the scalar index column
    results = table.search().where("x = 'c'").to_arrow()
--- a/rust/ffi/node/Cargo.toml
+++ b/rust/ffi/node/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-node"
-version = "0.6.0"
+version = "0.5.2"
 description = "Serverless, low-latency vector database for AI applications"
 license.workspace = true
 edition.workspace = true
--- a/rust/lancedb/Cargo.toml
+++ b/rust/lancedb/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb"
-version = "0.6.0"
+version = "0.5.2"
 edition.workspace = true
 description = "LanceDB: A serverless, low-latency vector database for AI applications"
 license.workspace = true
--- a/rust/lancedb/README.md
+++ b/rust/lancedb/README.md
@@ -6,12 +6,3 @@
 LanceDB Rust SDK, a serverless vector database.

 Read more at: https://lancedb.com/
-
-> [!TIP]
-> A transitive dependency of `lancedb` is `lzma-sys`, which uses dynamic linking
-> by default. If you want to statically link `lzma-sys`, you should activate it's
-> `static` feature by adding the following to your dependencies:
->
-> ```toml
-> lzma-sys = { version = "*", features = ["static"] }
-> ```
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -1889,7 +1889,6 @@ impl TableInternal for NativeTable {
                }
                columns.push(field.name.clone());
            }
-
            let index_type = if is_vector {
                crate::index::IndexType::IvfPq
            } else {
Author	SHA1	Message	Date
Lance Release	7a3ef68306	Bump version: 0.9.0-beta.3 → 0.9.0-beta.4	2024-12-20 16:02:53 +00:00
Ryan Green	43952e01d7	bump version	2024-12-20 09:44:46 -06:00
Ryan Green	495c335831	Fix fast_search	2024-12-20 09:43:39 -06:00
Ryan Green	77707db543	Backport fast_search and empty query builder for remote table	2024-12-20 09:21:05 -06:00
Ryan Green	d6d7ad3b06	bump version	2024-12-18 10:21:04 -06:00
Ryan Green	e58d64c286	Remove unsupported Retry params	2024-12-18 10:08:38 -06:00
Ryan Green	76cbd18c46	bump version	2024-12-18 09:38:36 -06:00
Ryan Green	4abb38ac70	bump version	2024-12-18 09:37:58 -06:00
Ryan Green	cc7bc5011d	Merge remote-tracking branch 'origin/python-v0.9.0-patch' into python-v0.9.0-patch # Conflicts: # python/pyproject.toml	2024-12-18 08:59:35 -06:00
Ryan Green	8193183304	override urllib3 version	2024-12-18 08:59:24 -06:00
Ryan Green	cf28b58b7d	override urllib3 version	2024-12-18 08:58:41 -06:00
Lance Release	e3b7ee47b9	Bump version: 0.9.0 → 0.9.0-final.1	2024-12-13 01:16:24 +00:00
Lu Qiu	97c9c906e4	Fix version test	2024-12-12 17:10:07 -08:00
Lu Qiu	358f86b9c6	fix	2024-12-12 16:44:24 -08:00
Lu Qiu	5489e215a3	Support storage options and folder prefix	2024-12-12 16:17:34 -08:00