Bump version: 0.9.0-beta.7 → 0.9.0-beta.8

Merge remote-tracking branch 'origin/python-v0.9.4-patch' into python-v0.9.4-patch
Pin chrono version
2025-12-24 22:09:58 +00:00 · 2025-02-26 15:03:57 +00:00 · 2025-02-26 11:31:34 -03:30 · 2025-02-26 11:31:29 -03:30 · 2025-02-26 13:29:54 +00:00 · 2025-02-26 09:56:08 -03:30
42 changed files with 612 additions and 143 deletions
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.5.2"
+current_version = "0.5.2-final.1"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/.github/workflows/build_windows_wheel/action.yml
+++ b/.github/workflows/build_windows_wheel/action.yml
@@ -28,7 +28,7 @@ runs:
        args: ${{ inputs.args }}
        docker-options: "-e PIP_EXTRA_INDEX_URL=https://pypi.fury.io/lancedb/"
        working-directory: python
-    - uses: actions/upload-artifact@v3
+    - uses: actions/upload-artifact@v4
      with:
        name: windows-wheels
        path: python\target\wheels
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,13 +20,11 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
 categories = ["database-implementations"]

 [workspace.dependencies]
-lance = { "version" = "=0.12.4", "features" = [
-    "dynamodb",
-]}
-lance-index = { "version" = "=0.12.4" }
-lance-linalg = { "version" = "=0.12.4" }
-lance-testing = { "version" = "=0.12.4" }
-lance-datafusion = { "version" = "=0.12.4" }
+lance = { "version" = "=0.13.0", "features" = ["dynamodb"] }
+lance-index = { "version" = "=0.13.0" }
+lance-linalg = { "version" = "=0.13.0" }
+lance-testing = { "version" = "=0.13.0" }
+lance-datafusion = { "version" = "=0.13.0" }
 # Note that this one does not include pyarrow
 arrow = { version = "51.0", optional = false }
 arrow-array = "51.0"
@@ -37,7 +35,7 @@ arrow-schema = "51.0"
 arrow-arith = "51.0"
 arrow-cast = "51.0"
 async-trait = "0"
-chrono = "0.4.35"
+chrono = "=0.4.39"
 datafusion-physical-plan = "37.1"
 half = { "version" = "=2.4.1", default-features = false, features = [
    "num-traits",
--- a/docs/src/fts.md
+++ b/docs/src/fts.md
@@ -54,6 +54,16 @@ This returns the result as a list of dictionaries as follows.
 !!! note
    LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.

+## Tokenization
+By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
+
+```python
+table.create_fts_index("text", tokenizer_name="en_stem")
+```
+
+The following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
+
+
 ## Index multiple columns

 If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
@@ -139,6 +149,7 @@ is treated as a phrase query.
 In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
 double quotes replaced by single quotes.

+
 ## Configurations

 By default, LanceDB configures a 1GB heap size limit for creating the index. You can
--- a/node/package.json
+++ b/node/package.json
@@ -1,12 +1,12 @@
 {
  "name": "vectordb",
-  "version": "0.5.2",
+  "version": "0.5.2-final.1",
  "description": " Serverless, low-latency vector database for AI applications",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
  "scripts": {
    "tsc": "tsc -b",
-    "build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb-node index.node -- cargo build --message-format=json",
+    "build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb_node index.node -- cargo build --message-format=json",
    "build-release": "npm run build -- --release",
    "test": "npm run tsc && mocha -recursive dist/test",
    "integration-test": "npm run tsc && mocha -recursive dist/integration_test",
--- a/nodejs/test/connection.test.ts
+++ b/nodejs/test/connection.test.ts
@@ -57,6 +57,18 @@ describe("given a connection", () => {
    expect(db.isOpen()).toBe(false);
    await expect(db.tableNames()).rejects.toThrow("Connection is closed");
  });
+  it("should be able to create a table from an object arg `createTable(options)`, or args `createTable(name, data, options)`", async () => {
+    let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
+    await expect(tbl.countRows()).resolves.toBe(2);
+
+    tbl = await db.createTable({
+      name: "test",
+      data: [{ id: 3 }],
+      mode: "overwrite",
+    });
+
+    await expect(tbl.countRows()).resolves.toBe(1);
+  });

  it("should fail if creating table twice, unless overwrite is true", async () => {
    let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
--- a/nodejs/test/embedding.test.ts
+++ b/nodejs/test/embedding.test.ts
@@ -230,7 +230,7 @@ describe("embedding functions", () => {
    },
  );

-  test.only.each([new Float16(), new Float32(), new Float64()])(
+  test.each([new Float16(), new Float32(), new Float64()])(
    "should be able to provide auto embeddings with multiple float datatypes",
    async (floatType) => {
      @register("test1")
--- a/nodejs/test/table.test.ts
+++ b/nodejs/test/table.test.ts
@@ -305,6 +305,7 @@ describe("When creating an index", () => {
    const indices = await tbl.listIndices();
    expect(indices.length).toBe(1);
    expect(indices[0]).toEqual({
+      name: "vec_idx",
      indexType: "IvfPq",
      columns: ["vec"],
    });
@@ -361,6 +362,24 @@ describe("When creating an index", () => {
    for await (const r of tbl.query().where("id > 1").select(["id"])) {
      expect(r.numRows).toBe(298);
    }
+    // should also work with 'filter' alias
+    for await (const r of tbl.query().filter("id > 1").select(["id"])) {
+      expect(r.numRows).toBe(298);
+    }
+  });
+
+  test("should be able to get index stats", async () => {
+    await tbl.createIndex("id");
+
+    const stats = await tbl.indexStats("id_idx");
+    expect(stats).toBeDefined();
+    expect(stats?.numIndexedRows).toEqual(300);
+    expect(stats?.numUnindexedRows).toEqual(0);
+  });
+
+  test("when getting stats on non-existent index", async () => {
+    const stats = await tbl.indexStats("some non-existent index");
+    expect(stats).toBeUndefined();
  });

  // TODO: Move this test to the query API test (making sure we can reject queries
--- a/nodejs/lancedb/connection.ts
+++ b/nodejs/lancedb/connection.ts
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-import { Table as ArrowTable, Schema } from "./arrow";
+import { Table as ArrowTable, Data, Schema } from "./arrow";
 import { fromTableToBuffer, makeEmptyTable } from "./arrow";
 import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
 import { Connection as LanceDbConnection } from "./native";
@@ -151,6 +151,19 @@ export abstract class Connection {
    options?: Partial<OpenTableOptions>,
  ): Promise<Table>;

+  /**
+   * Creates a new Table and initialize it with new data.
+   * @param {object} options - The options object.
+   * @param {string} options.name - The name of the table.
+   * @param {Data} options.data - Non-empty Array of Records to be inserted into the table
+   *
+   */
+  abstract createTable(
+    options: {
+      name: string;
+      data: Data;
+    } & Partial<CreateTableOptions>,
+  ): Promise<Table>;
  /**
   * Creates a new Table and initialize it with new data.
   * @param {string} name - The name of the table.
@@ -219,13 +232,22 @@ export class LocalConnection extends Connection {
  }

  async createTable(
-    name: string,
-    data: Record<string, unknown>[] | ArrowTable,
+    nameOrOptions:
+      | string
+      | ({ name: string; data: Data } & Partial<CreateTableOptions>),
+    data?: Record<string, unknown>[] | ArrowTable,
    options?: Partial<CreateTableOptions>,
  ): Promise<Table> {
+    if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
+      const { name, data, ...options } = nameOrOptions;
+      return this.createTable(name, data, options);
+    }
+    if (data === undefined) {
+      throw new Error("data is required");
+    }
    const { buf, mode } = await Table.parseTableData(data, options);
    const innerTable = await this.inner.createTable(
-      name,
+      nameOrOptions,
      buf,
      mode,
      cleanseStorageOptions(options?.storageOptions),
--- a/nodejs/lancedb/index.ts
+++ b/nodejs/lancedb/index.ts
@@ -31,6 +31,9 @@ export {
  AddColumnsSql,
  ColumnAlteration,
  ConnectionOptions,
+  IndexStatistics,
+  IndexMetadata,
+  IndexConfig,
 } from "./native.js";

 export {
@@ -56,12 +59,7 @@ export {

 export { Index, IndexOptions, IvfPqOptions } from "./indices";

-export {
-  Table,
-  AddDataOptions,
-  IndexConfig,
-  UpdateOptions,
-} from "./table";
+export { Table, AddDataOptions, UpdateOptions } from "./table";

 export * as embedding from "./embedding";

@@ -76,15 +74,61 @@ export * as embedding from "./embedding";
 * @param {string} uri - The uri of the database. If the database uri starts
 * with `db://` then it connects to a remote database.
 * @see {@link ConnectionOptions} for more details on the URI format.
+ * @example
+ * ```ts
+ * const conn = await connect("/path/to/database");
+ * ```
+ * @example
+ * ```ts
+ * const conn = await connect(
+ *   "s3://bucket/path/to/database",
+ *   {storageOptions: {timeout: "60s"}
+ * });
+ * ```
 */
 export async function connect(
  uri: string,
  opts?: Partial<ConnectionOptions | RemoteConnectionOptions>,
+): Promise<Connection>;
+/**
+ * Connect to a LanceDB instance at the given URI.
+ *
+ * Accepted formats:
+ *
+ * - `/path/to/database` - local database
+ * - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
+ * - `db://host:port` - remote database (LanceDB cloud)
+ * @param  options - The options to use when connecting to the database
+ * @see {@link ConnectionOptions} for more details on the URI format.
+ * @example
+ * ```ts
+ * const conn = await connect({
+ *   uri: "/path/to/database",
+ *   storageOptions: {timeout: "60s"}
+ * });
+ * ```
+ */
+export async function connect(
+  opts: Partial<RemoteConnectionOptions | ConnectionOptions> & { uri: string },
+): Promise<Connection>;
+export async function connect(
+  uriOrOptions:
+    | string
+    | (Partial<RemoteConnectionOptions | ConnectionOptions> & { uri: string }),
+  opts: Partial<ConnectionOptions | RemoteConnectionOptions> = {},
 ): Promise<Connection> {
+  let uri: string | undefined;
+  if (typeof uriOrOptions !== "string") {
+    const { uri: uri_, ...options } = uriOrOptions;
+    uri = uri_;
+    opts = options;
+  } else {
+    uri = uriOrOptions;
+  }
+
  if (!uri) {
    throw new Error("uri is required");
  }
-  opts = opts ?? {};

  if (uri?.startsWith("db://")) {
    return new RemoteConnection(uri, opts as RemoteConnectionOptions);
--- a/nodejs/lancedb/query.ts
+++ b/nodejs/lancedb/query.ts
@@ -114,6 +114,14 @@ export class QueryBase<
    this.inner.onlyIf(predicate);
    return this as unknown as QueryType;
  }
+  /**
+   * A filter statement to be applied to this query.
+   * @alias where
+   * @deprecated Use `where` instead
+   */
+  filter(predicate: string): QueryType {
+    return this.where(predicate);
+  }

  /**
   * Return only the specified columns.
--- a/nodejs/lancedb/remote/connection.ts
+++ b/nodejs/lancedb/remote/connection.ts
@@ -106,10 +106,19 @@ export class RemoteConnection extends Connection {
  }

  async createTable(
-    tableName: string,
-    data: Data,
+    nameOrOptions:
+      | string
+      | ({ name: string; data: Data } & Partial<CreateTableOptions>),
+    data?: Data,
    options?: Partial<CreateTableOptions> | undefined,
  ): Promise<Table> {
+    if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
+      const { name, data, ...options } = nameOrOptions;
+      return this.createTable(name, data, options);
+    }
+    if (data === undefined) {
+      throw new Error("data is required");
+    }
    if (options?.mode) {
      console.warn(
        "option 'mode' is not supported in LanceDB Cloud",
@@ -132,7 +141,7 @@ export class RemoteConnection extends Connection {
    );

    await this.#client.post(
-      `/v1/table/${encodeURIComponent(tableName)}/create/`,
+      `/v1/table/${encodeURIComponent(nameOrOptions)}/create/`,
      buf,
      {
        config: {
@@ -141,8 +150,8 @@ export class RemoteConnection extends Connection {
        headers: { "Content-Type": "application/vnd.apache.arrow.stream" },
      },
    );
-    this.#tableCache.set(tableName, true);
-    return new RemoteTable(this.#client, tableName, this.#dbName);
+    this.#tableCache.set(nameOrOptions, true);
+    return new RemoteTable(this.#client, nameOrOptions, this.#dbName);
  }

  async createEmptyTable(
--- a/nodejs/lancedb/remote/table.ts
+++ b/nodejs/lancedb/remote/table.ts
@@ -16,6 +16,7 @@ import { Table as ArrowTable } from "apache-arrow";

 import { Data, IntoVector } from "../arrow";

+import { IndexStatistics } from "..";
 import { CreateTableOptions } from "../connection";
 import { IndexOptions } from "../indices";
 import { MergeInsertBuilder } from "../merge";
@@ -34,6 +35,10 @@ export class RemoteTable extends Table {
    return `/v1/table/${encodeURIComponent(this.#name)}/`;
  }

+  get name(): string {
+    return this.#name;
+  }
+
  public constructor(
    client: RestfulLanceDBClient,
    tableName: string,
@@ -161,4 +166,7 @@ export class RemoteTable extends Table {
  mergeInsert(_on: string | string[]): MergeInsertBuilder {
    throw new Error("mergeInsert() is not yet supported on the LanceDB cloud");
  }
+  async indexStats(_name: string): Promise<IndexStatistics | undefined> {
+    throw new Error("indexStats() is not yet supported on the LanceDB cloud");
+  }
 }
--- a/nodejs/lancedb/table.ts
+++ b/nodejs/lancedb/table.ts
@@ -33,11 +33,11 @@ import {
  AddColumnsSql,
  ColumnAlteration,
  IndexConfig,
+  IndexStatistics,
  OptimizeStats,
  Table as _NativeTable,
 } from "./native";
 import { Query, VectorQuery } from "./query";
-export { IndexConfig } from "./native";

 /**
 * Options for adding data to a table.
@@ -98,6 +98,8 @@ export abstract class Table {
  [Symbol.for("nodejs.util.inspect.custom")](): string {
    return this.display();
  }
+  /** Returns the name of the table */
+  abstract get name(): string;

  /** Return true if the table has not been closed */
  abstract isOpen(): boolean;
@@ -158,6 +160,9 @@ export abstract class Table {
   * Indices on vector columns will speed up vector searches.
   * Indices on scalar columns will speed up filtering (in both
   * vector and non-vector searches)
+   *
+   * @note We currently don't support custom named indexes,
+   * The index name will always be `${column}_idx`
   * @example
   * // If the column has a vector (fixed size list) data type then
   * // an IvfPq vector index will be created.
@@ -368,6 +373,13 @@ export abstract class Table {

  abstract mergeInsert(on: string | string[]): MergeInsertBuilder;

+  /** List all the stats of a specified index
+   *
+   * @param {string} name The name of the index.
+   * @returns {IndexStatistics | undefined} The stats of the index. If the index does not exist, it will return undefined
+   */
+  abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
+
  static async parseTableData(
    // biome-ignore lint/suspicious/noExplicitAny: <explanation>
    data: Record<string, unknown>[] | ArrowTable<any>,
@@ -412,7 +424,9 @@ export class LocalTable extends Table {
    super();
    this.inner = inner;
  }
-
+  get name(): string {
+    return this.inner.name;
+  }
  isOpen(): boolean {
    return this.inner.isOpen();
  }
@@ -565,6 +579,13 @@ export class LocalTable extends Table {
    return await this.query().toArrow();
  }

+  async indexStats(name: string): Promise<IndexStatistics | undefined> {
+    const stats = await this.inner.indexStats(name);
+    if (stats === null) {
+      return undefined;
+    }
+    return stats;
+  }
  mergeInsert(on: string | string[]): MergeInsertBuilder {
    on = Array.isArray(on) ? on : [on];
    return new MergeInsertBuilder(this.inner.mergeInsert(on));
--- a/nodejs/npm/darwin-arm64/package.json
+++ b/nodejs/npm/darwin-arm64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-arm64",
-	"version": "0.5.2",
+	"version": "0.5.2-final.1",
 	"os": ["darwin"],
 	"cpu": ["arm64"],
 	"main": "lancedb.darwin-arm64.node",
--- a/nodejs/npm/darwin-x64/package.json
+++ b/nodejs/npm/darwin-x64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-x64",
-	"version": "0.5.2",
+	"version": "0.5.2-final.1",
 	"os": ["darwin"],
 	"cpu": ["x64"],
 	"main": "lancedb.darwin-x64.node",
--- a/nodejs/npm/linux-arm64-gnu/package.json
+++ b/nodejs/npm/linux-arm64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-arm64-gnu",
-	"version": "0.5.2",
+	"version": "0.5.2-final.1",
 	"os": ["linux"],
 	"cpu": ["arm64"],
 	"main": "lancedb.linux-arm64-gnu.node",
--- a/nodejs/npm/linux-x64-gnu/package.json
+++ b/nodejs/npm/linux-x64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-x64-gnu",
-	"version": "0.5.2",
+	"version": "0.5.2-final.1",
 	"os": ["linux"],
 	"cpu": ["x64"],
 	"main": "lancedb.linux-x64-gnu.node",
--- a/nodejs/npm/win32-x64-msvc/package.json
+++ b/nodejs/npm/win32-x64-msvc/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-win32-x64-msvc",
-	"version": "0.5.2",
+	"version": "0.5.2-final.1",
 	"os": ["win32"],
 	"cpu": ["x64"],
 	"main": "lancedb.win32-x64-msvc.node",
--- a/nodejs/package-lock.json
+++ b/nodejs/package-lock.json
@@ -18,10 +18,8 @@
        "win32"
      ],
      "dependencies": {
-        "@types/axios": "^0.14.0",
        "apache-arrow": "^15.0.0",
        "axios": "^1.7.2",
-        "memoize": "^10.0.0",
        "openai": "^4.29.2",
        "reflect-metadata": "^0.2.2"
      },
@@ -31,6 +29,7 @@
        "@biomejs/biome": "^1.7.3",
        "@jest/globals": "^29.7.0",
        "@napi-rs/cli": "^2.18.0",
+        "@types/axios": "^0.14.0",
        "@types/jest": "^29.1.2",
        "@types/tmp": "^0.2.6",
        "apache-arrow-old": "npm:apache-arrow@13.0.0",
@@ -3131,6 +3130,7 @@
      "resolved": "https://registry.npmjs.org/@types/axios/-/axios-0.14.0.tgz",
      "integrity": "sha512-KqQnQbdYE54D7oa/UmYVMZKq7CO4l8DEENzOKc4aBRwxCXSlJXGz83flFx5L7AWrOQnmuN3kVsRdt+GZPPjiVQ==",
      "deprecated": "This is a stub types definition for axios (https://github.com/mzabriskie/axios). axios provides its own type definitions, so you don't need @types/axios installed!",
+      "dev": true,
      "dependencies": {
        "axios": "*"
      }
@@ -5942,20 +5942,6 @@
        "is-buffer": "~1.1.6"
      }
    },
-    "node_modules/memoize": {
-      "version": "10.0.0",
-      "resolved": "https://registry.npmjs.org/memoize/-/memoize-10.0.0.tgz",
-      "integrity": "sha512-H6cBLgsi6vMWOcCpvVCdFFnl3kerEXbrYh9q+lY6VXvQSmM6CkmV08VOwT+WE2tzIEqRPFfAq3fm4v/UIW6mSA==",
-      "dependencies": {
-        "mimic-function": "^5.0.0"
-      },
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sindresorhus/memoize?sponsor=1"
-      }
-    },
    "node_modules/merge-stream": {
      "version": "2.0.0",
      "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
@@ -6003,17 +5989,6 @@
        "node": ">= 0.6"
      }
    },
-    "node_modules/mimic-function": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/mimic-function/-/mimic-function-5.0.1.tgz",
-      "integrity": "sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==",
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
    "node_modules/minimatch": {
      "version": "3.1.2",
      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
--- a/nodejs/package.json
+++ b/nodejs/package.json
@@ -1,6 +1,16 @@
 {
  "name": "@lancedb/lancedb",
-  "version": "0.5.2",
+  "description": "LanceDB: A serverless, low-latency vector database for AI applications",
+  "keywords": [
+    "database",
+    "lance",
+    "lancedb",
+    "search",
+    "vector",
+    "vector database",
+    "ann"
+  ],
+  "version": "0.5.2-final.1",
  "main": "dist/index.js",
  "exports": {
    ".": "./dist/index.js",
@@ -38,7 +48,8 @@
    "typedoc": "^0.25.7",
    "typedoc-plugin-markdown": "^3.17.1",
    "typescript": "^5.3.3",
-    "typescript-eslint": "^7.1.0"
+    "typescript-eslint": "^7.1.0",
+    "@types/axios": "^0.14.0"
  },
  "ava": {
    "timeout": "3m"
@@ -65,7 +76,6 @@
    "version": "napi version"
  },
  "dependencies": {
-    "@types/axios": "^0.14.0",
    "apache-arrow": "^15.0.0",
    "axios": "^1.7.2",
    "openai": "^4.29.2",
--- a/nodejs/src/connection.rs
+++ b/nodejs/src/connection.rs
@@ -56,12 +56,6 @@ impl Connection {
    #[napi(factory)]
    pub async fn new(uri: String, options: ConnectionOptions) -> napi::Result<Self> {
        let mut builder = ConnectBuilder::new(&uri);
-        if let Some(api_key) = options.api_key {
-            builder = builder.api_key(&api_key);
-        }
-        if let Some(host_override) = options.host_override {
-            builder = builder.host_override(&host_override);
-        }
        if let Some(interval) = options.read_consistency_interval {
            builder =
                builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));
--- a/nodejs/src/lib.rs
+++ b/nodejs/src/lib.rs
@@ -28,8 +28,6 @@ mod util;
 #[napi(object)]
 #[derive(Debug)]
 pub struct ConnectionOptions {
-    pub api_key: Option<String>,
-    pub host_override: Option<String>,
    /// (For LanceDB OSS only): The interval, in seconds, at which to check for
    /// updates to the table from other processes. If None, then consistency is not
    /// checked. For performance reasons, this is the default. For strong
--- a/nodejs/src/table.rs
+++ b/nodejs/src/table.rs
@@ -30,7 +30,7 @@ use crate::query::{Query, VectorQuery};
 pub struct Table {
    // We keep a duplicate of the table name so we can use it for error
    // messages even if the table has been closed
-    name: String,
+    pub name: String,
    pub(crate) inner: Option<LanceDbTable>,
 }

@@ -330,6 +330,13 @@ impl Table {
            .collect::<Vec<_>>())
    }

+    #[napi]
+    pub async fn index_stats(&self, index_name: String) -> napi::Result<Option<IndexStatistics>> {
+        let tbl = self.inner_ref()?.as_native().unwrap();
+        let stats = tbl.index_stats(&index_name).await.default_error()?;
+        Ok(stats.map(IndexStatistics::from))
+    }
+
    #[napi]
    pub fn merge_insert(&self, on: Vec<String>) -> napi::Result<NativeMergeInsertBuilder> {
        let on: Vec<_> = on.iter().map(String::as_str).collect();
@@ -340,11 +347,13 @@ impl Table {
 #[napi(object)]
 /// A description of an index currently configured on a column
 pub struct IndexConfig {
+    /// The name of the index
+    pub name: String,
    /// The type of the index
    pub index_type: String,
    /// The columns in the index
    ///
-    /// Currently this is always an array of size 1.  In the future there may
+    /// Currently this is always an array of size 1. In the future there may
    /// be more columns to represent composite indices.
    pub columns: Vec<String>,
 }
@@ -355,6 +364,7 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
        Self {
            index_type,
            columns: value.columns,
+            name: value.name,
        }
    }
 }
@@ -437,3 +447,40 @@ pub struct AddColumnsSql {
    /// The expression can reference other columns in the table.
    pub value_sql: String,
 }
+
+#[napi(object)]
+pub struct IndexStatistics {
+    /// The number of rows indexed by the index
+    pub num_indexed_rows: f64,
+    /// The number of rows not indexed
+    pub num_unindexed_rows: f64,
+    /// The type of the index
+    pub index_type: Option<String>,
+    /// The metadata for each index
+    pub indices: Vec<IndexMetadata>,
+}
+impl From<lancedb::index::IndexStatistics> for IndexStatistics {
+    fn from(value: lancedb::index::IndexStatistics) -> Self {
+        Self {
+            num_indexed_rows: value.num_indexed_rows as f64,
+            num_unindexed_rows: value.num_unindexed_rows as f64,
+            index_type: value.index_type.map(|t| format!("{:?}", t)),
+            indices: value.indices.into_iter().map(Into::into).collect(),
+        }
+    }
+}
+
+#[napi(object)]
+pub struct IndexMetadata {
+    pub metric_type: Option<String>,
+    pub index_type: Option<String>,
+}
+
+impl From<lancedb::index::IndexMetadata> for IndexMetadata {
+    fn from(value: lancedb::index::IndexMetadata) -> Self {
+        Self {
+            metric_type: value.metric_type,
+            index_type: value.index_type,
+        }
+    }
+}
--- a/python/.bumpversion.toml
+++ b/python/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.8.2"
+current_version = "0.9.0-beta.8"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-python"
-version = "0.8.2"
+version = "0.9.0-beta.8"
 edition.workspace = true
 description = "Python bindings for LanceDB"
 license.workspace = true
@@ -19,6 +19,8 @@ lancedb = { path = "../rust/lancedb" }
 env_logger = "0.10"
 pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
 pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
+base64ct = "=1.6.0" # workaround for https://github.com/RustCrypto/formats/issues/1684
+chrono = "=0.4.39"

 # Prevent dynamic linking of lzma, which comes from datafusion
 lzma-sys = { version = "*", features = ["static"] }
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -3,7 +3,7 @@ name = "lancedb"
 # version in Cargo.toml
 dependencies = [
    "deprecation",
-    "pylance==0.12.2-beta.2",
+    "pylance==0.13.0",
    "ratelimiter~=1.0",
    "requests>=2.31.0",
    "retry>=0.9.2",
@@ -13,6 +13,7 @@ dependencies = [
    "packaging",
    "cachetools",
    "overrides>=0.7",
+    "urllib3==1.26.19"
 ]
 description = "lancedb"
 authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
--- a/python/python/lancedb/init.py
+++ b/python/python/lancedb/init.py
@@ -35,6 +35,7 @@ def connect(
    host_override: Optional[str] = None,
    read_consistency_interval: Optional[timedelta] = None,
    request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
+    storage_options: Optional[Dict[str, str]] = None,
    **kwargs,
 ) -> DBConnection:
    """Connect to a LanceDB database.
@@ -70,6 +71,9 @@ def connect(
        executor will be used for making requests. This is for LanceDB Cloud
        only and is only used when making batch requests (i.e., passing in
        multiple queries to the search method at once).
+    storage_options: dict, optional
+        Additional options for the storage backend. See available options at
+        https://lancedb.github.io/lancedb/guides/storage/

    Examples
    --------
@@ -105,12 +109,16 @@ def connect(
            region,
            host_override,
            request_thread_pool=request_thread_pool,
+            storage_options=storage_options,
            **kwargs,
        )

    if kwargs:
        raise ValueError(f"Unknown keyword arguments: {kwargs}")
-    return LanceDBConnection(uri, read_consistency_interval=read_consistency_interval)
+    return LanceDBConnection(
+        uri,
+        read_consistency_interval=read_consistency_interval,
+    )


 async def connect_async(
--- a/python/python/lancedb/fts.py
+++ b/python/python/lancedb/fts.py
@@ -29,7 +29,10 @@ from .table import LanceTable


 def create_index(
-    index_path: str, text_fields: List[str], ordering_fields: List[str] = None
+    index_path: str,
+    text_fields: List[str],
+    ordering_fields: List[str] = None,
+    tokenizer_name: str = "default",
 ) -> tantivy.Index:
    """
    Create a new Index (not populated)
@@ -42,6 +45,8 @@ def create_index(
        List of text fields to index
    ordering_fields: List[str]
        List of unsigned type fields to order by at search time
+    tokenizer_name : str, default "default"
+        The tokenizer to use

    Returns
    -------
@@ -56,7 +61,7 @@ def create_index(
    schema_builder.add_integer_field("doc_id", stored=True)
    # data fields
    for name in text_fields:
-        schema_builder.add_text_field(name, stored=True)
+        schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
    if ordering_fields:
        for name in ordering_fields:
            schema_builder.add_unsigned_field(name, fast=True)
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -117,6 +117,8 @@ class Query(pydantic.BaseModel):

    with_row_id: bool = False

+    fast_search: bool = False
+

 class LanceQueryBuilder(ABC):
    """An abstract query builder. Subclasses are defined for vector search,
@@ -125,12 +127,14 @@ class LanceQueryBuilder(ABC):

    @classmethod
    def create(
-        cls,
-        table: "Table",
-        query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
-        query_type: str,
-        vector_column_name: str,
-        ordering_field_name: str = None,
+            cls,
+            table: "Table",
+            query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
+            query_type: str,
+            vector_column_name: str,
+            ordering_field_name: Optional[str] = None,
+            fts_columns: Union[str, List[str]] = [],
+            fast_search: bool = False,
    ) -> LanceQueryBuilder:
        """
        Create a query builder based on the given query and query type.
@@ -147,13 +151,18 @@ class LanceQueryBuilder(ABC):
            If "auto", the query type is inferred based on the query.
        vector_column_name: str
            The name of the vector column to use for vector search.
+        fast_search: bool
+            Skip flat search of unindexed data.
        """
-        if query is None:
-            return LanceEmptyQueryBuilder(table)
-
+        # Check hybrid search first as it supports empty query pattern
        if query_type == "hybrid":
            # hybrid fts and vector query
-            return LanceHybridQueryBuilder(table, query, vector_column_name)
+            return LanceHybridQueryBuilder(
+                table, query, vector_column_name, fts_columns=fts_columns
+            )
+
+        if query is None:
+            return LanceEmptyQueryBuilder(table)

        # remember the string query for reranking purpose
        str_query = query if isinstance(query, str) else None
@@ -165,12 +174,17 @@ class LanceQueryBuilder(ABC):
        )

        if query_type == "hybrid":
-            return LanceHybridQueryBuilder(table, query, vector_column_name)
+            return LanceHybridQueryBuilder(
+                table, query, vector_column_name, fts_columns=fts_columns
+            )

        if isinstance(query, str):
            # fts
            return LanceFtsQueryBuilder(
-                table, query, ordering_field_name=ordering_field_name
+                table,
+                query,
+                ordering_field_name=ordering_field_name,
+                fts_columns=fts_columns,
            )

        if isinstance(query, list):
@@ -180,7 +194,9 @@ class LanceQueryBuilder(ABC):
        else:
            raise TypeError(f"Unsupported query type: {type(query)}")

-        return LanceVectorQueryBuilder(table, query, vector_column_name, str_query)
+        return LanceVectorQueryBuilder(
+            table, query, vector_column_name, str_query, fast_search
+        )

    @classmethod
    def _resolve_query(cls, table, query, query_type, vector_column_name):
@@ -196,8 +212,6 @@ class LanceQueryBuilder(ABC):
        elif query_type == "auto":
            if isinstance(query, (list, np.ndarray)):
                return query, "vector"
-            if isinstance(query, tuple):
-                return query, "hybrid"
            else:
                conf = table.embedding_functions.get(vector_column_name)
                if conf is not None:
@@ -224,9 +238,14 @@ class LanceQueryBuilder(ABC):
    def __init__(self, table: "Table"):
        self._table = table
        self._limit = 10
+        self._offset = 0
        self._columns = None
        self._where = None
+        self._prefilter = False
        self._with_row_id = False
+        self._vector = None
+        self._text = None
+        self._ef = None

    @deprecation.deprecated(
        deprecated_in="0.3.1",
@@ -337,11 +356,13 @@ class LanceQueryBuilder(ABC):
        ----------
        limit: int
            The maximum number of results to return.
-            By default the query is limited to the first 10.
-            Call this method and pass 0, a negative value,
-            or None to remove the limit.
-            *WARNING* if you have a large dataset, removing
-            the limit can potentially result in reading a
+            The default query limit is 10 results.
+            For ANN/KNN queries, you must specify a limit.
+            Entering 0, a negative number, or None will reset
+            the limit to the default value of 10.
+            *WARNING* if you have a large dataset, setting
+            the limit to a large number, e.g. the table size,
+            can potentially result in reading a
            large amount of data into memory and cause
            out of memory issues.

@@ -351,11 +372,33 @@ class LanceQueryBuilder(ABC):
            The LanceQueryBuilder object.
        """
        if limit is None or limit <= 0:
-            self._limit = None
+            if isinstance(self, LanceVectorQueryBuilder):
+                raise ValueError("Limit is required for ANN/KNN queries")
+            else:
+                self._limit = None
        else:
            self._limit = limit
        return self

+    def offset(self, offset: int) -> LanceQueryBuilder:
+        """Set the offset for the results.
+
+        Parameters
+        ----------
+        offset: int
+            The offset to start fetching results from.
+
+        Returns
+        -------
+        LanceQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        if offset is None or offset <= 0:
+            self._offset = 0
+        else:
+            self._offset = offset
+        return self
+
    def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
        """Set the columns to return.

@@ -417,6 +460,80 @@ class LanceQueryBuilder(ABC):
        self._with_row_id = with_row_id
        return self

+    def explain_plan(self, verbose: Optional[bool] = False) -> str:
+        """Return the execution plan for this query.
+
+        Examples
+        --------
+        >>> import lancedb
+        >>> db = lancedb.connect("./.lancedb")
+        >>> table = db.create_table("my_table", [{"vector": [99, 99]}])
+        >>> query = [100, 100]
+        >>> plan = table.search(query).explain_plan(True)
+        >>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+        ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
+        GlobalLimitExec: skip=0, fetch=10
+          FilterExec: _distance@2 IS NOT NULL
+            SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
+              KNNVectorDistance: metric=l2
+                LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
+
+        Parameters
+        ----------
+        verbose : bool, default False
+            Use a verbose output format.
+
+        Returns
+        -------
+        plan : str
+        """  # noqa: E501
+        ds = self._table.to_lance()
+        return ds.scanner(
+            nearest={
+                "column": self._vector_column,
+                "q": self._query,
+                "k": self._limit,
+                "metric": self._metric,
+                "nprobes": self._nprobes,
+                "refine_factor": self._refine_factor,
+            },
+            prefilter=self._prefilter,
+            filter=self._str_query,
+            limit=self._limit,
+            with_row_id=self._with_row_id,
+            offset=self._offset,
+        ).explain_plan(verbose)
+
+    def vector(self, vector: Union[np.ndarray, list]) -> LanceQueryBuilder:
+        """Set the vector to search for.
+
+        Parameters
+        ----------
+        vector: np.ndarray or list
+            The vector to search for.
+
+        Returns
+        -------
+        LanceQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        raise NotImplementedError
+
+    def text(self, text: str) -> LanceQueryBuilder:
+        """Set the text to search for.
+
+        Parameters
+        ----------
+        text: str
+            The text to search for.
+
+        Returns
+        -------
+        LanceQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        raise NotImplementedError
+

 class LanceVectorQueryBuilder(LanceQueryBuilder):
    """
@@ -440,11 +557,12 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
    """

    def __init__(
-        self,
-        table: "Table",
-        query: Union[np.ndarray, list, "PIL.Image.Image"],
-        vector_column: str,
-        str_query: Optional[str] = None,
+            self,
+            table: "Table",
+            query: Union[np.ndarray, list, "PIL.Image.Image"],
+            vector_column: str,
+            str_query: Optional[str] = None,
+            fast_search: bool = False,
    ):
        super().__init__(table)
        self._query = query
@@ -455,13 +573,14 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
        self._prefilter = False
        self._reranker = None
        self._str_query = str_query
+        self._fast_search = fast_search

-    def metric(self, metric: Literal["L2", "cosine"]) -> LanceVectorQueryBuilder:
+    def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
        """Set the distance metric to use.

        Parameters
        ----------
-        metric: "L2" or "cosine"
+        metric: "L2" or "cosine" or "dot"
            The distance metric to use. By default "L2" is used.

        Returns
@@ -469,7 +588,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
        LanceVectorQueryBuilder
            The LanceQueryBuilder object.
        """
-        self._metric = metric
+        self._metric = metric.lower()
        return self

    def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
@@ -494,6 +613,28 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
        self._nprobes = nprobes
        return self

+    def ef(self, ef: int) -> LanceVectorQueryBuilder:
+        """Set the number of candidates to consider during search.
+
+        Higher values will yield better recall (more likely to find vectors if
+        they exist) at the expense of latency.
+
+        This only applies to the HNSW-related index.
+        The default value is 1.5 * limit.
+
+        Parameters
+        ----------
+        ef: int
+            The number of candidates to consider during search.
+
+        Returns
+        -------
+        LanceVectorQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        self._ef = ef
+        return self
+
    def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
        """Set the refine factor to use, increasing the number of vectors sampled.

@@ -554,15 +695,11 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
            refine_factor=self._refine_factor,
            vector_column=self._vector_column,
            with_row_id=self._with_row_id,
+            offset=self._offset,
+            fast_search=self._fast_search,
+            ef=self._ef,
        )
        result_set = self._table._execute_query(query, batch_size)
-        if self._reranker is not None:
-            rs_table = result_set.read_all()
-            result_set = self._reranker.rerank_vector(self._str_query, rs_table)
-            # convert result_set back to RecordBatchReader
-            result_set = pa.RecordBatchReader.from_batches(
-                result_set.schema, result_set.to_batches()
-            )

        return result_set

@@ -591,7 +728,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
        return self

    def rerank(
-        self, reranker: Reranker, query_string: Optional[str] = None
+            self, reranker: Reranker, query_string: Optional[str] = None
    ) -> LanceVectorQueryBuilder:
        """Rerank the results using the specified reranker.

@@ -756,12 +893,34 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):

 class LanceEmptyQueryBuilder(LanceQueryBuilder):
    def to_arrow(self) -> pa.Table:
-        ds = self._table.to_lance()
-        return ds.to_table(
+        return self.to_batches().read_all()
+
+    def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
+        query = Query(
            columns=self._columns,
            filter=self._where,
-            limit=self._limit,
+            k=self._limit or 10,
+            with_row_id=self._with_row_id,
+            vector=[],
+            # not actually respected in remote query
+            offset=self._offset or 0,
        )
+        return self._table._execute_query(query)
+
+    def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
+        """Rerank the results using the specified reranker.
+
+        Parameters
+        ----------
+        reranker: Reranker
+            The reranker to use.
+
+        Returns
+        -------
+        LanceEmptyQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        raise NotImplementedError("Reranking is not yet supported.")


 class LanceHybridQueryBuilder(LanceQueryBuilder):
--- a/python/python/lancedb/remote/client.py
+++ b/python/python/lancedb/remote/client.py
@@ -55,11 +55,13 @@ class RestfulLanceDBClient:
    region: str
    api_key: Credential
    host_override: Optional[str] = attrs.field(default=None)
+    db_prefix: Optional[str] = attrs.field(default=None)

    closed: bool = attrs.field(default=False, init=False)

    connection_timeout: float = attrs.field(default=120.0, kw_only=True)
    read_timeout: float = attrs.field(default=300.0, kw_only=True)
+    storage_options: Optional[Dict[str, str]] = attrs.field(default=None, kw_only=True)

    @functools.cached_property
    def session(self) -> requests.Session:
@@ -92,6 +94,18 @@ class RestfulLanceDBClient:
            headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
        if self.host_override:
            headers["x-lancedb-database"] = self.db_name
+        if self.storage_options:
+            if self.storage_options.get("account_name") is not None:
+                headers["x-azure-storage-account-name"] = self.storage_options[
+                    "account_name"
+                ]
+            if self.storage_options.get("azure_storage_account_name") is not None:
+                headers["x-azure-storage-account-name"] = self.storage_options[
+                    "azure_storage_account_name"
+                ]
+        if self.db_prefix:
+            headers["x-lancedb-database-prefix"] = self.db_prefix
+
        return headers

    @staticmethod
@@ -158,6 +172,7 @@ class RestfulLanceDBClient:
            headers["content-type"] = content_type
        if request_id is not None:
            headers["x-request-id"] = request_id
+
        with self.session.post(
            urljoin(self.url, uri),
            headers=headers,
@@ -245,7 +260,6 @@ def retry_adapter(options: Dict[str, Any]) -> HTTPAdapter:
            connect=connect_retries,
            read=read_retries,
            backoff_factor=backoff_factor,
-            backoff_jitter=backoff_jitter,
            status_forcelist=statuses,
            allowed_methods=methods,
        )
--- a/python/python/lancedb/remote/db.py
+++ b/python/python/lancedb/remote/db.py
@@ -15,7 +15,7 @@ import inspect
 import logging
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from typing import Iterable, List, Optional, Union
+from typing import Dict, Iterable, List, Optional, Union
 from urllib.parse import urlparse

 from cachetools import TTLCache
@@ -44,20 +44,25 @@ class RemoteDBConnection(DBConnection):
        request_thread_pool: Optional[ThreadPoolExecutor] = None,
        connection_timeout: float = 120.0,
        read_timeout: float = 300.0,
+        storage_options: Optional[Dict[str, str]] = None,
    ):
        """Connect to a remote LanceDB database."""
        parsed = urlparse(db_url)
        if parsed.scheme != "db":
            raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://")
        self.db_name = parsed.netloc
+        prefix = parsed.path.lstrip("/")
+        self.db_prefix = None if not prefix else prefix
        self.api_key = api_key
        self._client = RestfulLanceDBClient(
            self.db_name,
            region,
            api_key,
            host_override,
+            self.db_prefix,
            connection_timeout=connection_timeout,
            read_timeout=read_timeout,
+            storage_options=storage_options,
        )
        self._request_thread_pool = request_thread_pool
        self._table_cache = TTLCache(maxsize=10000, ttl=300)
--- a/python/python/lancedb/remote/table.py
+++ b/python/python/lancedb/remote/table.py
@@ -15,13 +15,14 @@ import logging
 import uuid
 from concurrent.futures import Future
 from functools import cached_property
-from typing import Dict, Iterable, Optional, Union
+from typing import Dict, Iterable, Optional, Union, Literal

 import pyarrow as pa
 from lance import json_to_schema

 from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
 from lancedb.merge import LanceMergeInsertBuilder
+from lancedb.query import LanceQueryBuilder

 from ..query import LanceVectorQueryBuilder
 from ..table import Query, Table, _sanitize_data
@@ -81,6 +82,7 @@ class RemoteTable(Table):
    def create_scalar_index(
        self,
        column: str,
+        index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
    ):
        """Creates a scalar index
        Parameters
@@ -89,8 +91,6 @@ class RemoteTable(Table):
            The column to be indexed.  Must be a boolean, integer, float,
            or string column.
        """
-        index_type = "scalar"
-
        data = {
            "column": column,
            "index_type": index_type,
@@ -228,10 +228,21 @@ class RemoteTable(Table):
            content_type=ARROW_STREAM_CONTENT_TYPE,
        )

+    def query(
+        self,
+        query: Union[VEC, str] = None,
+        query_type: str = "vector",
+        vector_column_name: Optional[str] = None,
+        fast_search: bool = False,
+    ) -> LanceVectorQueryBuilder:
+        return self.search(query, query_type, vector_column_name, fast_search)
+
    def search(
        self,
-        query: Union[VEC, str],
+        query: Union[VEC, str] = None,
+        query_type: str = "vector",
        vector_column_name: Optional[str] = None,
+        fast_search: bool = False,
    ) -> LanceVectorQueryBuilder:
        """Create a search query to find the nearest neighbors
        of the given query vector. We currently support [vector search][search]
@@ -278,6 +289,11 @@ class RemoteTable(Table):
            - If the table has multiple vector columns then the *vector_column_name*
            needs to be specified. Otherwise, an error is raised.

+        fast_search: bool, optional
+            Skip a flat search of unindexed data. This may improve
+            search performance but search results will not include unindexed data.
+
+            - *default False*.
        Returns
        -------
        LanceQueryBuilder
@@ -293,7 +309,14 @@ class RemoteTable(Table):
        """
        if vector_column_name is None:
            vector_column_name = inf_vector_column_query(self.schema)
-        return LanceVectorQueryBuilder(self, query, vector_column_name)
+
+        return LanceQueryBuilder.create(
+            self,
+            query,
+            query_type,
+            vector_column_name=vector_column_name,
+            fast_search=fast_search,
+        )

    def _execute_query(
        self, query: Query, batch_size: Optional[int] = None
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -1171,6 +1171,7 @@ class LanceTable(Table):
        *,
        replace: bool = False,
        writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
+        tokenizer_name: str = "default",
    ):
        """Create a full-text search index on the table.

@@ -1189,6 +1190,10 @@ class LanceTable(Table):
        ordering_field_names:
            A list of unsigned type fields to index to optionally order
            results on at search time
+        tokenizer_name: str, default "default"
+            The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
+            language code followed by "_stem". So for english it would be "en_stem".
+            For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
        """
        from .fts import create_index, populate_index

@@ -1214,6 +1219,7 @@ class LanceTable(Table):
            self._get_fts_index_path(),
            field_names,
            ordering_fields=ordering_field_names,
+            tokenizer_name=tokenizer_name,
        )
        populate_index(
            index,
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -66,6 +66,17 @@ def test_create_index(tmp_path):
    assert os.path.exists(str(tmp_path / "index"))


+def test_create_index_with_stemming(tmp_path, table):
+    index = ldb.fts.create_index(
+        str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
+    )
+    assert isinstance(index, tantivy.Index)
+    assert os.path.exists(str(tmp_path / "index"))
+
+    # Check stemming by running tokenizer on non empty table
+    table.create_fts_index("text", tokenizer_name="en_stem")
+
+
 def test_populate_index(tmp_path, table):
    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
    assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
--- a/python/python/tests/test_remote_db.py
+++ b/python/python/tests/test_remote_db.py
@@ -21,6 +21,7 @@ class FakeLanceDBClient:
        pass

    def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
+        print(f"{query=}")
        assert table_name == "test"
        t = pa.schema([]).empty_table()
        return VectorQueryResult(t)
@@ -39,3 +40,21 @@ def test_remote_db():
    table = conn["test"]
    table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
    table.search([1.0, 2.0]).to_pandas()
+
+
+def test_empty_query_with_filter():
+    conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
+    setattr(conn, "_client", FakeLanceDBClient())
+
+    table = conn["test"]
+    table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
+    print(table.query().select(["vector"]).where("foo == bar").to_arrow())
+
+
+def test_fast_search_query_with_filter():
+    conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
+    setattr(conn, "_client", FakeLanceDBClient())
+
+    table = conn["test"]
+    table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
+    print(table.query([0, 0], fast_search=True).select(["vector"]).where("foo == bar").to_arrow())
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -735,7 +735,7 @@ def test_create_scalar_index(db):
    indices = table.to_lance().list_indices()
    assert len(indices) == 1
    scalar_index = indices[0]
-    assert scalar_index["type"] == "Scalar"
+    assert scalar_index["type"] == "BTree"

    # Confirm that prefiltering still works with the scalar index column
    results = table.search().where("x = 'c'").to_arrow()
--- a/rust/ffi/node/Cargo.toml
+++ b/rust/ffi/node/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-node"
-version = "0.5.2"
+version = "0.5.2-final.1"
 description = "Serverless, low-latency vector database for AI applications"
 license.workspace = true
 edition.workspace = true
--- a/rust/ffi/node/src/table.rs
+++ b/rust/ffi/node/src/table.rs
@@ -463,6 +463,7 @@ impl JsTable {
        Ok(promise)
    }

+    #[allow(deprecated)]
    pub(crate) fn js_index_stats(mut cx: FunctionContext) -> JsResult<JsPromise> {
        let js_table = cx.this().downcast_or_throw::<JsBox<Self>, _>(&mut cx)?;
        let rt = runtime(&mut cx)?;
--- a/rust/lancedb/Cargo.toml
+++ b/rust/lancedb/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb"
-version = "0.5.2"
+version = "0.5.2-final.1"
 edition.workspace = true
 description = "LanceDB: A serverless, low-latency vector database for AI applications"
 license.workspace = true
--- a/rust/lancedb/src/index.rs
+++ b/rust/lancedb/src/index.rs
@@ -80,6 +80,8 @@ pub enum IndexType {

 /// A description of an index currently configured on a column
 pub struct IndexConfig {
+    /// The name of the index
+    pub name: String,
    /// The type of the index
    pub index_type: IndexType,
    /// The columns in the index
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -1206,28 +1206,36 @@ impl NativeTable {
            .await)
    }

+    #[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
    pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
+        #[allow(deprecated)]
        match self.load_index_stats(index_uuid).await? {
            Some(stats) => Ok(Some(stats.num_indexed_rows)),
            None => Ok(None),
        }
    }

+    #[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
    pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
+        #[allow(deprecated)]
        match self.load_index_stats(index_uuid).await? {
            Some(stats) => Ok(Some(stats.num_unindexed_rows)),
            None => Ok(None),
        }
    }

+    #[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
    pub async fn get_index_type(&self, index_uuid: &str) -> Result<Option<String>> {
+        #[allow(deprecated)]
        match self.load_index_stats(index_uuid).await? {
            Some(stats) => Ok(Some(stats.index_type.unwrap_or_default())),
            None => Ok(None),
        }
    }

+    #[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
    pub async fn get_distance_type(&self, index_uuid: &str) -> Result<Option<String>> {
+        #[allow(deprecated)]
        match self.load_index_stats(index_uuid).await? {
            Some(stats) => Ok(Some(
                stats
@@ -1240,16 +1248,8 @@ impl NativeTable {
        }
    }

-    pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
-        let dataset = self.dataset.get().await?;
-        let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
-        Ok(indices
-            .iter()
-            .map(|i| VectorIndex::new_from_format(&mf, i))
-            .collect())
-    }
-
-    async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<IndexStatistics>> {
+    #[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
+    pub async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<IndexStatistics>> {
        let index = self
            .load_indices()
            .await?
@@ -1268,6 +1268,35 @@ impl NativeTable {
        Ok(Some(index_stats))
    }

+    /// Get statistics about an index.
+    /// Returns an error if the index does not exist.
+    pub async fn index_stats<S: AsRef<str>>(
+        &self,
+        index_name: S,
+    ) -> Result<Option<IndexStatistics>> {
+        self.dataset
+            .get()
+            .await?
+            .index_statistics(index_name.as_ref())
+            .await
+            .ok()
+            .map(|stats| {
+                serde_json::from_str(&stats).map_err(|e| Error::InvalidInput {
+                    message: format!("error deserializing index statistics: {}", e),
+                })
+            })
+            .transpose()
+    }
+
+    pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
+        let dataset = self.dataset.get().await?;
+        let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
+        Ok(indices
+            .iter()
+            .map(|i| VectorIndex::new_from_format(&mf, i))
+            .collect())
+    }
+
    async fn create_ivf_pq_index(
        &self,
        index: IvfPqIndexBuilder,
@@ -1860,12 +1889,20 @@ impl TableInternal for NativeTable {
                }
                columns.push(field.name.clone());
            }
-            Ok(IndexConfig { index_type: if is_vector { crate::index::IndexType::IvfPq } else { crate::index::IndexType::BTree }, columns })
+            let index_type = if is_vector {
+                crate::index::IndexType::IvfPq
+            } else {
+                crate::index::IndexType::BTree
+            };
+
+            let name = idx.name.clone();
+            Ok(IndexConfig { index_type, columns, name })
        }).collect::<Result<Vec<_>>>()
    }
 }

 #[cfg(test)]
+#[allow(deprecated)]
 mod tests {
    use std::iter;
    use std::sync::atomic::{AtomicBool, Ordering};
Author	SHA1	Message	Date
Lance Release	1884fe8a3e	Bump version: 0.9.0-beta.7 → 0.9.0-beta.8	2025-02-26 15:03:57 +00:00
Ryan Green	d8111b259c	Merge remote-tracking branch 'origin/python-v0.9.4-patch' into python-v0.9.4-patch	2025-02-26 11:31:34 -03:30
Ryan Green	3c74bf5c7a	Pin chrono version	2025-02-26 11:31:29 -03:30
Lance Release	b64bb75a82	Bump version: 0.9.0-beta.6 → 0.9.0-beta.7	2025-02-26 13:29:54 +00:00
Ryan Green	93e03ec702	revert worfklow	2025-02-26 09:56:08 -03:30
Ryan Green	7a94a7e171	Merge remote-tracking branch 'origin/python-v0.9.4-patch' into python-v0.9.4-patch	2025-02-26 09:52:55 -03:30
Ryan Green	acae6522fb	workaround "edition2024" issue	2025-02-26 09:52:48 -03:30
Lance Release	005d5b64ac	Bump version: 0.5.2 → 0.5.2-final.1	2025-02-26 13:05:01 +00:00
Lance Release	1e89d07fe2	Bump version: 0.9.0-beta.5 → 0.9.0-beta.6	2025-02-26 13:04:48 +00:00
Ryan Green	1da55719e7	fix windows workflow	2025-02-26 09:33:42 -03:30
Ryan Green	9d0ca5a823	merge PyPi Publish workflow from main	2025-02-26 09:31:18 -03:30
Lance Release	1e0cc69401	Bump version: 0.9.0-beta.4 → 0.9.0-beta.5	2025-02-26 12:46:00 +00:00
Ryan Green	f31e0c749d	hotfix: add support for scalar index type in remote table	2025-02-26 09:13:30 -03:30
Lance Release	7a3ef68306	Bump version: 0.9.0-beta.3 → 0.9.0-beta.4	2024-12-20 16:02:53 +00:00
Ryan Green	43952e01d7	bump version	2024-12-20 09:44:46 -06:00
Ryan Green	495c335831	Fix fast_search	2024-12-20 09:43:39 -06:00
Ryan Green	77707db543	Backport fast_search and empty query builder for remote table	2024-12-20 09:21:05 -06:00
Ryan Green	d6d7ad3b06	bump version	2024-12-18 10:21:04 -06:00
Ryan Green	e58d64c286	Remove unsupported Retry params	2024-12-18 10:08:38 -06:00
Ryan Green	76cbd18c46	bump version	2024-12-18 09:38:36 -06:00
Ryan Green	4abb38ac70	bump version	2024-12-18 09:37:58 -06:00
Ryan Green	cc7bc5011d	Merge remote-tracking branch 'origin/python-v0.9.0-patch' into python-v0.9.0-patch # Conflicts: # python/pyproject.toml	2024-12-18 08:59:35 -06:00
Ryan Green	8193183304	override urllib3 version	2024-12-18 08:59:24 -06:00
Ryan Green	cf28b58b7d	override urllib3 version	2024-12-18 08:58:41 -06:00
Lance Release	e3b7ee47b9	Bump version: 0.9.0 → 0.9.0-final.1	2024-12-13 01:16:24 +00:00
Lu Qiu	97c9c906e4	Fix version test	2024-12-12 17:10:07 -08:00
Lu Qiu	358f86b9c6	fix	2024-12-12 16:44:24 -08:00
Lu Qiu	5489e215a3	Support storage options and folder prefix	2024-12-12 16:17:34 -08:00
Lance Release	bc0814767b	Bump version: 0.9.0-beta.0 → 0.9.0	2024-06-25 00:25:27 +00:00
Lance Release	8960a8e535	Bump version: 0.8.2 → 0.9.0-beta.0	2024-06-25 00:25:27 +00:00
Weston Pace	a8568ddc72	feat: upgrade to lance 0.13.0 (#1404 )	2024-06-24 17:22:57 -07:00
Cory Grinstead	55f88346d0	feat(nodejs): table.indexStats (#1361 ) closes https://github.com/lancedb/lancedb/issues/1359	2024-06-21 17:06:52 -05:00
Will Jones	dfb9a28795	ci(node): add description and keywords for lancedb package (#1398 )	2024-06-21 14:43:35 -07:00
Cory Grinstead	a797f5fe59	feat(nodejs): feature parity [5/N] - add `query.filter()` alias (#1391 ) to make the transition from `vectordb` to `@lancedb/lancedb` as seamless as possible, this adds `query.filter` with a deprecated tag. depends on https://github.com/lancedb/lancedb/pull/1390 see actual diff here https://github.com/universalmind303/lancedb/compare/list-indices-name...universalmind303:query-filter	2024-06-21 16:03:58 -05:00
Cory Grinstead	3cd84c9375	feat(nodejs): feature parity [4/N] - add 'name' to 'IndexConfig' for 'listIndices' (#1390 ) depends on https://github.com/lancedb/lancedb/pull/1386 see actual diff here https://github.com/universalmind303/lancedb/compare/create-table-args...universalmind303:list-indices-name	2024-06-21 15:45:02 -05:00
Cory Grinstead	5ca83fdc99	fix(node): node build (#1396 ) i have no idea why this fixes the build.	2024-06-21 15:42:22 -05:00
Cory Grinstead	33cc9b682f	feat(nodejs): feature parity [3/N] - `createTable({name, data, ...options})` (#1386 ) adds support for the `vectordb` syntax of `createTable({name, data, ...options})`. depends on https://github.com/lancedb/lancedb/pull/1380 see actual diff here https://github.com/universalmind303/lancedb/compare/table-name...universalmind303:create-table-args	2024-06-21 12:17:39 -05:00
Cory Grinstead	b3e5ac6d2a	feat(nodejs): feature parity [2/N] - add `table.name` and `lancedb.connect({args})` (#1380 ) depends on https://github.com/lancedb/lancedb/pull/1378 see proper diff here https://github.com/universalmind303/lancedb/compare/remote-table-node...universalmind303:lancedb:table-name	2024-06-21 11:38:26 -05:00
josca42	0fe844034d	feat: enable stemming (#1356 ) Added the ability to specify tokenizer_name, when creating a full text search index using tantivy. This enables the use of language specific stemming. Also updated the [guide on full text search](https://lancedb.github.io/lancedb/fts/) with a short section on choosing tokenizer. Fixes #1315	2024-06-20 14:23:55 -07:00