Bump version: 0.24.2-beta.2 → 0.24.2

Bump version: 0.24.2-beta.1 → 0.24.2-beta.2
fix: handle empty list with schema in table creation (#2548 )
2025-12-23 05:19:58 +00:00 · 2025-07-25 20:31:15 +00:00 · 2025-07-25 20:31:15 +00:00 · 2025-07-25 10:23:43 +08:00 · 2025-07-24 19:22:53 -07:00 · 2025-07-24 15:30:06 -07:00
106 changed files with 3106 additions and 704 deletions
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.21.0-beta.0"
+current_version = "0.21.2-beta.1"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/.github/workflows/cargo-publish.yml
+++ b/.github/workflows/cargo-publish.yml
@@ -5,8 +5,8 @@ on:
    tags-ignore:
      # We don't publish pre-releases for Rust. Crates.io is just a source
      # distribution, so we don't need to publish pre-releases.
-      - 'v*-beta*'
-      - '*-v*' # for example, python-vX.Y.Z
+      - "v*-beta*"
+      - "*-v*" # for example, python-vX.Y.Z

 env:
  # This env var is used by Swatinem/rust-cache@v2 for the cache
@@ -19,6 +19,8 @@ env:
 jobs:
  build:
    runs-on: ubuntu-22.04
+    permissions:
+      id-token: write
    timeout-minutes: 30
    # Only runs on tags that matches the make-release action
    if: startsWith(github.ref, 'refs/tags/v')
@@ -31,6 +33,8 @@ jobs:
        run: |
          sudo apt update
          sudo apt install -y protobuf-compiler libssl-dev
+      - uses: rust-lang/crates-io-auth-action@v1
+        id: auth
      - name: Publish the package
        run: |
-          cargo publish -p lancedb --all-features --token ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          cargo publish -p lancedb --all-features --token ${{ steps.auth.outputs.token }}
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1,24 @@
+LanceDB is a database designed for retrieval, including vector, full-text, and hybrid search.
+It is a wrapper around Lance. There are two backends: local (in-process like SQLite) and
+remote (against LanceDB Cloud).
+
+The core of LanceDB is written in Rust. There are bindings in Python, Typescript, and Java.
+
+Project layout:
+
+* `rust/lancedb`: The LanceDB core Rust implementation.
+* `python`: The Python bindings, using PyO3.
+* `nodejs`: The Typescript bindings, using napi-rs
+* `java`: The Java bindings
+
+(`rust/ffi` and `node/` are for a deprecated package. You can ignore them.)
+
+Common commands:
+
+* Check for compiler errors: `cargo check --features remote --tests --examples`
+* Run tests: `cargo test --features remote --tests`
+* Run specific test: `cargo test --features remote -p <package_name> --test <test_name>`
+* Lint: `cargo clippy --features remote --tests --examples`
+* Format: `cargo fmt --all`
+
+Before committing changes, run formatting.
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,14 +21,14 @@ categories = ["database-implementations"]
 rust-version = "1.78.0"

 [workspace.dependencies]
-lance = { "version" = "=0.30.0", "features" = ["dynamodb"] }
-lance-io = "=0.30.0"
-lance-index = "=0.30.0"
-lance-linalg = "=0.30.0"
-lance-table = "=0.30.0"
-lance-testing = "=0.30.0"
-lance-datafusion = "=0.30.0"
-lance-encoding = "=0.30.0"
+lance = { "version" = "=0.32.0", "features" = ["dynamodb"] }
+lance-io = "=0.32.0"
+lance-index = "=0.32.0"
+lance-linalg = "=0.32.0"
+lance-table = "=0.32.0"
+lance-testing = "=0.32.0"
+lance-datafusion = "=0.32.0"
+lance-encoding = "=0.32.0"
 # Note that this one does not include pyarrow
 arrow = { version = "55.1", optional = false }
 arrow-array = "55.1"
@@ -39,20 +39,20 @@ arrow-schema = "55.1"
 arrow-arith = "55.1"
 arrow-cast = "55.1"
 async-trait = "0"
-datafusion = { version = "47.0", default-features = false }
-datafusion-catalog = "47.0"
-datafusion-common = { version = "47.0", default-features = false }
-datafusion-execution = "47.0"
-datafusion-expr = "47.0"
-datafusion-physical-plan = "47.0"
+datafusion = { version = "48.0", default-features = false }
+datafusion-catalog = "48.0"
+datafusion-common = { version = "48.0", default-features = false }
+datafusion-execution = "48.0"
+datafusion-expr = "48.0"
+datafusion-physical-plan = "48.0"
 env_logger = "0.11"
-half = { "version" = "=2.5.0", default-features = false, features = [
+half = { "version" = "2.6.0", default-features = false, features = [
    "num-traits",
 ] }
 futures = "0"
 log = "0.4"
 moka = { version = "0.12", features = ["future"] }
-object_store = "0.11.0"
+object_store = "0.12.0"
 pin-project = "1.0.7"
 snafu = "0.8"
 url = "2"
--- a/ci/set_lance_version.py
+++ b/ci/set_lance_version.py
@@ -47,10 +47,10 @@ def extract_features(line: str) -> list:
    """
    import re

-    match = re.search(r'"features"\s*=\s*\[(.*?)\]', line)
+    match = re.search(r'"features"\s*=\s*\[\s*(.*?)\s*\]', line, re.DOTALL)
    if match:
        features_str = match.group(1)
-        return [f.strip('"') for f in features_str.split(",")]
+        return [f.strip('"') for f in features_str.split(",") if len(f) > 0]
    return []


@@ -63,10 +63,24 @@ def update_cargo_toml(line_updater):
        lines = f.readlines()

    new_lines = []
+    lance_line = ""
+    is_parsing_lance_line = False
    for line in lines:
        if line.startswith("lance"):
            # Update the line using the provided function
-            new_lines.append(line_updater(line))
+            if line.strip().endswith("}"):
+                new_lines.append(line_updater(line))
+            else:
+                lance_line = line
+                is_parsing_lance_line = True
+        elif is_parsing_lance_line:
+            lance_line += line
+            if line.strip().endswith("}"):
+                new_lines.append(line_updater(lance_line))
+                lance_line = ""
+                is_parsing_lance_line = False
+            else:
+                print("doesn't end with }:", line)
        else:
            # Keep the line unchanged
            new_lines.append(line)
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@@ -19,7 +19,7 @@
    },
    "../node": {
      "name": "vectordb",
-      "version": "0.12.0",
+      "version": "0.21.2-beta.0",
      "cpu": [
        "x64",
        "arm64"
@@ -65,11 +65,11 @@
        "uuid": "^9.0.0"
      },
      "optionalDependencies": {
-        "@lancedb/vectordb-darwin-arm64": "0.12.0",
-        "@lancedb/vectordb-darwin-x64": "0.12.0",
-        "@lancedb/vectordb-linux-arm64-gnu": "0.12.0",
-        "@lancedb/vectordb-linux-x64-gnu": "0.12.0",
-        "@lancedb/vectordb-win32-x64-msvc": "0.12.0"
+        "@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
+        "@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
+        "@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
+        "@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
+        "@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
      },
      "peerDependencies": {
        "@apache-arrow/ts": "^14.0.2",
--- a/docs/src/guides/sql_querying.md
+++ b/docs/src/guides/sql_querying.md
@@ -1,7 +1,9 @@
+# SQL Querying
+
 You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
 This guide will show how to query Lance tables them using both.

-We will re-use the dataset [created previously](./pandas_and_pyarrow.md):
+We will re-use the dataset [created previously](./tables.md):

 ```python
 import lancedb
@@ -27,15 +29,10 @@ arrow_table = table.to_lance()
 duckdb.query("SELECT * FROM arrow_table")
 ```

-```
-┌─────────────┬─────────┬────────┐
-│   vector    │  item   │ price  │
-│   float[]   │ varchar │ double │
-├─────────────┼─────────┼────────┤
-│ [3.1, 4.1]  │ foo     │   10.0 │
-│ [5.9, 26.5] │ bar     │   20.0 │
-└─────────────┴─────────┴────────┘
-```
+| vector      | item | price |
+| ----------- | ---- | ----- |
+| [3.1, 4.1]  | foo  | 10.0  |
+| [5.9, 26.5] | bar  | 20.0  |

 ## Querying a LanceDB Table with Apache Datafusion

@@ -57,12 +54,7 @@ Register the table created with the Datafusion session context.
    --8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
    ```

-```
-┌─────────────┬─────────┬────────┐
-│   vector    │  item   │ price  │
-│   float[]   │ varchar │ double │
-├─────────────┼─────────┼────────┤
-│ [3.1, 4.1]  │ foo     │   10.0 │
-│ [5.9, 26.5] │ bar     │   20.0 │
-└─────────────┴─────────┴────────┘
-```
+| vector      | item | price |
+| ----------- | ---- | ----- |
+| [3.1, 4.1]  | foo  | 10.0  |
+| [5.9, 26.5] | bar  | 20.0  |
--- a/docs/src/js/classes/MatchQuery.md
+++ b/docs/src/js/classes/MatchQuery.md
@@ -41,6 +41,7 @@ Creates an instance of MatchQuery.
    - `fuzziness`: The fuzziness level for the query (default is 0).
    - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
    - `operator`: The logical operator to use for combining terms in the query (default is "OR").
+    - `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.

 * **options.boost?**: `number`

@@ -50,6 +51,8 @@ Creates an instance of MatchQuery.

 * **options.operator?**: [`Operator`](../enumerations/Operator.md)

+* **options.prefixLength?**: `number`
+
 #### Returns

 [`MatchQuery`](MatchQuery.md)
--- a/docs/src/js/classes/Session.md
+++ b/docs/src/js/classes/Session.md
@@ -0,0 +1,84 @@
+[**@lancedb/lancedb**](../README.md) • **Docs**
+
+***
+
+[@lancedb/lancedb](../globals.md) / Session
+
+# Class: Session
+
+A session for managing caches and object stores across LanceDB operations.
+
+Sessions allow you to configure cache sizes for index and metadata caches,
+which can significantly impact performance for large datasets.
+
+## Constructors
+
+### new Session()
+
+```ts
+new Session(indexCacheSizeBytes?, metadataCacheSizeBytes?): Session
+```
+
+Create a new session with custom cache sizes.
+
+# Parameters
+
+- `index_cache_size_bytes`: The size of the index cache in bytes.
+  Defaults to 6GB if not specified.
+- `metadata_cache_size_bytes`: The size of the metadata cache in bytes.
+  Defaults to 1GB if not specified.
+
+#### Parameters
+
+* **indexCacheSizeBytes?**: `null` \| `bigint`
+
+* **metadataCacheSizeBytes?**: `null` \| `bigint`
+
+#### Returns
+
+[`Session`](Session.md)
+
+## Methods
+
+### approxNumItems()
+
+```ts
+approxNumItems(): number
+```
+
+Get the approximate number of items cached in the session.
+
+#### Returns
+
+`number`
+
+***
+
+### sizeBytes()
+
+```ts
+sizeBytes(): bigint
+```
+
+Get the current size of the session caches in bytes.
+
+#### Returns
+
+`bigint`
+
+***
+
+### default()
+
+```ts
+static default(): Session
+```
+
+Create a session with default cache sizes.
+
+This is equivalent to creating a session with 6GB index cache
+and 1GB metadata cache.
+
+#### Returns
+
+[`Session`](Session.md)
--- a/docs/src/js/classes/Table.md
+++ b/docs/src/js/classes/Table.md
@@ -612,7 +612,7 @@ of the given query

 #### Parameters

-* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
+* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
    the query, a vector or string

 * **queryType?**: `string`
@@ -799,7 +799,7 @@ by `query`.

 #### Parameters

-* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
+* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)

 #### Returns

--- a/docs/src/js/classes/VectorQuery.md
+++ b/docs/src/js/classes/VectorQuery.md
@@ -386,6 +386,53 @@ called then every valid row from the table will be returned.

 ***

+### maximumNprobes()
+
+```ts
+maximumNprobes(maximumNprobes): VectorQuery
+```
+
+Set the maximum number of probes used.
+
+This controls the maximum number of partitions that will be searched.  If this
+number is greater than minimumNprobes then the excess partitions will _only_ be
+searched if we have not found enough results.  This can be useful when there is
+a narrow filter to allow these queries to spend more time searching and avoid
+potential false negatives.
+
+#### Parameters
+
+* **maximumNprobes**: `number`
+
+#### Returns
+
+[`VectorQuery`](VectorQuery.md)
+
+***
+
+### minimumNprobes()
+
+```ts
+minimumNprobes(minimumNprobes): VectorQuery
+```
+
+Set the minimum number of probes used.
+
+This controls the minimum number of partitions that will be searched.  This
+parameter will impact every query against a vector index, regardless of the
+filter.  See `nprobes` for more details.  Higher values will increase recall
+but will also increase latency.
+
+#### Parameters
+
+* **minimumNprobes**: `number`
+
+#### Returns
+
+[`VectorQuery`](VectorQuery.md)
+
+***
+
 ### nprobes()

 ```ts
@@ -413,6 +460,10 @@ For best results we recommend tuning this parameter with a benchmark against
 your actual data to find the smallest possible value that will still give
 you the desired recall.

+For more fine grained control over behavior when you have a very narrow filter
+you can use `minimumNprobes` and `maximumNprobes`.  This method sets both
+the minimum and maximum to the same value.
+
 #### Parameters

 * **nprobes**: `number`
--- a/docs/src/js/enumerations/Occur.md
+++ b/docs/src/js/enumerations/Occur.md
@@ -10,6 +10,7 @@ Enum representing the occurrence of terms in full-text queries.

 - `Must`: The term must be present in the document.
 - `Should`: The term should contribute to the document score, but is not required.
+- `MustNot`: The term must not be present in the document.

 ## Enumeration Members

@@ -21,6 +22,14 @@ Must: "MUST";

 ***

+### MustNot
+
+```ts
+MustNot: "MUST_NOT";
+```
+
+***
+
 ### Should

 ```ts
--- a/docs/src/js/functions/connect.md
+++ b/docs/src/js/functions/connect.md
@@ -6,10 +6,13 @@

 # Function: connect()

-## connect(uri, options)
+## connect(uri, options, session)

 ```ts
-function connect(uri, options?): Promise<Connection>
+function connect(
+   uri,
+   options?,
+   session?): Promise<Connection>
 ```

 Connect to a LanceDB instance at the given URI.
@@ -29,6 +32,8 @@ Accepted formats:
 * **options?**: `Partial`&lt;[`ConnectionOptions`](../interfaces/ConnectionOptions.md)&gt;
    The options to use when connecting to the database

+* **session?**: [`Session`](../classes/Session.md)
+
 ### Returns

 `Promise`&lt;[`Connection`](../classes/Connection.md)&gt;
@@ -77,7 +82,7 @@ Accepted formats:

 [ConnectionOptions](../interfaces/ConnectionOptions.md) for more details on the URI format.

-### Example
+### Examples

 ```ts
 const conn = await connect({
@@ -85,3 +90,11 @@ const conn = await connect({
  storageOptions: {timeout: "60s"}
 });
 ```
+
+```ts
+const session = Session.default();
+const conn = await connect({
+  uri: "/path/to/database",
+  session: session
+});
+```
--- a/docs/src/js/globals.md
+++ b/docs/src/js/globals.md
@@ -29,6 +29,7 @@
 - [Query](classes/Query.md)
 - [QueryBase](classes/QueryBase.md)
 - [RecordBatchIterator](classes/RecordBatchIterator.md)
+- [Session](classes/Session.md)
 - [Table](classes/Table.md)
 - [TagContents](classes/TagContents.md)
 - [Tags](classes/Tags.md)
@@ -84,6 +85,7 @@
 - [FieldLike](type-aliases/FieldLike.md)
 - [IntoSql](type-aliases/IntoSql.md)
 - [IntoVector](type-aliases/IntoVector.md)
+- [MultiVector](type-aliases/MultiVector.md)
 - [RecordBatchLike](type-aliases/RecordBatchLike.md)
 - [SchemaLike](type-aliases/SchemaLike.md)
 - [TableLike](type-aliases/TableLike.md)
--- a/docs/src/js/interfaces/ConnectionOptions.md
+++ b/docs/src/js/interfaces/ConnectionOptions.md
@@ -70,6 +70,17 @@ Defaults to 'us-east-1'.

 ***

+### session?
+
+```ts
+optional session: Session;
+```
+
+(For LanceDB OSS only): the session to use for this connection. Holds
+shared caches and other session-specific state.
+
+***
+
 ### storageOptions?

 ```ts
--- a/docs/src/js/interfaces/FtsOptions.md
+++ b/docs/src/js/interfaces/FtsOptions.md
@@ -23,7 +23,7 @@ whether to remove punctuation
 ### baseTokenizer?

 ```ts
-optional baseTokenizer: "raw" | "simple" | "whitespace";
+optional baseTokenizer: "raw" | "simple" | "whitespace" | "ngram";
 ```

 The tokenizer to use when building the index.
@@ -71,6 +71,36 @@ tokens longer than this length will be ignored

 ***

+### ngramMaxLength?
+
+```ts
+optional ngramMaxLength: number;
+```
+
+ngram max length
+
+***
+
+### ngramMinLength?
+
+```ts
+optional ngramMinLength: number;
+```
+
+ngram min length
+
+***
+
+### prefixOnly?
+
+```ts
+optional prefixOnly: boolean;
+```
+
+whether to only index the prefix of the token for ngram tokenizer
+
+***
+
 ### removeStopWords?

 ```ts
--- a/docs/src/js/interfaces/OpenTableOptions.md
+++ b/docs/src/js/interfaces/OpenTableOptions.md
@@ -8,7 +8,7 @@

 ## Properties

-### indexCacheSize?
+### ~~indexCacheSize?~~

 ```ts
 optional indexCacheSize: number;
@@ -16,6 +16,11 @@ optional indexCacheSize: number;

 Set the size of the index cache, specified as a number of entries

+#### Deprecated
+
+Use session-level cache configuration instead.
+Create a Session with custom cache sizes and pass it to the connect() function.
+
 The exact meaning of an "entry" will depend on the type of index:
 - IVF: there is one entry for each IVF partition
 - BTREE: there is one entry for the entire index
--- a/docs/src/js/interfaces/OptimizeOptions.md
+++ b/docs/src/js/interfaces/OptimizeOptions.md
@@ -24,10 +24,10 @@ The default is 7 days
 // Delete all versions older than 1 day
 const olderThan = new Date();
 olderThan.setDate(olderThan.getDate() - 1));
-tbl.cleanupOlderVersions(olderThan);
+tbl.optimize({cleanupOlderThan: olderThan});

 // Delete all versions except the current version
-tbl.cleanupOlderVersions(new Date());
+tbl.optimize({cleanupOlderThan: new Date()});
 ```

 ***
--- a/docs/src/js/type-aliases/MultiVector.md
+++ b/docs/src/js/type-aliases/MultiVector.md
@@ -0,0 +1,11 @@
+[**@lancedb/lancedb**](../README.md) • **Docs**
+
+***
+
+[@lancedb/lancedb](../globals.md) / MultiVector
+
+# Type Alias: MultiVector
+
+```ts
+type MultiVector: IntoVector[];
+```
--- a/docs/src/notebooks/Multivector_on_LanceDB.ipynb
+++ b/docs/src/notebooks/Multivector_on_LanceDB.ipynb
@@ -428,7 +428,7 @@
        "\n",
        "**Why?**  \n",
        "Embedding the UFO dataset and ingesting it into LanceDB takes **~2 hours on a T4 GPU**. To save time:  \n",
-        "- **Use the pre-prepared table with index created ** (provided below) to proceed directly to step7: search.  \n",
+        "- **Use the pre-prepared table with index created** (provided below) to proceed directly to **Step 7**: search.  \n",
        "- **Step 5a** contains the full ingestion code for reference (run it only if necessary).  \n",
        "- **Step 6** contains the details on creating the index on the multivector column"
      ]
--- a/docs/test/md_testing.py
+++ b/docs/test/md_testing.py
@@ -30,7 +30,8 @@ excluded_globs = [
    "../src/rag/advanced_techniques/*.md",
    "../src/guides/scalar_index.md",
    "../src/guides/storage.md",
-    "../src/search.md"
+    "../src/search.md",
+    "../src/guides/sql_querying.md",
 ]

 python_prefix = "py"
--- a/java/.mvn/wrapper/maven-wrapper.properties
+++ b/java/.mvn/wrapper/maven-wrapper.properties
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+wrapperVersion=3.3.2
+distributionType=only-script
+distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.9/apache-maven-3.9.9-bin.zip
--- a/java/README.md
+++ b/java/README.md
@@ -0,0 +1,37 @@
+# LanceDB Java SDK
+
+## Configuration and Initialization
+
+### LanceDB Cloud
+
+For LanceDB Cloud, use the simplified builder API:
+
+```java
+import com.lancedb.lance.namespace.LanceRestNamespace;
+
+// If your DB url is db://example-db, then your database here is example-db
+LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
+    .apiKey("your_lancedb_cloud_api_key")
+    .database("your_database_name")
+    .build();
+```
+
+### LanceDB Enterprise
+
+For Enterprise deployments, use your VPC endpoint:
+
+```java
+LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
+    .apiKey("your_lancedb_enterprise_api_key")
+    .database("your-top-dir") // Your top level folder under your cloud bucket, e.g. s3://your-bucket/your-top-dir/
+    .hostOverride("http://<vpc_endpoint_dns_name>:80")
+    .build();
+```
+
+## Development
+
+Build:
+
+```shell
+./mvnw install
+```
--- a/java/core/lancedb-jni/Cargo.toml
+++ b/java/core/lancedb-jni/Cargo.toml
@@ -19,7 +19,7 @@ lancedb = { path = "../../../rust/lancedb" }
 lance = { workspace = true }
 arrow = { workspace = true, features = ["ffi"] }
 arrow-schema.workspace = true
-tokio = "1.23"
+tokio = "1.46"
 jni = "0.21.1"
 snafu.workspace = true
 lazy_static.workspace = true
--- a/java/core/pom.xml
+++ b/java/core/pom.xml
@@ -8,18 +8,24 @@
    <parent>
        <groupId>com.lancedb</groupId>
        <artifactId>lancedb-parent</artifactId>
-        <version>0.21.0-beta.0</version>
+        <version>0.21.2-beta.1</version>
        <relativePath>../pom.xml</relativePath>
    </parent>

    <artifactId>lancedb-core</artifactId>
-    <name>LanceDB Core</name>
+    <name>${project.artifactId}</name>
+    <description>LanceDB Core</description>
    <packaging>jar</packaging>
    <properties>
        <rust.release.build>false</rust.release.build>
    </properties>

    <dependencies>
+        <dependency>
+            <groupId>com.lancedb</groupId>
+            <artifactId>lance-namespace-core</artifactId>
+            <version>0.0.1</version>
+        </dependency>
        <dependency>
            <groupId>org.apache.arrow</groupId>
            <artifactId>arrow-vector</artifactId>
--- a/java/lance-namespace/pom.xml
+++ b/java/lance-namespace/pom.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>com.lancedb</groupId>
+        <artifactId>lancedb-parent</artifactId>
+        <version>0.21.2-beta.1</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+
+    <artifactId>lancedb-lance-namespace</artifactId>
+    <name>${project.artifactId}</name>
+    <description>LanceDB Java Integration with Lance Namespace</description>
+    <packaging>jar</packaging>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.lancedb</groupId>
+            <artifactId>lance-namespace-core</artifactId>
+        </dependency>
+    </dependencies>
+</project>
--- a/java/lance-namespace/src/main/java/com/lancedb/lancedb/LanceDbRestNamespaces.java
+++ b/java/lance-namespace/src/main/java/com/lancedb/lancedb/LanceDbRestNamespaces.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.lancedb.lancedb;
+
+import com.lancedb.lance.namespace.LanceRestNamespace;
+import com.lancedb.lance.namespace.client.apache.ApiClient;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Optional;
+
+/** Util class to help construct a {@link LanceRestNamespace} for LanceDB. */
+public class LanceDbRestNamespaces {
+  private static final String DEFAULT_REGION = "us-east-1";
+  private static final String CLOUD_URL_PATTERN = "https://%s.%s.api.lancedb.com";
+
+  private String apiKey;
+  private String database;
+  private Optional<String> hostOverride = Optional.empty();
+  private Optional<String> region = Optional.empty();
+  private Map<String, String> additionalConfig = new HashMap<>();
+
+  private LanceDbRestNamespaces() {}
+
+  /**
+   * Create a new builder instance.
+   *
+   * @return A new LanceRestNamespaceBuilder
+   */
+  public static LanceDbRestNamespaces builder() {
+    return new LanceDbRestNamespaces();
+  }
+
+  /**
+   * Set the API key (required).
+   *
+   * @param apiKey The LanceDB API key
+   * @return This builder
+   */
+  public LanceDbRestNamespaces apiKey(String apiKey) {
+    if (apiKey == null || apiKey.trim().isEmpty()) {
+      throw new IllegalArgumentException("API key cannot be null or empty");
+    }
+    this.apiKey = apiKey;
+    return this;
+  }
+
+  /**
+   * Set the database name (required).
+   *
+   * @param database The database name
+   * @return This builder
+   */
+  public LanceDbRestNamespaces database(String database) {
+    if (database == null || database.trim().isEmpty()) {
+      throw new IllegalArgumentException("Database cannot be null or empty");
+    }
+    this.database = database;
+    return this;
+  }
+
+  /**
+   * Set a custom host override (optional). When set, this overrides the default LanceDB Cloud URL
+   * construction. Use this for LanceDB Enterprise deployments.
+   *
+   * @param hostOverride The complete base URL (e.g., "http://your-vpc-endpoint:80")
+   * @return This builder
+   */
+  public LanceDbRestNamespaces hostOverride(String hostOverride) {
+    this.hostOverride = Optional.ofNullable(hostOverride);
+    return this;
+  }
+
+  /**
+   * Set the region for LanceDB Cloud (optional). Defaults to "us-east-1" if not specified. This is
+   * ignored when hostOverride is set.
+   *
+   * @param region The AWS region (e.g., "us-east-1", "eu-west-1")
+   * @return This builder
+   */
+  public LanceDbRestNamespaces region(String region) {
+    this.region = Optional.ofNullable(region);
+    return this;
+  }
+
+  /**
+   * Add additional configuration parameters.
+   *
+   * @param key The configuration key
+   * @param value The configuration value
+   * @return This builder
+   */
+  public LanceDbRestNamespaces config(String key, String value) {
+    this.additionalConfig.put(key, value);
+    return this;
+  }
+
+  /**
+   * Build the LanceRestNamespace instance.
+   *
+   * @return A configured LanceRestNamespace
+   * @throws IllegalStateException if required parameters are missing
+   */
+  public LanceRestNamespace build() {
+    // Validate required fields
+    if (apiKey == null) {
+      throw new IllegalStateException("API key is required");
+    }
+    if (database == null) {
+      throw new IllegalStateException("Database is required");
+    }
+
+    // Build configuration map
+    Map<String, String> config = new HashMap<>(additionalConfig);
+    config.put("headers.x-lancedb-database", database);
+    config.put("headers.x-api-key", apiKey);
+
+    // Determine base URL
+    String baseUrl;
+    if (hostOverride.isPresent()) {
+      baseUrl = hostOverride.get();
+      config.put("host_override", hostOverride.get());
+    } else {
+      String effectiveRegion = region.orElse(DEFAULT_REGION);
+      baseUrl = String.format(CLOUD_URL_PATTERN, database, effectiveRegion);
+      config.put("region", effectiveRegion);
+    }
+
+    // Create and configure ApiClient
+    ApiClient apiClient = new ApiClient();
+    apiClient.setBasePath(baseUrl);
+
+    return new LanceRestNamespace(apiClient, config);
+  }
+}
--- a/java/mvnw
+++ b/java/mvnw
@@ -0,0 +1,259 @@
+#!/bin/sh
+# ----------------------------------------------------------------------------
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# ----------------------------------------------------------------------------
+
+# ----------------------------------------------------------------------------
+# Apache Maven Wrapper startup batch script, version 3.3.2
+#
+# Optional ENV vars
+# -----------------
+#   JAVA_HOME - location of a JDK home dir, required when download maven via java source
+#   MVNW_REPOURL - repo url base for downloading maven distribution
+#   MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven
+#   MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output
+# ----------------------------------------------------------------------------
+
+set -euf
+[ "${MVNW_VERBOSE-}" != debug ] || set -x
+
+# OS specific support.
+native_path() { printf %s\\n "$1"; }
+case "$(uname)" in
+CYGWIN* | MINGW*)
+  [ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")"
+  native_path() { cygpath --path --windows "$1"; }
+  ;;
+esac
+
+# set JAVACMD and JAVACCMD
+set_java_home() {
+  # For Cygwin and MinGW, ensure paths are in Unix format before anything is touched
+  if [ -n "${JAVA_HOME-}" ]; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ]; then
+      # IBM's JDK on AIX uses strange locations for the executables
+      JAVACMD="$JAVA_HOME/jre/sh/java"
+      JAVACCMD="$JAVA_HOME/jre/sh/javac"
+    else
+      JAVACMD="$JAVA_HOME/bin/java"
+      JAVACCMD="$JAVA_HOME/bin/javac"
+
+      if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then
+        echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2
+        echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2
+        return 1
+      fi
+    fi
+  else
+    JAVACMD="$(
+      'set' +e
+      'unset' -f command 2>/dev/null
+      'command' -v java
+    )" || :
+    JAVACCMD="$(
+      'set' +e
+      'unset' -f command 2>/dev/null
+      'command' -v javac
+    )" || :
+
+    if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then
+      echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2
+      return 1
+    fi
+  fi
+}
+
+# hash string like Java String::hashCode
+hash_string() {
+  str="${1:-}" h=0
+  while [ -n "$str" ]; do
+    char="${str%"${str#?}"}"
+    h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296))
+    str="${str#?}"
+  done
+  printf %x\\n $h
+}
+
+verbose() { :; }
+[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; }
+
+die() {
+  printf %s\\n "$1" >&2
+  exit 1
+}
+
+trim() {
+  # MWRAPPER-139:
+  #   Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds.
+  #   Needed for removing poorly interpreted newline sequences when running in more
+  #   exotic environments such as mingw bash on Windows.
+  printf "%s" "${1}" | tr -d '[:space:]'
+}
+
+# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties
+while IFS="=" read -r key value; do
+  case "${key-}" in
+  distributionUrl) distributionUrl=$(trim "${value-}") ;;
+  distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;;
+  esac
+done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties"
+[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties"
+
+case "${distributionUrl##*/}" in
+maven-mvnd-*bin.*)
+  MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/
+  case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in
+  *AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;;
+  :Darwin*x86_64) distributionPlatform=darwin-amd64 ;;
+  :Darwin*arm64) distributionPlatform=darwin-aarch64 ;;
+  :Linux*x86_64*) distributionPlatform=linux-amd64 ;;
+  *)
+    echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2
+    distributionPlatform=linux-amd64
+    ;;
+  esac
+  distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip"
+  ;;
+maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;;
+*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;;
+esac
+
+# apply MVNW_REPOURL and calculate MAVEN_HOME
+# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash>
+[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}"
+distributionUrlName="${distributionUrl##*/}"
+distributionUrlNameMain="${distributionUrlName%.*}"
+distributionUrlNameMain="${distributionUrlNameMain%-bin}"
+MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}"
+MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")"
+
+exec_maven() {
+  unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || :
+  exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD"
+}
+
+if [ -d "$MAVEN_HOME" ]; then
+  verbose "found existing MAVEN_HOME at $MAVEN_HOME"
+  exec_maven "$@"
+fi
+
+case "${distributionUrl-}" in
+*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;;
+*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;;
+esac
+
+# prepare tmp dir
+if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then
+  clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; }
+  trap clean HUP INT TERM EXIT
+else
+  die "cannot create temp dir"
+fi
+
+mkdir -p -- "${MAVEN_HOME%/*}"
+
+# Download and Install Apache Maven
+verbose "Couldn't find MAVEN_HOME, downloading and installing it ..."
+verbose "Downloading from: $distributionUrl"
+verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName"
+
+# select .zip or .tar.gz
+if ! command -v unzip >/dev/null; then
+  distributionUrl="${distributionUrl%.zip}.tar.gz"
+  distributionUrlName="${distributionUrl##*/}"
+fi
+
+# verbose opt
+__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR=''
+[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v
+
+# normalize http auth
+case "${MVNW_PASSWORD:+has-password}" in
+'') MVNW_USERNAME='' MVNW_PASSWORD='' ;;
+has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;;
+esac
+
+if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then
+  verbose "Found wget ... using wget"
+  wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl"
+elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then
+  verbose "Found curl ... using curl"
+  curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl"
+elif set_java_home; then
+  verbose "Falling back to use Java to download"
+  javaSource="$TMP_DOWNLOAD_DIR/Downloader.java"
+  targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName"
+  cat >"$javaSource" <<-END
+	public class Downloader extends java.net.Authenticator
+	{
+	  protected java.net.PasswordAuthentication getPasswordAuthentication()
+	  {
+	    return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() );
+	  }
+	  public static void main( String[] args ) throws Exception
+	  {
+	    setDefault( new Downloader() );
+	    java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() );
+	  }
+	}
+	END
+  # For Cygwin/MinGW, switch paths to Windows format before running javac and java
+  verbose " - Compiling Downloader.java ..."
+  "$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java"
+  verbose " - Running Downloader.java ..."
+  "$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")"
+fi
+
+# If specified, validate the SHA-256 sum of the Maven distribution zip file
+if [ -n "${distributionSha256Sum-}" ]; then
+  distributionSha256Result=false
+  if [ "$MVN_CMD" = mvnd.sh ]; then
+    echo "Checksum validation is not supported for maven-mvnd." >&2
+    echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
+    exit 1
+  elif command -v sha256sum >/dev/null; then
+    if echo "$distributionSha256Sum  $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then
+      distributionSha256Result=true
+    fi
+  elif command -v shasum >/dev/null; then
+    if echo "$distributionSha256Sum  $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then
+      distributionSha256Result=true
+    fi
+  else
+    echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2
+    echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
+    exit 1
+  fi
+  if [ $distributionSha256Result = false ]; then
+    echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2
+    echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2
+    exit 1
+  fi
+fi
+
+# unzip and move
+if command -v unzip >/dev/null; then
+  unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip"
+else
+  tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar"
+fi
+printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url"
+mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME"
+
+clean || :
+exec_maven "$@"
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -6,11 +6,10 @@

    <groupId>com.lancedb</groupId>
    <artifactId>lancedb-parent</artifactId>
-    <version>0.21.0-beta.0</version>
+    <version>0.21.2-beta.1</version>
    <packaging>pom</packaging>
-
-    <name>LanceDB Parent</name>
-    <description>LanceDB vector database Java API</description>
+    <name>${project.artifactId}</name>
+    <description>LanceDB Java SDK Parent POM</description>
    <url>http://lancedb.com/</url>

    <developers>
@@ -29,6 +28,7 @@
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <arrow.version>15.0.0</arrow.version>
+        <lance-namespace.verison>0.0.1</lance-namespace.verison>
        <spotless.skip>false</spotless.skip>
        <spotless.version>2.30.0</spotless.version>
        <spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
@@ -52,6 +52,7 @@

    <modules>
        <module>core</module>
+        <module>lance-namespace</module>
    </modules>

    <scm>
@@ -62,6 +63,11 @@

    <dependencyManagement>
        <dependencies>
+            <dependency>
+                <groupId>com.lancedb</groupId>
+                <artifactId>lance-namespace-core</artifactId>
+                <version>${lance-namespace.verison}</version>
+            </dependency>
            <dependency>
                <groupId>org.apache.arrow</groupId>
                <artifactId>arrow-vector</artifactId>
--- a/node/package-lock.json
+++ b/node/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "vectordb",
-  "version": "0.20.1-beta.2",
+  "version": "0.21.2-beta.1",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "vectordb",
-      "version": "0.20.1-beta.2",
+      "version": "0.21.2-beta.1",
      "cpu": [
        "x64",
        "arm64"
@@ -52,11 +52,11 @@
        "uuid": "^9.0.0"
      },
      "optionalDependencies": {
-        "@lancedb/vectordb-darwin-arm64": "0.20.1-beta.2",
-        "@lancedb/vectordb-darwin-x64": "0.20.1-beta.2",
-        "@lancedb/vectordb-linux-arm64-gnu": "0.20.1-beta.2",
-        "@lancedb/vectordb-linux-x64-gnu": "0.20.1-beta.2",
-        "@lancedb/vectordb-win32-x64-msvc": "0.20.1-beta.2"
+        "@lancedb/vectordb-darwin-arm64": "0.21.2-beta.1",
+        "@lancedb/vectordb-darwin-x64": "0.21.2-beta.1",
+        "@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.1",
+        "@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.1",
+        "@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.1"
      },
      "peerDependencies": {
        "@apache-arrow/ts": "^14.0.2",
@@ -327,9 +327,9 @@
      }
    },
    "node_modules/@lancedb/vectordb-darwin-arm64": {
-      "version": "0.20.1-beta.2",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.20.1-beta.2.tgz",
-      "integrity": "sha512-mqi0yI+ZwBTydaDy1FRHAUZwrWS28u6tbHTe1s4uSrmERbVI6PfmoPR+NZWWAp6ZhlseSdl/+yeI4imk11rQSw==",
+      "version": "0.21.2-beta.1",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.2-beta.1.tgz",
+      "integrity": "sha512-7QXVJNTei7PMuXRyyc+F3WGiudRNq9HfeOaMmMOJJpuCAO0zLq1pM9DCl5aPF5MddrodPHJxi+IWV+iAFH7zcg==",
      "cpu": [
        "arm64"
      ],
@@ -339,9 +339,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-darwin-x64": {
-      "version": "0.20.1-beta.2",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.20.1-beta.2.tgz",
-      "integrity": "sha512-m8EYYA8JZIeNsJqQsBDUMu6r31/u7FzpjonJ4Y+CjapVl6UdvI65KUkeL2dYrFao++RuIoaiqcm3e7gRgFZpXQ==",
+      "version": "0.21.2-beta.1",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.2-beta.1.tgz",
+      "integrity": "sha512-M/TWcJ3WVc6DNFgG/lWI7L5tQ05IF3WoWuZfRfbbimGhRvY7xf1O3uOt+jMcNJCa5mHFGCg2SZDA8mebd/mL7g==",
      "cpu": [
        "x64"
      ],
@@ -351,9 +351,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-linux-arm64-gnu": {
-      "version": "0.20.1-beta.2",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.20.1-beta.2.tgz",
-      "integrity": "sha512-3Og2+bk4GlWmMO1Yg2HBfeb5zrOMLaIHD7bEqQ4+6yw4IckAaV+ke05H0tyyqmOVrOQ0LpvtXgD7pPztjm9r9A==",
+      "version": "0.21.2-beta.1",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.2-beta.1.tgz",
+      "integrity": "sha512-OEsM9znf9DDmdwGuTg2EVu+ebwuWQ1lCx0cYy4+hNy3ntolwMC39ePg2H9WD9SsEnQ2vcGJgBJTQLPKgXww+iQ==",
      "cpu": [
        "arm64"
      ],
@@ -363,9 +363,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-linux-x64-gnu": {
-      "version": "0.20.1-beta.2",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.20.1-beta.2.tgz",
-      "integrity": "sha512-mwTQyA/FBoU/FkPuvCNBZG3y83gBN+iYoejehBH2HBkLUIcmlsDgSRZ1OQ+f9ijj12EMBCA11tBUPA9zhHzyrw==",
+      "version": "0.21.2-beta.1",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.2-beta.1.tgz",
+      "integrity": "sha512-7FTq/O1zNzD71rgX2PEVmkct4jk2wc+ADU3rss+0VqoBSO9XeMqZEVD2WgZWuSTg6bYai//FHGDHSaknHBNsdw==",
      "cpu": [
        "x64"
      ],
@@ -375,9 +375,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-win32-x64-msvc": {
-      "version": "0.20.1-beta.2",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.20.1-beta.2.tgz",
-      "integrity": "sha512-VkjNpqhK3l3uHLLPmox+HrmKPMaZgV+qsGQWx0nfseGnSOEmXAWZWQFe0APVCQ9y0xTypQB0oH7eSOPZv2t4WQ==",
+      "version": "0.21.2-beta.1",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.2-beta.1.tgz",
+      "integrity": "sha512-mN1p/J0kdqy6MrlKtmA8set/PibqFPyytQJFAuxSLXC/rwD7vgqUCt0SI0zVWPGG7J5Y65kvdc99l7Yl7lJtwQ==",
      "cpu": [
        "x64"
      ],
--- a/node/package.json
+++ b/node/package.json
@@ -1,6 +1,6 @@
 {
  "name": "vectordb",
-  "version": "0.21.0-beta.0",
+  "version": "0.21.2-beta.1",
  "description": " Serverless, low-latency vector database for AI applications",
  "private": false,
  "main": "dist/index.js",
@@ -89,10 +89,10 @@
    }
  },
  "optionalDependencies": {
-    "@lancedb/vectordb-darwin-x64": "0.21.0-beta.0",
-    "@lancedb/vectordb-darwin-arm64": "0.21.0-beta.0",
-    "@lancedb/vectordb-linux-x64-gnu": "0.21.0-beta.0",
-    "@lancedb/vectordb-linux-arm64-gnu": "0.21.0-beta.0",
-    "@lancedb/vectordb-win32-x64-msvc": "0.21.0-beta.0"
+    "@lancedb/vectordb-darwin-x64": "0.21.2-beta.1",
+    "@lancedb/vectordb-darwin-arm64": "0.21.2-beta.1",
+    "@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.1",
+    "@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.1",
+    "@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.1"
  }
 }
--- a/node/src/integration_test/test.ts
+++ b/node/src/integration_test/test.ts
@@ -49,7 +49,7 @@ describe('LanceDB Mirrored Store Integration test', function () {
  it('s3://...?mirroredStore=... param is processed correctly', async function () {
    this.timeout(600000)

-    const dir = tmpdir()
+    const dir = await fs.promises.mkdtemp(path.join(tmpdir(), 'lancedb-mirror-'))
    console.log(dir)
    const conn = await lancedb.connect({ uri: `s3://lancedb-integtest?mirroredStore=${dir}`, storageOptions: { allowHttp: 'true' } })
    const data = Array(200).fill({ vector: Array(128).fill(1.0), id: 0 })
@@ -63,118 +63,93 @@ describe('LanceDB Mirrored Store Integration test', function () {
    const t = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })

    const mirroredPath = path.join(dir, `${tableName}.lance`)
-    fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
-      if (err != null) throw err
-      // there should be three dirs
-      assert.equal(files.length, 3)
-      assert.isTrue(files[0].isDirectory())
-      assert.isTrue(files[1].isDirectory())

-      fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
-        if (err != null) throw err
-        assert.equal(files.length, 1)
-        assert.isTrue(files[0].name.endsWith('.txn'))
-      })
+    const files = await fs.promises.readdir(mirroredPath, { withFileTypes: true })
+    // there should be three dirs
+    assert.equal(files.length, 3, 'files after table creation')
+    assert.isTrue(files[0].isDirectory())
+    assert.isTrue(files[1].isDirectory())

-      fs.readdir(path.join(mirroredPath, '_versions'), { withFileTypes: true }, (err, files) => {
-        if (err != null) throw err
-        assert.equal(files.length, 1)
-        assert.isTrue(files[0].name.endsWith('.manifest'))
-      })
+    const transactionFiles = await fs.promises.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true })
+    assert.equal(transactionFiles.length, 1, 'transactionFiles after table creation')
+    assert.isTrue(transactionFiles[0].name.endsWith('.txn'))

-      fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
-        if (err != null) throw err
-        assert.equal(files.length, 1)
-        assert.isTrue(files[0].name.endsWith('.lance'))
-      })
-    })
+    const versionFiles = await fs.promises.readdir(path.join(mirroredPath, '_versions'), { withFileTypes: true })
+    assert.equal(versionFiles.length, 1, 'versionFiles after table creation')
+    assert.isTrue(versionFiles[0].name.endsWith('.manifest'))
+
+    const dataFiles = await fs.promises.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true })
+    assert.equal(dataFiles.length, 1, 'dataFiles after table creation')
+    assert.isTrue(dataFiles[0].name.endsWith('.lance'))

    // try create index and check if it's mirrored
    await t.createIndex({ column: 'vector', type: 'ivf_pq' })

-    fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
-      if (err != null) throw err
-      // there should be four dirs
-      assert.equal(files.length, 4)
-      assert.isTrue(files[0].isDirectory())
-      assert.isTrue(files[1].isDirectory())
-      assert.isTrue(files[2].isDirectory())
+    const filesAfterIndex = await fs.promises.readdir(mirroredPath, { withFileTypes: true })
+    // there should be four dirs
+    assert.equal(filesAfterIndex.length, 4, 'filesAfterIndex')
+    assert.isTrue(filesAfterIndex[0].isDirectory())
+    assert.isTrue(filesAfterIndex[1].isDirectory())
+    assert.isTrue(filesAfterIndex[2].isDirectory())

-      // Two TXs now
-      fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
-        if (err != null) throw err
-        assert.equal(files.length, 2)
-        assert.isTrue(files[0].name.endsWith('.txn'))
-        assert.isTrue(files[1].name.endsWith('.txn'))
-      })
+    // Two TXs now
+    const transactionFilesAfterIndex = await fs.promises.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true })
+    assert.equal(transactionFilesAfterIndex.length, 2, 'transactionFilesAfterIndex')
+    assert.isTrue(transactionFilesAfterIndex[0].name.endsWith('.txn'))
+    assert.isTrue(transactionFilesAfterIndex[1].name.endsWith('.txn'))

-      fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
-        if (err != null) throw err
-        assert.equal(files.length, 1)
-        assert.isTrue(files[0].name.endsWith('.lance'))
-      })
+    const dataFilesAfterIndex = await fs.promises.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true })
+    assert.equal(dataFilesAfterIndex.length, 1, 'dataFilesAfterIndex')
+    assert.isTrue(dataFilesAfterIndex[0].name.endsWith('.lance'))

-      fs.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true }, (err, files) => {
-        if (err != null) throw err
-        assert.equal(files.length, 1)
-        assert.isTrue(files[0].isDirectory())
+    const indicesFiles = await fs.promises.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true })
+    assert.equal(indicesFiles.length, 1, 'indicesFiles')
+    assert.isTrue(indicesFiles[0].isDirectory())

-        fs.readdir(path.join(mirroredPath, '_indices', files[0].name), { withFileTypes: true }, (err, files) => {
-          if (err != null) throw err
-
-          assert.equal(files.length, 1)
-          assert.isTrue(files[0].isFile())
-          assert.isTrue(files[0].name.endsWith('.idx'))
-        })
-      })
-    })
+    const indexFiles = await fs.promises.readdir(path.join(mirroredPath, '_indices', indicesFiles[0].name), { withFileTypes: true })
+    console.log(`DEBUG indexFiles in ${indicesFiles[0].name}:`, indexFiles.map(f => `${f.name} (${f.isFile() ? 'file' : 'dir'})`))
+    assert.equal(indexFiles.length, 2, 'indexFiles')
+    const fileNames = indexFiles.map(f => f.name).sort()
+    assert.isTrue(fileNames.includes('auxiliary.idx'), 'auxiliary.idx should be present')
+    assert.isTrue(fileNames.includes('index.idx'), 'index.idx should be present')
+    assert.isTrue(indexFiles.every(f => f.isFile()), 'all index files should be files')

    // try delete and check if it's mirrored
    await t.delete('id = 0')

-    fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
-      if (err != null) throw err
-      // there should be five dirs
-      assert.equal(files.length, 5)
-      assert.isTrue(files[0].isDirectory())
-      assert.isTrue(files[1].isDirectory())
-      assert.isTrue(files[2].isDirectory())
-      assert.isTrue(files[3].isDirectory())
-      assert.isTrue(files[4].isDirectory())
+    const filesAfterDelete = await fs.promises.readdir(mirroredPath, { withFileTypes: true })
+    // there should be five dirs
+    assert.equal(filesAfterDelete.length, 5, 'filesAfterDelete')
+    assert.isTrue(filesAfterDelete[0].isDirectory())
+    assert.isTrue(filesAfterDelete[1].isDirectory())
+    assert.isTrue(filesAfterDelete[2].isDirectory())
+    assert.isTrue(filesAfterDelete[3].isDirectory())
+    assert.isTrue(filesAfterDelete[4].isDirectory())

-      // Three TXs now
-      fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
-        if (err != null) throw err
-        assert.equal(files.length, 3)
-        assert.isTrue(files[0].name.endsWith('.txn'))
-        assert.isTrue(files[1].name.endsWith('.txn'))
-      })
+    // Three TXs now
+    const transactionFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true })
+    assert.equal(transactionFilesAfterDelete.length, 3, 'transactionFilesAfterDelete')
+    assert.isTrue(transactionFilesAfterDelete[0].name.endsWith('.txn'))
+    assert.isTrue(transactionFilesAfterDelete[1].name.endsWith('.txn'))

-      fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
-        if (err != null) throw err
-        assert.equal(files.length, 1)
-        assert.isTrue(files[0].name.endsWith('.lance'))
-      })
+    const dataFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true })
+    assert.equal(dataFilesAfterDelete.length, 1, 'dataFilesAfterDelete')
+    assert.isTrue(dataFilesAfterDelete[0].name.endsWith('.lance'))

-      fs.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true }, (err, files) => {
-        if (err != null) throw err
-        assert.equal(files.length, 1)
-        assert.isTrue(files[0].isDirectory())
+    const indicesFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true })
+    assert.equal(indicesFilesAfterDelete.length, 1, 'indicesFilesAfterDelete')
+    assert.isTrue(indicesFilesAfterDelete[0].isDirectory())

-        fs.readdir(path.join(mirroredPath, '_indices', files[0].name), { withFileTypes: true }, (err, files) => {
-          if (err != null) throw err
+    const indexFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, '_indices', indicesFilesAfterDelete[0].name), { withFileTypes: true })
+    console.log(`DEBUG indexFilesAfterDelete in ${indicesFilesAfterDelete[0].name}:`, indexFilesAfterDelete.map(f => `${f.name} (${f.isFile() ? 'file' : 'dir'})`))
+    assert.equal(indexFilesAfterDelete.length, 2, 'indexFilesAfterDelete')
+    const fileNamesAfterDelete = indexFilesAfterDelete.map(f => f.name).sort()
+    assert.isTrue(fileNamesAfterDelete.includes('auxiliary.idx'), 'auxiliary.idx should be present after delete')
+    assert.isTrue(fileNamesAfterDelete.includes('index.idx'), 'index.idx should be present after delete')
+    assert.isTrue(indexFilesAfterDelete.every(f => f.isFile()), 'all index files should be files after delete')

-          assert.equal(files.length, 1)
-          assert.isTrue(files[0].isFile())
-          assert.isTrue(files[0].name.endsWith('.idx'))
-        })
-      })
-
-      fs.readdir(path.join(mirroredPath, '_deletions'), { withFileTypes: true }, (err, files) => {
-        if (err != null) throw err
-        assert.equal(files.length, 1)
-        assert.isTrue(files[0].name.endsWith('.arrow'))
-      })
-    })
+    const deletionFiles = await fs.promises.readdir(path.join(mirroredPath, '_deletions'), { withFileTypes: true })
+    assert.equal(deletionFiles.length, 1, 'deletionFiles')
+    assert.isTrue(deletionFiles[0].name.endsWith('.arrow'))
  })
 })
--- a/nodejs/CLAUDE.md
+++ b/nodejs/CLAUDE.md
@@ -0,0 +1,13 @@
+These are the typescript bindings of LanceDB.
+The core Rust library is in the `../rust/lancedb` directory, the rust binding
+code is in the `src/` directory and the typescript bindings are in
+the `lancedb/` directory.
+
+Whenever you change the Rust code, you will need to recompile: `npm run build`.
+
+Common commands:
+* Build: `npm run build`
+* Lint: `npm run lint`
+* Fix lints: `npm run lint-fix`
+* Test: `npm test`
+* Run single test file: `npm test __test__/arrow.test.ts`
--- a/nodejs/Cargo.toml
+++ b/nodejs/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "lancedb-nodejs"
 edition.workspace = true
-version = "0.21.0-beta.0"
+version = "0.21.2-beta.1"
 license.workspace = true
 description.workspace = true
 repository.workspace = true
--- a/nodejs/test/arrow.test.ts
+++ b/nodejs/test/arrow.test.ts
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The LanceDB Authors

-import { Schema } from "apache-arrow";
+import { Bool, Field, Int32, List, Schema, Struct, Utf8 } from "apache-arrow";

 import * as arrow15 from "apache-arrow-15";
 import * as arrow16 from "apache-arrow-16";
@@ -11,10 +11,12 @@ import * as arrow18 from "apache-arrow-18";
 import {
  convertToTable,
  fromBufferToRecordBatch,
+  fromDataToBuffer,
  fromRecordBatchToBuffer,
  fromTableToBuffer,
  makeArrowTable,
  makeEmptyTable,
+  tableFromIPC,
 } from "../lancedb/arrow";
 import {
  EmbeddingFunction,
@@ -375,8 +377,221 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
        expect(table2.schema).toEqual(schema);
      });

+      it("will handle missing columns in schema alignment when using embeddings", async function () {
+        const schema = new Schema(
+          [
+            new Field("domain", new Utf8(), true),
+            new Field("name", new Utf8(), true),
+            new Field("description", new Utf8(), true),
+          ],
+          new Map([["embedding_functions", JSON.stringify([])]]),
+        );
+
+        const data = [
+          { domain: "google.com", name: "Google" },
+          { domain: "facebook.com", name: "Facebook" },
+        ];
+
+        const table = await convertToTable(data, undefined, { schema });
+
+        expect(table.numCols).toBe(3);
+        expect(table.numRows).toBe(2);
+
+        const descriptionColumn = table.getChild("description");
+        expect(descriptionColumn).toBeDefined();
+        expect(descriptionColumn?.nullCount).toBe(2);
+        expect(descriptionColumn?.toArray()).toEqual([null, null]);
+
+        expect(table.getChild("domain")?.toArray()).toEqual([
+          "google.com",
+          "facebook.com",
+        ]);
+        expect(table.getChild("name")?.toArray()).toEqual([
+          "Google",
+          "Facebook",
+        ]);
+      });
+
+      it("will handle completely missing nested struct columns", async function () {
+        const schema = new Schema(
+          [
+            new Field("id", new Utf8(), true),
+            new Field("name", new Utf8(), true),
+            new Field(
+              "metadata",
+              new Struct([
+                new Field("version", new Int32(), true),
+                new Field("author", new Utf8(), true),
+                new Field(
+                  "tags",
+                  new List(new Field("item", new Utf8(), true)),
+                  true,
+                ),
+              ]),
+              true,
+            ),
+          ],
+          new Map([["embedding_functions", JSON.stringify([])]]),
+        );
+
+        const data = [
+          { id: "doc1", name: "Document 1" },
+          { id: "doc2", name: "Document 2" },
+        ];
+
+        const table = await convertToTable(data, undefined, { schema });
+
+        expect(table.numCols).toBe(3);
+        expect(table.numRows).toBe(2);
+
+        const buf = await fromTableToBuffer(table);
+        const retrievedTable = tableFromIPC(buf);
+
+        const rows = [];
+        for (let i = 0; i < retrievedTable.numRows; i++) {
+          rows.push(retrievedTable.get(i));
+        }
+
+        expect(rows[0].metadata.version).toBe(null);
+        expect(rows[0].metadata.author).toBe(null);
+        expect(rows[0].metadata.tags).toBe(null);
+        expect(rows[0].id).toBe("doc1");
+        expect(rows[0].name).toBe("Document 1");
+      });
+
+      it("will handle partially missing nested struct fields", async function () {
+        const schema = new Schema(
+          [
+            new Field("id", new Utf8(), true),
+            new Field(
+              "metadata",
+              new Struct([
+                new Field("version", new Int32(), true),
+                new Field("author", new Utf8(), true),
+                new Field("created_at", new Utf8(), true),
+              ]),
+              true,
+            ),
+          ],
+          new Map([["embedding_functions", JSON.stringify([])]]),
+        );
+
+        const data = [
+          { id: "doc1", metadata: { version: 1, author: "Alice" } },
+          { id: "doc2", metadata: { version: 2 } },
+        ];
+
+        const table = await convertToTable(data, undefined, { schema });
+
+        expect(table.numCols).toBe(2);
+        expect(table.numRows).toBe(2);
+
+        const metadataColumn = table.getChild("metadata");
+        expect(metadataColumn).toBeDefined();
+        expect(metadataColumn?.type.toString()).toBe(
+          "Struct<{version:Int32, author:Utf8, created_at:Utf8}>",
+        );
+      });
+
+      it("will handle multiple levels of nested structures", async function () {
+        const schema = new Schema(
+          [
+            new Field("id", new Utf8(), true),
+            new Field(
+              "config",
+              new Struct([
+                new Field("database", new Utf8(), true),
+                new Field(
+                  "connection",
+                  new Struct([
+                    new Field("host", new Utf8(), true),
+                    new Field("port", new Int32(), true),
+                    new Field(
+                      "ssl",
+                      new Struct([
+                        new Field("enabled", new Bool(), true),
+                        new Field("cert_path", new Utf8(), true),
+                      ]),
+                      true,
+                    ),
+                  ]),
+                  true,
+                ),
+              ]),
+              true,
+            ),
+          ],
+          new Map([["embedding_functions", JSON.stringify([])]]),
+        );
+
+        const data = [
+          {
+            id: "config1",
+            config: {
+              database: "postgres",
+              connection: { host: "localhost" },
+            },
+          },
+          {
+            id: "config2",
+            config: { database: "mysql" },
+          },
+          {
+            id: "config3",
+          },
+        ];
+
+        const table = await convertToTable(data, undefined, { schema });
+
+        expect(table.numCols).toBe(2);
+        expect(table.numRows).toBe(3);
+
+        const configColumn = table.getChild("config");
+        expect(configColumn).toBeDefined();
+        expect(configColumn?.type.toString()).toBe(
+          "Struct<{database:Utf8, connection:Struct<{host:Utf8, port:Int32, ssl:Struct<{enabled:Bool, cert_path:Utf8}>}>}>",
+        );
+      });
+
+      it("will handle missing columns in Arrow table input when using embeddings", async function () {
+        const incompleteTable = makeArrowTable([
+          { domain: "google.com", name: "Google" },
+          { domain: "facebook.com", name: "Facebook" },
+        ]);
+
+        const schema = new Schema(
+          [
+            new Field("domain", new Utf8(), true),
+            new Field("name", new Utf8(), true),
+            new Field("description", new Utf8(), true),
+          ],
+          new Map([["embedding_functions", JSON.stringify([])]]),
+        );
+
+        const buf = await fromDataToBuffer(incompleteTable, undefined, schema);
+
+        expect(buf.byteLength).toBeGreaterThan(0);
+
+        const retrievedTable = tableFromIPC(buf);
+        expect(retrievedTable.numCols).toBe(3);
+        expect(retrievedTable.numRows).toBe(2);
+
+        const descriptionColumn = retrievedTable.getChild("description");
+        expect(descriptionColumn).toBeDefined();
+        expect(descriptionColumn?.nullCount).toBe(2);
+        expect(descriptionColumn?.toArray()).toEqual([null, null]);
+
+        expect(retrievedTable.getChild("domain")?.toArray()).toEqual([
+          "google.com",
+          "facebook.com",
+        ]);
+        expect(retrievedTable.getChild("name")?.toArray()).toEqual([
+          "Google",
+          "Facebook",
+        ]);
+      });
+
      it("should correctly retain values in nested struct fields", async function () {
-        // Define test data with nested struct
        const testData = [
          {
            id: "doc1",
@@ -400,10 +615,8 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
          },
        ];

-        // Create Arrow table from the data
        const table = makeArrowTable(testData);

-        // Verify schema has the nested struct fields
        const metadataField = table.schema.fields.find(
          (f) => f.name === "metadata",
        );
@@ -417,23 +630,17 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
          "text",
        ]);

-        // Convert to buffer and back (simulating storage and retrieval)
        const buf = await fromTableToBuffer(table);
        const retrievedTable = tableFromIPC(buf);

-        // Verify the retrieved table has the same structure
        const rows = [];
        for (let i = 0; i < retrievedTable.numRows; i++) {
          rows.push(retrievedTable.get(i));
        }

-        // Check values in the first row
        const firstRow = rows[0];
        expect(firstRow.id).toBe("doc1");
        expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
-
-        // Verify metadata values are preserved (this is where the bug is)
-        expect(firstRow.metadata).toBeDefined();
        expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
        expect(firstRow.metadata.startLine).toBe(10);
        expect(firstRow.metadata.endLine).toBe(20);
--- a/nodejs/test/session.test.ts
+++ b/nodejs/test/session.test.ts
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+import * as tmp from "tmp";
+import { Session, connect } from "../lancedb";
+
+describe("Session", () => {
+  let tmpDir: tmp.DirResult;
+  beforeEach(() => {
+    tmpDir = tmp.dirSync({ unsafeCleanup: true });
+  });
+  afterEach(() => tmpDir.removeCallback());
+
+  it("should configure cache sizes and work with database operations", async () => {
+    // Create session with small cache limits for testing
+    const indexCacheSize = BigInt(1024 * 1024); // 1MB
+    const metadataCacheSize = BigInt(512 * 1024); // 512KB
+
+    const session = new Session(indexCacheSize, metadataCacheSize);
+
+    // Record initial cache state
+    const initialCacheSize = session.sizeBytes();
+    const initialCacheItems = session.approxNumItems();
+
+    // Test session works with database connection
+    const db = await connect({ uri: tmpDir.name, session: session });
+
+    // Create and use a table to exercise the session
+    const data = Array.from({ length: 100 }, (_, i) => ({
+      id: i,
+      text: `item ${i}`,
+    }));
+    const table = await db.createTable("test", data);
+    const results = await table.query().limit(5).toArray();
+
+    expect(results).toHaveLength(5);
+
+    // Verify cache usage increased after operations
+    const finalCacheSize = session.sizeBytes();
+    const finalCacheItems = session.approxNumItems();
+
+    expect(finalCacheSize).toBeGreaterThan(initialCacheSize); // Cache should have grown
+    expect(finalCacheItems).toBeGreaterThanOrEqual(initialCacheItems); // Items should not decrease
+    expect(initialCacheSize).toBeLessThan(indexCacheSize + metadataCacheSize); // Within limits
+  });
+});
--- a/nodejs/test/table.test.ts
+++ b/nodejs/test/table.test.ts
@@ -368,9 +368,9 @@ describe("merge insert", () => {
      { a: 4, b: "z" },
    ];

-    expect(
-      JSON.parse(JSON.stringify((await table.toArrow()).toArray())),
-    ).toEqual(expected);
+    const result = (await table.toArrow()).toArray().sort((a, b) => a.a - b.a);
+
+    expect(result.map((row) => ({ ...row }))).toEqual(expected);
  });
  test("conditional update", async () => {
    const newData = [
@@ -1706,6 +1706,60 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
      expect(mustNotResults.length).toBe(1);
    });

+    test("full text search ngram", async () => {
+      const db = await connect(tmpDir.name);
+      const data = [
+        { text: "hello world", vector: [0.1, 0.2, 0.3] },
+        { text: "lance database", vector: [0.4, 0.5, 0.6] },
+        { text: "lance is cool", vector: [0.7, 0.8, 0.9] },
+      ];
+      const table = await db.createTable("test", data);
+      await table.createIndex("text", {
+        config: Index.fts({ baseTokenizer: "ngram" }),
+      });
+
+      const results = await table.search("lan").toArray();
+      expect(results.length).toBe(2);
+      const resultSet = new Set(results.map((r) => r.text));
+      expect(resultSet.has("lance database")).toBe(true);
+      expect(resultSet.has("lance is cool")).toBe(true);
+
+      const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
+      expect(results2.length).toBe(2);
+      const resultSet2 = new Set(results2.map((r) => r.text));
+      expect(resultSet2.has("lance database")).toBe(true);
+      expect(resultSet2.has("lance is cool")).toBe(true);
+
+      // the default min_ngram_length is 3, so "la" should not match
+      const results3 = await table.search("la").toArray();
+      expect(results3.length).toBe(0);
+
+      // test setting min_ngram_length and prefix_only
+      await table.createIndex("text", {
+        config: Index.fts({
+          baseTokenizer: "ngram",
+          ngramMinLength: 2,
+          prefixOnly: true,
+        }),
+        replace: true,
+      });
+
+      const results4 = await table.search("lan").toArray();
+      expect(results4.length).toBe(2);
+      const resultSet4 = new Set(results4.map((r) => r.text));
+      expect(resultSet4.has("lance database")).toBe(true);
+      expect(resultSet4.has("lance is cool")).toBe(true);
+
+      const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
+      expect(results5.length).toBe(0);
+
+      const results6 = await table.search("la").toArray();
+      expect(results6.length).toBe(2);
+      const resultSet6 = new Set(results6.map((r) => r.text));
+      expect(resultSet6.has("lance database")).toBe(true);
+      expect(resultSet6.has("lance is cool")).toBe(true);
+    });
+
    test.each([
      [0.4, 0.5, 0.599], // number[]
      Float32Array.of(0.4, 0.5, 0.599), // Float32Array
@@ -1809,4 +1863,43 @@ describe("column name options", () => {
    expect(results[0].query_index).toBe(0);
    expect(results[1].query_index).toBe(1);
  });
+
+  test("index and search multivectors", async () => {
+    const db = await connect(tmpDir.name);
+    const data = [];
+    // generate 512 random multivectors
+    for (let i = 0; i < 256; i++) {
+      data.push({
+        multivector: Array.from({ length: 10 }, () =>
+          Array(2).fill(Math.random()),
+        ),
+      });
+    }
+    const table = await db.createTable("multivectors", data, {
+      schema: new Schema([
+        new Field(
+          "multivector",
+          new List(
+            new Field(
+              "item",
+              new FixedSizeList(2, new Field("item", new Float32())),
+            ),
+          ),
+        ),
+      ]),
+    });
+
+    const results = await table.search(data[0].multivector).limit(10).toArray();
+    expect(results.length).toBe(10);
+
+    await table.createIndex("multivector", {
+      config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }),
+    });
+
+    const results2 = await table
+      .search(data[0].multivector)
+      .limit(10)
+      .toArray();
+    expect(results2.length).toBe(10);
+  });
 });
--- a/nodejs/lancedb/arrow.ts
+++ b/nodejs/lancedb/arrow.ts
@@ -107,6 +107,20 @@ export type IntoVector =
  | number[]
  | Promise<Float32Array | Float64Array | number[]>;

+export type MultiVector = IntoVector[];
+
+export function isMultiVector(value: unknown): value is MultiVector {
+  return Array.isArray(value) && isIntoVector(value[0]);
+}
+
+export function isIntoVector(value: unknown): value is IntoVector {
+  return (
+    value instanceof Float32Array ||
+    value instanceof Float64Array ||
+    (Array.isArray(value) && !Array.isArray(value[0]))
+  );
+}
+
 export function isArrowTable(value: object): value is TableLike {
  if (value instanceof ArrowTable) return true;
  return "schema" in value && "batches" in value;
@@ -839,6 +853,15 @@ async function applyEmbeddingsFromMetadata(
    const vector = makeVector(vectors, destType);
    columns[destColumn] = vector;
  }
+
+  // Add any missing columns from the schema as null vectors
+  for (const field of schema.fields) {
+    if (!(field.name in columns)) {
+      const nullValues = new Array(table.numRows).fill(null);
+      columns[field.name] = makeVector(nullValues, field.type);
+    }
+  }
+
  const newTable = new ArrowTable(columns);
  return alignTable(newTable, schema);
 }
@@ -987,7 +1010,21 @@ export async function convertToTable(
  embeddings?: EmbeddingFunctionConfig,
  makeTableOptions?: Partial<MakeArrowTableOptions>,
 ): Promise<ArrowTable> {
-  const table = makeArrowTable(data, makeTableOptions);
+  let processedData = data;
+
+  // If we have a schema with embedding metadata, we need to preprocess the data
+  // to ensure all nested fields are present
+  if (
+    makeTableOptions?.schema &&
+    makeTableOptions.schema.metadata?.has("embedding_functions")
+  ) {
+    processedData = ensureNestedFieldsExist(
+      data,
+      makeTableOptions.schema as Schema,
+    );
+  }
+
+  const table = makeArrowTable(processedData, makeTableOptions);
  return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
 }

@@ -1080,7 +1117,16 @@ export async function fromDataToBuffer(
    schema = sanitizeSchema(schema);
  }
  if (isArrowTable(data)) {
-    return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
+    const table = sanitizeTable(data);
+    // If we have a schema with embedding functions, we need to ensure all columns exist
+    // before applying embeddings, since applyEmbeddingsFromMetadata expects all columns
+    // to be present in the table
+    if (schema && schema.metadata?.has("embedding_functions")) {
+      const alignedTable = alignTableToSchema(table, schema);
+      return fromTableToBuffer(alignedTable, embeddings, schema);
+    } else {
+      return fromTableToBuffer(table, embeddings, schema);
+    }
  } else {
    const table = await convertToTable(data, embeddings, { schema });
    return fromTableToBuffer(table);
@@ -1149,7 +1195,7 @@ function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
    type: new Struct(schema.fields),
    length: batch.numRows,
    nullCount: batch.nullCount,
-    children: alignedChildren,
+    children: alignedChildren as unknown as ArrowData<DataType>[],
  });
  return new RecordBatch(schema, newData);
 }
@@ -1221,6 +1267,79 @@ function validateSchemaEmbeddings(
  return new Schema(fields, schema.metadata);
 }

+/**
+ * Ensures that all nested fields defined in the schema exist in the data,
+ * filling missing fields with null values.
+ */
+export function ensureNestedFieldsExist(
+  data: Array<Record<string, unknown>>,
+  schema: Schema,
+): Array<Record<string, unknown>> {
+  return data.map((row) => {
+    const completeRow: Record<string, unknown> = {};
+
+    for (const field of schema.fields) {
+      if (field.name in row) {
+        if (
+          field.type.constructor.name === "Struct" &&
+          row[field.name] !== null &&
+          row[field.name] !== undefined
+        ) {
+          // Handle nested struct
+          const nestedValue = row[field.name] as Record<string, unknown>;
+          completeRow[field.name] = ensureStructFieldsExist(
+            nestedValue,
+            field.type,
+          );
+        } else {
+          // Non-struct field or null struct value
+          completeRow[field.name] = row[field.name];
+        }
+      } else {
+        // Field is missing from the data - set to null
+        completeRow[field.name] = null;
+      }
+    }
+
+    return completeRow;
+  });
+}
+
+/**
+ * Recursively ensures that all fields in a struct type exist in the data,
+ * filling missing fields with null values.
+ */
+function ensureStructFieldsExist(
+  data: Record<string, unknown>,
+  structType: Struct,
+): Record<string, unknown> {
+  const completeStruct: Record<string, unknown> = {};
+
+  for (const childField of structType.children) {
+    if (childField.name in data) {
+      if (
+        childField.type.constructor.name === "Struct" &&
+        data[childField.name] !== null &&
+        data[childField.name] !== undefined
+      ) {
+        // Recursively handle nested struct
+        completeStruct[childField.name] = ensureStructFieldsExist(
+          data[childField.name] as Record<string, unknown>,
+          childField.type,
+        );
+      } else {
+        // Non-struct field or null struct value
+        completeStruct[childField.name] = data[childField.name];
+      }
+    } else {
+      // Field is missing - set to null
+      completeStruct[childField.name] = null;
+    }
+  }
+
+  return completeStruct;
+}
+
 interface JsonDataType {
  type: string;
  fields?: JsonField[];
@@ -1354,3 +1473,64 @@ function fieldToJson(field: Field): JsonField {
    metadata: field.metadata,
  };
 }
+
+function alignTableToSchema(
+  table: ArrowTable,
+  targetSchema: Schema,
+): ArrowTable {
+  const existingColumns = new Map<string, Vector>();
+
+  // Map existing columns
+  for (const field of table.schema.fields) {
+    existingColumns.set(field.name, table.getChild(field.name)!);
+  }
+
+  // Create vectors for all fields in target schema
+  const alignedColumns: Record<string, Vector> = {};
+
+  for (const field of targetSchema.fields) {
+    if (existingColumns.has(field.name)) {
+      // Column exists, use it
+      alignedColumns[field.name] = existingColumns.get(field.name)!;
+    } else {
+      // Column missing, create null vector
+      alignedColumns[field.name] = createNullVector(field, table.numRows);
+    }
+  }
+
+  // Create new table with aligned schema and columns
+  return new ArrowTable(targetSchema, alignedColumns);
+}
+
+function createNullVector(field: Field, numRows: number): Vector {
+  if (field.type.constructor.name === "Struct") {
+    // For struct types, create a struct with null fields
+    const structType = field.type as Struct;
+    const childVectors = structType.children.map((childField) =>
+      createNullVector(childField, numRows),
+    );
+
+    // Create struct data
+    const structData = makeData({
+      type: structType,
+      length: numRows,
+      nullCount: 0,
+      children: childVectors.map((v) => v.data[0]),
+    });
+
+    return arrowMakeVector(structData);
+  } else {
+    // For other types, create a vector of nulls
+    const nullBitmap = new Uint8Array(Math.ceil(numRows / 8));
+    // All bits are 0, meaning all values are null
+
+    const data = makeData({
+      type: field.type,
+      length: numRows,
+      nullCount: numRows,
+      nullBitmap,
+    });
+
+    return arrowMakeVector(data);
+  }
+}
--- a/nodejs/lancedb/connection.ts
+++ b/nodejs/lancedb/connection.ts
@@ -85,6 +85,9 @@ export interface OpenTableOptions {
  /**
   * Set the size of the index cache, specified as a number of entries
   *
+   * @deprecated Use session-level cache configuration instead.
+   * Create a Session with custom cache sizes and pass it to the connect() function.
+   *
   * The exact meaning of an "entry" will depend on the type of index:
   * - IVF: there is one entry for each IVF partition
   * - BTREE: there is one entry for the entire index
--- a/nodejs/lancedb/index.ts
+++ b/nodejs/lancedb/index.ts
@@ -10,6 +10,7 @@ import {
 import {
  ConnectionOptions,
  Connection as LanceDbConnection,
+  Session,
 } from "./native.js";

 export {
@@ -51,6 +52,8 @@ export {
  OpenTableOptions,
 } from "./connection";

+export { Session } from "./native.js";
+
 export {
  ExecutableQuery,
  Query,
@@ -100,6 +103,7 @@ export {
  RecordBatchLike,
  DataLike,
  IntoVector,
+  MultiVector,
 } from "./arrow";
 export { IntoSql, packBits } from "./util";

@@ -130,6 +134,7 @@ export { IntoSql, packBits } from "./util";
 export async function connect(
  uri: string,
  options?: Partial<ConnectionOptions>,
+  session?: Session,
 ): Promise<Connection>;
 /**
 * Connect to a LanceDB instance at the given URI.
@@ -148,31 +153,43 @@ export async function connect(
 *   storageOptions: {timeout: "60s"}
 * });
 * ```
+ *
+ * @example
+ * ```ts
+ * const session = Session.default();
+ * const conn = await connect({
+ *   uri: "/path/to/database",
+ *   session: session
+ * });
+ * ```
 */
 export async function connect(
  options: Partial<ConnectionOptions> & { uri: string },
 ): Promise<Connection>;
 export async function connect(
  uriOrOptions: string | (Partial<ConnectionOptions> & { uri: string }),
-  options: Partial<ConnectionOptions> = {},
+  options?: Partial<ConnectionOptions>,
 ): Promise<Connection> {
  let uri: string | undefined;
+  let finalOptions: Partial<ConnectionOptions> = {};
+
  if (typeof uriOrOptions !== "string") {
    const { uri: uri_, ...opts } = uriOrOptions;
    uri = uri_;
-    options = opts;
+    finalOptions = opts;
  } else {
    uri = uriOrOptions;
+    finalOptions = options || {};
  }

  if (!uri) {
    throw new Error("uri is required");
  }

-  options = (options as ConnectionOptions) ?? {};
-  (<ConnectionOptions>options).storageOptions = cleanseStorageOptions(
-    (<ConnectionOptions>options).storageOptions,
+  finalOptions = (finalOptions as ConnectionOptions) ?? {};
+  (<ConnectionOptions>finalOptions).storageOptions = cleanseStorageOptions(
+    (<ConnectionOptions>finalOptions).storageOptions,
  );
-  const nativeConn = await LanceDbConnection.new(uri, options);
+  const nativeConn = await LanceDbConnection.new(uri, finalOptions);
  return new LocalConnection(nativeConn);
 }
--- a/nodejs/lancedb/indices.ts
+++ b/nodejs/lancedb/indices.ts
@@ -439,7 +439,7 @@ export interface FtsOptions {
   *
   * "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
   */
-  baseTokenizer?: "simple" | "whitespace" | "raw";
+  baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";

  /**
   * language for stemming and stop words
@@ -472,6 +472,21 @@ export interface FtsOptions {
   * whether to remove punctuation
   */
  asciiFolding?: boolean;
+
+  /**
+   * ngram min length
+   */
+  ngramMinLength?: number;
+
+  /**
+   * ngram max length
+   */
+  ngramMaxLength?: number;
+
+  /**
+   * whether to only index the prefix of the token for ngram tokenizer
+   */
+  prefixOnly?: boolean;
 }

 export class Index {
@@ -608,6 +623,9 @@ export class Index {
        options?.stem,
        options?.removeStopWords,
        options?.asciiFolding,
+        options?.ngramMinLength,
+        options?.ngramMaxLength,
+        options?.prefixOnly,
      ),
    );
  }
--- a/nodejs/lancedb/table.ts
+++ b/nodejs/lancedb/table.ts
@@ -6,9 +6,11 @@ import {
  Data,
  DataType,
  IntoVector,
+  MultiVector,
  Schema,
  dataTypeToJson,
  fromDataToBuffer,
+  isMultiVector,
  tableFromIPC,
 } from "./arrow";

@@ -75,10 +77,10 @@ export interface OptimizeOptions {
   * // Delete all versions older than 1 day
   * const olderThan = new Date();
   * olderThan.setDate(olderThan.getDate() - 1));
-   * tbl.cleanupOlderVersions(olderThan);
+   * tbl.optimize({cleanupOlderThan: olderThan});
   *
   * // Delete all versions except the current version
-   * tbl.cleanupOlderVersions(new Date());
+   * tbl.optimize({cleanupOlderThan: new Date()});
   */
  cleanupOlderThan: Date;
  deleteUnverified: boolean;
@@ -346,7 +348,7 @@ export abstract class Table {
   * if the query is a string and no embedding function is defined, it will be treated as a full text search query
   */
  abstract search(
-    query: string | IntoVector | FullTextQuery,
+    query: string | IntoVector | MultiVector | FullTextQuery,
    queryType?: string,
    ftsColumns?: string | string[],
  ): VectorQuery | Query;
@@ -357,7 +359,7 @@ export abstract class Table {
   * is the same thing as calling `nearestTo` on the builder returned
   * by `query`.  @see {@link Query#nearestTo} for more details.
   */
-  abstract vectorSearch(vector: IntoVector): VectorQuery;
+  abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
  /**
   * Add new columns with defined values.
   * @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
@@ -668,7 +670,7 @@ export class LocalTable extends Table {
  }

  search(
-    query: string | IntoVector | FullTextQuery,
+    query: string | IntoVector | MultiVector | FullTextQuery,
    queryType: string = "auto",
    ftsColumns?: string | string[],
  ): VectorQuery | Query {
@@ -715,7 +717,15 @@ export class LocalTable extends Table {
    return this.query().nearestTo(queryPromise);
  }

-  vectorSearch(vector: IntoVector): VectorQuery {
+  vectorSearch(vector: IntoVector | MultiVector): VectorQuery {
+    if (isMultiVector(vector)) {
+      const query = this.query().nearestTo(vector[0]);
+      for (const v of vector.slice(1)) {
+        query.addQueryVector(v);
+      }
+      return query;
+    }
+
    return this.query().nearestTo(vector);
  }

--- a/nodejs/npm/darwin-arm64/package.json
+++ b/nodejs/npm/darwin-arm64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-arm64",
-	"version": "0.21.0-beta.0",
+	"version": "0.21.2-beta.1",
 	"os": ["darwin"],
 	"cpu": ["arm64"],
 	"main": "lancedb.darwin-arm64.node",
--- a/nodejs/npm/darwin-x64/package.json
+++ b/nodejs/npm/darwin-x64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-x64",
-	"version": "0.21.0-beta.0",
+	"version": "0.21.2-beta.1",
 	"os": ["darwin"],
 	"cpu": ["x64"],
 	"main": "lancedb.darwin-x64.node",
--- a/nodejs/npm/linux-arm64-gnu/package.json
+++ b/nodejs/npm/linux-arm64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-arm64-gnu",
-	"version": "0.21.0-beta.0",
+	"version": "0.21.2-beta.1",
 	"os": ["linux"],
 	"cpu": ["arm64"],
 	"main": "lancedb.linux-arm64-gnu.node",
--- a/nodejs/npm/linux-arm64-musl/package.json
+++ b/nodejs/npm/linux-arm64-musl/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-arm64-musl",
-	"version": "0.21.0-beta.0",
+	"version": "0.21.2-beta.1",
 	"os": ["linux"],
 	"cpu": ["arm64"],
 	"main": "lancedb.linux-arm64-musl.node",
--- a/nodejs/npm/linux-x64-gnu/package.json
+++ b/nodejs/npm/linux-x64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-x64-gnu",
-	"version": "0.21.0-beta.0",
+	"version": "0.21.2-beta.1",
 	"os": ["linux"],
 	"cpu": ["x64"],
 	"main": "lancedb.linux-x64-gnu.node",
--- a/nodejs/npm/linux-x64-musl/package.json
+++ b/nodejs/npm/linux-x64-musl/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-x64-musl",
-	"version": "0.21.0-beta.0",
+	"version": "0.21.2-beta.1",
 	"os": ["linux"],
 	"cpu": ["x64"],
 	"main": "lancedb.linux-x64-musl.node",
--- a/nodejs/npm/win32-arm64-msvc/package.json
+++ b/nodejs/npm/win32-arm64-msvc/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@lancedb/lancedb-win32-arm64-msvc",
-  "version": "0.21.0-beta.0",
+  "version": "0.21.2-beta.1",
  "os": [
    "win32"
  ],
--- a/nodejs/npm/win32-x64-msvc/package.json
+++ b/nodejs/npm/win32-x64-msvc/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-win32-x64-msvc",
-	"version": "0.21.0-beta.0",
+	"version": "0.21.2-beta.1",
 	"os": ["win32"],
 	"cpu": ["x64"],
 	"main": "lancedb.win32-x64-msvc.node",
--- a/nodejs/package-lock.json
+++ b/nodejs/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "@lancedb/lancedb",
-  "version": "0.20.1-beta.2",
+  "version": "0.21.2-beta.1",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "@lancedb/lancedb",
-      "version": "0.20.1-beta.2",
+      "version": "0.21.2-beta.1",
      "cpu": [
        "x64",
        "arm64"
--- a/nodejs/package.json
+++ b/nodejs/package.json
@@ -11,7 +11,7 @@
    "ann"
  ],
  "private": false,
-  "version": "0.21.0-beta.0",
+  "version": "0.21.2-beta.1",
  "main": "dist/index.js",
  "exports": {
    ".": "./dist/index.js",
--- a/nodejs/src/connection.rs
+++ b/nodejs/src/connection.rs
@@ -74,6 +74,10 @@ impl Connection {
            builder = builder.host_override(&host_override);
        }

+        if let Some(session) = options.session {
+            builder = builder.session(session.inner.clone());
+        }
+
        Ok(Self::inner_new(builder.execute().await.default_error()?))
    }

--- a/nodejs/src/index.rs
+++ b/nodejs/src/index.rs
@@ -123,6 +123,9 @@ impl Index {
        stem: Option<bool>,
        remove_stop_words: Option<bool>,
        ascii_folding: Option<bool>,
+        ngram_min_length: Option<u32>,
+        ngram_max_length: Option<u32>,
+        prefix_only: Option<bool>,
    ) -> Self {
        let mut opts = FtsIndexBuilder::default();
        if let Some(with_position) = with_position {
@@ -149,6 +152,15 @@ impl Index {
        if let Some(ascii_folding) = ascii_folding {
            opts = opts.ascii_folding(ascii_folding);
        }
+        if let Some(ngram_min_length) = ngram_min_length {
+            opts = opts.ngram_min_length(ngram_min_length);
+        }
+        if let Some(ngram_max_length) = ngram_max_length {
+            opts = opts.ngram_max_length(ngram_max_length);
+        }
+        if let Some(prefix_only) = prefix_only {
+            opts = opts.ngram_prefix_only(prefix_only);
+        }

        Self {
            inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
--- a/nodejs/src/lib.rs
+++ b/nodejs/src/lib.rs
@@ -14,6 +14,7 @@ pub mod merge;
 mod query;
 pub mod remote;
 mod rerankers;
+mod session;
 mod table;
 mod util;

@@ -34,6 +35,9 @@ pub struct ConnectionOptions {
    ///
    /// The available options are described at https://lancedb.github.io/lancedb/guides/storage/
    pub storage_options: Option<HashMap<String, String>>,
+    /// (For LanceDB OSS only): the session to use for this connection. Holds
+    /// shared caches and other session-specific state.
+    pub session: Option<session::Session>,

    /// (For LanceDB cloud only): configuration for the remote HTTP client.
    pub client_config: Option<remote::ClientConfig>,
--- a/nodejs/src/session.rs
+++ b/nodejs/src/session.rs
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+use std::sync::Arc;
+
+use lancedb::{ObjectStoreRegistry, Session as LanceSession};
+use napi::bindgen_prelude::*;
+use napi_derive::*;
+
+/// A session for managing caches and object stores across LanceDB operations.
+///
+/// Sessions allow you to configure cache sizes for index and metadata caches,
+/// which can significantly impact memory use and performance. They can
+/// also be re-used across multiple connections to share the same cache state.
+#[napi]
+#[derive(Clone)]
+pub struct Session {
+    pub(crate) inner: Arc<LanceSession>,
+}
+
+impl std::fmt::Debug for Session {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Session")
+            .field("size_bytes", &self.inner.size_bytes())
+            .field("approx_num_items", &self.inner.approx_num_items())
+            .finish()
+    }
+}
+
+#[napi]
+impl Session {
+    /// Create a new session with custom cache sizes.
+    ///
+    /// # Parameters
+    ///
+    /// - `index_cache_size_bytes`: The size of the index cache in bytes.
+    ///   Index data is stored in memory in this cache to speed up queries.
+    ///   Defaults to 6GB if not specified.
+    /// - `metadata_cache_size_bytes`: The size of the metadata cache in bytes.
+    ///   The metadata cache stores file metadata and schema information in memory.
+    ///   This cache improves scan and write performance.
+    ///   Defaults to 1GB if not specified.
+    #[napi(constructor)]
+    pub fn new(
+        index_cache_size_bytes: Option<BigInt>,
+        metadata_cache_size_bytes: Option<BigInt>,
+    ) -> napi::Result<Self> {
+        let index_cache_size = index_cache_size_bytes
+            .map(|size| size.get_u64().1 as usize)
+            .unwrap_or(6 * 1024 * 1024 * 1024); // 6GB default
+
+        let metadata_cache_size = metadata_cache_size_bytes
+            .map(|size| size.get_u64().1 as usize)
+            .unwrap_or(1024 * 1024 * 1024); // 1GB default
+
+        let session = LanceSession::new(
+            index_cache_size,
+            metadata_cache_size,
+            Arc::new(ObjectStoreRegistry::default()),
+        );
+
+        Ok(Self {
+            inner: Arc::new(session),
+        })
+    }
+
+    /// Create a session with default cache sizes.
+    ///
+    /// This is equivalent to creating a session with 6GB index cache
+    /// and 1GB metadata cache.
+    #[napi(factory)]
+    pub fn default() -> Self {
+        Self {
+            inner: Arc::new(LanceSession::default()),
+        }
+    }
+
+    /// Get the current size of the session caches in bytes.
+    #[napi]
+    pub fn size_bytes(&self) -> BigInt {
+        BigInt::from(self.inner.size_bytes())
+    }
+
+    /// Get the approximate number of items cached in the session.
+    #[napi]
+    pub fn approx_num_items(&self) -> u32 {
+        self.inner.approx_num_items() as u32
+    }
+}
+
+// Implement FromNapiValue for Session to work with napi(object)
+impl napi::bindgen_prelude::FromNapiValue for Session {
+    unsafe fn from_napi_value(
+        env: napi::sys::napi_env,
+        napi_val: napi::sys::napi_value,
+    ) -> napi::Result<Self> {
+        let object: napi::bindgen_prelude::ClassInstance<Session> =
+            napi::bindgen_prelude::ClassInstance::from_napi_value(env, napi_val)?;
+        let copy = object.clone();
+        Ok(copy)
+    }
+}
--- a/python/.bumpversion.toml
+++ b/python/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.24.0"
+current_version = "0.24.2"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/python/CLAUDE.md
+++ b/python/CLAUDE.md
@@ -0,0 +1,19 @@
+These are the Python bindings of LanceDB.
+The core Rust library is in the `../rust/lancedb` directory, the rust binding
+code is in the `src/` directory and the Python bindings are in the `lancedb/` directory.
+
+Common commands:
+
+* Build: `make develop`
+* Format: `make format`
+* Lint: `make check`
+* Fix lints: `make fix`
+* Test: `make test`
+* Doc test: `make doctest`
+
+Before committing changes, run lints and then formatting.
+
+When you change the Rust code, you will need to recompile the Python bindings: `make develop`.
+
+When you export new types from Rust to Python, you must manually update `python/lancedb/_lancedb.pyi`
+with the corresponding type hints. You can run `pyright` to check for type errors in the Python code.
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-python"
-version = "0.24.0"
+version = "0.24.2"
 edition.workspace = true
 description = "Python bindings for LanceDB"
 license.workspace = true
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -85,8 +85,8 @@ embeddings = [
    "boto3>=1.28.57",
    "awscli>=1.29.57",
    "botocore>=1.31.57",
-    "ollama",
-    "ibm-watsonx-ai>=1.1.2",
+    'ibm-watsonx-ai>=1.1.2; python_version >= "3.10"',
+    "ollama>=0.3.0",
 ]
 azure = ["adlfs>=2024.2.0"]

--- a/python/python/lancedb/init.py
+++ b/python/python/lancedb/init.py
@@ -18,6 +18,7 @@ from .remote import ClientConfig
 from .remote.db import RemoteDBConnection
 from .schema import vector
 from .table import AsyncTable
+from ._lancedb import Session


 def connect(
@@ -30,6 +31,7 @@ def connect(
    request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
    client_config: Union[ClientConfig, Dict[str, Any], None] = None,
    storage_options: Optional[Dict[str, str]] = None,
+    session: Optional[Session] = None,
    **kwargs: Any,
 ) -> DBConnection:
    """Connect to a LanceDB database.
@@ -64,6 +66,12 @@ def connect(
    storage_options: dict, optional
        Additional options for the storage backend. See available options at
        <https://lancedb.github.io/lancedb/guides/storage/>
+    session: Session, optional
+        (For LanceDB OSS only)
+        A session to use for this connection. Sessions allow you to configure
+        cache sizes for index and metadata caches, which can significantly
+        impact memory use and performance. They can also be re-used across
+        multiple connections to share the same cache state.

    Examples
    --------
@@ -92,7 +100,7 @@ def connect(
        if api_key is None:
            api_key = os.environ.get("LANCEDB_API_KEY")
        if api_key is None:
-            raise ValueError(f"api_key is required to connected LanceDB cloud: {uri}")
+            raise ValueError(f"api_key is required to connect to LanceDB cloud: {uri}")
        if isinstance(request_thread_pool, int):
            request_thread_pool = ThreadPoolExecutor(request_thread_pool)
        return RemoteDBConnection(
@@ -113,6 +121,7 @@ def connect(
        uri,
        read_consistency_interval=read_consistency_interval,
        storage_options=storage_options,
+        session=session,
    )


@@ -125,6 +134,7 @@ async def connect_async(
    read_consistency_interval: Optional[timedelta] = None,
    client_config: Optional[Union[ClientConfig, Dict[str, Any]]] = None,
    storage_options: Optional[Dict[str, str]] = None,
+    session: Optional[Session] = None,
 ) -> AsyncConnection:
    """Connect to a LanceDB database.

@@ -158,6 +168,12 @@ async def connect_async(
    storage_options: dict, optional
        Additional options for the storage backend. See available options at
        <https://lancedb.github.io/lancedb/guides/storage/>
+    session: Session, optional
+        (For LanceDB OSS only)
+        A session to use for this connection. Sessions allow you to configure
+        cache sizes for index and metadata caches, which can significantly
+        impact memory use and performance. They can also be re-used across
+        multiple connections to share the same cache state.

    Examples
    --------
@@ -197,6 +213,7 @@ async def connect_async(
            read_consistency_interval_secs,
            client_config,
            storage_options,
+            session,
        )
    )

@@ -212,6 +229,7 @@ __all__ = [
    "DBConnection",
    "LanceDBConnection",
    "RemoteDBConnection",
+    "Session",
    "__version__",
 ]

--- a/python/python/lancedb/_lancedb.pyi
+++ b/python/python/lancedb/_lancedb.pyi
@@ -6,6 +6,19 @@ import pyarrow as pa
 from .index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
 from .remote import ClientConfig

+class Session:
+    def __init__(
+        self,
+        index_cache_size_bytes: Optional[int] = None,
+        metadata_cache_size_bytes: Optional[int] = None,
+    ): ...
+    @staticmethod
+    def default() -> "Session": ...
+    @property
+    def size_bytes(self) -> int: ...
+    @property
+    def approx_num_items(self) -> int: ...
+
 class Connection(object):
    uri: str
    async def table_names(
@@ -89,6 +102,7 @@ async def connect(
    read_consistency_interval: Optional[float],
    client_config: Optional[Union[ClientConfig, Dict[str, Any]]],
    storage_options: Optional[Dict[str, str]],
+    session: Optional[Session],
 ) -> Connection: ...

 class RecordBatchStream:
--- a/python/python/lancedb/common.py
+++ b/python/python/lancedb/common.py
@@ -94,9 +94,9 @@ def data_to_reader(
    else:
        raise TypeError(
            f"Unknown data type {type(data)}. "
-            "Please check "
-            "https://lancedb.github.io/lance/read_and_write.html "
-            "to see supported types."
+            "Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
+            "pyarrow Table/RecordBatch, or Pydantic models. "
+            "See https://lancedb.github.io/lancedb/guides/tables/ for examples."
        )


--- a/python/python/lancedb/db.py
+++ b/python/python/lancedb/db.py
@@ -37,6 +37,7 @@ if TYPE_CHECKING:
    from ._lancedb import Connection as LanceDbConnection
    from .common import DATA, URI
    from .embeddings import EmbeddingFunctionConfig
+    from ._lancedb import Session


 class DBConnection(EnforceOverrides):
@@ -247,6 +248,9 @@ class DBConnection(EnforceOverrides):
        name: str
            The name of the table.
        index_cache_size: int, default 256
+            **Deprecated**: Use session-level cache configuration instead.
+            Create a Session with custom cache sizes and pass it to lancedb.connect().
+
            Set the size of the index cache, specified as a number of entries

            The exact meaning of an "entry" will depend on the type of index:
@@ -354,6 +358,7 @@ class LanceDBConnection(DBConnection):
        *,
        read_consistency_interval: Optional[timedelta] = None,
        storage_options: Optional[Dict[str, str]] = None,
+        session: Optional[Session] = None,
    ):
        if not isinstance(uri, Path):
            scheme = get_uri_scheme(uri)
@@ -367,6 +372,7 @@ class LanceDBConnection(DBConnection):
        self._entered = False
        self.read_consistency_interval = read_consistency_interval
        self.storage_options = storage_options
+        self.session = session

        if read_consistency_interval is not None:
            read_consistency_interval_secs = read_consistency_interval.total_seconds()
@@ -382,6 +388,7 @@ class LanceDBConnection(DBConnection):
                read_consistency_interval_secs,
                None,
                storage_options,
+                session,
            )

        self._conn = AsyncConnection(LOOP.run(do_connect()))
@@ -475,6 +482,17 @@ class LanceDBConnection(DBConnection):
        -------
        A LanceTable object representing the table.
        """
+        if index_cache_size is not None:
+            import warnings
+
+            warnings.warn(
+                "index_cache_size is deprecated. Use session-level cache "
+                "configuration instead. Create a Session with custom cache sizes "
+                "and pass it to lancedb.connect().",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
        return LanceTable.open(
            self,
            name,
@@ -820,6 +838,9 @@ class AsyncConnection(object):
            See available options at
            <https://lancedb.github.io/lancedb/guides/storage/>
        index_cache_size: int, default 256
+            **Deprecated**: Use session-level cache configuration instead.
+            Create a Session with custom cache sizes and pass it to lancedb.connect().
+
            Set the size of the index cache, specified as a number of entries

            The exact meaning of an "entry" will depend on the type of index:
--- a/python/python/lancedb/embeddings/init.py
+++ b/python/python/lancedb/embeddings/init.py
@@ -11,7 +11,7 @@ from .instructor import InstructorEmbeddingFunction
 from .ollama import OllamaEmbeddings
 from .open_clip import OpenClipEmbeddings
 from .openai import OpenAIEmbeddings
-from .registry import EmbeddingFunctionRegistry, get_registry
+from .registry import EmbeddingFunctionRegistry, get_registry, register
 from .sentence_transformers import SentenceTransformerEmbeddings
 from .gte import GteEmbeddings
 from .transformers import TransformersEmbeddingFunction, ColbertEmbeddings
--- a/python/python/lancedb/embeddings/gte_mlx_model.py
+++ b/python/python/lancedb/embeddings/gte_mlx_model.py
@@ -9,11 +9,14 @@ from huggingface_hub import snapshot_download
 from pydantic import BaseModel
 from transformers import BertTokenizer

+from .utils import create_import_stub
+
 try:
    import mlx.core as mx
    import mlx.nn as nn
 except ImportError:
-    raise ImportError("You need to install MLX to use this model use - pip install mlx")
+    mx = create_import_stub("mlx.core", "mlx")
+    nn = create_import_stub("mlx.nn", "mlx")


 def average_pool(last_hidden_state: mx.array, attention_mask: mx.array) -> mx.array:
@@ -72,7 +75,7 @@ class TransformerEncoder(nn.Module):
        super().__init__()
        self.layers = [
            TransformerEncoderLayer(dims, num_heads, mlp_dims)
-            for i in range(num_layers)
+            for _ in range(num_layers)
        ]

    def __call__(self, x, mask):
--- a/python/python/lancedb/embeddings/ollama.py
+++ b/python/python/lancedb/embeddings/ollama.py
@@ -2,14 +2,15 @@
 # SPDX-FileCopyrightText: Copyright The LanceDB Authors

 from functools import cached_property
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Sequence, Union
+
+import numpy as np

 from ..util import attempt_import_or_raise
 from .base import TextEmbeddingFunction
 from .registry import register

 if TYPE_CHECKING:
-    import numpy as np
    import ollama


@@ -28,23 +29,21 @@ class OllamaEmbeddings(TextEmbeddingFunction):
    keep_alive: Optional[Union[float, str]] = None
    ollama_client_kwargs: Optional[dict] = {}

-    def ndims(self):
+    def ndims(self) -> int:
        return len(self.generate_embeddings(["foo"])[0])

-    def _compute_embedding(self, text) -> Union["np.array", None]:
-        return (
-            self._ollama_client.embeddings(
-                model=self.name,
-                prompt=text,
-                options=self.options,
-                keep_alive=self.keep_alive,
-            )["embedding"]
-            or None
+    def _compute_embedding(self, text: Sequence[str]) -> Sequence[Sequence[float]]:
+        response = self._ollama_client.embed(
+            model=self.name,
+            input=text,
+            options=self.options,
+            keep_alive=self.keep_alive,
        )
+        return response.embeddings

    def generate_embeddings(
-        self, texts: Union[List[str], "np.ndarray"]
-    ) -> list[Union["np.array", None]]:
+        self, texts: Union[List[str], np.ndarray]
+    ) -> list[Union[np.array, None]]:
        """
        Get the embeddings for the given texts

@@ -54,8 +53,8 @@ class OllamaEmbeddings(TextEmbeddingFunction):
            The texts to embed
        """
        # TODO retry, rate limit, token limit
-        embeddings = [self._compute_embedding(text) for text in texts]
-        return embeddings
+        embeddings = self._compute_embedding(texts)
+        return list(embeddings)

    @cached_property
    def _ollama_client(self) -> "ollama.Client":
--- a/python/python/lancedb/embeddings/registry.py
+++ b/python/python/lancedb/embeddings/registry.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright The LanceDB Authors

 import json
-from typing import Dict, Optional
+from typing import Dict, Optional, Type

 from .base import EmbeddingFunction, EmbeddingFunctionConfig

@@ -43,7 +43,7 @@ class EmbeddingFunctionRegistry:
        self._functions = {}
        self._variables = {}

-    def register(self, alias: str = None):
+    def register(self, alias: Optional[str] = None):
        """
        This creates a decorator that can be used to register
        an EmbeddingFunction.
@@ -75,7 +75,7 @@ class EmbeddingFunctionRegistry:
        """
        self._functions = {}

-    def get(self, name: str):
+    def get(self, name: str) -> Type[EmbeddingFunction]:
        """
        Fetch an embedding function class by name

--- a/python/python/lancedb/embeddings/utils.py
+++ b/python/python/lancedb/embeddings/utils.py
@@ -21,6 +21,36 @@ from ..dependencies import pandas as pd
 from ..util import attempt_import_or_raise


+def create_import_stub(module_name: str, package_name: str = None):
+    """
+    Create a stub module that allows class definition but fails when used.
+    This allows modules to be imported for doctest collection even when
+    optional dependencies are not available.
+
+    Parameters
+    ----------
+    module_name : str
+        The name of the module to create a stub for
+    package_name : str, optional
+        The package name to suggest in the error message
+
+    Returns
+    -------
+    object
+        A stub object that can be used in place of the module
+    """
+
+    class _ImportStub:
+        def __getattr__(self, name):
+            return _ImportStub  # Return stub for chained access like nn.Module
+
+        def __call__(self, *args, **kwargs):
+            pkg = package_name or module_name
+            raise ImportError(f"You need to install {pkg} to use this functionality")
+
+    return _ImportStub()
+
+
 # ruff: noqa: PERF203
 def retry(tries=10, delay=1, max_delay=30, backoff=3, jitter=1):
    def wrapper(fn):
--- a/python/python/lancedb/index.py
+++ b/python/python/lancedb/index.py
@@ -137,6 +137,9 @@ class FTS:
    stem: bool = True
    remove_stop_words: bool = True
    ascii_folding: bool = True
+    ngram_min_length: int = 3
+    ngram_max_length: int = 3
+    prefix_only: bool = False


@dataclass
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -14,7 +14,7 @@ from typing import (
    Literal,
    Optional,
    Tuple,
-    Type,
+    TypeVar,
    Union,
    Any,
 )
@@ -58,6 +58,8 @@ if TYPE_CHECKING:
    else:
        from typing_extensions import Self

+T = TypeVar("T", bound="LanceModel")
+

 # Pydantic validation function for vector queries
 def ensure_vector_query(
@@ -746,8 +748,8 @@ class LanceQueryBuilder(ABC):
        return self.to_arrow(timeout=timeout).to_pylist()

    def to_pydantic(
-        self, model: Type[LanceModel], *, timeout: Optional[timedelta] = None
-    ) -> List[LanceModel]:
+        self, model: type[T], *, timeout: Optional[timedelta] = None
+    ) -> list[T]:
        """Return the table as a list of pydantic models.

        Parameters
@@ -906,11 +908,11 @@ class LanceQueryBuilder(ABC):
        >>> plan = table.search(query).explain_plan(True)
        >>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
        ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
-        GlobalLimitExec: skip=0, fetch=10
-          FilterExec: _distance@2 IS NOT NULL
-            SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
-              KNNVectorDistance: metric=l2
-                LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
+          GlobalLimitExec: skip=0, fetch=10
+            FilterExec: _distance@2 IS NOT NULL
+              SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
+                KNNVectorDistance: metric=l2
+                  LanceRead: uri=..., projection=[vector], ...

        Parameters
        ----------
@@ -940,19 +942,19 @@ class LanceQueryBuilder(ABC):
        >>> plan = table.search(query).analyze_plan()
        >>> print(plan)  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
        AnalyzeExec verbose=true, metrics=[]
-          ProjectionExec: expr=[...], metrics=[...]
-            GlobalLimitExec: skip=0, fetch=10, metrics=[...]
-              FilterExec: _distance@2 IS NOT NULL,
-              metrics=[output_rows=..., elapsed_compute=...]
-                SortExec: TopK(fetch=10), expr=[...],
-                preserve_partitioning=[...],
-                metrics=[output_rows=..., elapsed_compute=..., row_replacements=...]
-                  KNNVectorDistance: metric=l2,
-                  metrics=[output_rows=..., elapsed_compute=..., output_batches=...]
-                    LanceScan: uri=..., projection=[vector], row_id=true,
-                    row_addr=false, ordered=false,
-                    metrics=[output_rows=..., elapsed_compute=...,
-                    bytes_read=..., iops=..., requests=...]
+          TracedExec, metrics=[]
+            ProjectionExec: expr=[...], metrics=[...]
+              GlobalLimitExec: skip=0, fetch=10, metrics=[...]
+                FilterExec: _distance@2 IS NOT NULL,
+                metrics=[output_rows=..., elapsed_compute=...]
+                  SortExec: TopK(fetch=10), expr=[...],
+                  preserve_partitioning=[...],
+                  metrics=[output_rows=..., elapsed_compute=..., row_replacements=...]
+                    KNNVectorDistance: metric=l2,
+                    metrics=[output_rows=..., elapsed_compute=..., output_batches=...]
+                      LanceRead: uri=..., projection=[vector], ...
+                      metrics=[output_rows=..., elapsed_compute=...,
+                      bytes_read=..., iops=..., requests=...]

        Returns
        -------
@@ -1374,6 +1376,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
        if query_string is not None and not isinstance(query_string, str):
            raise ValueError("Reranking currently only supports string queries")
        self._str_query = query_string if query_string is not None else self._str_query
+        if reranker.score == "all":
+            self.with_row_id(True)
        return self

    def bypass_vector_index(self) -> LanceVectorQueryBuilder:
@@ -1569,6 +1573,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
            The LanceQueryBuilder object.
        """
        self._reranker = reranker
+        if reranker.score == "all":
+            self.with_row_id(True)
        return self


@@ -1845,6 +1851,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):

        self._norm = normalize
        self._reranker = reranker
+        if reranker.score == "all":
+            self.with_row_id(True)

        return self

@@ -2037,7 +2045,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
          FilterExec: _distance@2 IS NOT NULL
            SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
              KNNVectorDistance: metric=l2
-                LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
+                LanceRead: uri=..., projection=[vector], ...

        Parameters
        ----------
@@ -2423,7 +2431,7 @@ class AsyncQueryBase(object):
            FilterExec: _distance@2 IS NOT NULL
              SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
                KNNVectorDistance: metric=l2
-                  LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
+                  LanceRead: uri=..., projection=[vector], ...

        Parameters
        ----------
@@ -3042,15 +3050,21 @@ class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
        >>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
        Vector Search Plan:
        ProjectionExec: expr=[vector@0 as vector, text@3 as text, _distance@2 as _distance]
-            Take: columns="vector, _rowid, _distance, (text)"
-                CoalesceBatchesExec: target_batch_size=1024
-                GlobalLimitExec: skip=0, fetch=10
-                    FilterExec: _distance@2 IS NOT NULL
-                    SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
-                        KNNVectorDistance: metric=l2
-                        LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
+          Take: columns="vector, _rowid, _distance, (text)"
+            CoalesceBatchesExec: target_batch_size=1024
+              GlobalLimitExec: skip=0, fetch=10
+                FilterExec: _distance@2 IS NOT NULL
+                  SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
+                    KNNVectorDistance: metric=l2
+                      LanceRead: uri=..., projection=[vector], ...
+        <BLANKLINE>
        FTS Search Plan:
-        LanceScan: uri=..., projection=[vector, text], row_id=false, row_addr=false, ordered=true
+        ProjectionExec: expr=[vector@2 as vector, text@3 as text, _score@1 as _score]
+          Take: columns="_rowid, _score, (vector), (text)"
+            CoalesceBatchesExec: target_batch_size=1024
+              GlobalLimitExec: skip=0, fetch=10
+                MatchQuery: query=hello
+        <BLANKLINE>

        Parameters
        ----------
--- a/python/python/lancedb/remote/table.py
+++ b/python/python/lancedb/remote/table.py
@@ -18,7 +18,7 @@ from lancedb._lancedb import (
    UpdateResult,
 )
 from lancedb.embeddings.base import EmbeddingFunctionConfig
-from lancedb.index import FTS, BTree, Bitmap, HnswPq, HnswSq, IvfFlat, IvfPq, LabelList
+from lancedb.index import FTS, BTree, Bitmap, HnswSq, IvfFlat, IvfPq, LabelList
 from lancedb.remote.db import LOOP
 import pyarrow as pa

@@ -89,7 +89,7 @@ class RemoteTable(Table):

    def to_pandas(self):
        """to_pandas() is not yet supported on LanceDB cloud."""
-        return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
+        raise NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")

    def checkout(self, version: Union[int, str]):
        return LOOP.run(self._table.checkout(version))
@@ -158,6 +158,9 @@ class RemoteTable(Table):
        stem: bool = True,
        remove_stop_words: bool = True,
        ascii_folding: bool = True,
+        ngram_min_length: int = 3,
+        ngram_max_length: int = 3,
+        prefix_only: bool = False,
    ):
        config = FTS(
            with_position=with_position,
@@ -168,6 +171,9 @@ class RemoteTable(Table):
            stem=stem,
            remove_stop_words=remove_stop_words,
            ascii_folding=ascii_folding,
+            ngram_min_length=ngram_min_length,
+            ngram_max_length=ngram_max_length,
+            prefix_only=prefix_only,
        )
        LOOP.run(
            self._table.create_index(
@@ -186,6 +192,8 @@ class RemoteTable(Table):
        accelerator: Optional[str] = None,
        index_type="vector",
        wait_timeout: Optional[timedelta] = None,
+        *,
+        num_bits: int = 8,
    ):
        """Create an index on the table.
        Currently, the only parameters that matter are
@@ -220,11 +228,6 @@ class RemoteTable(Table):
        >>> table.create_index("l2", "vector") # doctest: +SKIP
        """

-        if num_partitions is not None:
-            logging.warning(
-                "num_partitions is not supported on LanceDB cloud."
-                "This parameter will be tuned automatically."
-            )
        if num_sub_vectors is not None:
            logging.warning(
                "num_sub_vectors is not supported on LanceDB cloud."
@@ -244,13 +247,21 @@ class RemoteTable(Table):

        index_type = index_type.upper()
        if index_type == "VECTOR" or index_type == "IVF_PQ":
-            config = IvfPq(distance_type=metric)
+            config = IvfPq(
+                distance_type=metric,
+                num_partitions=num_partitions,
+                num_sub_vectors=num_sub_vectors,
+                num_bits=num_bits,
+            )
        elif index_type == "IVF_HNSW_PQ":
-            config = HnswPq(distance_type=metric)
+            raise ValueError(
+                "IVF_HNSW_PQ is not supported on LanceDB cloud."
+                "Please use IVF_HNSW_SQ instead."
+            )
        elif index_type == "IVF_HNSW_SQ":
-            config = HnswSq(distance_type=metric)
+            config = HnswSq(distance_type=metric, num_partitions=num_partitions)
        elif index_type == "IVF_FLAT":
-            config = IvfFlat(distance_type=metric)
+            config = IvfFlat(distance_type=metric, num_partitions=num_partitions)
        else:
            raise ValueError(
                f"Unknown vector index type: {index_type}. Valid options are"
--- a/python/python/lancedb/rerankers/answerdotai.py
+++ b/python/python/lancedb/rerankers/answerdotai.py
@@ -74,9 +74,7 @@ class AnswerdotaiRerankers(Reranker):
        if self.score == "relevance":
            combined_results = self._keep_relevance_score(combined_results)
        elif self.score == "all":
-            raise NotImplementedError(
-                "Answerdotai Reranker does not support score='all' yet"
-            )
+            combined_results = self._merge_and_keep_scores(vector_results, fts_results)
        combined_results = combined_results.sort_by(
            [("_relevance_score", "descending")]
        )
--- a/python/python/lancedb/rerankers/base.py
+++ b/python/python/lancedb/rerankers/base.py
@@ -232,6 +232,39 @@ class Reranker(ABC):

        return deduped_table

+    def _merge_and_keep_scores(self, vector_results: pa.Table, fts_results: pa.Table):
+        """
+        Merge the results from the vector and FTS search and keep the scores.
+        This op is slower than just keeping relevance score but can be useful
+        for debugging.
+        """
+        # add nulls to fts results for _distance
+        if "_distance" not in fts_results.column_names:
+            fts_results = fts_results.append_column(
+                "_distance",
+                pa.array([None] * len(fts_results), type=pa.float32()),
+            )
+        # add nulls to vector results for _score
+        if "_score" not in vector_results.column_names:
+            vector_results = vector_results.append_column(
+                "_score",
+                pa.array([None] * len(vector_results), type=pa.float32()),
+            )
+
+        # combine them and fill the scores
+        vector_results_dict = {row["_rowid"]: row for row in vector_results.to_pylist()}
+        fts_results_dict = {row["_rowid"]: row for row in fts_results.to_pylist()}
+
+        # merge them into vector_results
+        for key, value in fts_results_dict.items():
+            if key in vector_results_dict:
+                vector_results_dict[key]["_score"] = value["_score"]
+            else:
+                vector_results_dict[key] = value
+
+        combined = pa.Table.from_pylist(list(vector_results_dict.values()))
+        return combined
+
    def _keep_relevance_score(self, combined_results: pa.Table):
        if self.score == "relevance":
            if "_score" in combined_results.column_names:
--- a/python/python/lancedb/rerankers/cohere.py
+++ b/python/python/lancedb/rerankers/cohere.py
@@ -92,14 +92,14 @@ class CohereReranker(Reranker):
        vector_results: pa.Table,
        fts_results: pa.Table,
    ):
-        combined_results = self.merge_results(vector_results, fts_results)
+        if self.score == "all":
+            combined_results = self._merge_and_keep_scores(vector_results, fts_results)
+        else:
+            combined_results = self.merge_results(vector_results, fts_results)
        combined_results = self._rerank(combined_results, query)
        if self.score == "relevance":
            combined_results = self._keep_relevance_score(combined_results)
-        elif self.score == "all":
-            raise NotImplementedError(
-                "return_score='all' not implemented for cohere reranker"
-            )
+
        return combined_results

    def rerank_vector(self, query: str, vector_results: pa.Table):
--- a/python/python/lancedb/rerankers/cross_encoder.py
+++ b/python/python/lancedb/rerankers/cross_encoder.py
@@ -81,15 +81,15 @@ class CrossEncoderReranker(Reranker):
        vector_results: pa.Table,
        fts_results: pa.Table,
    ):
-        combined_results = self.merge_results(vector_results, fts_results)
+        if self.score == "all":
+            combined_results = self._merge_and_keep_scores(vector_results, fts_results)
+        else:
+            combined_results = self.merge_results(vector_results, fts_results)
        combined_results = self._rerank(combined_results, query)
        # sort the results by _score
        if self.score == "relevance":
            combined_results = self._keep_relevance_score(combined_results)
-        elif self.score == "all":
-            raise NotImplementedError(
-                "return_score='all' not implemented for CrossEncoderReranker"
-            )
+
        combined_results = combined_results.sort_by(
            [("_relevance_score", "descending")]
        )
--- a/python/python/lancedb/rerankers/jinaai.py
+++ b/python/python/lancedb/rerankers/jinaai.py
@@ -97,14 +97,14 @@ class JinaReranker(Reranker):
        vector_results: pa.Table,
        fts_results: pa.Table,
    ):
-        combined_results = self.merge_results(vector_results, fts_results)
+        if self.score == "all":
+            combined_results = self._merge_and_keep_scores(vector_results, fts_results)
+        else:
+            combined_results = self.merge_results(vector_results, fts_results)
        combined_results = self._rerank(combined_results, query)
        if self.score == "relevance":
            combined_results = self._keep_relevance_score(combined_results)
-        elif self.score == "all":
-            raise NotImplementedError(
-                "return_score='all' not implemented for JinaReranker"
-            )
+
        return combined_results

    def rerank_vector(self, query: str, vector_results: pa.Table):
--- a/python/python/lancedb/rerankers/openai.py
+++ b/python/python/lancedb/rerankers/openai.py
@@ -88,14 +88,13 @@ class OpenaiReranker(Reranker):
        vector_results: pa.Table,
        fts_results: pa.Table,
    ):
-        combined_results = self.merge_results(vector_results, fts_results)
+        if self.score == "all":
+            combined_results = self._merge_and_keep_scores(vector_results, fts_results)
+        else:
+            combined_results = self.merge_results(vector_results, fts_results)
        combined_results = self._rerank(combined_results, query)
        if self.score == "relevance":
            combined_results = self._keep_relevance_score(combined_results)
-        elif self.score == "all":
-            raise NotImplementedError(
-                "OpenAI Reranker does not support score='all' yet"
-            )

        combined_results = combined_results.sort_by(
            [("_relevance_score", "descending")]
--- a/python/python/lancedb/rerankers/voyageai.py
+++ b/python/python/lancedb/rerankers/voyageai.py
@@ -94,14 +94,14 @@ class VoyageAIReranker(Reranker):
        vector_results: pa.Table,
        fts_results: pa.Table,
    ):
-        combined_results = self.merge_results(vector_results, fts_results)
+        if self.score == "all":
+            combined_results = self._merge_and_keep_scores(vector_results, fts_results)
+        else:
+            combined_results = self.merge_results(vector_results, fts_results)
        combined_results = self._rerank(combined_results, query)
        if self.score == "relevance":
            combined_results = self._keep_relevance_score(combined_results)
-        elif self.score == "all":
-            raise NotImplementedError(
-                "return_score='all' not implemented for voyageai reranker"
-            )
+
        return combined_results

    def rerank_vector(self, query: str, vector_results: pa.Table):
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -102,7 +102,9 @@ if TYPE_CHECKING:
    )


-def _into_pyarrow_reader(data) -> pa.RecordBatchReader:
+def _into_pyarrow_reader(
+    data, schema: Optional[pa.Schema] = None
+) -> pa.RecordBatchReader:
    from lancedb.dependencies import datasets

    if _check_for_hugging_face(data):
@@ -123,6 +125,12 @@ def _into_pyarrow_reader(data) -> pa.RecordBatchReader:
        raise ValueError("Cannot add a single dictionary to a table. Use a list.")

    if isinstance(data, list):
+        # Handle empty list case
+        if not data:
+            if schema is None:
+                raise ValueError("Cannot create table from empty list without a schema")
+            return pa.Table.from_pylist(data, schema=schema).to_reader()
+
        # convert to list of dict if data is a bunch of LanceModels
        if isinstance(data[0], LanceModel):
            schema = data[0].__class__.to_arrow_schema()
@@ -165,9 +173,9 @@ def _into_pyarrow_reader(data) -> pa.RecordBatchReader:
    else:
        raise TypeError(
            f"Unknown data type {type(data)}. "
-            "Please check "
-            "https://lancedb.github.io/lancedb/python/python/ "
-            "to see supported types."
+            "Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
+            "pyarrow Table/RecordBatch, or Pydantic models. "
+            "See https://lancedb.github.io/lancedb/guides/tables/ for examples."
        )


@@ -236,7 +244,7 @@ def _sanitize_data(
    # 1. There might be embedding columns missing that will be added
    #    in the add_embeddings step.
    # 2. If `allow_subschemas` is True, there might be columns missing.
-    reader = _into_pyarrow_reader(data)
+    reader = _into_pyarrow_reader(data, target_schema)

    reader = _append_vector_columns(reader, target_schema, metadata=metadata)

@@ -838,6 +846,9 @@ class Table(ABC):
        stem: bool = True,
        remove_stop_words: bool = True,
        ascii_folding: bool = True,
+        ngram_min_length: int = 3,
+        ngram_max_length: int = 3,
+        prefix_only: bool = False,
        wait_timeout: Optional[timedelta] = None,
    ):
        """Create a full-text search index on the table.
@@ -877,6 +888,7 @@ class Table(ABC):
            - "simple": Splits text by whitespace and punctuation.
            - "whitespace": Split text by whitespace, but not punctuation.
            - "raw": No tokenization. The entire text is treated as a single token.
+            - "ngram": N-Gram tokenizer.
        language : str, default "English"
            The language to use for tokenization.
        max_token_length : int, default 40
@@ -894,6 +906,12 @@ class Table(ABC):
        ascii_folding : bool, default True
            Whether to fold ASCII characters. This converts accented characters to
            their ASCII equivalent. For example, "café" would be converted to "cafe".
+        ngram_min_length: int, default 3
+            The minimum length of an n-gram.
+        ngram_max_length: int, default 3
+            The maximum length of an n-gram.
+        prefix_only: bool, default False
+            Whether to only index the prefix of the token for ngram tokenizer.
        wait_timeout: timedelta, optional
            The timeout to wait if indexing is asynchronous.
        """
@@ -1981,6 +1999,9 @@ class LanceTable(Table):
        stem: bool = True,
        remove_stop_words: bool = True,
        ascii_folding: bool = True,
+        ngram_min_length: int = 3,
+        ngram_max_length: int = 3,
+        prefix_only: bool = False,
    ):
        if not use_tantivy:
            if not isinstance(field_names, str):
@@ -1996,6 +2017,9 @@ class LanceTable(Table):
                    "stem": stem,
                    "remove_stop_words": remove_stop_words,
                    "ascii_folding": ascii_folding,
+                    "ngram_min_length": ngram_min_length,
+                    "ngram_max_length": ngram_max_length,
+                    "prefix_only": prefix_only,
                }
            else:
                tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
@@ -2065,6 +2089,9 @@ class LanceTable(Table):
                "stem": False,
                "remove_stop_words": False,
                "ascii_folding": False,
+                "ngram_min_length": 3,
+                "ngram_max_length": 3,
+                "prefix_only": False,
            }
        elif tokenizer_name == "raw":
            return {
@@ -2075,6 +2102,9 @@ class LanceTable(Table):
                "stem": False,
                "remove_stop_words": False,
                "ascii_folding": False,
+                "ngram_min_length": 3,
+                "ngram_max_length": 3,
+                "prefix_only": False,
            }
        elif tokenizer_name == "whitespace":
            return {
@@ -2085,6 +2115,9 @@ class LanceTable(Table):
                "stem": False,
                "remove_stop_words": False,
                "ascii_folding": False,
+                "ngram_min_length": 3,
+                "ngram_max_length": 3,
+                "prefix_only": False,
            }

        # or it's with language stemming with pattern like "en_stem"
@@ -2103,6 +2136,9 @@ class LanceTable(Table):
            "stem": True,
            "remove_stop_words": False,
            "ascii_folding": False,
+            "ngram_min_length": 3,
+            "ngram_max_length": 3,
+            "prefix_only": False,
        }

    def add(
--- a/python/python/lancedb/types.py
+++ b/python/python/lancedb/types.py
@@ -25,4 +25,4 @@ IndexType = Literal[
 ]

 # Tokenizer literals
-BaseTokenizerType = Literal["simple", "raw", "whitespace"]
+BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -33,8 +33,11 @@ tantivy = pytest.importorskip("tantivy")

@pytest.fixture
 def table(tmp_path) -> ldb.table.LanceTable:
+    # Use local random state to avoid affecting other tests
+    rng = np.random.RandomState(42)
+    local_random = random.Random(42)
    db = ldb.connect(tmp_path)
-    vectors = [np.random.randn(128) for _ in range(100)]
+    vectors = [rng.randn(128) for _ in range(100)]

    text_nouns = ("puppy", "car")
    text2_nouns = ("rabbit", "girl", "monkey")
@@ -44,10 +47,10 @@ def table(tmp_path) -> ldb.table.LanceTable:
    text = [
        " ".join(
            [
-                text_nouns[random.randrange(0, len(text_nouns))],
-                verbs[random.randrange(0, 5)],
-                adv[random.randrange(0, 5)],
-                adj[random.randrange(0, 5)],
+                text_nouns[local_random.randrange(0, len(text_nouns))],
+                verbs[local_random.randrange(0, 5)],
+                adv[local_random.randrange(0, 5)],
+                adj[local_random.randrange(0, 5)],
            ]
        )
        for _ in range(100)
@@ -55,15 +58,15 @@ def table(tmp_path) -> ldb.table.LanceTable:
    text2 = [
        " ".join(
            [
-                text2_nouns[random.randrange(0, len(text2_nouns))],
-                verbs[random.randrange(0, 5)],
-                adv[random.randrange(0, 5)],
-                adj[random.randrange(0, 5)],
+                text2_nouns[local_random.randrange(0, len(text2_nouns))],
+                verbs[local_random.randrange(0, 5)],
+                adv[local_random.randrange(0, 5)],
+                adj[local_random.randrange(0, 5)],
            ]
        )
        for _ in range(100)
    ]
-    count = [random.randint(1, 10000) for _ in range(100)]
+    count = [local_random.randint(1, 10000) for _ in range(100)]
    table = db.create_table(
        "test",
        data=pd.DataFrame(
@@ -82,8 +85,11 @@ def table(tmp_path) -> ldb.table.LanceTable:

@pytest.fixture
 async def async_table(tmp_path) -> ldb.table.AsyncTable:
+    # Use local random state to avoid affecting other tests
+    rng = np.random.RandomState(42)
+    local_random = random.Random(42)
    db = await ldb.connect_async(tmp_path)
-    vectors = [np.random.randn(128) for _ in range(100)]
+    vectors = [rng.randn(128) for _ in range(100)]

    text_nouns = ("puppy", "car")
    text2_nouns = ("rabbit", "girl", "monkey")
@@ -93,10 +99,10 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
    text = [
        " ".join(
            [
-                text_nouns[random.randrange(0, len(text_nouns))],
-                verbs[random.randrange(0, 5)],
-                adv[random.randrange(0, 5)],
-                adj[random.randrange(0, 5)],
+                text_nouns[local_random.randrange(0, len(text_nouns))],
+                verbs[local_random.randrange(0, 5)],
+                adv[local_random.randrange(0, 5)],
+                adj[local_random.randrange(0, 5)],
            ]
        )
        for _ in range(100)
@@ -104,15 +110,15 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
    text2 = [
        " ".join(
            [
-                text2_nouns[random.randrange(0, len(text2_nouns))],
-                verbs[random.randrange(0, 5)],
-                adv[random.randrange(0, 5)],
-                adj[random.randrange(0, 5)],
+                text2_nouns[local_random.randrange(0, len(text2_nouns))],
+                verbs[local_random.randrange(0, 5)],
+                adv[local_random.randrange(0, 5)],
+                adj[local_random.randrange(0, 5)],
            ]
        )
        for _ in range(100)
    ]
-    count = [random.randint(1, 10000) for _ in range(100)]
+    count = [local_random.randint(1, 10000) for _ in range(100)]
    table = await db.create_table(
        "test",
        data=pd.DataFrame(
@@ -669,3 +675,46 @@ def test_fts_on_list(mem_db: DBConnection):

    res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
    assert len(res) == 2
+
+
+def test_fts_ngram(mem_db: DBConnection):
+    data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
+    table = mem_db.create_table("test", data=data)
+    table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
+
+    results = table.search("lan", query_type="fts").limit(10).to_list()
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
+
+    results = (
+        table.search("nce", query_type="fts").limit(10).to_list()
+    )  # spellchecker:disable-line
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
+
+    # the default min_ngram_length is 3, so "la" should not match
+    results = table.search("la", query_type="fts").limit(10).to_list()
+    assert len(results) == 0
+
+    # test setting min_ngram_length and prefix_only
+    table.create_fts_index(
+        "text",
+        use_tantivy=False,
+        base_tokenizer="ngram",
+        replace=True,
+        ngram_min_length=2,
+        prefix_only=True,
+    )
+
+    results = table.search("lan", query_type="fts").limit(10).to_list()
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
+
+    results = (
+        table.search("nce", query_type="fts").limit(10).to_list()
+    )  # spellchecker:disable-line
+    assert len(results) == 0
+
+    results = table.search("la", query_type="fts").limit(10).to_list()
+    assert len(results) == 2
+    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
--- a/python/python/tests/test_hybrid_query.py
+++ b/python/python/tests/test_hybrid_query.py
@@ -166,7 +166,7 @@ async def test_explain_plan(table: AsyncTable):
    assert "Vector Search Plan" in plan
    assert "KNNVectorDistance" in plan
    assert "FTS Search Plan" in plan
-    assert "LanceScan" in plan
+    assert "LanceRead" in plan


@pytest.mark.asyncio
--- a/python/python/tests/test_query.py
+++ b/python/python/tests/test_query.py
@@ -272,7 +272,9 @@ async def test_distance_range_with_new_rows_async():
    # append more rows so that execution plan would be mixed with ANN & Flat KNN
    new_data = pa.table(
        {
-            "vector": pa.FixedShapeTensorArray.from_numpy_ndarray(np.random.rand(4, 2)),
+            "vector": pa.FixedShapeTensorArray.from_numpy_ndarray(
+                np.random.rand(4, 2) + 1
+            ),
        }
    )
    await table.add(new_data)
@@ -775,6 +777,83 @@ async def test_explain_plan_async(table_async: AsyncTable):
    assert "KNN" in plan


+@pytest.mark.asyncio
+async def test_explain_plan_fts(table_async: AsyncTable):
+    """Test explain plan for FTS queries"""
+    # Create FTS index
+    from lancedb.index import FTS
+
+    await table_async.create_index("text", config=FTS())
+
+    # Test pure FTS query
+    query = await table_async.search("dog", query_type="fts", fts_columns="text")
+    plan = await query.explain_plan()
+    # Should show FTS details (issue #2465 is now fixed)
+    assert "MatchQuery: query=dog" in plan
+    assert "GlobalLimitExec" in plan  # Default limit
+
+    # Test FTS query with limit
+    query_with_limit = await table_async.search(
+        "dog", query_type="fts", fts_columns="text"
+    )
+    plan_with_limit = await query_with_limit.limit(1).explain_plan()
+    assert "MatchQuery: query=dog" in plan_with_limit
+    assert "GlobalLimitExec: skip=0, fetch=1" in plan_with_limit
+
+    # Test FTS query with offset and limit
+    query_with_offset = await table_async.search(
+        "dog", query_type="fts", fts_columns="text"
+    )
+    plan_with_offset = await query_with_offset.offset(1).limit(1).explain_plan()
+    assert "MatchQuery: query=dog" in plan_with_offset
+    assert "GlobalLimitExec: skip=1, fetch=1" in plan_with_offset
+
+
+@pytest.mark.asyncio
+async def test_explain_plan_vector_with_limit_offset(table_async: AsyncTable):
+    """Test explain plan for vector queries with limit and offset"""
+    # Test vector query with limit
+    plan_with_limit = await (
+        table_async.query().nearest_to(pa.array([1, 2])).limit(1).explain_plan()
+    )
+    assert "KNN" in plan_with_limit
+    assert "GlobalLimitExec: skip=0, fetch=1" in plan_with_limit
+
+    # Test vector query with offset and limit
+    plan_with_offset = await (
+        table_async.query()
+        .nearest_to(pa.array([1, 2]))
+        .offset(1)
+        .limit(1)
+        .explain_plan()
+    )
+    assert "KNN" in plan_with_offset
+    assert "GlobalLimitExec: skip=1, fetch=1" in plan_with_offset
+
+
+@pytest.mark.asyncio
+async def test_explain_plan_with_filters(table_async: AsyncTable):
+    """Test explain plan for queries with filters"""
+    # Test vector query with filter
+    plan_with_filter = await (
+        table_async.query().nearest_to(pa.array([1, 2])).where("id = 1").explain_plan()
+    )
+    assert "KNN" in plan_with_filter
+    assert "LanceRead" in plan_with_filter
+
+    # Test FTS query with filter
+    from lancedb.index import FTS
+
+    await table_async.create_index("text", config=FTS())
+    query_fts_filter = await table_async.search(
+        "dog", query_type="fts", fts_columns="text"
+    )
+    plan_fts_filter = await query_fts_filter.where("id = 1").explain_plan()
+    assert "MatchQuery: query=dog" in plan_fts_filter
+    assert "LanceRead" in plan_fts_filter
+    assert "full_filter=id = Int64(1)" in plan_fts_filter  # Should show filter details
+
+
@pytest.mark.asyncio
 async def test_query_camelcase_async(tmp_path):
    db = await lancedb.connect_async(tmp_path)
@@ -1260,3 +1339,20 @@ async def test_query_timeout_async(tmp_path):
            .nearest_to([0.0, 0.0])
            .to_list(timeout=timedelta(0))
        )
+
+
+def test_search_empty_table(mem_db):
+    """Test searching on empty table should not crash
+
+    Regression test for issue #303:
+    https://github.com/lancedb/lancedb/issues/303
+    Searching on empty table produces scary error message
+    """
+    schema = pa.schema(
+        [pa.field("vector", pa.list_(pa.float32(), 2)), pa.field("id", pa.int64())]
+    )
+    table = mem_db.create_table("test_empty_search", schema=schema)
+
+    # Search on empty table should return empty results, not crash
+    results = table.search([1.0, 2.0]).limit(5).to_list()
+    assert results == []
--- a/python/python/tests/test_remote_db.py
+++ b/python/python/tests/test_remote_db.py
@@ -210,6 +210,25 @@ async def test_retry_error():
        assert cause.status_code == 429


+def test_table_unimplemented_functions():
+    def handler(request):
+        if request.path == "/v1/table/test/create/?mode=create":
+            request.send_response(200)
+            request.send_header("Content-Type", "application/json")
+            request.end_headers()
+            request.wfile.write(b"{}")
+        else:
+            request.send_response(404)
+            request.end_headers()
+
+    with mock_lancedb_connection(handler) as db:
+        table = db.create_table("test", [{"id": 1}])
+        with pytest.raises(NotImplementedError):
+            table.to_arrow()
+        with pytest.raises(NotImplementedError):
+            table.to_pandas()
+
+
 def test_table_add_in_threadpool():
    def handler(request):
        if request.path == "/v1/table/test/insert/":
--- a/python/python/tests/test_rerankers.py
+++ b/python/python/tests/test_rerankers.py
@@ -499,3 +499,19 @@ def test_empty_result_reranker():
            .rerank(reranker)
            .to_arrow()
        )
+
+
+@pytest.mark.parametrize("use_tantivy", [True, False])
+def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy):
+    pytest.importorskip("sentence_transformers")
+    reranker = CrossEncoderReranker(return_score="all")
+    table, schema = get_test_table(tmp_path, use_tantivy)
+    query = "single player experience"
+    result = (
+        table.search(query, query_type="hybrid", vector_column_name="vector")
+        .rerank(reranker=reranker)
+        .to_arrow()
+    )
+    assert "_relevance_score" in result.column_names
+    assert "_score" in result.column_names
+    assert "_distance" in result.column_names
--- a/python/python/tests/test_session.py
+++ b/python/python/tests/test_session.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+import lancedb
+
+
+def test_session_cache_configuration(tmp_path):
+    """Test Session cache configuration and basic functionality."""
+    # Create session with small cache limits for testing
+    index_cache_size = 1024 * 1024  # 1MB
+    metadata_cache_size = 512 * 1024  # 512KB
+
+    session = lancedb.Session(
+        index_cache_size_bytes=index_cache_size,
+        metadata_cache_size_bytes=metadata_cache_size,
+    )
+
+    # Record initial cache state
+    initial_cache_size = session.size_bytes
+    initial_cache_items = session.approx_num_items
+
+    # Test session works with database connection
+    db = lancedb.connect(tmp_path, session=session)
+
+    # Create and use a table to exercise the session
+    data = [{"id": i, "text": f"item {i}"} for i in range(100)]
+    table = db.create_table("test", data)
+    results = list(table.to_arrow().to_pylist())
+
+    assert len(results) == 100
+
+    # Verify cache usage increased after operations
+    final_cache_size = session.size_bytes
+    final_cache_items = session.approx_num_items
+
+    assert final_cache_size > initial_cache_size  # Cache should have grown
+    assert final_cache_items >= initial_cache_items  # Items should not decrease
+    assert initial_cache_size < index_cache_size + metadata_cache_size
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -1804,3 +1804,45 @@ def test_stats(mem_db: DBConnection):
            },
        },
    }
+
+
+def test_create_table_empty_list_with_schema(mem_db: DBConnection):
+    """Test creating table with empty list data and schema
+
+    Regression test for IndexError: list index out of range
+    when calling create_table(name, data=[], schema=schema)
+    """
+    schema = pa.schema(
+        [pa.field("vector", pa.list_(pa.float32(), 2)), pa.field("id", pa.int64())]
+    )
+    table = mem_db.create_table("test_empty_list", data=[], schema=schema)
+    assert table.count_rows() == 0
+    assert table.schema == schema
+
+
+def test_create_table_empty_list_no_schema_error(mem_db: DBConnection):
+    """Test that creating table with empty list and no schema raises error"""
+    with pytest.raises(
+        ValueError, match="Cannot create table from empty list without a schema"
+    ):
+        mem_db.create_table("test_empty_no_schema", data=[])
+
+
+def test_add_table_with_empty_embeddings(tmp_path):
+    """Test exact scenario from issue #1968
+
+    Regression test for issue #1968:
+    https://github.com/lancedb/lancedb/issues/1968
+    """
+    db = lancedb.connect(tmp_path)
+
+    class MySchema(LanceModel):
+        text: str
+        embedding: Vector(16)
+
+    table = db.create_table("test", schema=MySchema)
+    table.add(
+        [{"text": "bar", "embedding": [0.1] * 16}],
+        on_bad_vectors="drop",
+    )
+    assert table.count_rows() == 1
--- a/python/src/connection.rs
+++ b/python/src/connection.rs
@@ -179,7 +179,7 @@ impl Connection {
 }

 #[pyfunction]
-#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None))]
+#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None))]
 #[allow(clippy::too_many_arguments)]
 pub fn connect(
    py: Python,
@@ -190,6 +190,7 @@ pub fn connect(
    read_consistency_interval: Option<f64>,
    client_config: Option<PyClientConfig>,
    storage_options: Option<HashMap<String, String>>,
+    session: Option<crate::session::Session>,
 ) -> PyResult<Bound<'_, PyAny>> {
    future_into_py(py, async move {
        let mut builder = lancedb::connect(&uri);
@@ -213,6 +214,9 @@ pub fn connect(
        if let Some(client_config) = client_config {
            builder = builder.client_config(client_config.into());
        }
+        if let Some(session) = session {
+            builder = builder.session(session.inner.clone());
+        }
        Ok(Connection::new(builder.execute().await.infer_error()?))
    })
 }
--- a/python/src/index.rs
+++ b/python/src/index.rs
@@ -47,7 +47,10 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
                    .max_token_length(params.max_token_length)
                    .remove_stop_words(params.remove_stop_words)
                    .stem(params.stem)
-                    .ascii_folding(params.ascii_folding);
+                    .ascii_folding(params.ascii_folding)
+                    .ngram_min_length(params.ngram_min_length)
+                    .ngram_max_length(params.ngram_max_length)
+                    .ngram_prefix_only(params.prefix_only);
                Ok(LanceDbIndex::FTS(inner_opts))
            },
            "IvfFlat" => {
@@ -130,6 +133,9 @@ struct FtsParams {
    stem: bool,
    remove_stop_words: bool,
    ascii_folding: bool,
+    ngram_min_length: u32,
+    ngram_max_length: u32,
+    prefix_only: bool,
 }

 #[derive(FromPyObject)]
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -11,6 +11,7 @@ use pyo3::{
    wrap_pyfunction, Bound, PyResult, Python,
 };
 use query::{FTSQuery, HybridQuery, Query, VectorQuery};
+use session::Session;
 use table::{
    AddColumnsResult, AddResult, AlterColumnsResult, DeleteResult, DropColumnsResult, MergeResult,
    Table, UpdateResult,
@@ -21,6 +22,7 @@ pub mod connection;
 pub mod error;
 pub mod index;
 pub mod query;
+pub mod session;
 pub mod table;
 pub mod util;

@@ -31,6 +33,7 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
        .write_style("LANCEDB_LOG_STYLE");
    env_logger::init_from_env(env);
    m.add_class::<Connection>()?;
+    m.add_class::<Session>()?;
    m.add_class::<Table>()?;
    m.add_class::<IndexConfig>()?;
    m.add_class::<Query>()?;
--- a/python/src/query.rs
+++ b/python/src/query.rs
@@ -52,7 +52,7 @@ impl FromPyObject<'_> for PyLanceDB<FtsQuery> {
                let operator = ob.getattr("operator")?.extract::<String>()?;
                let prefix_length = ob.getattr("prefix_length")?.extract()?;

-                Ok(PyLanceDB(
+                Ok(Self(
                    MatchQuery::new(query)
                        .with_column(Some(column))
                        .with_boost(boost)
@@ -70,7 +70,7 @@ impl FromPyObject<'_> for PyLanceDB<FtsQuery> {
                let column = ob.getattr("column")?.extract()?;
                let slop = ob.getattr("slop")?.extract()?;

-                Ok(PyLanceDB(
+                Ok(Self(
                    PhraseQuery::new(query)
                        .with_column(Some(column))
                        .with_slop(slop)
@@ -78,10 +78,10 @@ impl FromPyObject<'_> for PyLanceDB<FtsQuery> {
                ))
            }
            "BoostQuery" => {
-                let positive: PyLanceDB<FtsQuery> = ob.getattr("positive")?.extract()?;
-                let negative: PyLanceDB<FtsQuery> = ob.getattr("negative")?.extract()?;
+                let positive: Self = ob.getattr("positive")?.extract()?;
+                let negative: Self = ob.getattr("negative")?.extract()?;
                let negative_boost = ob.getattr("negative_boost")?.extract()?;
-                Ok(PyLanceDB(
+                Ok(Self(
                    BoostQuery::new(positive.0, negative.0, negative_boost).into(),
                ))
            }
@@ -103,18 +103,17 @@ impl FromPyObject<'_> for PyLanceDB<FtsQuery> {
                let op = Operator::try_from(operator.as_str())
                    .map_err(|e| PyValueError::new_err(format!("Invalid operator: {}", e)))?;

-                Ok(PyLanceDB(q.with_operator(op).into()))
+                Ok(Self(q.with_operator(op).into()))
            }
            "BooleanQuery" => {
-                let queries: Vec<(String, PyLanceDB<FtsQuery>)> =
-                    ob.getattr("queries")?.extract()?;
+                let queries: Vec<(String, Self)> = ob.getattr("queries")?.extract()?;
                let mut sub_queries = Vec::with_capacity(queries.len());
                for (occur, q) in queries {
                    let occur = Occur::try_from(occur.as_str())
                        .map_err(|e| PyValueError::new_err(e.to_string()))?;
                    sub_queries.push((occur, q.0));
                }
-                Ok(PyLanceDB(BooleanQuery::new(sub_queries).into()))
+                Ok(Self(BooleanQuery::new(sub_queries).into()))
            }
            name => Err(PyValueError::new_err(format!(
                "Unsupported FTS query type: {}",
@@ -155,8 +154,8 @@ impl<'py> IntoPyObject<'py> for PyLanceDB<FtsQuery> {
                    .call((query.terms, query.column.unwrap()), Some(&kwargs))
            }
            FtsQuery::Boost(query) => {
-                let positive = PyLanceDB(query.positive.as_ref().clone()).into_pyobject(py)?;
-                let negative = PyLanceDB(query.negative.as_ref().clone()).into_pyobject(py)?;
+                let positive = Self(query.positive.as_ref().clone()).into_pyobject(py)?;
+                let negative = Self(query.negative.as_ref().clone()).into_pyobject(py)?;
                let kwargs = PyDict::new(py);
                kwargs.set_item("negative_boost", query.negative_boost)?;
                namespace
@@ -182,13 +181,13 @@ impl<'py> IntoPyObject<'py> for PyLanceDB<FtsQuery> {
                    query.should.len() + query.must.len() + query.must_not.len(),
                );
                for q in query.should {
-                    queries.push((Occur::Should.into(), PyLanceDB(q).into_pyobject(py)?));
+                    queries.push((Occur::Should.into(), Self(q).into_pyobject(py)?));
                }
                for q in query.must {
-                    queries.push((Occur::Must.into(), PyLanceDB(q).into_pyobject(py)?));
+                    queries.push((Occur::Must.into(), Self(q).into_pyobject(py)?));
                }
                for q in query.must_not {
-                    queries.push((Occur::MustNot.into(), PyLanceDB(q).into_pyobject(py)?));
+                    queries.push((Occur::MustNot.into(), Self(q).into_pyobject(py)?));
                }

                namespace
@@ -563,7 +562,10 @@ impl FTSQuery {
    }

    pub fn explain_plan(self_: PyRef<'_, Self>, verbose: bool) -> PyResult<Bound<'_, PyAny>> {
-        let inner = self_.inner.clone();
+        let inner = self_
+            .inner
+            .clone()
+            .full_text_search(self_.fts_query.clone());
        future_into_py(self_.py(), async move {
            inner
                .explain_plan(verbose)
@@ -573,7 +575,10 @@ impl FTSQuery {
    }

    pub fn analyze_plan(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
-        let inner = self_.inner.clone();
+        let inner = self_
+            .inner
+            .clone()
+            .full_text_search(self_.fts_query.clone());
        future_into_py(self_.py(), async move {
            inner
                .analyze_plan()
--- a/python/src/session.rs
+++ b/python/src/session.rs
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+use std::sync::Arc;
+
+use lancedb::{ObjectStoreRegistry, Session as LanceSession};
+use pyo3::{pyclass, pymethods, PyResult};
+
+/// A session for managing caches and object stores across LanceDB operations.
+///
+/// Sessions allow you to configure cache sizes for index and metadata caches,
+/// which can significantly impact memory use and performance. They can
+/// also be re-used across multiple connections to share the same cache state.
+#[pyclass]
+#[derive(Clone)]
+pub struct Session {
+    pub(crate) inner: Arc<LanceSession>,
+}
+
+impl Default for Session {
+    fn default() -> Self {
+        Self {
+            inner: Arc::new(LanceSession::default()),
+        }
+    }
+}
+
+#[pymethods]
+impl Session {
+    /// Create a new session with custom cache sizes.
+    ///
+    /// Parameters
+    /// ----------
+    /// index_cache_size_bytes : int, optional
+    ///     The size of the index cache in bytes.
+    ///     Index data is stored in memory in this cache to speed up queries.
+    ///     Default: 6GB (6 * 1024 * 1024 * 1024 bytes)
+    /// metadata_cache_size_bytes : int, optional
+    ///     The size of the metadata cache in bytes.
+    ///     The metadata cache stores file metadata and schema information in memory.
+    ///     This cache improves scan and write performance.
+    ///     Default: 1GB (1024 * 1024 * 1024 bytes)
+    #[new]
+    #[pyo3(signature = (index_cache_size_bytes=None, metadata_cache_size_bytes=None))]
+    pub fn new(
+        index_cache_size_bytes: Option<usize>,
+        metadata_cache_size_bytes: Option<usize>,
+    ) -> PyResult<Self> {
+        let index_cache_size = index_cache_size_bytes.unwrap_or(6 * 1024 * 1024 * 1024); // 6GB default
+        let metadata_cache_size = metadata_cache_size_bytes.unwrap_or(1024 * 1024 * 1024); // 1GB default
+
+        let session = LanceSession::new(
+            index_cache_size,
+            metadata_cache_size,
+            Arc::new(ObjectStoreRegistry::default()),
+        );
+
+        Ok(Self {
+            inner: Arc::new(session),
+        })
+    }
+
+    /// Create a session with default cache sizes.
+    ///
+    /// This is equivalent to creating a session with 6GB index cache
+    /// and 1GB metadata cache.
+    ///
+    /// Returns
+    /// -------
+    /// Session
+    ///     A new Session with default cache sizes
+    #[staticmethod]
+    #[allow(clippy::should_implement_trait)]
+    pub fn default() -> Self {
+        Default::default()
+    }
+
+    /// Get the current size of the session caches in bytes.
+    ///
+    /// Returns
+    /// -------
+    /// int
+    ///     The total size of all caches in the session
+    #[getter]
+    pub fn size_bytes(&self) -> u64 {
+        self.inner.size_bytes()
+    }
+
+    /// Get the approximate number of items cached in the session.
+    ///
+    /// Returns
+    /// -------
+    /// int
+    ///     The number of cached items across all caches
+    #[getter]
+    pub fn approx_num_items(&self) -> usize {
+        self.inner.approx_num_items()
+    }
+
+    fn __repr__(&self) -> String {
+        format!(
+            "Session(size_bytes={}, approx_num_items={})",
+            self.size_bytes(),
+            self.approx_num_items()
+        )
+    }
+}
--- a/rust/ffi/node/Cargo.toml
+++ b/rust/ffi/node/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-node"
-version = "0.21.0-beta.0"
+version = "0.21.2-beta.1"
 description = "Serverless, low-latency vector database for AI applications"
 license.workspace = true
 edition.workspace = true
--- a/rust/lancedb/Cargo.toml
+++ b/rust/lancedb/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb"
-version = "0.21.0-beta.0"
+version = "0.21.2-beta.1"
 edition.workspace = true
 description = "LanceDB: A serverless, low-latency vector database for AI applications"
 license.workspace = true
--- a/rust/lancedb/src/catalog/listing.rs
+++ b/rust/lancedb/src/catalog/listing.rs
@@ -105,7 +105,7 @@ impl ListingCatalog {
    }

    async fn open_path(path: &str) -> Result<Self> {
-        let (object_store, base_path) = ObjectStore::from_uri(path).await.unwrap();
+        let (object_store, base_path) = ObjectStore::from_uri(path).await?;
        if object_store.is_local() {
            Self::try_create_dir(path).context(CreateDirSnafu { path })?;
        }
@@ -216,6 +216,7 @@ impl Catalog for ListingCatalog {
            client_config: Default::default(),
            read_consistency_interval: None,
            options: Default::default(),
+            session: None,
        };

        // Add the db options to the connect request
@@ -243,6 +244,7 @@ impl Catalog for ListingCatalog {
            client_config: Default::default(),
            read_consistency_interval: None,
            options: Default::default(),
+            session: None,
        };

        // Add the db options to the connect request
@@ -312,6 +314,7 @@ mod tests {
            client_config: Default::default(),
            options: Default::default(),
            read_consistency_interval: None,
+            session: None,
        };

        let catalog = ListingCatalog::connect(&request).await.unwrap();
@@ -573,6 +576,7 @@ mod tests {
            client_config: Default::default(),
            options: Default::default(),
            read_consistency_interval: None,
+            session: None,
        };

        let catalog = ListingCatalog::connect(&request).await.unwrap();
@@ -592,6 +596,7 @@ mod tests {
            client_config: Default::default(),
            options: Default::default(),
            read_consistency_interval: None,
+            session: None,
        };

        let catalog = ListingCatalog::connect(&request).await.unwrap();
@@ -608,6 +613,7 @@ mod tests {
            client_config: Default::default(),
            options: Default::default(),
            read_consistency_interval: None,
+            session: None,
        };

        let result = ListingCatalog::connect(&request).await;
--- a/rust/lancedb/src/connection.rs
+++ b/rust/lancedb/src/connection.rs
@@ -627,6 +627,12 @@ pub struct ConnectRequest {
    /// consistency only applies to read operations. Write operations are
    /// always consistent.
    pub read_consistency_interval: Option<std::time::Duration>,
+
+    /// Optional session for object stores and caching
+    ///
+    /// If provided, this session will be used instead of creating a default one.
+    /// This allows for custom configuration of object store registries, caching, etc.
+    pub session: Option<Arc<lance::session::Session>>,
 }

 #[derive(Debug)]
@@ -645,6 +651,7 @@ impl ConnectBuilder {
                client_config: Default::default(),
                read_consistency_interval: None,
                options: HashMap::new(),
+                session: None,
            },
            embedding_registry: None,
        }
@@ -802,6 +809,20 @@ impl ConnectBuilder {
        self
    }

+    /// Set a custom session for object stores and caching.
+    ///
+    /// By default, a new session with default configuration will be created.
+    /// This method allows you to provide a custom session with your own
+    /// configuration for object store registries, caching, etc.
+    ///
+    /// # Arguments
+    ///
+    /// * `session` - A custom session to use for this connection
+    pub fn session(mut self, session: Arc<lance::session::Session>) -> Self {
+        self.request.session = Some(session);
+        self
+    }
+
    #[cfg(feature = "remote")]
    fn execute_remote(self) -> Result<Connection> {
        use crate::remote::db::RemoteDatabaseOptions;
@@ -884,6 +905,7 @@ impl CatalogConnectBuilder {
                client_config: Default::default(),
                read_consistency_interval: None,
                options: HashMap::new(),
+                session: None,
            },
        }
    }
--- a/rust/lancedb/src/database/listing.rs
+++ b/rust/lancedb/src/database/listing.rs
@@ -8,7 +8,7 @@ use std::path::Path;
 use std::{collections::HashMap, sync::Arc};

 use lance::dataset::{ReadParams, WriteMode};
-use lance::io::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry, WrappingObjectStore};
+use lance::io::{ObjectStore, ObjectStoreParams, WrappingObjectStore};
 use lance_datafusion::utils::StreamingWriteSource;
 use lance_encoding::version::LanceFileVersion;
 use lance_table::io::commit::commit_handler_from_url;
@@ -217,6 +217,9 @@ pub struct ListingDatabase {

    // Options for tables created by this connection
    new_table_config: NewTableConfig,
+
+    // Session for object stores and caching
+    session: Arc<lance::session::Session>,
 }

 impl std::fmt::Display for ListingDatabase {
@@ -262,6 +265,7 @@ impl ListingDatabase {
                    uri,
                    request.read_consistency_interval,
                    options.new_table_config,
+                    request.session.clone(),
                )
                .await
            }
@@ -313,13 +317,20 @@ impl ListingDatabase {

                let plain_uri = url.to_string();

-                let registry = Arc::new(ObjectStoreRegistry::default());
+                let session = request
+                    .session
+                    .clone()
+                    .unwrap_or_else(|| Arc::new(lance::session::Session::default()));
                let os_params = ObjectStoreParams {
                    storage_options: Some(options.storage_options.clone()),
                    ..Default::default()
                };
-                let (object_store, base_path) =
-                    ObjectStore::from_uri_and_params(registry, &plain_uri, &os_params).await?;
+                let (object_store, base_path) = ObjectStore::from_uri_and_params(
+                    session.store_registry(),
+                    &plain_uri,
+                    &os_params,
+                )
+                .await?;
                if object_store.is_local() {
                    Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
                }
@@ -342,6 +353,7 @@ impl ListingDatabase {
                    read_consistency_interval: request.read_consistency_interval,
                    storage_options: options.storage_options,
                    new_table_config: options.new_table_config,
+                    session,
                })
            }
            Err(_) => {
@@ -349,6 +361,7 @@ impl ListingDatabase {
                    uri,
                    request.read_consistency_interval,
                    options.new_table_config,
+                    request.session.clone(),
                )
                .await
            }
@@ -359,8 +372,15 @@ impl ListingDatabase {
        path: &str,
        read_consistency_interval: Option<std::time::Duration>,
        new_table_config: NewTableConfig,
+        session: Option<Arc<lance::session::Session>>,
    ) -> Result<Self> {
-        let (object_store, base_path) = ObjectStore::from_uri(path).await?;
+        let session = session.unwrap_or_else(|| Arc::new(lance::session::Session::default()));
+        let (object_store, base_path) = ObjectStore::from_uri_and_params(
+            session.store_registry(),
+            path,
+            &ObjectStoreParams::default(),
+        )
+        .await?;
        if object_store.is_local() {
            Self::try_create_dir(path).context(CreateDirSnafu { path })?;
        }
@@ -374,6 +394,7 @@ impl ListingDatabase {
            read_consistency_interval,
            storage_options: HashMap::new(),
            new_table_config,
+            session,
        })
    }

@@ -441,6 +462,128 @@ impl ListingDatabase {
        }
        Ok(())
    }
+
+    /// Inherit storage options from the connection into the target map
+    fn inherit_storage_options(&self, target: &mut HashMap<String, String>) {
+        for (key, value) in self.storage_options.iter() {
+            if !target.contains_key(key) {
+                target.insert(key.clone(), value.clone());
+            }
+        }
+    }
+
+    /// Extract storage option overrides from the request
+    fn extract_storage_overrides(
+        &self,
+        request: &CreateTableRequest,
+    ) -> Result<(Option<LanceFileVersion>, Option<bool>)> {
+        let storage_options = request
+            .write_options
+            .lance_write_params
+            .as_ref()
+            .and_then(|p| p.store_params.as_ref())
+            .and_then(|sp| sp.storage_options.as_ref());
+
+        let storage_version_override = storage_options
+            .and_then(|opts| opts.get(OPT_NEW_TABLE_STORAGE_VERSION))
+            .map(|s| s.parse::<LanceFileVersion>())
+            .transpose()?;
+
+        let v2_manifest_override = storage_options
+            .and_then(|opts| opts.get(OPT_NEW_TABLE_V2_MANIFEST_PATHS))
+            .map(|s| s.parse::<bool>())
+            .transpose()
+            .map_err(|_| Error::InvalidInput {
+                message: "enable_v2_manifest_paths must be a boolean".to_string(),
+            })?;
+
+        Ok((storage_version_override, v2_manifest_override))
+    }
+
+    /// Prepare write parameters for table creation
+    fn prepare_write_params(
+        &self,
+        request: &CreateTableRequest,
+        storage_version_override: Option<LanceFileVersion>,
+        v2_manifest_override: Option<bool>,
+    ) -> lance::dataset::WriteParams {
+        let mut write_params = request
+            .write_options
+            .lance_write_params
+            .clone()
+            .unwrap_or_default();
+
+        // Only modify the storage options if we actually have something to
+        // inherit. There is a difference between storage_options=None and
+        // storage_options=Some({}). Using storage_options=None will cause the
+        // connection's session store registry to be used. Supplying Some({})
+        // will cause a new connection to be created, and that connection will
+        // be dropped from the cache when python GCs the table object, which
+        // confounds reuse across tables.
+        if !self.storage_options.is_empty() {
+            let storage_options = write_params
+                .store_params
+                .get_or_insert_with(Default::default)
+                .storage_options
+                .get_or_insert_with(Default::default);
+            self.inherit_storage_options(storage_options);
+        }
+
+        write_params.data_storage_version = self
+            .new_table_config
+            .data_storage_version
+            .or(storage_version_override);
+
+        if let Some(enable_v2_manifest_paths) = self
+            .new_table_config
+            .enable_v2_manifest_paths
+            .or(v2_manifest_override)
+        {
+            write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
+        }
+
+        if matches!(&request.mode, CreateTableMode::Overwrite) {
+            write_params.mode = WriteMode::Overwrite;
+        }
+
+        write_params.session = Some(self.session.clone());
+
+        write_params
+    }
+
+    /// Handle the case where table already exists based on the create mode
+    async fn handle_table_exists(
+        &self,
+        table_name: &str,
+        mode: CreateTableMode,
+        data_schema: &arrow_schema::Schema,
+    ) -> Result<Arc<dyn BaseTable>> {
+        match mode {
+            CreateTableMode::Create => Err(Error::TableAlreadyExists {
+                name: table_name.to_string(),
+            }),
+            CreateTableMode::ExistOk(callback) => {
+                let req = OpenTableRequest {
+                    name: table_name.to_string(),
+                    index_cache_size: None,
+                    lance_read_params: None,
+                };
+                let req = (callback)(req);
+                let table = self.open_table(req).await?;
+
+                let table_schema = table.schema().await?;
+
+                if table_schema.as_ref() != data_schema {
+                    return Err(Error::Schema {
+                        message: "Provided schema does not match existing table schema".to_string(),
+                    });
+                }
+
+                Ok(table)
+            }
+            CreateTableMode::Overwrite => unreachable!(),
+        }
+    }
 }

 #[async_trait::async_trait]
@@ -475,50 +618,14 @@ impl Database for ListingDatabase {
        Ok(f)
    }

-    async fn create_table(&self, mut request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
+    async fn create_table(&self, request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
        let table_uri = self.table_uri(&request.name)?;
-        // Inherit storage options from the connection
-        let storage_options = request
-            .write_options
-            .lance_write_params
-            .get_or_insert_with(Default::default)
-            .store_params
-            .get_or_insert_with(Default::default)
-            .storage_options
-            .get_or_insert_with(Default::default);
-        for (key, value) in self.storage_options.iter() {
-            if !storage_options.contains_key(key) {
-                storage_options.insert(key.clone(), value.clone());
-            }
-        }

-        let storage_options = storage_options.clone();
+        let (storage_version_override, v2_manifest_override) =
+            self.extract_storage_overrides(&request)?;

-        let mut write_params = request.write_options.lance_write_params.unwrap_or_default();
-
-        if let Some(storage_version) = &self.new_table_config.data_storage_version {
-            write_params.data_storage_version = Some(*storage_version);
-        } else {
-            // Allow the user to override the storage version via storage options (backwards compatibility)
-            if let Some(data_storage_version) = storage_options.get(OPT_NEW_TABLE_STORAGE_VERSION) {
-                write_params.data_storage_version = Some(data_storage_version.parse()?);
-            }
-        }
-        if let Some(enable_v2_manifest_paths) = self.new_table_config.enable_v2_manifest_paths {
-            write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
-        } else {
-            // Allow the user to override the storage version via storage options (backwards compatibility)
-            if let Some(enable_v2_manifest_paths) = storage_options
-                .get(OPT_NEW_TABLE_V2_MANIFEST_PATHS)
-                .map(|s| s.parse::<bool>().unwrap())
-            {
-                write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
-            }
-        }
-
-        if matches!(&request.mode, CreateTableMode::Overwrite) {
-            write_params.mode = WriteMode::Overwrite;
-        }
+        let write_params =
+            self.prepare_write_params(&request, storage_version_override, v2_manifest_override);

        let data_schema = request.data.arrow_schema();

@@ -533,30 +640,10 @@ impl Database for ListingDatabase {
        .await
        {
            Ok(table) => Ok(Arc::new(table)),
-            Err(Error::TableAlreadyExists { name }) => match request.mode {
-                CreateTableMode::Create => Err(Error::TableAlreadyExists { name }),
-                CreateTableMode::ExistOk(callback) => {
-                    let req = OpenTableRequest {
-                        name: request.name.clone(),
-                        index_cache_size: None,
-                        lance_read_params: None,
-                    };
-                    let req = (callback)(req);
-                    let table = self.open_table(req).await?;
-
-                    let table_schema = table.schema().await?;
-
-                    if table_schema != data_schema {
-                        return Err(Error::Schema {
-                            message: "Provided schema does not match existing table schema"
-                                .to_string(),
-                        });
-                    }
-
-                    Ok(table)
-                }
-                CreateTableMode::Overwrite => unreachable!(),
-            },
+            Err(Error::TableAlreadyExists { .. }) => {
+                self.handle_table_exists(&request.name, request.mode, &data_schema)
+                    .await
+            }
            Err(err) => Err(err),
        }
    }
@@ -564,18 +651,22 @@ impl Database for ListingDatabase {
    async fn open_table(&self, mut request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
        let table_uri = self.table_uri(&request.name)?;

-        // Inherit storage options from the connection
-        let storage_options = request
-            .lance_read_params
-            .get_or_insert_with(Default::default)
-            .store_options
-            .get_or_insert_with(Default::default)
-            .storage_options
-            .get_or_insert_with(Default::default);
-        for (key, value) in self.storage_options.iter() {
-            if !storage_options.contains_key(key) {
-                storage_options.insert(key.clone(), value.clone());
-            }
+        // Only modify the storage options if we actually have something to
+        // inherit. There is a difference between storage_options=None and
+        // storage_options=Some({}). Using storage_options=None will cause the
+        // connection's session store registry to be used. Supplying Some({})
+        // will cause a new connection to be created, and that connection will
+        // be dropped from the cache when python GCs the table object, which
+        // confounds reuse across tables.
+        if !self.storage_options.is_empty() {
+            let storage_options = request
+                .lance_read_params
+                .get_or_insert_with(Default::default)
+                .store_options
+                .get_or_insert_with(Default::default)
+                .storage_options
+                .get_or_insert_with(Default::default);
+            self.inherit_storage_options(storage_options);
        }

        // Some ReadParams are exposed in the OpenTableBuilder, but we also
@@ -584,13 +675,15 @@ impl Database for ListingDatabase {
        // If we have a user provided ReadParams use that
        // If we don't then start with the default ReadParams and customize it with
        // the options from the OpenTableBuilder
-        let read_params = request.lance_read_params.unwrap_or_else(|| {
+        let mut read_params = request.lance_read_params.unwrap_or_else(|| {
            let mut default_params = ReadParams::default();
            if let Some(index_cache_size) = request.index_cache_size {
-                default_params.index_cache_size = index_cache_size as usize;
+                #[allow(deprecated)]
+                default_params.index_cache_size(index_cache_size as usize);
            }
            default_params
        });
+        read_params.session(self.session.clone());

        let native_table = Arc::new(
            NativeTable::open_with_params(
--- a/rust/lancedb/src/io/object_store.rs
+++ b/rust/lancedb/src/io/object_store.rs
@@ -107,7 +107,7 @@ impl ObjectStore for MirroringObjectStore {
        self.primary.delete(location).await
    }

-    fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
+    fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result<ObjectMeta>> {
        self.primary.list(prefix)
    }

--- a/Show More
+++ b/Show More