Bump version: 0.30.0-beta.2 → 0.30.0-beta.3

feat: upgrade lance to 3.0.0-rc.2 and add bindings for fast_search (#3083 )
fix(python): pin pylance to make datafusion table provider match version (#3080 )
2026-03-26 10:30:40 +00:00 · 2026-02-28 01:29:53 +00:00 · 2026-02-27 17:27:01 -08:00 · 2026-02-27 13:34:05 -08:00 · 2026-02-26 19:25:46 -08:00 · 2026-02-25 14:59:32 -08:00
59 changed files with 5653 additions and 1051 deletions
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.27.0-beta.0"
+current_version = "0.27.0-beta.2"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/.github/workflows/nodejs.yml
+++ b/.github/workflows/nodejs.yml
@@ -78,8 +78,11 @@ jobs:
        fetch-depth: 0
        lfs: true
    - uses: actions/setup-node@v3
+      name: Setup Node.js 20 for build
      with:
-        node-version: ${{ matrix.node-version }}
+        # @napi-rs/cli v3 requires Node >= 20.12 (via @inquirer/prompts@8).
+        # Build always on Node 20; tests run on the matrix version below.
+        node-version: 20
        cache: 'npm'
        cache-dependency-path: nodejs/package-lock.json
    - uses: Swatinem/rust-cache@v2
@@ -87,12 +90,16 @@ jobs:
      run: |
        sudo apt update
        sudo apt install -y protobuf-compiler libssl-dev
-        npm install -g @napi-rs/cli
    - name: Build
      run: |
        npm ci --include=optional
        npm run build:debug -- --profile ci
-        npm run tsc
+    - uses: actions/setup-node@v3
+      name: Setup Node.js ${{ matrix.node-version }} for test
+      with:
+        node-version: ${{ matrix.node-version }}
+    - name: Compile TypeScript
+      run: npm run tsc
    - name: Setup localstack
      working-directory: .
      run: docker compose up --detach --wait
@@ -145,7 +152,6 @@ jobs:
    - name: Install dependencies
      run: |
        brew install protobuf
-        npm install -g @napi-rs/cli
    - name: Build
      run: |
        npm ci --include=optional
--- a/.github/workflows/npm-publish.yml
+++ b/.github/workflows/npm-publish.yml
@@ -128,16 +128,13 @@ jobs:
          - target: x86_64-unknown-linux-musl
            # This one seems to need some extra memory
            host: ubuntu-2404-8x-x64
-            # https://github.com/napi-rs/napi-rs/blob/main/alpine.Dockerfile
-            docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-alpine
            features: fp16kernels
            pre_build: |-
              set -e &&
-              apk add protobuf-dev curl &&
-              ln -s /usr/lib/gcc/x86_64-alpine-linux-musl/14.2.0/crtbeginS.o /usr/lib/crtbeginS.o &&
-              ln -s /usr/lib/libgcc_s.so /usr/lib/libgcc.so &&
-              CC=gcc &&
-              CXX=g++
+              sudo apt-get update &&
+              sudo apt-get install -y protobuf-compiler pkg-config &&
+              rustup target add x86_64-unknown-linux-musl &&
+              export EXTRA_ARGS="-x"
          - target: aarch64-unknown-linux-gnu
            host: ubuntu-2404-8x-x64
            # https://github.com/napi-rs/napi-rs/blob/main/debian-aarch64.Dockerfile
@@ -153,15 +150,13 @@ jobs:
              rustup target add aarch64-unknown-linux-gnu
          - target: aarch64-unknown-linux-musl
            host: ubuntu-2404-8x-x64
-            # https://github.com/napi-rs/napi-rs/blob/main/alpine.Dockerfile
-            docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-alpine
            features: ","
            pre_build: |-
              set -e &&
-              apk add protobuf-dev &&
+              sudo apt-get update &&
+              sudo apt-get install -y protobuf-compiler &&
              rustup target add aarch64-unknown-linux-musl &&
-              export CC_aarch64_unknown_linux_musl=aarch64-linux-musl-gcc &&
-              export CXX_aarch64_unknown_linux_musl=aarch64-linux-musl-g++
+              export EXTRA_ARGS="-x"
    name: build - ${{ matrix.settings.target }}
    runs-on: ${{ matrix.settings.host }}
    defaults:
@@ -192,12 +187,18 @@ jobs:
            .cargo-cache
            target/
          key: nodejs-${{ matrix.settings.target }}-cargo-${{ matrix.settings.host }}
-      - name: Setup toolchain
-        run: ${{ matrix.settings.setup }}
-        if: ${{ matrix.settings.setup }}
-        shell: bash
      - name: Install dependencies
        run: npm ci
+      - name: Install Zig
+        uses: mlugg/setup-zig@v2
+        if: ${{ contains(matrix.settings.target, 'musl') }}
+        with:
+          version: 0.14.1
+      - name: Install cargo-zigbuild
+        uses: taiki-e/install-action@v2
+        if: ${{ contains(matrix.settings.target, 'musl') }}
+        with:
+          tool: cargo-zigbuild
      - name: Build in docker
        uses: addnab/docker-run-action@v3
        if: ${{ matrix.settings.docker }}
@@ -210,24 +211,24 @@ jobs:
          run: |
            set -e
            ${{ matrix.settings.pre_build }}
-            npx napi build --platform  --release --no-const-enum \
+            npx napi build --platform --release \
              --features ${{ matrix.settings.features }} \
              --target ${{ matrix.settings.target }} \
              --dts ../lancedb/native.d.ts \
              --js ../lancedb/native.js \
              --strip \
-              dist/
+              --output-dir dist/
      - name: Build
        run: |
          ${{ matrix.settings.pre_build }}
-          npx napi build --platform  --release --no-const-enum \
+          npx napi build --platform --release \
              --features ${{ matrix.settings.features }} \
              --target ${{ matrix.settings.target }} \
              --dts ../lancedb/native.d.ts \
              --js ../lancedb/native.js \
              --strip \
              $EXTRA_ARGS \
-              dist/
+              --output-dir dist/
        if: ${{ !matrix.settings.docker }}
        shell: bash
      - name: Upload artifact
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -100,7 +100,9 @@ jobs:
          lfs: true
      - uses: Swatinem/rust-cache@v2
      - name: Install dependencies
-        run: sudo apt install -y protobuf-compiler libssl-dev
+        run: |
+          sudo apt update
+          sudo apt install -y protobuf-compiler libssl-dev
      - uses: rui314/setup-mold@v1
      - name: Make Swap
        run: |
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,20 +15,20 @@ categories = ["database-implementations"]
 rust-version = "1.91.0"

 [workspace.dependencies]
-lance = { "version" = "=3.0.0-beta.5", default-features = false, "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-core = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-datagen = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-file = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-io = { "version" = "=3.0.0-beta.5", default-features = false, "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-index = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-linalg = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-namespace = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-namespace-impls = { "version" = "=3.0.0-beta.5", default-features = false, "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-table = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-testing = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-datafusion = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-encoding = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
-lance-arrow = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" }
+lance = { "version" = "=3.0.0-rc.2", default-features = false, "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-core = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-datagen = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-file = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-io = { "version" = "=3.0.0-rc.2", default-features = false, "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-index = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-linalg = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-namespace = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-namespace-impls = { "version" = "=3.0.0-rc.2", default-features = false, "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-table = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-testing = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-datafusion = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-encoding = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
+lance-arrow = { "version" = "=3.0.0-rc.2", "tag" = "v3.0.0-rc.2", "git" = "https://github.com/lance-format/lance.git" }
 ahash = "0.8"
 # Note that this one does not include pyarrow
 arrow = { version = "57.2", optional = false }
@@ -40,13 +40,15 @@ arrow-schema = "57.2"
 arrow-select = "57.2"
 arrow-cast = "57.2"
 async-trait = "0"
-datafusion = { version = "51.0", default-features = false }
-datafusion-catalog = "51.0"
-datafusion-common = { version = "51.0", default-features = false }
-datafusion-execution = "51.0"
-datafusion-expr = "51.0"
-datafusion-physical-plan = "51.0"
-datafusion-physical-expr = "51.0"
+datafusion = { version = "52.1", default-features = false }
+datafusion-catalog = "52.1"
+datafusion-common = { version = "52.1", default-features = false }
+datafusion-execution = "52.1"
+datafusion-expr = "52.1"
+datafusion-functions = "52.1"
+datafusion-physical-plan = "52.1"
+datafusion-physical-expr = "52.1"
+datafusion-sql = "52.1"
 env_logger = "0.11"
 half = { "version" = "2.7.1", default-features = false, features = [
    "num-traits",
--- a/docs/src/java/java.md
+++ b/docs/src/java/java.md
@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
 <dependency>
    <groupId>com.lancedb</groupId>
    <artifactId>lancedb-core</artifactId>
-    <version>0.27.0-beta.0</version>
+    <version>0.27.0-beta.2</version>
 </dependency>
 ```

--- a/java/lancedb-core/pom.xml
+++ b/java/lancedb-core/pom.xml
@@ -8,7 +8,7 @@
    <parent>
      <groupId>com.lancedb</groupId>
      <artifactId>lancedb-parent</artifactId>
-      <version>0.27.0-beta.0</version>
+      <version>0.27.0-beta.2</version>
      <relativePath>../pom.xml</relativePath>
    </parent>

--- a/java/pom.xml
+++ b/java/pom.xml
@@ -6,7 +6,7 @@

    <groupId>com.lancedb</groupId>
    <artifactId>lancedb-parent</artifactId>
-    <version>0.27.0-beta.0</version>
+    <version>0.27.0-beta.2</version>
    <packaging>pom</packaging>
    <name>${project.artifactId}</name>
    <description>LanceDB Java SDK Parent POM</description>
@@ -28,7 +28,7 @@
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <arrow.version>15.0.0</arrow.version>
-        <lance-core.version>3.0.0-beta.5</lance-core.version>
+        <lance-core.version>3.1.0-beta.2</lance-core.version>
        <spotless.skip>false</spotless.skip>
        <spotless.version>2.30.0</spotless.version>
        <spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
--- a/nodejs/Cargo.toml
+++ b/nodejs/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "lancedb-nodejs"
 edition.workspace = true
-version = "0.27.0-beta.0"
+version = "0.27.0-beta.2"
 license.workspace = true
 description.workspace = true
 repository.workspace = true
@@ -19,11 +19,11 @@ arrow-schema.workspace = true
 env_logger.workspace = true
 futures.workspace = true
 lancedb = { path = "../rust/lancedb", default-features = false }
-napi = { version = "2.16.8", default-features = false, features = [
+napi = { version = "3.8.3", default-features = false, features = [
    "napi9",
    "async"
 ] }
-napi-derive = "2.16.4"
+napi-derive = "3.5.2"
 # Prevent dynamic linking of lzma, which comes from datafusion
 lzma-sys = { version = "*", features = ["static"] }
 log.workspace = true
@@ -33,7 +33,7 @@ aws-lc-sys = "=0.28.0"
 aws-lc-rs = "=1.13.0"

 [build-dependencies]
-napi-build = "2.1"
+napi-build = "2.3.1"

 [features]
 default = ["remote", "lancedb/aws", "lancedb/gcs", "lancedb/azure", "lancedb/dynamodb", "lancedb/oss", "lancedb/huggingface"]
--- a/nodejs/lancedb/index.ts
+++ b/nodejs/lancedb/index.ts
@@ -273,7 +273,9 @@ export async function connect(
  let nativeProvider: NativeJsHeaderProvider | undefined;
  if (finalHeaderProvider) {
    if (typeof finalHeaderProvider === "function") {
-      nativeProvider = new NativeJsHeaderProvider(finalHeaderProvider);
+      nativeProvider = new NativeJsHeaderProvider(async () =>
+        finalHeaderProvider(),
+      );
    } else if (
      finalHeaderProvider &&
      typeof finalHeaderProvider.getHeaders === "function"
--- a/nodejs/lancedb/query.ts
+++ b/nodejs/lancedb/query.ts
@@ -684,19 +684,17 @@ export class VectorQuery extends StandardQueryBase<NativeVectorQuery> {

  rerank(reranker: Reranker): VectorQuery {
    super.doCall((inner) =>
-      inner.rerank({
-        rerankHybrid: async (_, args) => {
-          const vecResults = await fromBufferToRecordBatch(args.vecResults);
-          const ftsResults = await fromBufferToRecordBatch(args.ftsResults);
-          const result = await reranker.rerankHybrid(
-            args.query,
-            vecResults as RecordBatch,
-            ftsResults as RecordBatch,
-          );
+      inner.rerank(async (args) => {
+        const vecResults = await fromBufferToRecordBatch(args.vecResults);
+        const ftsResults = await fromBufferToRecordBatch(args.ftsResults);
+        const result = await reranker.rerankHybrid(
+          args.query,
+          vecResults as RecordBatch,
+          ftsResults as RecordBatch,
+        );

-          const buffer = fromRecordBatchToBuffer(result);
-          return buffer;
-        },
+        const buffer = fromRecordBatchToBuffer(result);
+        return buffer;
      }),
    );

--- a/nodejs/npm/darwin-arm64/package.json
+++ b/nodejs/npm/darwin-arm64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-arm64",
-	"version": "0.27.0-beta.0",
+	"version": "0.27.0-beta.2",
 	"os": ["darwin"],
 	"cpu": ["arm64"],
 	"main": "lancedb.darwin-arm64.node",
--- a/nodejs/npm/linux-arm64-gnu/package.json
+++ b/nodejs/npm/linux-arm64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-arm64-gnu",
-	"version": "0.27.0-beta.0",
+	"version": "0.27.0-beta.2",
 	"os": ["linux"],
 	"cpu": ["arm64"],
 	"main": "lancedb.linux-arm64-gnu.node",
--- a/nodejs/npm/linux-arm64-musl/package.json
+++ b/nodejs/npm/linux-arm64-musl/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-arm64-musl",
-	"version": "0.27.0-beta.0",
+	"version": "0.27.0-beta.2",
 	"os": ["linux"],
 	"cpu": ["arm64"],
 	"main": "lancedb.linux-arm64-musl.node",
--- a/nodejs/npm/linux-x64-gnu/package.json
+++ b/nodejs/npm/linux-x64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-x64-gnu",
-	"version": "0.27.0-beta.0",
+	"version": "0.27.0-beta.2",
 	"os": ["linux"],
 	"cpu": ["x64"],
 	"main": "lancedb.linux-x64-gnu.node",
--- a/nodejs/npm/linux-x64-musl/package.json
+++ b/nodejs/npm/linux-x64-musl/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-x64-musl",
-	"version": "0.27.0-beta.0",
+	"version": "0.27.0-beta.2",
 	"os": ["linux"],
 	"cpu": ["x64"],
 	"main": "lancedb.linux-x64-musl.node",
--- a/nodejs/npm/win32-arm64-msvc/package.json
+++ b/nodejs/npm/win32-arm64-msvc/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@lancedb/lancedb-win32-arm64-msvc",
-  "version": "0.27.0-beta.0",
+  "version": "0.27.0-beta.2",
  "os": [
    "win32"
  ],
--- a/nodejs/npm/win32-x64-msvc/package.json
+++ b/nodejs/npm/win32-x64-msvc/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-win32-x64-msvc",
-	"version": "0.27.0-beta.0",
+	"version": "0.27.0-beta.2",
 	"os": ["win32"],
 	"cpu": ["x64"],
 	"main": "lancedb.win32-x64-msvc.node",
--- a/nodejs/package-lock.json
+++ b/nodejs/package-lock.json
--- a/nodejs/package.json
+++ b/nodejs/package.json
@@ -11,7 +11,7 @@
    "ann"
  ],
  "private": false,
-  "version": "0.27.0-beta.0",
+  "version": "0.27.0-beta.2",
  "main": "dist/index.js",
  "exports": {
    ".": "./dist/index.js",
@@ -21,19 +21,16 @@
  },
  "types": "dist/index.d.ts",
  "napi": {
-    "name": "lancedb",
-    "triples": {
-      "defaults": false,
-      "additional": [
-        "aarch64-apple-darwin",
-        "x86_64-unknown-linux-gnu",
-        "aarch64-unknown-linux-gnu",
-        "x86_64-unknown-linux-musl",
-        "aarch64-unknown-linux-musl",
-        "x86_64-pc-windows-msvc",
-        "aarch64-pc-windows-msvc"
-      ]
-    }
+    "binaryName": "lancedb",
+    "targets": [
+      "aarch64-apple-darwin",
+      "x86_64-unknown-linux-gnu",
+      "aarch64-unknown-linux-gnu",
+      "x86_64-unknown-linux-musl",
+      "aarch64-unknown-linux-musl",
+      "x86_64-pc-windows-msvc",
+      "aarch64-pc-windows-msvc"
+    ]
  },
  "license": "Apache-2.0",
  "repository": {
@@ -46,7 +43,7 @@
    "@aws-sdk/client-s3": "^3.33.0",
    "@biomejs/biome": "^1.7.3",
    "@jest/globals": "^29.7.0",
-    "@napi-rs/cli": "^2.18.3",
+    "@napi-rs/cli": "^3.5.1",
    "@types/axios": "^0.14.0",
    "@types/jest": "^29.1.2",
    "@types/node": "^22.7.4",
@@ -75,9 +72,9 @@
  "os": ["darwin", "linux", "win32"],
  "scripts": {
    "artifacts": "napi artifacts",
-    "build:debug": "napi build --platform --no-const-enum --dts ../lancedb/native.d.ts --js ../lancedb/native.js lancedb",
+    "build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir lancedb",
    "postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
-    "build:release": "napi build --platform --no-const-enum --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js dist/",
+    "build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir dist",
    "postbuild:release": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
    "build": "npm run build:debug && npm run tsc",
    "build-release": "npm run build:release && npm run tsc",
@@ -91,7 +88,7 @@
    "prepublishOnly": "napi prepublish -t npm",
    "test": "jest --verbose",
    "integration": "S3_TEST=1 npm run test",
-    "universal": "napi universal",
+    "universal": "napi universalize",
    "version": "napi version"
  },
  "dependencies": {
--- a/nodejs/src/header.rs
+++ b/nodejs/src/header.rs
@@ -1,20 +1,19 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The LanceDB Authors

-use napi::{
-    bindgen_prelude::*,
-    threadsafe_function::{ErrorStrategy, ThreadsafeFunction},
-};
+use napi::{bindgen_prelude::*, threadsafe_function::ThreadsafeFunction};
 use napi_derive::napi;
 use std::collections::HashMap;
 use std::sync::Arc;

+type GetHeadersFn = ThreadsafeFunction<(), Promise<HashMap<String, String>>, (), Status, false>;
+
 /// JavaScript HeaderProvider implementation that wraps a JavaScript callback.
 /// This is the only native header provider - all header provider implementations
 /// should provide a JavaScript function that returns headers.
 #[napi]
 pub struct JsHeaderProvider {
-    get_headers_fn: Arc<ThreadsafeFunction<(), ErrorStrategy::CalleeHandled>>,
+    get_headers_fn: Arc<GetHeadersFn>,
 }

 impl Clone for JsHeaderProvider {
@@ -29,9 +28,12 @@ impl Clone for JsHeaderProvider {
 impl JsHeaderProvider {
    /// Create a new JsHeaderProvider from a JavaScript callback
    #[napi(constructor)]
-    pub fn new(get_headers_callback: JsFunction) -> Result<Self> {
+    pub fn new(
+        get_headers_callback: Function<(), Promise<HashMap<String, String>>>,
+    ) -> Result<Self> {
        let get_headers_fn = get_headers_callback
-            .create_threadsafe_function(0, |ctx| Ok(vec![ctx.value]))
+            .build_threadsafe_function()
+            .build()
            .map_err(|e| {
                Error::new(
                    Status::GenericFailure,
@@ -51,7 +53,7 @@ impl lancedb::remote::HeaderProvider for JsHeaderProvider {
    async fn get_headers(&self) -> lancedb::error::Result<HashMap<String, String>> {
        // Call the JavaScript function asynchronously
        let promise: Promise<HashMap<String, String>> =
-            self.get_headers_fn.call_async(Ok(())).await.map_err(|e| {
+            self.get_headers_fn.call_async(()).await.map_err(|e| {
                lancedb::error::Error::Runtime {
                    message: format!("Failed to call JavaScript get_headers: {}", e),
                }
--- a/nodejs/src/lib.rs
+++ b/nodejs/src/lib.rs
@@ -60,7 +60,7 @@ pub struct OpenTableOptions {
    pub storage_options: Option<HashMap<String, String>>,
 }

-#[napi::module_init]
+#[napi_derive::module_init]
 fn init() {
    let env = Env::new()
        .filter_or("LANCEDB_LOG", "warn")
--- a/nodejs/src/query.rs
+++ b/nodejs/src/query.rs
@@ -20,8 +20,8 @@ use napi_derive::napi;
 use crate::error::convert_error;
 use crate::error::NapiErrorExt;
 use crate::iterator::RecordBatchIterator;
+use crate::rerankers::RerankHybridCallbackArgs;
 use crate::rerankers::Reranker;
-use crate::rerankers::RerankerCallbacks;
 use crate::util::{parse_distance_type, schema_to_buffer};

 #[napi]
@@ -42,7 +42,7 @@ impl Query {
    }

    #[napi]
-    pub fn full_text_search(&mut self, query: napi::JsObject) -> napi::Result<()> {
+    pub fn full_text_search(&mut self, query: Object) -> napi::Result<()> {
        let query = parse_fts_query(query)?;
        self.inner = self.inner.clone().full_text_search(query);
        Ok(())
@@ -235,7 +235,7 @@ impl VectorQuery {
    }

    #[napi]
-    pub fn full_text_search(&mut self, query: napi::JsObject) -> napi::Result<()> {
+    pub fn full_text_search(&mut self, query: Object) -> napi::Result<()> {
        let query = parse_fts_query(query)?;
        self.inner = self.inner.clone().full_text_search(query);
        Ok(())
@@ -272,11 +272,13 @@ impl VectorQuery {
    }

    #[napi]
-    pub fn rerank(&mut self, callbacks: RerankerCallbacks) {
-        self.inner = self
-            .inner
-            .clone()
-            .rerank(Arc::new(Reranker::new(callbacks)));
+    pub fn rerank(
+        &mut self,
+        rerank_hybrid: Function<RerankHybridCallbackArgs, Promise<Buffer>>,
+    ) -> napi::Result<()> {
+        let reranker = Reranker::new(rerank_hybrid)?;
+        self.inner = self.inner.clone().rerank(Arc::new(reranker));
+        Ok(())
    }

    #[napi(catch_unwind)]
@@ -523,12 +525,12 @@ impl JsFullTextQuery {
    }
 }

-fn parse_fts_query(query: napi::JsObject) -> napi::Result<FullTextSearchQuery> {
-    if let Ok(Some(query)) = query.get::<_, &JsFullTextQuery>("query") {
+fn parse_fts_query(query: Object) -> napi::Result<FullTextSearchQuery> {
+    if let Ok(Some(query)) = query.get::<&JsFullTextQuery>("query") {
        Ok(FullTextSearchQuery::new_query(query.inner.clone()))
-    } else if let Ok(Some(query_text)) = query.get::<_, String>("query") {
+    } else if let Ok(Some(query_text)) = query.get::<String>("query") {
        let mut query_text = query_text;
-        let columns = query.get::<_, Option<Vec<String>>>("columns")?.flatten();
+        let columns = query.get::<Option<Vec<String>>>("columns")?.flatten();

        let is_phrase =
            query_text.len() >= 2 && query_text.starts_with('"') && query_text.ends_with('"');
--- a/nodejs/src/rerankers.rs
+++ b/nodejs/src/rerankers.rs
@@ -3,10 +3,7 @@

 use arrow_array::RecordBatch;
 use async_trait::async_trait;
-use napi::{
-    bindgen_prelude::*,
-    threadsafe_function::{ErrorStrategy, ThreadsafeFunction},
-};
+use napi::{bindgen_prelude::*, threadsafe_function::ThreadsafeFunction};
 use napi_derive::napi;

 use lancedb::ipc::batches_to_ipc_file;
@@ -15,27 +12,28 @@ use lancedb::{error::Error, ipc::ipc_file_to_batches};

 use crate::error::NapiErrorExt;

+type RerankHybridFn = ThreadsafeFunction<
+    RerankHybridCallbackArgs,
+    Promise<Buffer>,
+    RerankHybridCallbackArgs,
+    Status,
+    false,
+>;
+
 /// Reranker implementation that "wraps" a NodeJS Reranker implementation.
 /// This contains references to the callbacks that can be used to invoke the
 /// reranking methods on the NodeJS implementation and handles serializing the
 /// record batches to Arrow IPC buffers.
-#[napi]
 pub struct Reranker {
-    /// callback to the Javascript which will call the rerankHybrid method of
-    /// some Reranker implementation
-    rerank_hybrid: ThreadsafeFunction<RerankHybridCallbackArgs, ErrorStrategy::CalleeHandled>,
+    rerank_hybrid: RerankHybridFn,
 }

-#[napi]
 impl Reranker {
-    #[napi]
-    pub fn new(callbacks: RerankerCallbacks) -> Self {
-        let rerank_hybrid = callbacks
-            .rerank_hybrid
-            .create_threadsafe_function(0, move |ctx| Ok(vec![ctx.value]))
-            .unwrap();
-
-        Self { rerank_hybrid }
+    pub fn new(
+        rerank_hybrid: Function<RerankHybridCallbackArgs, Promise<Buffer>>,
+    ) -> napi::Result<Self> {
+        let rerank_hybrid = rerank_hybrid.build_threadsafe_function().build()?;
+        Ok(Self { rerank_hybrid })
    }
 }

@@ -49,16 +47,16 @@ impl lancedb::rerankers::Reranker for Reranker {
    ) -> lancedb::error::Result<RecordBatch> {
        let callback_args = RerankHybridCallbackArgs {
            query: query.to_string(),
-            vec_results: batches_to_ipc_file(&[vector_results])?,
-            fts_results: batches_to_ipc_file(&[fts_results])?,
+            vec_results: Buffer::from(batches_to_ipc_file(&[vector_results])?.as_ref()),
+            fts_results: Buffer::from(batches_to_ipc_file(&[fts_results])?.as_ref()),
        };
        let promised_buffer: Promise<Buffer> = self
            .rerank_hybrid
-            .call_async(Ok(callback_args))
+            .call_async(callback_args)
            .await
            .map_err(|e| Error::Runtime {
-                message: format!("napi error status={}, reason={}", e.status, e.reason),
-            })?;
+            message: format!("napi error status={}, reason={}", e.status, e.reason),
+        })?;
        let buffer = promised_buffer.await.map_err(|e| Error::Runtime {
            message: format!("napi error status={}, reason={}", e.status, e.reason),
        })?;
@@ -77,16 +75,11 @@ impl std::fmt::Debug for Reranker {
    }
 }

-#[napi(object)]
-pub struct RerankerCallbacks {
-    pub rerank_hybrid: JsFunction,
-}
-
 #[napi(object)]
 pub struct RerankHybridCallbackArgs {
    pub query: String,
-    pub vec_results: Vec<u8>,
-    pub fts_results: Vec<u8>,
+    pub vec_results: Buffer,
+    pub fts_results: Buffer,
 }

 fn buffer_to_record_batch(buffer: Buffer) -> Result<RecordBatch> {
--- a/nodejs/src/session.rs
+++ b/nodejs/src/session.rs
@@ -96,7 +96,6 @@ impl napi::bindgen_prelude::FromNapiValue for Session {
    ) -> napi::Result<Self> {
        let object: napi::bindgen_prelude::ClassInstance<Self> =
            napi::bindgen_prelude::ClassInstance::from_napi_value(env, napi_val)?;
-        let copy = object.clone();
-        Ok(copy)
+        Ok((*object).clone())
    }
 }
--- a/nodejs/src/table.rs
+++ b/nodejs/src/table.rs
@@ -71,6 +71,17 @@ impl Table {
    pub async fn add(&self, buf: Buffer, mode: String) -> napi::Result<AddResult> {
        let batches = ipc_file_to_batches(buf.to_vec())
            .map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
+        let batches = batches
+            .into_iter()
+            .map(|batch| {
+                batch.map_err(|e| {
+                    napi::Error::from_reason(format!(
+                        "Failed to read record batch from IPC file: {}",
+                        e
+                    ))
+                })
+            })
+            .collect::<Result<Vec<_>>>()?;
        let mut op = self.inner_ref()?.add(batches);

        op = if mode == "append" {
--- a/python/.bumpversion.toml
+++ b/python/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.30.0-beta.1"
+current_version = "0.30.0-beta.3"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-python"
-version = "0.30.0-beta.1"
+version = "0.30.0-beta.3"
 edition.workspace = true
 description = "Python bindings for LanceDB"
 license.workspace = true
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -59,9 +59,9 @@ tests = [
    "polars>=0.19, <=1.3.0",
    "tantivy",
    "pyarrow-stubs",
-    "pylance>=1.0.0b14",
+    "pylance>=1.0.0b14,<3.0.0",
    "requests",
-    "datafusion",
+    "datafusion<52",
 ]
 dev = [
    "ruff",
--- a/python/python/lancedb/arrow.py
+++ b/python/python/lancedb/arrow.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright The LanceDB Authors

+from functools import singledispatch
 from typing import List, Optional, Tuple, Union

+from lancedb.pydantic import LanceModel, model_to_dict
 import pyarrow as pa

 from ._lancedb import RecordBatchStream
@@ -80,3 +82,32 @@ def peek_reader(
        yield from reader

    return batch, pa.RecordBatchReader.from_batches(batch.schema, all_batches())
+
+
+@singledispatch
+def to_arrow(data) -> pa.Table:
+    """Convert a single data object to a pa.Table."""
+    raise NotImplementedError(f"to_arrow not implemented for type {type(data)}")
+
+
+@to_arrow.register(pa.RecordBatch)
+def _arrow_from_batch(data: pa.RecordBatch) -> pa.Table:
+    return pa.Table.from_batches([data])
+
+
+@to_arrow.register(pa.Table)
+def _arrow_from_table(data: pa.Table) -> pa.Table:
+    return data
+
+
+@to_arrow.register(list)
+def _arrow_from_list(data: list) -> pa.Table:
+    if not data:
+        raise ValueError("Cannot create table from empty list without a schema")
+
+    if isinstance(data[0], LanceModel):
+        schema = data[0].__class__.to_arrow_schema()
+        dicts = [model_to_dict(d) for d in data]
+        return pa.Table.from_pylist(dicts, schema=schema)
+
+    return pa.Table.from_pylist(data)
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -1462,6 +1462,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
        self._phrase_query = False
        self.ordering_field_name = ordering_field_name
        self._reranker = None
+        self._fast_search = None
        if isinstance(fts_columns, str):
            fts_columns = [fts_columns]
        self._fts_columns = fts_columns
@@ -1483,6 +1484,19 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
        self._phrase_query = phrase_query
        return self

+    def fast_search(self) -> LanceFtsQueryBuilder:
+        """
+        Skip a flat search of unindexed data. This will improve
+        search performance but search results will not include unindexed data.
+
+        Returns
+        -------
+        LanceFtsQueryBuilder
+            The LanceFtsQueryBuilder object.
+        """
+        self._fast_search = True
+        return self
+
    def to_query_object(self) -> Query:
        return Query(
            columns=self._columns,
@@ -1494,6 +1508,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
                query=self._query, columns=self._fts_columns
            ),
            offset=self._offset,
+            fast_search=self._fast_search,
        )

    def output_schema(self) -> pa.Schema:
--- a/python/python/lancedb/scannable.py
+++ b/python/python/lancedb/scannable.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+from dataclasses import dataclass
+from functools import singledispatch
+import sys
+from typing import Callable, Iterator, Optional
+from lancedb.arrow import to_arrow
+import pyarrow as pa
+import pyarrow.dataset as ds
+
+from .pydantic import LanceModel
+
+
+@dataclass
+class Scannable:
+    schema: pa.Schema
+    num_rows: Optional[int]
+    # Factory function to create a new reader each time (supports re-scanning)
+    reader: Callable[[], pa.RecordBatchReader]
+    # Whether reader can be called more than once. For example, an iterator can
+    # only be consumed once, while a DataFrame can be converted to a new reader
+    # each time.
+    rescannable: bool = True
+
+
+@singledispatch
+def to_scannable(data) -> Scannable:
+    # Fallback: try iterable protocol
+    if hasattr(data, "__iter__"):
+        return _from_iterable(iter(data))
+    raise NotImplementedError(f"to_scannable not implemented for type {type(data)}")
+
+
+@to_scannable.register(pa.RecordBatchReader)
+def _from_reader(data: pa.RecordBatchReader) -> Scannable:
+    # RecordBatchReader can only be consumed once - not rescannable
+    return Scannable(
+        schema=data.schema, num_rows=None, reader=lambda: data, rescannable=False
+    )
+
+
+@to_scannable.register(pa.RecordBatch)
+def _from_batch(data: pa.RecordBatch) -> Scannable:
+    return Scannable(
+        schema=data.schema,
+        num_rows=data.num_rows,
+        reader=lambda: pa.RecordBatchReader.from_batches(data.schema, [data]),
+    )
+
+
+@to_scannable.register(pa.Table)
+def _from_table(data: pa.Table) -> Scannable:
+    return Scannable(schema=data.schema, num_rows=data.num_rows, reader=data.to_reader)
+
+
+@to_scannable.register(ds.Dataset)
+def _from_dataset(data: ds.Dataset) -> Scannable:
+    return Scannable(
+        schema=data.schema,
+        num_rows=data.count_rows(),
+        reader=lambda: data.scanner().to_reader(),
+    )
+
+
+@to_scannable.register(ds.Scanner)
+def _from_scanner(data: ds.Scanner) -> Scannable:
+    # Scanner can only be consumed once - not rescannable
+    return Scannable(
+        schema=data.projected_schema,
+        num_rows=None,
+        reader=data.to_reader,
+        rescannable=False,
+    )
+
+
+@to_scannable.register(list)
+def _from_list(data: list) -> Scannable:
+    if not data:
+        raise ValueError("Cannot create table from empty list without a schema")
+    table = to_arrow(data)
+    return Scannable(
+        schema=table.schema, num_rows=table.num_rows, reader=table.to_reader
+    )
+
+
+@to_scannable.register(dict)
+def _from_dict(data: dict) -> Scannable:
+    raise ValueError("Cannot add a single dictionary to a table. Use a list.")
+
+
+@to_scannable.register(LanceModel)
+def _from_lance_model(data: LanceModel) -> Scannable:
+    raise ValueError("Cannot add a single LanceModel to a table. Use a list.")
+
+
+def _from_iterable(data: Iterator) -> Scannable:
+    first_item = next(data, None)
+    if first_item is None:
+        raise ValueError("Cannot create table from empty iterator")
+    first = to_arrow(first_item)
+    schema = first.schema
+
+    def iter():
+        yield from first.to_batches()
+        for item in data:
+            batch = to_arrow(item)
+            if batch.schema != schema:
+                try:
+                    batch = batch.cast(schema)
+                except pa.lib.ArrowInvalid:
+                    raise ValueError(
+                        f"Input iterator yielded a batch with schema that "
+                        f"does not match the schema of other batches.\n"
+                        f"Expected:\n{schema}\nGot:\n{batch.schema}"
+                    )
+            yield from batch.to_batches()
+
+    reader = pa.RecordBatchReader.from_batches(schema, iter())
+    return to_scannable(reader)
+
+
+_registered_modules: set[str] = set()
+
+
+def _register_optional_converters():
+    """Register converters for optional dependencies that are already imported."""
+
+    if "pandas" in sys.modules and "pandas" not in _registered_modules:
+        _registered_modules.add("pandas")
+        import pandas as pd
+
+        @to_arrow.register(pd.DataFrame)
+        def _arrow_from_pandas(data: pd.DataFrame) -> pa.Table:
+            table = pa.Table.from_pandas(data, preserve_index=False)
+            return table.replace_schema_metadata(None)
+
+        @to_scannable.register(pd.DataFrame)
+        def _from_pandas(data: pd.DataFrame) -> Scannable:
+            return to_scannable(_arrow_from_pandas(data))
+
+    if "polars" in sys.modules and "polars" not in _registered_modules:
+        _registered_modules.add("polars")
+        import polars as pl
+
+        @to_arrow.register(pl.DataFrame)
+        def _arrow_from_polars(data: pl.DataFrame) -> pa.Table:
+            return data.to_arrow()
+
+        @to_scannable.register(pl.DataFrame)
+        def _from_polars(data: pl.DataFrame) -> Scannable:
+            arrow = data.to_arrow()
+            return Scannable(
+                schema=arrow.schema, num_rows=len(data), reader=arrow.to_reader
+            )
+
+        @to_scannable.register(pl.LazyFrame)
+        def _from_polars_lazy(data: pl.LazyFrame) -> Scannable:
+            arrow = data.collect().to_arrow()
+            return Scannable(
+                schema=arrow.schema, num_rows=arrow.num_rows, reader=arrow.to_reader
+            )
+
+    if "datasets" in sys.modules and "datasets" not in _registered_modules:
+        _registered_modules.add("datasets")
+        from datasets import Dataset as HFDataset
+        from datasets import DatasetDict as HFDatasetDict
+
+        @to_scannable.register(HFDataset)
+        def _from_hf_dataset(data: HFDataset) -> Scannable:
+            table = data.data.table  # Access underlying Arrow table
+            return Scannable(
+                schema=table.schema, num_rows=len(data), reader=table.to_reader
+            )
+
+        @to_scannable.register(HFDatasetDict)
+        def _from_hf_dataset_dict(data: HFDatasetDict) -> Scannable:
+            # HuggingFace DatasetDict: combine all splits with a 'split' column
+            schema = data[list(data.keys())[0]].features.arrow_schema
+            if "split" not in schema.names:
+                schema = schema.append(pa.field("split", pa.string()))
+
+            def gen():
+                for split_name, dataset in data.items():
+                    for batch in dataset.data.to_batches():
+                        split_arr = pa.array(
+                            [split_name] * len(batch), type=pa.string()
+                        )
+                        yield pa.RecordBatch.from_arrays(
+                            list(batch.columns) + [split_arr], schema=schema
+                        )
+
+            total_rows = sum(len(dataset) for dataset in data.values())
+            return Scannable(
+                schema=schema,
+                num_rows=total_rows,
+                reader=lambda: pa.RecordBatchReader.from_batches(schema, gen()),
+            )
+
+    if "lance" in sys.modules and "lance" not in _registered_modules:
+        _registered_modules.add("lance")
+        import lance
+
+        @to_scannable.register(lance.LanceDataset)
+        def _from_lance(data: lance.LanceDataset) -> Scannable:
+            return Scannable(
+                schema=data.schema,
+                num_rows=data.count_rows(),
+                reader=lambda: data.scanner().to_reader(),
+            )
+
+
+# Register on module load
+_register_optional_converters()
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -25,6 +25,8 @@ from typing import (
 )
 from urllib.parse import urlparse

+from lancedb.scannable import _register_optional_converters, to_scannable
+
 from . import __version__
 from lancedb.arrow import peek_reader
 from lancedb.background_loop import LOOP
@@ -3727,18 +3729,31 @@ class AsyncTable:
            on_bad_vectors = "error"
        if fill_value is None:
            fill_value = 0.0
-        data = _sanitize_data(
-            data,
-            schema,
-            metadata=schema.metadata,
-            on_bad_vectors=on_bad_vectors,
-            fill_value=fill_value,
-            allow_subschema=True,
-        )
-        if isinstance(data, pa.Table):
-            data = data.to_reader()

-        return await self._inner.add(data, mode or "append")
+        # _santitize_data is an old code path, but we will use it until the
+        # new code path is ready.
+        if on_bad_vectors != "error" or (
+            schema.metadata is not None and b"embedding_functions" in schema.metadata
+        ):
+            data = _sanitize_data(
+                data,
+                schema,
+                metadata=schema.metadata,
+                on_bad_vectors=on_bad_vectors,
+                fill_value=fill_value,
+                allow_subschema=True,
+            )
+        _register_optional_converters()
+        data = to_scannable(data)
+        try:
+            return await self._inner.add(data, mode or "append")
+        except RuntimeError as e:
+            if "Cast error" in str(e):
+                raise ValueError(e)
+            elif "Vector column contains NaN" in str(e):
+                raise ValueError(e)
+            else:
+                raise

    def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:
        """
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -882,3 +882,105 @@ def test_fts_query_to_json():
        '"must_not":[]}}'
    )
    assert json_str == expected
+
+
+def test_fts_fast_search(table):
+    table.create_fts_index("text", use_tantivy=False)
+
+    # Insert some unindexed data
+    table.add(
+        [
+            {
+                "text": "xyz",
+                "vector": [0 for _ in range(128)],
+                "id": 101,
+                "text2": "xyz",
+                "nested": {"text": "xyz"},
+                "count": 10,
+            }
+        ]
+    )
+
+    # Without fast_search, the query object should not have fast_search set
+    builder = table.search("xyz", query_type="fts").limit(10)
+    query = builder.to_query_object()
+    assert query.fast_search is None
+
+    # With fast_search, the query object should have fast_search=True
+    builder = table.search("xyz", query_type="fts").fast_search().limit(10)
+    query = builder.to_query_object()
+    assert query.fast_search is True
+
+    # fast_search should be chainable with other methods
+    builder = (
+        table.search("xyz", query_type="fts").fast_search().select(["text"]).limit(5)
+    )
+    query = builder.to_query_object()
+    assert query.fast_search is True
+    assert query.limit == 5
+    assert query.columns == ["text"]
+
+    # Verify it executes without error and skips unindexed data
+    results = table.search("xyz", query_type="fts").fast_search().limit(5).to_list()
+    assert len(results) == 0
+
+    # Update index and verify it returns results
+    table.optimize()
+    results = table.search("xyz", query_type="fts").fast_search().limit(5).to_list()
+    assert len(results) > 0
+
+
+@pytest.mark.asyncio
+async def test_fts_fast_search_async(async_table):
+    await async_table.create_index("text", config=FTS())
+
+    # Insert some unindexed data
+    await async_table.add(
+        [
+            {
+                "text": "xyz",
+                "vector": [0 for _ in range(128)],
+                "id": 101,
+                "text2": "xyz",
+                "nested": {"text": "xyz"},
+                "count": 10,
+            }
+        ]
+    )
+
+    # Without fast_search, should return results
+    results = await async_table.query().nearest_to_text("xyz").limit(5).to_list()
+    assert len(results) > 0
+
+    # With fast_search, should return no results data unindexed
+    fast_results = (
+        await async_table.query()
+        .nearest_to_text("xyz")
+        .fast_search()
+        .limit(5)
+        .to_list()
+    )
+    assert len(fast_results) == 0
+
+    # Update index and verify it returns results
+    await async_table.optimize()
+
+    fast_results = (
+        await async_table.query()
+        .nearest_to_text("xyz")
+        .fast_search()
+        .limit(5)
+        .to_list()
+    )
+    assert len(fast_results) > 0
+
+    # fast_search should be chainable with other methods
+    results = (
+        await async_table.query()
+        .nearest_to_text("xyz")
+        .fast_search()
+        .select(["text"])
+        .limit(5)
+        .to_list()
+    )
+    assert len(results) > 0
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -810,7 +810,7 @@ def test_create_index_name_and_train_parameters(
    )


-def test_add_with_nans(mem_db: DBConnection):
+def test_create_with_nans(mem_db: DBConnection):
    # by default we raise an error on bad input vectors
    bad_data = [
        {"vector": [np.nan], "item": "bar", "price": 20.0},
@@ -854,6 +854,57 @@ def test_add_with_nans(mem_db: DBConnection):
    assert np.allclose(v, np.array([0.0, 0.0]))


+def test_add_with_nans(mem_db: DBConnection):
+    schema = pa.schema(
+        [
+            pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
+            pa.field("item", pa.string(), nullable=True),
+            pa.field("price", pa.float64(), nullable=False),
+        ],
+    )
+    table = mem_db.create_table("test", schema=schema)
+    # by default we raise an error on bad input vectors
+    bad_data = [
+        {"vector": [np.nan], "item": "bar", "price": 20.0},
+        {"vector": [5], "item": "bar", "price": 20.0},
+        {"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
+        {"vector": [np.nan, 5.0], "item": "bar", "price": 20.0},
+    ]
+    for row in bad_data:
+        with pytest.raises(ValueError):
+            table.add(
+                data=[row],
+            )
+
+    table.add(
+        [
+            {"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
+            {"vector": [2.1, 4.1], "item": "foo", "price": 9.0},
+            {"vector": [np.nan], "item": "bar", "price": 20.0},
+            {"vector": [5], "item": "bar", "price": 20.0},
+            {"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
+        ],
+        on_bad_vectors="drop",
+    )
+    assert len(table) == 2
+    table.delete("true")
+
+    # We can fill bad input with some value
+    table.add(
+        data=[
+            {"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
+            {"vector": [np.nan], "item": "bar", "price": 20.0},
+            {"vector": [np.nan, np.nan], "item": "bar", "price": 20.0},
+        ],
+        on_bad_vectors="fill",
+        fill_value=0.0,
+    )
+    assert len(table) == 3
+    arrow_tbl = table.search().where("item == 'bar'").to_arrow()
+    v = arrow_tbl["vector"].to_pylist()[0]
+    assert np.allclose(v, np.array([0.0, 0.0]))
+
+
 def test_restore(mem_db: DBConnection):
    table = mem_db.create_table(
        "my_table",
--- a/python/src/table.rs
+++ b/python/src/table.rs
@@ -7,6 +7,7 @@ use crate::{
    error::PythonErrorExt,
    index::{extract_index_params, IndexConfig},
    query::{Query, TakeQuery},
+    table::scannable::PyScannable,
 };
 use arrow::{
    datatypes::{DataType, Schema},
@@ -25,6 +26,8 @@ use pyo3::{
 };
 use pyo3_async_runtimes::tokio::future_into_py;

+mod scannable;
+
 /// Statistics about a compaction operation.
 #[pyclass(get_all)]
 #[derive(Clone, Debug)]
@@ -293,12 +296,10 @@ impl Table {

    pub fn add<'a>(
        self_: PyRef<'a, Self>,
-        data: Bound<'_, PyAny>,
+        data: PyScannable,
        mode: String,
    ) -> PyResult<Bound<'a, PyAny>> {
-        let batches: Box<dyn arrow::array::RecordBatchReader + Send> =
-            Box::new(ArrowArrayStreamReader::from_pyarrow_bound(&data)?);
-        let mut op = self_.inner_ref()?.add(batches);
+        let mut op = self_.inner_ref()?.add(data);
        if mode == "append" {
            op = op.mode(AddDataMode::Append);
        } else if mode == "overwrite" {
--- a/python/src/table/scannable.rs
+++ b/python/src/table/scannable.rs
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+use std::sync::Arc;
+
+use arrow::{
+    datatypes::{Schema, SchemaRef},
+    ffi_stream::ArrowArrayStreamReader,
+    pyarrow::{FromPyArrow, PyArrowType},
+};
+use futures::StreamExt;
+use lancedb::{
+    arrow::{SendableRecordBatchStream, SimpleRecordBatchStream},
+    data::scannable::Scannable,
+    Error,
+};
+use pyo3::{types::PyAnyMethods, FromPyObject, Py, PyAny, Python};
+
+/// Adapter that implements Scannable for a Python reader factory callable.
+///
+/// This holds a Python callable that returns a RecordBatchReader when called.
+/// For rescannable sources, the callable can be invoked multiple times to
+/// get fresh readers.
+pub struct PyScannable {
+    /// Python callable that returns a RecordBatchReader
+    reader_factory: Py<PyAny>,
+    schema: SchemaRef,
+    num_rows: Option<usize>,
+    rescannable: bool,
+}
+
+impl std::fmt::Debug for PyScannable {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("PyScannable")
+            .field("schema", &self.schema)
+            .field("num_rows", &self.num_rows)
+            .field("rescannable", &self.rescannable)
+            .finish()
+    }
+}
+
+impl Scannable for PyScannable {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn scan_as_stream(&mut self) -> SendableRecordBatchStream {
+        let reader: Result<ArrowArrayStreamReader, Error> = {
+            Python::attach(|py| {
+                let result =
+                    self.reader_factory
+                        .call0(py)
+                        .map_err(|e| lancedb::Error::Runtime {
+                            message: format!("Python reader factory failed: {}", e),
+                        })?;
+                ArrowArrayStreamReader::from_pyarrow_bound(result.bind(py)).map_err(|e| {
+                    lancedb::Error::Runtime {
+                        message: format!("Failed to create Arrow reader from Python: {}", e),
+                    }
+                })
+            })
+        };
+
+        // Reader is blocking but stream is non-blocking, so we need to spawn a task to pull.
+        let (tx, rx) = tokio::sync::mpsc::channel(1);
+
+        let join_handle = tokio::task::spawn_blocking(move || {
+            let reader = match reader {
+                Ok(reader) => reader,
+                Err(e) => {
+                    let _ = tx.blocking_send(Err(e));
+                    return;
+                }
+            };
+            for batch in reader {
+                match batch {
+                    Ok(batch) => {
+                        if tx.blocking_send(Ok(batch)).is_err() {
+                            // Receiver dropped, stop processing
+                            break;
+                        }
+                    }
+                    Err(source) => {
+                        let _ = tx.blocking_send(Err(Error::Arrow { source }));
+                        break;
+                    }
+                }
+            }
+        });
+
+        let schema = self.schema.clone();
+        let stream = futures::stream::unfold(
+            (rx, Some(join_handle)),
+            |(mut rx, join_handle)| async move {
+                match rx.recv().await {
+                    Some(Ok(batch)) => Some((Ok(batch), (rx, join_handle))),
+                    Some(Err(e)) => Some((Err(e), (rx, join_handle))),
+                    None => {
+                        // Channel closed. Check if the task panicked — a panic
+                        // drops the sender without sending an error, so without
+                        // this check we'd silently return a truncated stream.
+                        if let Some(handle) = join_handle {
+                            if let Err(join_err) = handle.await {
+                                return Some((
+                                    Err(Error::Runtime {
+                                        message: format!("Reader task panicked: {}", join_err),
+                                    }),
+                                    (rx, None),
+                                ));
+                            }
+                        }
+                        None
+                    }
+                }
+            },
+        );
+        Box::pin(SimpleRecordBatchStream::new(stream.fuse(), schema))
+    }
+
+    fn num_rows(&self) -> Option<usize> {
+        self.num_rows
+    }
+
+    fn rescannable(&self) -> bool {
+        self.rescannable
+    }
+}
+
+impl<'py> FromPyObject<'py> for PyScannable {
+    fn extract_bound(ob: &pyo3::Bound<'py, PyAny>) -> pyo3::PyResult<Self> {
+        // Convert from Scannable dataclass.
+        let schema: PyArrowType<Schema> = ob.getattr("schema")?.extract()?;
+        let schema = Arc::new(schema.0);
+        let num_rows: Option<usize> = ob.getattr("num_rows")?.extract()?;
+        let rescannable: bool = ob.getattr("rescannable")?.extract()?;
+        let reader_factory: Py<PyAny> = ob.getattr("reader")?.unbind();
+
+        Ok(Self {
+            schema,
+            reader_factory,
+            num_rows,
+            rescannable,
+        })
+    }
+}
--- a/rust/lancedb/Cargo.toml
+++ b/rust/lancedb/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb"
-version = "0.27.0-beta.0"
+version = "0.27.0-beta.2"
 edition.workspace = true
 description = "LanceDB: A serverless, low-latency vector database for AI applications"
 license.workspace = true
@@ -25,7 +25,9 @@ datafusion-catalog.workspace = true
 datafusion-common.workspace = true
 datafusion-execution.workspace = true
 datafusion-expr.workspace = true
+datafusion-functions.workspace = true
 datafusion-physical-expr.workspace = true
+datafusion-sql.workspace = true
 datafusion-physical-plan.workspace = true
 datafusion.workspace = true
 object_store = { workspace = true }
--- a/rust/lancedb/src/arrow.rs
+++ b/rust/lancedb/src/arrow.rs
@@ -155,9 +155,7 @@ impl IntoArrowStream for SendableRecordBatchStream {
 impl IntoArrowStream for datafusion_physical_plan::SendableRecordBatchStream {
    fn into_arrow(self) -> Result<SendableRecordBatchStream> {
        let schema = self.schema();
-        let stream = self.map_err(|df_err| Error::Runtime {
-            message: df_err.to_string(),
-        });
+        let stream = self.map_err(|df_err| df_err.into());
        Ok(Box::pin(SimpleRecordBatchStream::new(stream, schema)))
    }
 }
--- a/rust/lancedb/src/data/scannable.rs
+++ b/rust/lancedb/src/data/scannable.rs
@@ -9,13 +9,6 @@

 use std::sync::Arc;

-use arrow_array::{RecordBatch, RecordBatchIterator, RecordBatchReader};
-use arrow_schema::{ArrowError, SchemaRef};
-use async_trait::async_trait;
-use futures::stream::once;
-use futures::StreamExt;
-use lance_datafusion::utils::StreamingWriteSource;
-
 use crate::arrow::{
    SendableRecordBatchStream, SendableRecordBatchStreamExt, SimpleRecordBatchStream,
 };
@@ -25,6 +18,12 @@ use crate::embeddings::{
 };
 use crate::table::{ColumnDefinition, ColumnKind, TableDefinition};
 use crate::{Error, Result};
+use arrow_array::{ArrayRef, RecordBatch, RecordBatchIterator, RecordBatchReader};
+use arrow_schema::{ArrowError, SchemaRef};
+use async_trait::async_trait;
+use futures::stream::once;
+use futures::StreamExt;
+use lance_datafusion::utils::StreamingWriteSource;

 pub trait Scannable: Send {
    /// Returns the schema of the data.
@@ -228,6 +227,19 @@ impl WithEmbeddingsScannable {
        let table_definition = TableDefinition::new(output_schema, column_definitions);
        let output_schema = table_definition.into_rich_schema();

+        Self::with_schema(inner, embeddings, output_schema)
+    }
+
+    /// Create a WithEmbeddingsScannable with a specific output schema.
+    ///
+    /// Use this when the table schema is already known (e.g. during add) to
+    /// avoid nullability mismatches between the embedding function's declared
+    /// type and the table's stored type.
+    pub fn with_schema(
+        inner: Box<dyn Scannable>,
+        embeddings: Vec<(EmbeddingDefinition, Arc<dyn EmbeddingFunction>)>,
+        output_schema: SchemaRef,
+    ) -> Result<Self> {
        Ok(Self {
            inner,
            embeddings,
@@ -245,9 +257,11 @@ impl Scannable for WithEmbeddingsScannable {
        let inner_stream = self.inner.scan_as_stream();
        let embeddings = self.embeddings.clone();
        let output_schema = self.output_schema.clone();
+        let stream_schema = output_schema.clone();

        let mapped_stream = inner_stream.then(move |batch_result| {
            let embeddings = embeddings.clone();
+            let output_schema = output_schema.clone();
            async move {
                let batch = batch_result?;
                let result = tokio::task::spawn_blocking(move || {
@@ -257,12 +271,29 @@ impl Scannable for WithEmbeddingsScannable {
                .map_err(|e| Error::Runtime {
                    message: format!("Task panicked during embedding computation: {}", e),
                })??;
+                // Cast columns to match the declared output schema. The data is
+                // identical but field metadata (e.g. nested nullability) may
+                // differ between the embedding function output and the table.
+                let columns: Vec<ArrayRef> = result
+                    .columns()
+                    .iter()
+                    .enumerate()
+                    .map(|(i, col)| {
+                        let target_type = output_schema.field(i).data_type();
+                        if col.data_type() == target_type {
+                            Ok(col.clone())
+                        } else {
+                            arrow_cast::cast(col, target_type).map_err(Error::from)
+                        }
+                    })
+                    .collect::<Result<_>>()?;
+                let result = RecordBatch::try_new(output_schema, columns)?;
                Ok(result)
            }
        });

        Box::pin(SimpleRecordBatchStream {
-            schema: output_schema,
+            schema: stream_schema,
            stream: mapped_stream,
        })
    }
@@ -303,8 +334,13 @@ pub fn scannable_with_embeddings(
        }

        if !embeddings.is_empty() {
-            return Ok(Box::new(WithEmbeddingsScannable::try_new(
-                inner, embeddings,
+            // Use the table's schema so embedding column types (including nested
+            // nullability) match what's stored, avoiding mismatches with the
+            // embedding function's declared dest_type.
+            return Ok(Box::new(WithEmbeddingsScannable::with_schema(
+                inner,
+                embeddings,
+                table_definition.schema.clone(),
            )?));
        }
    }
@@ -312,6 +348,133 @@ pub fn scannable_with_embeddings(
    Ok(inner)
 }

+/// A wrapper that buffers the first RecordBatch from a Scannable so we can
+/// inspect it (e.g. to estimate data size) without losing it.
+pub(crate) struct PeekedScannable {
+    inner: Box<dyn Scannable>,
+    peeked: Option<RecordBatch>,
+    /// The first item from the stream, if it was an error. Stored so we can
+    /// re-emit it from `scan_as_stream` instead of silently dropping it.
+    first_error: Option<crate::Error>,
+    stream: Option<SendableRecordBatchStream>,
+}
+
+impl PeekedScannable {
+    pub fn new(inner: Box<dyn Scannable>) -> Self {
+        Self {
+            inner,
+            peeked: None,
+            first_error: None,
+            stream: None,
+        }
+    }
+
+    /// Reads and buffers the first batch from the inner scannable.
+    /// Returns a clone of it. Subsequent calls return the same batch.
+    ///
+    /// Returns `None` if the stream is empty or the first item is an error.
+    /// Errors are preserved and re-emitted by `scan_as_stream`.
+    pub async fn peek(&mut self) -> Option<RecordBatch> {
+        if self.peeked.is_some() {
+            return self.peeked.clone();
+        }
+        // Already peeked and got an error or empty stream.
+        if self.stream.is_some() || self.first_error.is_some() {
+            return None;
+        }
+        let mut stream = self.inner.scan_as_stream();
+        match stream.next().await {
+            Some(Ok(batch)) => {
+                self.peeked = Some(batch.clone());
+                self.stream = Some(stream);
+                Some(batch)
+            }
+            Some(Err(e)) => {
+                self.first_error = Some(e);
+                self.stream = Some(stream);
+                None
+            }
+            None => {
+                self.stream = Some(stream);
+                None
+            }
+        }
+    }
+}
+
+impl Scannable for PeekedScannable {
+    fn schema(&self) -> SchemaRef {
+        self.inner.schema()
+    }
+
+    fn num_rows(&self) -> Option<usize> {
+        self.inner.num_rows()
+    }
+
+    fn rescannable(&self) -> bool {
+        self.inner.rescannable()
+    }
+
+    fn scan_as_stream(&mut self) -> SendableRecordBatchStream {
+        let schema = self.inner.schema();
+
+        // If peek() hit an error, prepend it so downstream sees the error.
+        let error_item = self.first_error.take().map(Err);
+
+        match (self.peeked.take(), self.stream.take()) {
+            (Some(batch), Some(rest)) => {
+                let prepend = futures::stream::once(std::future::ready(Ok(batch)));
+                Box::pin(SimpleRecordBatchStream {
+                    schema,
+                    stream: prepend.chain(rest),
+                })
+            }
+            (Some(batch), None) => Box::pin(SimpleRecordBatchStream {
+                schema,
+                stream: futures::stream::once(std::future::ready(Ok(batch))),
+            }),
+            (None, Some(rest)) => {
+                if let Some(err) = error_item {
+                    let stream = futures::stream::once(std::future::ready(err));
+                    Box::pin(SimpleRecordBatchStream { schema, stream })
+                } else {
+                    rest
+                }
+            }
+            (None, None) => {
+                // peek() was never called — just delegate
+                self.inner.scan_as_stream()
+            }
+        }
+    }
+}
+
+/// Compute the number of write partitions based on data size estimates.
+///
+/// `sample_bytes` and `sample_rows` come from a representative batch and are
+/// used to estimate per-row size. `total_rows_hint` is the total row count
+/// when known; otherwise `sample_rows` row count is used as a lower bound
+/// estimate.
+///
+/// Targets roughly 1 million rows or 2 GB per partition, capped at
+/// `max_partitions` (typically the number of available CPU cores).
+pub(crate) fn estimate_write_partitions(
+    sample_bytes: usize,
+    sample_rows: usize,
+    total_rows_hint: Option<usize>,
+    max_partitions: usize,
+) -> usize {
+    if sample_rows == 0 {
+        return 1;
+    }
+    let bytes_per_row = sample_bytes / sample_rows;
+    let total_rows = total_rows_hint.unwrap_or(sample_rows);
+    let total_bytes = total_rows * bytes_per_row;
+    let by_rows = total_rows.div_ceil(1_000_000);
+    let by_bytes = total_bytes.div_ceil(2 * 1024 * 1024 * 1024);
+    by_rows.max(by_bytes).max(1).min(max_partitions)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -408,6 +571,231 @@ mod tests {
        assert!(result2.unwrap().is_err());
    }

+    mod peeked_scannable_tests {
+        use crate::test_utils::TestCustomError;
+
+        use super::*;
+
+        #[tokio::test]
+        async fn test_peek_returns_first_batch() {
+            let batch = record_batch!(("id", Int64, [1, 2, 3])).unwrap();
+            let mut peeked = PeekedScannable::new(Box::new(batch.clone()));
+
+            let first = peeked.peek().await.unwrap();
+            assert_eq!(first, batch);
+        }
+
+        #[tokio::test]
+        async fn test_peek_is_idempotent() {
+            let batch = record_batch!(("id", Int64, [1, 2, 3])).unwrap();
+            let mut peeked = PeekedScannable::new(Box::new(batch.clone()));
+
+            let first = peeked.peek().await.unwrap();
+            let second = peeked.peek().await.unwrap();
+            assert_eq!(first, second);
+        }
+
+        #[tokio::test]
+        async fn test_scan_after_peek_returns_all_data() {
+            let batches = vec![
+                record_batch!(("id", Int64, [1, 2])).unwrap(),
+                record_batch!(("id", Int64, [3, 4, 5])).unwrap(),
+            ];
+            let mut peeked = PeekedScannable::new(Box::new(batches.clone()));
+
+            let first = peeked.peek().await.unwrap();
+            assert_eq!(first, batches[0]);
+
+            let result: Vec<RecordBatch> = peeked.scan_as_stream().try_collect().await.unwrap();
+            assert_eq!(result.len(), 2);
+            assert_eq!(result[0], batches[0]);
+            assert_eq!(result[1], batches[1]);
+        }
+
+        #[tokio::test]
+        async fn test_scan_without_peek_passes_through() {
+            let batch = record_batch!(("id", Int64, [1, 2, 3])).unwrap();
+            let mut peeked = PeekedScannable::new(Box::new(batch.clone()));
+
+            let result: Vec<RecordBatch> = peeked.scan_as_stream().try_collect().await.unwrap();
+            assert_eq!(result.len(), 1);
+            assert_eq!(result[0], batch);
+        }
+
+        #[tokio::test]
+        async fn test_delegates_num_rows() {
+            let batches = vec![
+                record_batch!(("id", Int64, [1, 2])).unwrap(),
+                record_batch!(("id", Int64, [3])).unwrap(),
+            ];
+            let peeked = PeekedScannable::new(Box::new(batches));
+            assert_eq!(peeked.num_rows(), Some(3));
+        }
+
+        #[tokio::test]
+        async fn test_non_rescannable_stream_data_preserved() {
+            let batches = vec![
+                record_batch!(("id", Int64, [1, 2])).unwrap(),
+                record_batch!(("id", Int64, [3])).unwrap(),
+            ];
+            let schema = batches[0].schema();
+            let inner = futures::stream::iter(batches.clone().into_iter().map(Ok));
+            let stream: SendableRecordBatchStream = Box::pin(SimpleRecordBatchStream {
+                schema,
+                stream: inner,
+            });
+
+            let mut peeked = PeekedScannable::new(Box::new(stream));
+            assert!(!peeked.rescannable());
+            assert_eq!(peeked.num_rows(), None);
+
+            let first = peeked.peek().await.unwrap();
+            assert_eq!(first, batches[0]);
+
+            // All data is still available via scan_as_stream
+            let result: Vec<RecordBatch> = peeked.scan_as_stream().try_collect().await.unwrap();
+            assert_eq!(result.len(), 2);
+            assert_eq!(result[0], batches[0]);
+            assert_eq!(result[1], batches[1]);
+        }
+
+        #[tokio::test]
+        async fn test_error_in_first_batch_propagates() {
+            let schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
+                "id",
+                arrow_schema::DataType::Int64,
+                false,
+            )]));
+            let inner = futures::stream::iter(vec![Err(Error::External {
+                source: Box::new(TestCustomError),
+            })]);
+            let stream: SendableRecordBatchStream = Box::pin(SimpleRecordBatchStream {
+                schema,
+                stream: inner,
+            });
+
+            let mut peeked = PeekedScannable::new(Box::new(stream));
+
+            // peek returns None for errors
+            assert!(peeked.peek().await.is_none());
+
+            // But the error should come through when scanning
+            let mut stream = peeked.scan_as_stream();
+            let first = stream.next().await.unwrap();
+            assert!(first.is_err());
+            let err = first.unwrap_err();
+            assert!(
+                matches!(&err, Error::External { source } if source.downcast_ref::<TestCustomError>().is_some()),
+                "Expected TestCustomError to be preserved, got: {err}"
+            );
+        }
+
+        #[tokio::test]
+        async fn test_error_in_later_batch_propagates() {
+            let good_batch = record_batch!(("id", Int64, [1, 2])).unwrap();
+            let schema = good_batch.schema();
+            let inner = futures::stream::iter(vec![
+                Ok(good_batch.clone()),
+                Err(Error::External {
+                    source: Box::new(TestCustomError),
+                }),
+            ]);
+            let stream: SendableRecordBatchStream = Box::pin(SimpleRecordBatchStream {
+                schema,
+                stream: inner,
+            });
+
+            let mut peeked = PeekedScannable::new(Box::new(stream));
+
+            // peek succeeds with the first batch
+            let first = peeked.peek().await.unwrap();
+            assert_eq!(first, good_batch);
+
+            // scan_as_stream should yield the first batch, then the error
+            let mut stream = peeked.scan_as_stream();
+            let batch1 = stream.next().await.unwrap().unwrap();
+            assert_eq!(batch1, good_batch);
+
+            let batch2 = stream.next().await.unwrap();
+            assert!(batch2.is_err());
+            let err = batch2.unwrap_err();
+            assert!(
+                matches!(&err, Error::External { source } if source.downcast_ref::<TestCustomError>().is_some()),
+                "Expected TestCustomError to be preserved, got: {err}"
+            );
+        }
+
+        #[tokio::test]
+        async fn test_empty_stream_returns_none() {
+            let schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
+                "id",
+                arrow_schema::DataType::Int64,
+                false,
+            )]));
+            let inner = futures::stream::empty();
+            let stream: SendableRecordBatchStream = Box::pin(SimpleRecordBatchStream {
+                schema,
+                stream: inner,
+            });
+
+            let mut peeked = PeekedScannable::new(Box::new(stream));
+            assert!(peeked.peek().await.is_none());
+
+            // Scanning an empty (post-peek) stream should yield nothing
+            let result: Vec<RecordBatch> = peeked.scan_as_stream().try_collect().await.unwrap();
+            assert!(result.is_empty());
+        }
+    }
+
+    mod estimate_write_partitions_tests {
+        use super::*;
+
+        #[test]
+        fn test_small_data_single_partition() {
+            // 100 rows * 24 bytes/row = 2400 bytes — well under both thresholds
+            assert_eq!(estimate_write_partitions(2400, 100, Some(100), 8), 1);
+        }
+
+        #[test]
+        fn test_scales_by_row_count() {
+            // 2.5M rows at 24 bytes/row — row threshold dominates
+            // ceil(2_500_000 / 1_000_000) = 3
+            assert_eq!(estimate_write_partitions(72, 3, Some(2_500_000), 8), 3);
+        }
+
+        #[test]
+        fn test_scales_by_byte_size() {
+            // 100k rows at 40KB/row = ~4GB total → ceil(4GB / 2GB) = 2
+            let sample_bytes = 40_000 * 10;
+            assert_eq!(
+                estimate_write_partitions(sample_bytes, 10, Some(100_000), 8),
+                2
+            );
+        }
+
+        #[test]
+        fn test_capped_at_max_partitions() {
+            // 10M rows would want 10 partitions, but capped at 4
+            assert_eq!(estimate_write_partitions(72, 3, Some(10_000_000), 4), 4);
+        }
+
+        #[test]
+        fn test_zero_sample_rows_returns_one() {
+            assert_eq!(estimate_write_partitions(0, 0, Some(1_000_000), 8), 1);
+        }
+
+        #[test]
+        fn test_no_row_hint_uses_sample_size() {
+            // Without a hint, uses sample_rows (3), which is small
+            assert_eq!(estimate_write_partitions(72, 3, None, 8), 1);
+        }
+
+        #[test]
+        fn test_always_at_least_one() {
+            assert_eq!(estimate_write_partitions(24, 1, Some(1), 8), 1);
+        }
+    }
+
    mod embedding_tests {
        use super::*;
        use crate::embeddings::MemoryRegistry;
--- a/rust/lancedb/src/dataloader/permutation/reader.rs
+++ b/rust/lancedb/src/dataloader/permutation/reader.rs
@@ -426,6 +426,7 @@ impl PermutationReader {
            row_ids_query = row_ids_query.limit(limit as usize);
        }
        let mut row_ids = row_ids_query.execute().await?;
+        let mut idx_offset = 0;
        while let Some(batch) = row_ids.try_next().await? {
            let row_ids = batch
                .column(0)
@@ -433,8 +434,9 @@ impl PermutationReader {
                .values()
                .to_vec();
            for (i, row_id) in row_ids.iter().enumerate() {
-                offset_map.insert(i as u64, *row_id);
+                offset_map.insert(i as u64 + idx_offset, *row_id);
            }
+            idx_offset += batch.num_rows() as u64;
        }
        let offset_map = Arc::new(offset_map);
        *offset_map_ref = Some(offset_map.clone());
@@ -845,4 +847,106 @@ mod tests {
            .to_vec();
        assert_eq!(idx_values, vec![row_ids[2] as i32]);
    }
+
+    #[tokio::test]
+    async fn test_filtered_permutation_full_iteration() {
+        use crate::dataloader::permutation::builder::PermutationBuilder;
+
+        // Create a base table with 10000 rows where idx goes 0..10000.
+        // Filter to even values only, giving 5000 rows in the permutation.
+        let base_table = lance_datagen::gen_batch()
+            .col("idx", lance_datagen::array::step::<Int32Type>())
+            .into_mem_table("tbl", RowCount::from(10000), BatchCount::from(1))
+            .await;
+
+        let permutation_table = PermutationBuilder::new(base_table.clone())
+            .with_filter("idx % 2 = 0".to_string())
+            .build()
+            .await
+            .unwrap();
+
+        assert_eq!(permutation_table.count_rows(None).await.unwrap(), 5000);
+
+        let reader = PermutationReader::try_from_tables(
+            base_table.base_table().clone(),
+            permutation_table.base_table().clone(),
+            0,
+        )
+        .await
+        .unwrap();
+
+        assert_eq!(reader.count_rows(), 5000);
+
+        // Iterate through all batches using a batch size that doesn't evenly divide
+        // the row count (5000 / 128 = 39 full batches + 1 batch of 8 rows).
+        let batch_size = 128;
+        let mut stream = reader
+            .read(
+                Select::All,
+                QueryExecutionOptions {
+                    max_batch_length: batch_size,
+                    ..Default::default()
+                },
+            )
+            .await
+            .unwrap();
+
+        let mut total_rows = 0u64;
+        let mut all_idx_values = Vec::new();
+        while let Some(batch) = stream.try_next().await.unwrap() {
+            assert!(batch.num_rows() <= batch_size as usize);
+            total_rows += batch.num_rows() as u64;
+            let idx_col = batch.column(0).as_primitive::<Int32Type>().values();
+            all_idx_values.extend(idx_col.iter().copied());
+        }
+
+        assert_eq!(total_rows, 5000);
+        assert_eq!(all_idx_values.len(), 5000);
+
+        // Every value should be even (from the filter)
+        assert!(all_idx_values.iter().all(|v| v % 2 == 0));
+
+        // Should have 5000 unique values
+        let unique: std::collections::HashSet<i32> = all_idx_values.iter().copied().collect();
+        assert_eq!(unique.len(), 5000);
+
+        // Use take_offsets to fetch rows from the beginning, middle, and end
+        // of the permutation. The values should match what we saw during iteration.
+
+        // Beginning
+        let batch = reader.take_offsets(&[0, 1, 2], Select::All).await.unwrap();
+        assert_eq!(batch.num_rows(), 3);
+        let idx_values = batch
+            .column(0)
+            .as_primitive::<Int32Type>()
+            .values()
+            .to_vec();
+        assert_eq!(idx_values, &all_idx_values[0..3]);
+
+        // Middle
+        let batch = reader
+            .take_offsets(&[2499, 2500, 2501], Select::All)
+            .await
+            .unwrap();
+        assert_eq!(batch.num_rows(), 3);
+        let idx_values = batch
+            .column(0)
+            .as_primitive::<Int32Type>()
+            .values()
+            .to_vec();
+        assert_eq!(idx_values, &all_idx_values[2499..2502]);
+
+        // End (last 3 rows)
+        let batch = reader
+            .take_offsets(&[4997, 4998, 4999], Select::All)
+            .await
+            .unwrap();
+        assert_eq!(batch.num_rows(), 3);
+        let idx_values = batch
+            .column(0)
+            .as_primitive::<Int32Type>()
+            .values()
+            .to_vec();
+        assert_eq!(idx_values, &all_idx_values[4997..5000]);
+    }
 }
--- a/rust/lancedb/src/error.rs
+++ b/rust/lancedb/src/error.rs
@@ -4,6 +4,7 @@
 use std::sync::PoisonError;

 use arrow_schema::ArrowError;
+use datafusion_common::DataFusionError;
 use snafu::Snafu;

 pub(crate) type BoxError = Box<dyn std::error::Error + Send + Sync>;
@@ -96,28 +97,74 @@ pub type Result<T> = std::result::Result<T, Error>;
 impl From<ArrowError> for Error {
    fn from(source: ArrowError) -> Self {
        match source {
-            ArrowError::ExternalError(source) => match source.downcast::<Self>() {
-                Ok(e) => *e,
-                Err(source) => Self::External { source },
-            },
+            ArrowError::ExternalError(source) => Self::from_box_error(source),
            _ => Self::Arrow { source },
        }
    }
 }

+impl From<DataFusionError> for Error {
+    fn from(source: DataFusionError) -> Self {
+        match source {
+            DataFusionError::ArrowError(source, _) => (*source).into(),
+            DataFusionError::External(source) => Self::from_box_error(source),
+            other => Self::External {
+                source: Box::new(other),
+            },
+        }
+    }
+}
+
 impl From<lance::Error> for Error {
    fn from(source: lance::Error) -> Self {
        // Try to unwrap external errors that were wrapped by lance
        match source {
-            lance::Error::Wrapped { error, .. } => match error.downcast::<Self>() {
-                Ok(e) => *e,
-                Err(source) => Self::External { source },
-            },
+            lance::Error::Wrapped { error, .. } => Self::from_box_error(error),
+            lance::Error::External { source } => Self::from_box_error(source),
            _ => Self::Lance { source },
        }
    }
 }

+impl Error {
+    fn from_box_error(mut source: Box<dyn std::error::Error + Send + Sync>) -> Self {
+        source = match source.downcast::<Self>() {
+            Ok(e) => match *e {
+                Self::External { source } => return Self::from_box_error(source),
+                other => return other,
+            },
+            Err(source) => source,
+        };
+
+        source = match source.downcast::<lance::Error>() {
+            Ok(e) => match *e {
+                lance::Error::Wrapped { error, .. } => return Self::from_box_error(error),
+                other => return other.into(),
+            },
+            Err(source) => source,
+        };
+
+        source = match source.downcast::<ArrowError>() {
+            Ok(e) => match *e {
+                ArrowError::ExternalError(source) => return Self::from_box_error(source),
+                other => return other.into(),
+            },
+            Err(source) => source,
+        };
+
+        source = match source.downcast::<DataFusionError>() {
+            Ok(e) => match *e {
+                DataFusionError::ArrowError(source, _) => return (*source).into(),
+                DataFusionError::External(source) => return Self::from_box_error(source),
+                other => return other.into(),
+            },
+            Err(source) => source,
+        };
+
+        Self::External { source }
+    }
+}
+
 impl From<object_store::Error> for Error {
    fn from(source: object_store::Error) -> Self {
        Self::ObjectStore { source }
--- a/rust/lancedb/src/expr.rs
+++ b/rust/lancedb/src/expr.rs
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+//! Expression builder API for type-safe query construction
+//!
+//! This module provides a fluent API for building expressions that can be used
+//! in filters and projections. It wraps DataFusion's expression system.
+//!
+//! # Examples
+//!
+//! ```rust
+//! use std::ops::Mul;
+//! use lancedb::expr::{col, lit};
+//!
+//! let expr = col("age").gt(lit(18));
+//! let expr = col("age").gt(lit(18)).and(col("status").eq(lit("active")));
+//! let expr = col("price") * lit(1.1);
+//! ```
+
+mod sql;
+
+pub use sql::expr_to_sql_string;
+
+use std::sync::Arc;
+
+use arrow_schema::DataType;
+use datafusion_expr::{expr_fn::cast, Expr, ScalarUDF};
+use datafusion_functions::string::expr_fn as string_expr_fn;
+
+pub use datafusion_expr::{col, lit};
+
+pub use datafusion_expr::Expr as DfExpr;
+
+pub fn lower(expr: Expr) -> Expr {
+    string_expr_fn::lower(expr)
+}
+
+pub fn upper(expr: Expr) -> Expr {
+    string_expr_fn::upper(expr)
+}
+
+pub fn contains(expr: Expr, search: Expr) -> Expr {
+    string_expr_fn::contains(expr, search)
+}
+
+pub fn expr_cast(expr: Expr, data_type: DataType) -> Expr {
+    cast(expr, data_type)
+}
+
+lazy_static::lazy_static! {
+    static ref FUNC_REGISTRY: std::sync::RwLock<std::collections::HashMap<String, Arc<ScalarUDF>>> = {
+        let mut m = std::collections::HashMap::new();
+        m.insert("lower".to_string(), datafusion_functions::string::lower());
+        m.insert("upper".to_string(), datafusion_functions::string::upper());
+        m.insert("contains".to_string(), datafusion_functions::string::contains());
+        m.insert("btrim".to_string(), datafusion_functions::string::btrim());
+        m.insert("ltrim".to_string(), datafusion_functions::string::ltrim());
+        m.insert("rtrim".to_string(), datafusion_functions::string::rtrim());
+        m.insert("concat".to_string(), datafusion_functions::string::concat());
+        m.insert("octet_length".to_string(), datafusion_functions::string::octet_length());
+        std::sync::RwLock::new(m)
+    };
+}
+
+pub fn func(name: impl AsRef<str>, args: Vec<Expr>) -> crate::Result<Expr> {
+    let name = name.as_ref();
+    let registry = FUNC_REGISTRY
+        .read()
+        .map_err(|e| crate::Error::InvalidInput {
+            message: format!("lock poisoned: {}", e),
+        })?;
+    let udf = registry
+        .get(name)
+        .ok_or_else(|| crate::Error::InvalidInput {
+            message: format!("unknown function: {}", name),
+        })?;
+    Ok(Expr::ScalarFunction(
+        datafusion_expr::expr::ScalarFunction::new_udf(udf.clone(), args),
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_col_lit_comparisons() {
+        let expr = col("age").gt(lit(18));
+        let sql = expr_to_sql_string(&expr).unwrap();
+        assert!(sql.contains("age") && sql.contains("18"));
+
+        let expr = col("name").eq(lit("Alice"));
+        let sql = expr_to_sql_string(&expr).unwrap();
+        assert!(sql.contains("name") && sql.contains("Alice"));
+    }
+
+    #[test]
+    fn test_compound_expression() {
+        let expr = col("age").gt(lit(18)).and(col("status").eq(lit("active")));
+        let sql = expr_to_sql_string(&expr).unwrap();
+        assert!(sql.contains("age") && sql.contains("status"));
+    }
+
+    #[test]
+    fn test_string_functions() {
+        let expr = lower(col("name"));
+        let sql = expr_to_sql_string(&expr).unwrap();
+        assert!(sql.to_lowercase().contains("lower"));
+
+        let expr = contains(col("text"), lit("search"));
+        let sql = expr_to_sql_string(&expr).unwrap();
+        assert!(sql.to_lowercase().contains("contains"));
+    }
+
+    #[test]
+    fn test_func() {
+        let expr = func("lower", vec![col("x")]).unwrap();
+        let sql = expr_to_sql_string(&expr).unwrap();
+        assert!(sql.to_lowercase().contains("lower"));
+
+        let result = func("unknown_func", vec![col("x")]);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_arithmetic() {
+        let expr = col("price") * lit(1.1);
+        let sql = expr_to_sql_string(&expr).unwrap();
+        assert!(sql.contains("price"));
+    }
+}
--- a/rust/lancedb/src/expr/sql.rs
+++ b/rust/lancedb/src/expr/sql.rs
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+use datafusion_expr::Expr;
+use datafusion_sql::unparser;
+
+pub fn expr_to_sql_string(expr: &Expr) -> crate::Result<String> {
+    let ast = unparser::expr_to_sql(expr).map_err(|e| crate::Error::InvalidInput {
+        message: format!("failed to serialize expression to SQL: {}", e),
+    })?;
+    Ok(ast.to_string())
+}
--- a/rust/lancedb/src/lib.rs
+++ b/rust/lancedb/src/lib.rs
@@ -169,6 +169,7 @@ pub mod database;
 pub mod dataloader;
 pub mod embeddings;
 pub mod error;
+pub mod expr;
 pub mod index;
 pub mod io;
 pub mod ipc;
--- a/rust/lancedb/src/query.rs
+++ b/rust/lancedb/src/query.rs
@@ -359,6 +359,28 @@ pub trait QueryBase {
    /// on the filter column(s).
    fn only_if(self, filter: impl AsRef<str>) -> Self;

+    /// Only return rows which match the filter, using an expression builder.
+    ///
+    /// Use [`crate::expr`] for building type-safe expressions:
+    ///
+    /// ```
+    /// use lancedb::expr::{col, lit};
+    /// use lancedb::query::{QueryBase, ExecutableQuery};
+    ///
+    /// # use lancedb::Table;
+    /// # async fn query(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
+    /// let results = table.query()
+    ///     .only_if_expr(col("age").gt(lit(18)).and(col("status").eq(lit("active"))))
+    ///     .execute()
+    ///     .await?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    ///
+    /// Note: Expression filters are not supported for remote/server-side queries.
+    /// Use [`QueryBase::only_if`] with SQL strings for remote tables.
+    fn only_if_expr(self, filter: datafusion_expr::Expr) -> Self;
+
    /// Perform a full text search on the table.
    ///
    /// The results will be returned in order of BM25 scores.
@@ -468,6 +490,11 @@ impl<T: HasQuery> QueryBase for T {
        self
    }

+    fn only_if_expr(mut self, filter: datafusion_expr::Expr) -> Self {
+        self.mut_query().filter = Some(QueryFilter::Datafusion(filter));
+        self
+    }
+
    fn full_text_search(mut self, query: FullTextSearchQuery) -> Self {
        if self.mut_query().limit.is_none() {
            self.mut_query().limit = Some(DEFAULT_TOP_K);
--- a/rust/lancedb/src/remote/client.rs
+++ b/rust/lancedb/src/remote/client.rs
@@ -724,12 +724,58 @@ pub mod test_utils {
        }
    }

+    /// Consume a reqwest body into bytes, returning an error if the body
+    /// stream fails. This is used by MockSender to materialize streaming
+    /// bodies so that data pipeline errors (e.g. NaN rejection) are triggered
+    /// during mock sends just as they would be during a real HTTP upload.
+    pub async fn try_collect_body(body: reqwest::Body) -> std::result::Result<Vec<u8>, String> {
+        use http_body::Body;
+        use std::pin::Pin;
+
+        let mut body = body;
+        let mut data = Vec::new();
+        let mut body_pin = Pin::new(&mut body);
+        while let Some(frame) = futures::StreamExt::next(&mut futures::stream::poll_fn(|cx| {
+            body_pin.as_mut().poll_frame(cx)
+        }))
+        .await
+        {
+            match frame {
+                Ok(frame) => {
+                    if let Some(bytes) = frame.data_ref() {
+                        data.extend_from_slice(bytes);
+                    }
+                }
+                Err(e) => return Err(e.to_string()),
+            }
+        }
+        Ok(data)
+    }
+
    impl HttpSend for MockSender {
        async fn send(
            &self,
            _client: &reqwest::Client,
-            request: reqwest::Request,
+            mut request: reqwest::Request,
        ) -> reqwest::Result<reqwest::Response> {
+            // Consume any streaming body to materialize it into bytes.
+            // This triggers data pipeline errors (e.g. NaN rejection) that
+            // would otherwise only fire when a real HTTP client reads the body.
+            if let Some(body) = request.body_mut().take() {
+                match try_collect_body(body).await {
+                    Ok(bytes) => {
+                        *request.body_mut() = Some(reqwest::Body::from(bytes));
+                    }
+                    Err(msg) => {
+                        // Simulate a failed request by returning a 500 response.
+                        return Ok(http::Response::builder()
+                            .status(500)
+                            .body(msg)
+                            .unwrap()
+                            .into());
+                    }
+                }
+            }
            let response = (self.f)(request);
            Ok(response)
        }
--- a/rust/lancedb/src/remote/retry.rs
+++ b/rust/lancedb/src/remote/retry.rs
@@ -60,6 +60,34 @@ impl<'a> RetryCounter<'a> {
        self.check_out_of_retries(Box::new(source), status_code)
    }

+    /// Increment the appropriate failure counter based on the error type.
+    ///
+    /// For `Error::Http` whose source is a connect error, increments
+    /// `connect_failures`. For read errors (`is_body` or `is_decode`),
+    /// increments `read_failures`. For all other errors, increments
+    /// `request_failures`. Calls `check_out_of_retries` to enforce global limits.
+    pub fn increment_from_error(&mut self, source: crate::Error) -> crate::Result<()> {
+        let reqwest_err = match &source {
+            crate::Error::Http { source, .. } => source.downcast_ref::<reqwest::Error>(),
+            _ => None,
+        };
+
+        if reqwest_err.is_some_and(|e| e.is_connect()) {
+            self.connect_failures += 1;
+        } else if reqwest_err.is_some_and(|e| e.is_body() || e.is_decode()) {
+            self.read_failures += 1;
+        } else {
+            self.request_failures += 1;
+        }
+
+        let status_code = if let crate::Error::Http { status_code, .. } = &source {
+            *status_code
+        } else {
+            None
+        };
+        self.check_out_of_retries(Box::new(source), status_code)
+    }
+
    pub fn increment_connect_failures(&mut self, source: reqwest::Error) -> crate::Result<()> {
        self.connect_failures += 1;
        let status_code = source.status();
@@ -77,7 +105,7 @@ impl<'a> RetryCounter<'a> {
        let jitter = rand::random::<f32>() * self.config.backoff_jitter;
        let sleep_time = Duration::from_secs_f32(backoff + jitter);
        debug!(
-            "Retrying request {:?} ({}/{} connect, {}/{} read, {}/{} read) in {:?}",
+            "Retrying request {:?} ({}/{} connect, {}/{} request, {}/{} read) in {:?}",
            self.request_id,
            self.connect_failures,
            self.config.connect_retries,
@@ -91,6 +119,115 @@ impl<'a> RetryCounter<'a> {
    }
 }

+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_config() -> ResolvedRetryConfig {
+        ResolvedRetryConfig {
+            retries: 3,
+            connect_retries: 2,
+            read_retries: 3,
+            backoff_factor: 0.0,
+            backoff_jitter: 0.0,
+            statuses: vec![reqwest::StatusCode::BAD_GATEWAY],
+        }
+    }
+
+    /// Get a real reqwest connect error by trying to connect to a refused port.
+    async fn make_connect_error() -> reqwest::Error {
+        // Port 1 is almost always refused/unavailable.
+        reqwest::Client::new()
+            .get("http://127.0.0.1:1")
+            .send()
+            .await
+            .unwrap_err()
+    }
+
+    #[tokio::test]
+    async fn test_increment_from_error_connect() {
+        let config = test_config();
+        let mut counter = RetryCounter::new(&config, "test".to_string());
+
+        let connect_err = make_connect_error().await;
+        assert!(connect_err.is_connect());
+
+        let http_err = crate::Error::Http {
+            source: Box::new(connect_err),
+            request_id: "test".to_string(),
+            status_code: None,
+        };
+
+        // First connect failure: should be ok (1 < 2)
+        counter.increment_from_error(http_err).unwrap();
+        assert_eq!(counter.connect_failures, 1);
+        assert_eq!(counter.request_failures, 0);
+
+        // Second connect failure: should hit the limit (2 >= 2)
+        let connect_err2 = make_connect_error().await;
+        let http_err2 = crate::Error::Http {
+            source: Box::new(connect_err2),
+            request_id: "test".to_string(),
+            status_code: None,
+        };
+        let result = counter.increment_from_error(http_err2);
+        assert!(result.is_err());
+        assert!(matches!(
+            result.unwrap_err(),
+            crate::Error::Retry {
+                connect_failures: 2,
+                max_connect_failures: 2,
+                ..
+            }
+        ));
+    }
+
+    #[test]
+    fn test_increment_from_error_request() {
+        let config = test_config();
+        let mut counter = RetryCounter::new(&config, "test".to_string());
+
+        let http_err = crate::Error::Http {
+            source: "bad gateway".into(),
+            request_id: "test".to_string(),
+            status_code: Some(reqwest::StatusCode::BAD_GATEWAY),
+        };
+
+        counter.increment_from_error(http_err).unwrap();
+        assert_eq!(counter.request_failures, 1);
+        assert_eq!(counter.connect_failures, 0);
+    }
+
+    #[tokio::test]
+    async fn test_increment_from_error_respects_global_limits() {
+        // If request_failures is already at max, a connect error should still
+        // trigger the global limit check.
+        let config = test_config();
+        let mut counter = RetryCounter::new(&config, "test".to_string());
+        counter.request_failures = 3; // at max
+
+        let connect_err = make_connect_error().await;
+        let http_err = crate::Error::Http {
+            source: Box::new(connect_err),
+            request_id: "test".to_string(),
+            status_code: None,
+        };
+
+        // Even though connect_failures would be 1 (under limit of 2),
+        // request_failures is already at 3 (>= limit of 3), so this should fail.
+        let result = counter.increment_from_error(http_err);
+        assert!(result.is_err());
+        assert!(matches!(
+            result.unwrap_err(),
+            crate::Error::Retry {
+                request_failures: 3,
+                connect_failures: 1,
+                ..
+            }
+        ));
+    }
+}
+
 #[derive(Debug, Clone)]
 pub struct ResolvedRetryConfig {
    pub retries: u8,
--- a/rust/lancedb/src/remote/table.rs
+++ b/rust/lancedb/src/remote/table.rs
@@ -3,17 +3,17 @@

 pub mod insert;

+use self::insert::RemoteInsertExec;
+use crate::expr::expr_to_sql_string;
+
 use super::client::RequestResultExt;
 use super::client::{HttpSend, RestfulLanceDbClient, Sender};
 use super::db::ServerVersion;
-use super::util::stream_as_body;
 use super::ARROW_STREAM_CONTENT_TYPE;
-use crate::data::scannable::Scannable;
 use crate::index::waiter::wait_for_index;
 use crate::index::Index;
 use crate::index::IndexStatistics;
 use crate::query::{QueryFilter, QueryRequest, Select, VectorQueryRequest};
-use crate::remote::util::stream_as_ipc;
 use crate::table::query::create_multi_vector_plan;
 use crate::table::AddColumnsResult;
 use crate::table::AddResult;
@@ -23,7 +23,7 @@ use crate::table::DropColumnsResult;
 use crate::table::MergeResult;
 use crate::table::Tags;
 use crate::table::UpdateResult;
-use crate::table::{AddDataMode, AnyQuery, Filter, TableStatistics};
+use crate::table::{AnyQuery, Filter, TableStatistics};
 use crate::utils::background_cache::BackgroundCache;
 use crate::utils::{supported_btree_data_type, supported_vector_data_type};
 use crate::{
@@ -202,7 +202,6 @@ impl<S: HttpSend + 'static> Tags for RemoteTags<'_, S> {
 }

 pub struct RemoteTable<S: HttpSend = Sender> {
-    #[allow(dead_code)]
    client: RestfulLanceDbClient<S>,
    name: String,
    namespace: Vec<String>,
@@ -358,110 +357,6 @@ impl<S: HttpSend> RemoteTable<S> {
        Ok(res)
    }

-    /// Send a request with data from a Scannable source.
-    ///
-    /// For rescannable sources, this will retry on retryable errors by re-reading
-    /// the data. For non-rescannable sources (streams), only a single attempt is made.
-    async fn send_scannable(
-        &self,
-        req_builder: RequestBuilder,
-        data: &mut dyn Scannable,
-    ) -> Result<(String, Response)> {
-        use crate::remote::retry::RetryCounter;
-
-        // Right now, Python and Typescript don't pass down re-scannable data yet.
-        // So to preserve existing retry behavior, we have to collect data in
-        // memory for now. Once they expose rescannable data sources, we can remove this.
-        if !data.rescannable() && self.client.retry_config.retries > 0 {
-            let mut body = Vec::new();
-            stream_as_ipc(data.scan_as_stream())?
-                .try_for_each(|b| {
-                    body.extend_from_slice(&b);
-                    futures::future::ok(())
-                })
-                .await?;
-            let req_builder = req_builder.body(body);
-            return self.client.send_with_retry(req_builder, None, true).await;
-        }
-
-        let rescannable = data.rescannable();
-        let max_retries = if rescannable {
-            self.client.retry_config.retries
-        } else {
-            0
-        };
-
-        // Clone the request builder to extract the request id
-        let tmp_req = req_builder.try_clone().ok_or_else(|| Error::Runtime {
-            message: "Attempted to retry a request that cannot be cloned".to_string(),
-        })?;
-        let (_, r) = tmp_req.build_split();
-        let mut r = r.map_err(|e| Error::Runtime {
-            message: format!("Failed to build request: {}", e),
-        })?;
-        let request_id = self.client.extract_request_id(&mut r);
-        let mut retry_counter = RetryCounter::new(&self.client.retry_config, request_id.clone());
-
-        loop {
-            // Re-read data on each attempt
-            let stream = data.scan_as_stream();
-            let body = stream_as_body(stream)?;
-
-            let mut req_builder = req_builder.try_clone().ok_or_else(|| Error::Runtime {
-                message: "Attempted to retry a request that cannot be cloned".to_string(),
-            })?;
-            req_builder = req_builder.body(body);
-
-            let (c, request) = req_builder.build_split();
-            let mut request = request.map_err(|e| Error::Runtime {
-                message: format!("Failed to build request: {}", e),
-            })?;
-            self.client.set_request_id(&mut request, &request_id);
-
-            // Apply dynamic headers
-            request = self.client.apply_dynamic_headers(request).await?;
-
-            self.client.log_request(&request, &request_id);
-
-            let response = match self.client.sender.send(&c, request).await {
-                Ok(r) => r,
-                Err(err) => {
-                    if err.is_connect() {
-                        retry_counter.increment_connect_failures(err)?;
-                    } else if err.is_body() || err.is_decode() {
-                        retry_counter.increment_read_failures(err)?;
-                    } else {
-                        return Err(crate::Error::Http {
-                            source: err.into(),
-                            request_id,
-                            status_code: None,
-                        });
-                    }
-                    tokio::time::sleep(retry_counter.next_sleep_time()).await;
-                    continue;
-                }
-            };
-
-            let status = response.status();
-
-            // Check for retryable status codes
-            if self.client.retry_config.statuses.contains(&status)
-                && retry_counter.request_failures < max_retries
-            {
-                let http_err = crate::Error::Http {
-                    source: format!("Retryable status code: {}", status).into(),
-                    request_id: request_id.clone(),
-                    status_code: Some(status),
-                };
-                retry_counter.increment_request_failures(http_err)?;
-                tokio::time::sleep(retry_counter.next_sleep_time()).await;
-                continue;
-            }
-
-            return Ok((request_id, response));
-        }
-    }
-
    pub(super) async fn handle_table_not_found(
        table_name: &str,
        response: reqwest::Response,
@@ -552,13 +447,17 @@ impl<S: HttpSend> RemoteTable<S> {
        body["k"] = serde_json::Value::Number(serde_json::Number::from(limit));

        if let Some(filter) = &params.filter {
-            if let QueryFilter::Sql(filter) = filter {
-                body["filter"] = serde_json::Value::String(filter.clone());
-            } else {
-                return Err(Error::NotSupported {
-                    message: "querying a remote table with a non-sql filter".to_string(),
-                });
-            }
+            let filter_sql = match filter {
+                QueryFilter::Sql(sql) => sql.clone(),
+                QueryFilter::Datafusion(expr) => expr_to_sql_string(expr)?,
+                QueryFilter::Substrait(_) => {
+                    return Err(Error::NotSupported {
+                        message: "Substrait filters are not supported for remote queries"
+                            .to_string(),
+                    });
+                }
+            };
+            body["filter"] = serde_json::Value::String(filter_sql);
        }

        match &params.select {
@@ -1046,12 +945,12 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
        let version = self.current_version().await;

        if let Some(filter) = filter {
-            let Filter::Sql(filter) = filter else {
-                return Err(Error::NotSupported {
-                    message: "querying a remote table with a datafusion filter".to_string(),
-                });
+            let filter_sql = match filter {
+                Filter::Sql(sql) => sql.clone(),
+                Filter::Datafusion(expr) => expr_to_sql_string(&expr)?,
            };
-            request = request.json(&serde_json::json!({ "predicate": filter, "version": version }));
+            request =
+                request.json(&serde_json::json!({ "predicate": filter_sql, "version": version }));
        } else {
            let body = serde_json::json!({ "version": version });
            request = request.json(&body);
@@ -1077,39 +976,75 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
            status_code: None,
        })
    }
-    async fn add(&self, mut add: AddDataBuilder) -> Result<AddResult> {
-        self.check_mutable().await?;
-        let mut request = self
-            .client
-            .post(&format!("/v1/table/{}/insert/", self.identifier))
-            .header(CONTENT_TYPE, ARROW_STREAM_CONTENT_TYPE);
+    async fn add(&self, add: AddDataBuilder) -> Result<AddResult> {
+        use crate::remote::retry::RetryCounter;

-        match add.mode {
-            AddDataMode::Append => {}
-            AddDataMode::Overwrite => {
-                request = request.query(&[("mode", "overwrite")]);
+        self.check_mutable().await?;
+
+        let table_schema = self.schema().await?;
+        let table_def = TableDefinition::try_from_rich_schema(table_schema.clone())?;
+        let output = add.into_plan(&table_schema, &table_def)?;
+
+        let mut insert: Arc<dyn ExecutionPlan> = Arc::new(RemoteInsertExec::new(
+            self.name.clone(),
+            self.identifier.clone(),
+            self.client.clone(),
+            output.plan,
+            output.overwrite,
+        ));
+
+        let mut retry_counter =
+            RetryCounter::new(&self.client.retry_config, uuid::Uuid::new_v4().to_string());
+
+        loop {
+            let stream = execute_plan(insert.clone(), Default::default())?;
+            let result: Result<Vec<_>> = stream.try_collect().await.map_err(Error::from);
+
+            match result {
+                Ok(_) => {
+                    let add_result = insert
+                        .as_any()
+                        .downcast_ref::<RemoteInsertExec<S>>()
+                        .and_then(|i| i.add_result())
+                        .unwrap_or(AddResult { version: 0 });
+
+                    if output.overwrite {
+                        self.invalidate_schema_cache();
+                    }
+
+                    return Ok(add_result);
+                }
+                Err(err) if output.rescannable => {
+                    let retryable = match &err {
+                        Error::Http {
+                            source,
+                            status_code,
+                            ..
+                        } => {
+                            // Don't retry read errors (is_body/is_decode): the
+                            // server may have committed the write already, and
+                            // without an idempotency key we'd duplicate data.
+                            source
+                                .downcast_ref::<reqwest::Error>()
+                                .is_some_and(|e| e.is_connect())
+                                || status_code
+                                    .is_some_and(|s| self.client.retry_config.statuses.contains(&s))
+                        }
+                        _ => false,
+                    };
+
+                    if retryable {
+                        retry_counter.increment_from_error(err)?;
+                        tokio::time::sleep(retry_counter.next_sleep_time()).await;
+                        insert = insert.reset_state()?;
+                        continue;
+                    }
+
+                    return Err(err);
+                }
+                Err(err) => return Err(err),
            }
        }
-
-        let (request_id, response) = self.send_scannable(request, &mut *add.data).await?;
-        let response = self.check_table_response(&request_id, response).await?;
-        let body = response.text().await.err_to_http(request_id.clone())?;
-        if body.trim().is_empty() {
-            // Backward compatible with old servers
-            return Ok(AddResult { version: 0 });
-        }
-
-        let add_response: AddResult = serde_json::from_str(&body).map_err(|e| Error::Http {
-            source: format!("Failed to parse add response: {}", e).into(),
-            request_id,
-            status_code: None,
-        })?;
-
-        if matches!(add.mode, AddDataMode::Overwrite) {
-            self.invalidate_schema_cache();
-        }
-
-        Ok(add_response)
    }

    async fn create_plan(
@@ -1756,9 +1691,8 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
    }

    async fn table_definition(&self) -> Result<TableDefinition> {
-        Err(Error::NotSupported {
-            message: "table_definition is not supported on LanceDB cloud.".into(),
-        })
+        let schema = self.schema().await?;
+        TableDefinition::try_from_rich_schema(schema)
    }
    async fn uri(&self) -> Result<String> {
        // Check if we already have the location cached
@@ -1883,6 +1817,8 @@ mod tests {

    use super::*;

+    use crate::table::AddDataMode;
+
    use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
    use arrow_array::{record_batch, Int32Array, RecordBatch, RecordBatchIterator};
    use arrow_schema::{DataType, Field, Schema};
@@ -2095,6 +2031,16 @@ mod tests {
        body
    }

+    /// Build a JSON describe response for the given schema.
+    fn describe_response(schema: &Schema) -> String {
+        let json_schema = JsonSchema::try_from(schema).unwrap();
+        serde_json::to_string(&json!({
+            "version": 1,
+            "schema": json_schema,
+        }))
+        .unwrap()
+    }
+
    #[rstest]
    #[case("", 0)]
    #[case("{}", 0)]
@@ -2111,30 +2057,35 @@ mod tests {
        // Clone response_body to give it 'static lifetime for the closure
        let response_body = response_body.to_string();

+        let describe_body = describe_response(&data.schema());
        let (sender, receiver) = std::sync::mpsc::channel();
-        let table = Table::new_with_handler("my_table", move |mut request| {
-            if request.url().path() == "/v1/table/my_table/insert/" {
-                assert_eq!(request.method(), "POST");
-                assert!(request
-                    .url()
-                    .query_pairs()
-                    .filter(|(k, _)| k == "mode")
-                    .all(|(_, v)| v == "append"));
-                assert_eq!(
-                    request.headers().get("Content-Type").unwrap(),
-                    ARROW_STREAM_CONTENT_TYPE
-                );
-                let mut body_out = reqwest::Body::from(Vec::new());
-                std::mem::swap(request.body_mut().as_mut().unwrap(), &mut body_out);
-                sender.send(body_out).unwrap();
-                http::Response::builder()
+        let table =
+            Table::new_with_handler("my_table", move |mut request| match request.url().path() {
+                "/v1/table/my_table/describe/" => http::Response::builder()
                    .status(200)
-                    .body(response_body.clone())
-                    .unwrap()
-            } else {
-                panic!("Unexpected request path: {}", request.url().path());
-            }
-        });
+                    .body(describe_body.clone())
+                    .unwrap(),
+                "/v1/table/my_table/insert/" => {
+                    assert_eq!(request.method(), "POST");
+                    assert!(request
+                        .url()
+                        .query_pairs()
+                        .filter(|(k, _)| k == "mode")
+                        .all(|(_, v)| v == "append"));
+                    assert_eq!(
+                        request.headers().get("Content-Type").unwrap(),
+                        ARROW_STREAM_CONTENT_TYPE
+                    );
+                    let mut body_out = reqwest::Body::from(Vec::new());
+                    std::mem::swap(request.body_mut().as_mut().unwrap(), &mut body_out);
+                    sender.send(body_out).unwrap();
+                    http::Response::builder()
+                        .status(200)
+                        .body(response_body.clone())
+                        .unwrap()
+                }
+                path => panic!("Unexpected request path: {}", path),
+            });
        let result = table.add(data.clone()).execute().await.unwrap();

        // Check version matches expected value
@@ -2157,39 +2108,50 @@ mod tests {
        )
        .unwrap();

+        let describe_body = describe_response(&data.schema());
        let (sender, receiver) = std::sync::mpsc::channel();
-        let table = Table::new_with_handler("my_table", move |mut request| {
-            assert_eq!(request.method(), "POST");
-            assert_eq!(request.url().path(), "/v1/table/my_table/insert/");
-            assert_eq!(
-                request
-                    .url()
-                    .query_pairs()
-                    .find(|(k, _)| k == "mode")
-                    .map(|kv| kv.1)
-                    .as_deref(),
-                Some("overwrite"),
-                "Expected mode=overwrite"
-            );
-
-            assert_eq!(
-                request.headers().get("Content-Type").unwrap(),
-                ARROW_STREAM_CONTENT_TYPE
-            );
-
-            let mut body_out = reqwest::Body::from(Vec::new());
-            std::mem::swap(request.body_mut().as_mut().unwrap(), &mut body_out);
-            sender.send(body_out).unwrap();
-
-            if old_server {
-                http::Response::builder().status(200).body("").unwrap()
-            } else {
-                http::Response::builder()
+        let table =
+            Table::new_with_handler("my_table", move |mut request| match request.url().path() {
+                "/v1/table/my_table/describe/" => http::Response::builder()
                    .status(200)
-                    .body(r#"{"version": 43}"#)
-                    .unwrap()
-            }
-        });
+                    .body(describe_body.clone())
+                    .unwrap(),
+                "/v1/table/my_table/insert/" => {
+                    assert_eq!(request.method(), "POST");
+                    assert_eq!(
+                        request
+                            .url()
+                            .query_pairs()
+                            .find(|(k, _)| k == "mode")
+                            .map(|kv| kv.1)
+                            .as_deref(),
+                        Some("overwrite"),
+                        "Expected mode=overwrite"
+                    );
+
+                    assert_eq!(
+                        request.headers().get("Content-Type").unwrap(),
+                        ARROW_STREAM_CONTENT_TYPE
+                    );
+
+                    let mut body_out = reqwest::Body::from(Vec::new());
+                    std::mem::swap(request.body_mut().as_mut().unwrap(), &mut body_out);
+                    sender.send(body_out).unwrap();
+
+                    if old_server {
+                        http::Response::builder()
+                            .status(200)
+                            .body("".to_string())
+                            .unwrap()
+                    } else {
+                        http::Response::builder()
+                            .status(200)
+                            .body(r#"{"version": 43}"#.to_string())
+                            .unwrap()
+                    }
+                }
+                path => panic!("Unexpected request path: {}", path),
+            });

        let result = table
            .add(data.clone())
@@ -2206,6 +2168,131 @@ mod tests {
        assert_eq!(&body, &expected_body);
    }

+    #[tokio::test]
+    async fn test_add_preprocessing() {
+        use crate::table::NaNVectorBehavior;
+        use arrow_array::{FixedSizeListArray, Float32Array, Int64Array};
+
+        // The table schema: {id: Int64, vec: FixedSizeList<Float32>[3]}
+        let table_schema = Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new(
+                "vec",
+                DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 3),
+                false,
+            ),
+        ]);
+        let json_schema = JsonSchema::try_from(&table_schema).unwrap();
+        let describe_body = serde_json::to_string(&json!({
+            "version": 1,
+            "schema": json_schema,
+        }))
+        .unwrap();
+
+        // ---- Part 1: NaN vectors should be rejected by default ----
+        let nan_data = RecordBatch::try_new(
+            Arc::new(table_schema.clone()),
+            vec![
+                Arc::new(Int64Array::from(vec![1])),
+                Arc::new(
+                    FixedSizeListArray::try_new(
+                        Arc::new(Field::new("item", DataType::Float32, true)),
+                        3,
+                        Arc::new(Float32Array::from(vec![1.0, f32::NAN, 3.0])),
+                        None,
+                    )
+                    .unwrap(),
+                ),
+            ],
+        )
+        .unwrap();
+
+        let describe_body_clone = describe_body.clone();
+        let table =
+            Table::new_with_handler("my_table", move |request| match request.url().path() {
+                "/v1/table/my_table/describe/" => http::Response::builder()
+                    .status(200)
+                    .body(describe_body_clone.clone())
+                    .unwrap(),
+                "/v1/table/my_table/insert/" => http::Response::builder()
+                    .status(200)
+                    .body(r#"{"version": 2}"#.to_string())
+                    .unwrap(),
+                path => panic!("Unexpected path: {path}"),
+            });
+
+        let result = table.add(nan_data).execute().await;
+        assert!(result.is_err(), "NaN vectors should be rejected by default");
+        assert!(
+            result.unwrap_err().to_string().contains("NaN"),
+            "error should mention NaN"
+        );
+
+        // ---- Part 2: With Keep, should handle casting and missing columns ----
+        // Input: {id: Int32 (needs cast to Int64), vec: FixedSizeList<Float32>[3] with NaN}
+        // Table expects Int64 for id; NaN should be kept.
+        let input_schema = Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new(
+                "vec",
+                DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 3),
+                false,
+            ),
+        ]);
+        let cast_data = RecordBatch::try_new(
+            Arc::new(input_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![42])),
+                Arc::new(
+                    FixedSizeListArray::try_new(
+                        Arc::new(Field::new("item", DataType::Float32, true)),
+                        3,
+                        Arc::new(Float32Array::from(vec![1.0, f32::NAN, 3.0])),
+                        None,
+                    )
+                    .unwrap(),
+                ),
+            ],
+        )
+        .unwrap();
+
+        let (sender, receiver) = std::sync::mpsc::channel();
+        let table =
+            Table::new_with_handler("my_table", move |mut request| match request.url().path() {
+                "/v1/table/my_table/describe/" => http::Response::builder()
+                    .status(200)
+                    .body(describe_body.clone())
+                    .unwrap(),
+                "/v1/table/my_table/insert/" => {
+                    let mut body_out = reqwest::Body::from(Vec::new());
+                    std::mem::swap(request.body_mut().as_mut().unwrap(), &mut body_out);
+                    sender.send(body_out).unwrap();
+                    http::Response::builder()
+                        .status(200)
+                        .body(r#"{"version": 2}"#.to_string())
+                        .unwrap()
+                }
+                path => panic!("Unexpected path: {path}"),
+            });
+
+        table
+            .add(cast_data)
+            .on_nan_vectors(NaNVectorBehavior::Keep)
+            .execute()
+            .await
+            .unwrap();
+
+        // Verify the data sent to the server was cast to the table schema.
+        let body = receiver.recv().unwrap();
+        let body = collect_body(body).await;
+        let cursor = std::io::Cursor::new(body);
+        let mut reader = arrow_ipc::reader::StreamReader::try_new(cursor, None).unwrap();
+        let batch = reader.next().unwrap().unwrap();
+        assert_eq!(batch.schema().field(0).data_type(), &DataType::Int64);
+        let ids: &Int64Array = batch.column(0).as_any().downcast_ref().unwrap();
+        assert_eq!(ids.value(0), 42);
+    }
+
    #[rstest]
    #[case(true)]
    #[case(false)]
@@ -3572,23 +3659,29 @@ mod tests {
        )
        .unwrap();

+        let describe_body = describe_response(&data.schema());
        let (sender, receiver) = std::sync::mpsc::channel();
        let table = Table::new_with_handler("prod$metrics", move |mut request| {
-            if request.url().path() == "/v1/table/prod$metrics/insert/" {
-                assert_eq!(request.method(), "POST");
-                assert_eq!(
-                    request.headers().get("Content-Type").unwrap(),
-                    ARROW_STREAM_CONTENT_TYPE
-                );
-                let mut body_out = reqwest::Body::from(Vec::new());
-                std::mem::swap(request.body_mut().as_mut().unwrap(), &mut body_out);
-                sender.send(body_out).unwrap();
-                http::Response::builder()
+            match request.url().path() {
+                "/v1/table/prod$metrics/describe/" => http::Response::builder()
                    .status(200)
-                    .body(r#"{"version": 2}"#)
-                    .unwrap()
-            } else {
-                panic!("Unexpected request path: {}", request.url().path());
+                    .body(describe_body.clone())
+                    .unwrap(),
+                "/v1/table/prod$metrics/insert/" => {
+                    assert_eq!(request.method(), "POST");
+                    assert_eq!(
+                        request.headers().get("Content-Type").unwrap(),
+                        ARROW_STREAM_CONTENT_TYPE
+                    );
+                    let mut body_out = reqwest::Body::from(Vec::new());
+                    std::mem::swap(request.body_mut().as_mut().unwrap(), &mut body_out);
+                    sender.send(body_out).unwrap();
+                    http::Response::builder()
+                        .status(200)
+                        .body(r#"{"version": 2}"#.to_string())
+                        .unwrap()
+                }
+                path => panic!("Unexpected request path: {}", path),
            }
        });

@@ -4480,93 +4573,126 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_add_retries_rescannable_data() {
-        let call_count = Arc::new(AtomicUsize::new(0));
-        let call_count_clone = call_count.clone();
-
-        // Configure with retries enabled (default is 3)
-        let config = crate::remote::ClientConfig::default();
-
-        let table = Table::new_with_handler_and_config(
-            "my_table",
-            move |_request| {
-                let count = call_count_clone.fetch_add(1, Ordering::SeqCst);
-                if count < 2 {
-                    // First two attempts fail with a retryable error (409)
-                    http::Response::builder().status(409).body("").unwrap()
-                } else {
-                    // Third attempt succeeds
-                    http::Response::builder()
-                        .status(200)
-                        .body(r#"{"version": 1}"#)
-                        .unwrap()
-                }
-            },
-            config,
-        );
-
-        // RecordBatch is rescannable - should retry and succeed
+    async fn test_add_insert_fails() {
+        // Verify that an HTTP error from the insert endpoint is properly
+        // surfaced with the status code intact. Use 400 (non-retryable).
        let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
-        let result = table.add(batch).execute().await;
+        let describe_body = describe_response(&batch.schema());

-        assert!(
-            result.is_ok(),
-            "Expected success after retries: {:?}",
-            result
-        );
-        assert_eq!(
-            call_count.load(Ordering::SeqCst),
-            3,
-            "Expected 2 failed attempts + 1 success = 3 total"
-        );
+        let table =
+            Table::new_with_handler("my_table", move |request| match request.url().path() {
+                "/v1/table/my_table/describe/" => http::Response::builder()
+                    .status(200)
+                    .body(describe_body.clone())
+                    .unwrap(),
+                "/v1/table/my_table/insert/" => http::Response::builder()
+                    .status(400)
+                    .body("bad request".to_string())
+                    .unwrap(),
+                path => panic!("Unexpected request path: {}", path),
+            });
+
+        let result = table.add(batch).execute().await;
+        let err = result.unwrap_err();
+        match &err {
+            Error::Http { status_code, .. } => {
+                assert_eq!(*status_code, Some(reqwest::StatusCode::BAD_REQUEST));
+            }
+            other => panic!("Expected Http error, got: {:?}", other),
+        }
    }

    #[tokio::test]
-    async fn test_add_no_retry_for_non_rescannable() {
-        let call_count = Arc::new(AtomicUsize::new(0));
-        let call_count_clone = call_count.clone();
-
-        // Configure with retries enabled
-        let config = crate::remote::ClientConfig::default();
-
-        let table = Table::new_with_handler_and_config(
-            "my_table",
-            move |_request| {
-                call_count_clone.fetch_add(1, Ordering::SeqCst);
-                // Always fail with retryable error
-                http::Response::builder().status(409).body("").unwrap()
-            },
-            config,
-        );
-
-        // RecordBatchReader is NOT rescannable - should NOT retry
+    async fn test_add_retries_on_retryable_status() {
+        // Verify that rescannable data retries on retryable status codes (e.g. 502)
+        // and eventually succeeds.
        let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
-        let reader: Box<dyn arrow_array::RecordBatchReader + Send> = Box::new(
-            RecordBatchIterator::new(vec![Ok(batch.clone())], batch.schema()),
-        );
+        let describe_body = describe_response(&batch.schema());

-        let result = table.add(reader).execute().await;
+        let attempt = Arc::new(AtomicUsize::new(0));
+        let attempt_clone = attempt.clone();

-        // Should fail because we can't retry non-rescannable sources
-        assert!(result.is_err());
-        // Right now, we actually do retry, so we get 3 failures. In the future
-        // this will change and we need to update the test.
-        assert!(
-            matches!(
-                result.unwrap_err(),
-                Error::Retry {
-                    request_failures: 3,
-                    ..
+        let table =
+            Table::new_with_handler("my_table", move |request| match request.url().path() {
+                "/v1/table/my_table/describe/" => http::Response::builder()
+                    .status(200)
+                    .body(describe_body.clone())
+                    .unwrap(),
+                "/v1/table/my_table/insert/" => {
+                    let n = attempt_clone.fetch_add(1, Ordering::SeqCst);
+                    if n < 2 {
+                        http::Response::builder()
+                            .status(502)
+                            .body("bad gateway".to_string())
+                            .unwrap()
+                    } else {
+                        http::Response::builder()
+                            .status(200)
+                            .body(r#"{"version": 3}"#.to_string())
+                            .unwrap()
+                    }
                }
-            ),
-            "Expected RequestFailed with status 409"
-        );
-        // TODO: After we implement proper non-rescannable handling, uncomment below
-        // (This is blocked on getting Python and Node to pass down re-scannable data.)
-        // assert_eq!(
-        //     call_count.load(Ordering::SeqCst),
-        //     1,
-        //     "Expected only one attempt for non-rescannable source"
-        // );
+                path => panic!("Unexpected request path: {}", path),
+            });
+
+        let result = table.add(batch).execute().await.unwrap();
+        assert_eq!(result.version, 3);
+        assert_eq!(attempt.load(Ordering::SeqCst), 3);
+    }
+
+    #[tokio::test]
+    async fn test_query_with_datafusion_filter() {
+        use datafusion_expr::{col, lit};
+
+        let expected_data = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+        )
+        .unwrap();
+        let expected_data_ref = expected_data.clone();
+
+        let table = Table::new_with_handler("my_table", move |request| {
+            assert_eq!(request.method(), "POST");
+            assert_eq!(request.url().path(), "/v1/table/my_table/query/");
+
+            let body = request.body().unwrap().as_bytes().unwrap();
+            let body: serde_json::Value = serde_json::from_slice(body).unwrap();
+
+            // The Datafusion expression should be serialized to SQL
+            let filter = body.get("filter").expect("filter should be present");
+            let filter_str = filter.as_str().expect("filter should be a string");
+            // col("x") > lit(10) AND col("status") = lit("active")
+            assert!(
+                filter_str.contains("x") && filter_str.contains("10"),
+                "Filter should contain 'x' and '10', got: {}",
+                filter_str
+            );
+            assert!(
+                filter_str.contains("status") && filter_str.contains("active"),
+                "Filter should contain 'status' and 'active', got: {}",
+                filter_str
+            );
+
+            let response_body = write_ipc_file(&expected_data_ref);
+            http::Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, ARROW_FILE_CONTENT_TYPE)
+                .body(response_body)
+                .unwrap()
+        });
+
+        // Use only_if_expr with a Datafusion expression
+        let expr = col("x").gt(lit(10)).and(col("status").eq(lit("active")));
+        let data = table
+            .query()
+            .only_if_expr(expr)
+            .execute()
+            .await
+            .unwrap()
+            .collect::<Vec<_>>()
+            .await;
+
+        assert_eq!(data.len(), 1);
+        assert_eq!(data[0].as_ref().unwrap(), &expected_data);
    }
 }
--- a/rust/lancedb/src/remote/table/insert.rs
+++ b/rust/lancedb/src/remote/table/insert.rs
@@ -8,7 +8,6 @@ use std::sync::{Arc, Mutex};

 use arrow_array::{ArrayRef, RecordBatch, UInt64Array};
 use arrow_ipc::CompressionType;
-use arrow_schema::ArrowError;
 use datafusion_common::{DataFusionError, Result as DataFusionResult};
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_physical_expr::EquivalenceProperties;
@@ -76,7 +75,15 @@ impl<S: HttpSend + 'static> RemoteInsertExec<S> {
        self.add_result.lock().unwrap().clone()
    }

-    fn stream_as_body(data: SendableRecordBatchStream) -> DataFusionResult<reqwest::Body> {
+    /// Stream the input into an HTTP body as an Arrow IPC stream, capturing any
+    /// stream errors into the provided channel. Errors from the input plan
+    /// (e.g. NaN rejection) would otherwise be swallowed inside the HTTP body
+    /// upload; by stashing them in the channel we can surface them with their
+    /// original message after the request completes.
+    fn stream_as_http_body(
+        data: SendableRecordBatchStream,
+        error_tx: tokio::sync::oneshot::Sender<DataFusionError>,
+    ) -> DataFusionResult<reqwest::Body> {
        let options = arrow_ipc::writer::IpcWriteOptions::default()
            .try_with_compression(Some(CompressionType::LZ4_FRAME))?;
        let writer = arrow_ipc::writer::StreamWriter::try_new_with_options(
@@ -85,26 +92,44 @@ impl<S: HttpSend + 'static> RemoteInsertExec<S> {
            options,
        )?;

-        let stream = futures::stream::try_unfold((data, writer), move |(mut data, mut writer)| {
-            async move {
+        let stream = futures::stream::try_unfold(
+            (data, writer, Some(error_tx), false),
+            move |(mut data, mut writer, error_tx, finished)| async move {
+                if finished {
+                    return Ok(None);
+                }
                match data.next().await {
                    Some(Ok(batch)) => {
-                        writer.write(&batch)?;
+                        writer
+                            .write(&batch)
+                            .map_err(|e| std::io::Error::other(e.to_string()))?;
                        let buffer = std::mem::take(writer.get_mut());
-                        Ok(Some((buffer, (data, writer))))
+                        Ok(Some((buffer, (data, writer, error_tx, false))))
+                    }
+                    Some(Err(e)) => {
+                        // Send the original error through the channel before
+                        // returning a generic error to reqwest.
+                        if let Some(tx) = error_tx {
+                            let _ = tx.send(e);
+                        }
+                        Err(std::io::Error::other(
+                            "input stream error (see error channel)",
+                        ))
                    }
-                    Some(Err(e)) => Err(e),
                    None => {
-                        if let Err(ArrowError::IpcError(_msg)) = writer.finish() {
-                            // Will error if already closed.
-                            return Ok(None);
-                        };
+                        writer
+                            .finish()
+                            .map_err(|e| std::io::Error::other(e.to_string()))?;
                        let buffer = std::mem::take(writer.get_mut());
-                        Ok(Some((buffer, (data, writer))))
+                        if buffer.is_empty() {
+                            Ok(None)
+                        } else {
+                            Ok(Some((buffer, (data, writer, None, true))))
+                        }
                    }
                }
-            }
-        });
+            },
+        );

        Ok(reqwest::Body::wrap_stream(stream))
    }
@@ -202,24 +227,41 @@ impl<S: HttpSend + 'static> ExecutionPlan for RemoteInsertExec<S> {
                request = request.query(&[("mode", "overwrite")]);
            }

-            let body = Self::stream_as_body(input_stream)?;
+            let (error_tx, mut error_rx) = tokio::sync::oneshot::channel();
+            let body = Self::stream_as_http_body(input_stream, error_tx)?;
            let request = request.body(body);

-            let (request_id, response) = client
-                .send(request)
-                .await
-                .map_err(|e| DataFusionError::External(Box::new(e)))?;
-
-            let response =
-                RemoteTable::<Sender>::handle_table_not_found(&table_name, response, &request_id)
+            let result: DataFusionResult<(String, _)> = async {
+                let (request_id, response) = client
+                    .send(request)
                    .await
                    .map_err(|e| DataFusionError::External(Box::new(e)))?;

-            let response = client
-                .check_response(&request_id, response)
+                let response = RemoteTable::<Sender>::handle_table_not_found(
+                    &table_name,
+                    response,
+                    &request_id,
+                )
                .await
                .map_err(|e| DataFusionError::External(Box::new(e)))?;

+                let response = client
+                    .check_response(&request_id, response)
+                    .await
+                    .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+                Ok((request_id, response))
+            }
+            .await;
+
+            // If the request failed due to an input stream error, surface the
+            // original error (e.g. NaN rejection) instead of the HTTP error.
+            if let Ok(stream_err) = error_rx.try_recv() {
+                return Err(stream_err);
+            }
+
+            let (request_id, response) = result?;
+
            let body_text = response.text().await.map_err(|e| {
                DataFusionError::External(Box::new(Error::Http {
                    source: Box::new(e),
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -6,16 +6,19 @@
 use arrow_array::{RecordBatch, RecordBatchReader};
 use arrow_schema::{DataType, Field, Schema, SchemaRef};
 use async_trait::async_trait;
+use datafusion_execution::TaskContext;
 use datafusion_expr::Expr;
 use datafusion_physical_plan::display::DisplayableExecutionPlan;
 use datafusion_physical_plan::ExecutionPlan;
+use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use lance::dataset::builder::DatasetBuilder;
 pub use lance::dataset::ColumnAlteration;
 pub use lance::dataset::NewColumnTransform;
 pub use lance::dataset::ReadParams;
 pub use lance::dataset::Version;
-use lance::dataset::{InsertBuilder, WriteMode, WriteParams};
+use lance::dataset::WriteMode;
+use lance::dataset::{InsertBuilder, WriteParams};
 use lance::index::vector::utils::infer_vector_dim;
 use lance::index::vector::VectorIndexParams;
 use lance::io::{ObjectStoreParams, WrappingObjectStore};
@@ -40,7 +43,7 @@ use std::format;
 use std::path::Path;
 use std::sync::Arc;

-use crate::data::scannable::{scannable_with_embeddings, Scannable};
+use crate::data::scannable::{estimate_write_partitions, PeekedScannable, Scannable};
 use crate::database::Database;
 use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MemoryRegistry};
 use crate::error::{Error, Result};
@@ -49,6 +52,7 @@ use crate::index::IndexStatistics;
 use crate::index::{vector::suggested_num_sub_vectors, Index, IndexBuilder};
 use crate::index::{IndexConfig, IndexStatisticsImpl};
 use crate::query::{IntoQueryVector, Query, QueryExecutionOptions, TakeQuery, VectorQuery};
+use crate::table::datafusion::insert::InsertExec;
 use crate::utils::{
    supported_bitmap_data_type, supported_btree_data_type, supported_fts_data_type,
    supported_label_list_data_type, supported_vector_data_type, PatchReadParam, PatchWriteParam,
@@ -67,7 +71,7 @@ pub mod query;
 pub mod schema_evolution;
 pub mod update;
 use crate::index::waiter::wait_for_index;
-pub use add_data::{AddDataBuilder, AddDataMode, AddResult};
+pub use add_data::{AddDataBuilder, AddDataMode, AddResult, NaNVectorBehavior};
 pub use chrono::Duration;
 pub use delete::DeleteResult;
 use futures::future::join_all;
@@ -2109,29 +2113,81 @@ impl BaseTable for NativeTable {
        }
    }

-    async fn add(&self, add: AddDataBuilder) -> Result<AddResult> {
-        let lance_params = add.write_options.lance_write_params.unwrap_or(WriteParams {
-            mode: match add.mode {
-                AddDataMode::Append => WriteMode::Append,
-                AddDataMode::Overwrite => WriteMode::Overwrite,
-            },
-            ..Default::default()
-        });
-
-        // Apply embeddings if configured
+    async fn add(&self, mut add: AddDataBuilder) -> Result<AddResult> {
        let table_def = self.table_definition().await?;
-        let data =
-            scannable_with_embeddings(add.data, &table_def, add.embedding_registry.as_ref())?;

        self.dataset.ensure_mutable()?;
+        let ds_wrapper = self.dataset.clone();
        let ds = self.dataset.get().await?;
-        let dataset = InsertBuilder::new(ds)
-            .with_params(&lance_params)
-            .execute_stream(data)
-            .await?;

-        let version = dataset.manifest().version;
-        self.dataset.update(dataset);
+        let table_schema = Schema::from(&ds.schema().clone());
+
+        // Peek at the first batch to estimate a good partition count for
+        // write parallelism.
+        let mut peeked = PeekedScannable::new(add.data);
+        let num_partitions = if let Some(first_batch) = peeked.peek().await {
+            let max_partitions = lance_core::utils::tokio::get_num_compute_intensive_cpus();
+            estimate_write_partitions(
+                first_batch.get_array_memory_size(),
+                first_batch.num_rows(),
+                peeked.num_rows(),
+                max_partitions,
+            )
+        } else {
+            1
+        };
+        add.data = Box::new(peeked);
+
+        let output = add.into_plan(&table_schema, &table_def)?;
+
+        let lance_params = output
+            .write_options
+            .lance_write_params
+            .unwrap_or(WriteParams {
+                mode: match output.mode {
+                    AddDataMode::Append => WriteMode::Append,
+                    AddDataMode::Overwrite => WriteMode::Overwrite,
+                },
+                ..Default::default()
+            });
+
+        // Repartition for write parallelism if beneficial.
+        let plan = if num_partitions > 1 {
+            Arc::new(
+                datafusion_physical_plan::repartition::RepartitionExec::try_new(
+                    output.plan,
+                    datafusion_physical_plan::Partitioning::RoundRobinBatch(num_partitions),
+                )?,
+            ) as Arc<dyn ExecutionPlan>
+        } else {
+            output.plan
+        };
+
+        let insert_exec = Arc::new(InsertExec::new(ds_wrapper.clone(), ds, plan, lance_params));
+
+        // Execute all partitions in parallel.
+        let task_ctx = Arc::new(TaskContext::default());
+        let handles = FuturesUnordered::new();
+        for partition in 0..num_partitions {
+            let exec = insert_exec.clone();
+            let ctx = task_ctx.clone();
+            handles.push(tokio::spawn(async move {
+                let mut stream = exec
+                    .execute(partition, ctx)
+                    .map_err(|e| -> Error { e.into() })?;
+                while let Some(batch) = stream.next().await {
+                    batch.map_err(|e| -> Error { e.into() })?;
+                }
+                Ok::<_, Error>(())
+            }));
+        }
+        for handle in handles {
+            handle.await.map_err(|e| Error::Runtime {
+                message: format!("Insert task panicked: {}", e),
+            })??;
+        }
+
+        let version = ds_wrapper.get().await?.manifest().version;
        Ok(AddResult { version })
    }

--- a/rust/lancedb/src/table/add_data.rs
+++ b/rust/lancedb/src/table/add_data.rs
@@ -3,13 +3,19 @@

 use std::sync::Arc;

+use arrow_schema::{DataType, Fields, Schema};
+use lance::dataset::WriteMode;
 use serde::{Deserialize, Serialize};

+use crate::data::scannable::scannable_with_embeddings;
 use crate::data::scannable::Scannable;
 use crate::embeddings::EmbeddingRegistry;
-use crate::Result;
+use crate::table::datafusion::cast::cast_to_table_schema;
+use crate::table::datafusion::reject_nan::reject_nan_vectors;
+use crate::table::datafusion::scannable_exec::ScannableExec;
+use crate::{Error, Result};

-use super::{BaseTable, WriteOptions};
+use super::{BaseTable, TableDefinition, WriteOptions};

 #[derive(Debug, Clone, Default)]
 pub enum AddDataMode {
@@ -29,12 +35,22 @@ pub struct AddResult {
    pub version: u64,
 }

+#[derive(Debug, Default, Clone, Copy)]
+pub enum NaNVectorBehavior {
+    /// Reject any vectors containing NaN values (the default)
+    #[default]
+    Error,
+    /// Allow NaN values to be added, but they will not be indexed for search
+    Keep,
+}
+
 /// A builder for configuring a [`crate::table::Table::add`] operation
 pub struct AddDataBuilder {
    pub(crate) parent: Arc<dyn BaseTable>,
    pub(crate) data: Box<dyn Scannable>,
    pub(crate) mode: AddDataMode,
    pub(crate) write_options: WriteOptions,
+    pub(crate) on_nan_vectors: NaNVectorBehavior,
    pub(crate) embedding_registry: Option<Arc<dyn EmbeddingRegistry>>,
 }

@@ -59,6 +75,7 @@ impl AddDataBuilder {
            data,
            mode: AddDataMode::Append,
            write_options: WriteOptions::default(),
+            on_nan_vectors: NaNVectorBehavior::default(),
            embedding_registry,
        }
    }
@@ -73,16 +90,123 @@ impl AddDataBuilder {
        self
    }

+    /// Configure how to handle NaN values in vector columns.
+    ///
+    /// By default, any vectors containing NaN values will be rejected with an
+    /// error, since NaNs cannot be indexed for search. Setting this to `Keep`
+    /// will allow NaN values to be added to the table, but they will not be
+    /// indexed and will not be searchable.
+    pub fn on_nan_vectors(mut self, behavior: NaNVectorBehavior) -> Self {
+        self.on_nan_vectors = behavior;
+        self
+    }
+
    pub async fn execute(self) -> Result<AddResult> {
        self.parent.clone().add(self).await
    }
+
+    /// Build a DataFusion execution plan that applies embeddings, casts data to
+    /// the table schema, and optionally rejects NaN vectors.
+    ///
+    /// Returns the plan along with whether the input is rescannable (for retry
+    /// decisions) and whether this is an overwrite operation.
+    pub(crate) fn into_plan(
+        mut self,
+        table_schema: &Schema,
+        table_def: &TableDefinition,
+    ) -> Result<PreprocessingOutput> {
+        let overwrite = self
+            .write_options
+            .lance_write_params
+            .as_ref()
+            .is_some_and(|p| matches!(p.mode, WriteMode::Overwrite))
+            || matches!(self.mode, AddDataMode::Overwrite);
+
+        if !overwrite {
+            validate_schema(&self.data.schema(), table_schema)?;
+        }
+
+        self.data =
+            scannable_with_embeddings(self.data, table_def, self.embedding_registry.as_ref())?;
+
+        let rescannable = self.data.rescannable();
+        let plan: Arc<dyn datafusion_physical_plan::ExecutionPlan> =
+            Arc::new(ScannableExec::new(self.data));
+        // Skip casting when overwriting — the input schema replaces the table schema.
+        let plan = if overwrite {
+            plan
+        } else {
+            cast_to_table_schema(plan, table_schema)?
+        };
+        let plan = match self.on_nan_vectors {
+            NaNVectorBehavior::Error => reject_nan_vectors(plan)?,
+            NaNVectorBehavior::Keep => plan,
+        };
+
+        Ok(PreprocessingOutput {
+            plan,
+            overwrite,
+            rescannable,
+            write_options: self.write_options,
+            mode: self.mode,
+        })
+    }
+}
+
+pub struct PreprocessingOutput {
+    pub plan: Arc<dyn datafusion_physical_plan::ExecutionPlan>,
+    #[cfg_attr(not(feature = "remote"), allow(dead_code))]
+    pub overwrite: bool,
+    #[cfg_attr(not(feature = "remote"), allow(dead_code))]
+    pub rescannable: bool,
+    pub write_options: WriteOptions,
+    pub mode: AddDataMode,
+}
+
+/// Check that the input schema is valid for insert.
+///
+/// Fields can be in different orders, so match by name.
+///
+/// If a column exists in input but not in table, error (no extra columns allowed).
+///
+/// If a column exists in table but not in input, that is okay - it may be filled with nulls.
+///
+/// If the types are not exactly the same, we will attempt to cast later - so that is also okay at this stage.
+///
+/// If the nullability is different, that is also okay - we can relax nullability when casting.
+fn validate_schema(input: &Schema, table: &Schema) -> Result<()> {
+    validate_fields(input.fields(), table.fields())
+}
+
+fn validate_fields(input: &Fields, table: &Fields) -> Result<()> {
+    for field in input {
+        match table.iter().find(|f| f.name() == field.name()) {
+            None => {
+                return Err(Error::InvalidInput {
+                    message: format!("field '{}' does not exist in table schema", field.name()),
+                });
+            }
+            Some(table_field) => {
+                if let (DataType::Struct(in_children), DataType::Struct(tbl_children)) =
+                    (field.data_type(), table_field.data_type())
+                {
+                    validate_fields(in_children, tbl_children)?;
+                }
+            }
+        }
+    }
+    Ok(())
 }

 #[cfg(test)]
 mod tests {
    use std::sync::Arc;

-    use arrow_array::{record_batch, RecordBatch, RecordBatchIterator};
+    use arrow::datatypes::Float64Type;
+    use arrow_array::{
+        record_batch, FixedSizeListArray, Float32Array, Int32Array, LargeStringArray, ListArray,
+        RecordBatch, RecordBatchIterator,
+    };
    use arrow_schema::{ArrowError, DataType, Field, Schema};
    use futures::TryStreamExt;
    use lance::dataset::{WriteMode, WriteParams};
@@ -94,8 +218,10 @@ mod tests {
        EmbeddingDefinition, EmbeddingFunction, EmbeddingRegistry, MemoryRegistry,
    };
    use crate::query::{ExecutableQuery, QueryBase, Select};
+    use crate::table::add_data::NaNVectorBehavior;
    use crate::table::{ColumnDefinition, ColumnKind, Table, TableDefinition, WriteOptions};
    use crate::test_utils::embeddings::MockEmbed;
+    use crate::test_utils::TestCustomError;
    use crate::Error;

    use super::AddDataMode;
@@ -160,17 +286,20 @@ mod tests {
        test_add_with_data(stream).await;
    }

-    #[derive(Debug)]
-    struct MyError;
-
-    impl std::fmt::Display for MyError {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "MyError occurred")
-        }
+    fn assert_preserves_external_error(err: &Error) {
+        assert!(
+            matches!(err, Error::External { source } if source.downcast_ref::<TestCustomError>().is_some()),
+            "Expected Error::External, got: {err:?}"
+        );
+        // The original TestCustomError message should be preserved through the
+        // error chain, even if the error gets wrapped multiple times by
+        // lance's insert pipeline.
+        assert!(
+            err.to_string().contains("TestCustomError occurred"),
+            "Expected original error message to be preserved, got: {err}"
+        );
    }

-    impl std::error::Error for MyError {}
-
    #[tokio::test]
    async fn test_add_preserves_reader_error() {
        let table = create_test_table().await;
@@ -178,7 +307,7 @@ mod tests {
        let schema = first_batch.schema();
        let iterator = vec![
            Ok(first_batch),
-            Err(ArrowError::ExternalError(Box::new(MyError))),
+            Err(ArrowError::ExternalError(Box::new(TestCustomError))),
        ];
        let reader: Box<dyn arrow_array::RecordBatchReader + Send> = Box::new(
            RecordBatchIterator::new(iterator.into_iter(), schema.clone()),
@@ -186,7 +315,7 @@ mod tests {

        let result = table.add(reader).execute().await;

-        assert!(result.is_err());
+        assert_preserves_external_error(&result.unwrap_err());
    }

    #[tokio::test]
@@ -197,7 +326,7 @@ mod tests {
        let iterator = vec![
            Ok(first_batch),
            Err(Error::External {
-                source: Box::new(MyError),
+                source: Box::new(TestCustomError),
            }),
        ];
        let stream = futures::stream::iter(iterator);
@@ -208,7 +337,7 @@ mod tests {

        let result = table.add(stream).execute().await;

-        assert!(result.is_err());
+        assert_preserves_external_error(&result.unwrap_err());
    }

    #[tokio::test]
@@ -340,4 +469,248 @@ mod tests {
            assert_eq!(embedding_col.null_count(), 0);
        }
    }
+
+    #[tokio::test]
+    async fn test_add_casts_to_table_schema() {
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("text", DataType::Utf8, false),
+            Field::new(
+                "embedding",
+                DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4),
+                false,
+            ),
+        ]));
+
+        let input_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false), // Upcast integer
+            Field::new("text", DataType::LargeUtf8, false), // Re-encode string
+            // Cast list of float64 to fixed-size list of float32
+            // (This will only work if list size is correct. See next test.
+            Field::new(
+                "embedding",
+                DataType::List(Arc::new(Field::new("item", DataType::Float64, true))),
+                false,
+            ),
+        ]));
+
+        let db = connect("memory://").execute().await.unwrap();
+        let table = db
+            .create_empty_table("cast_test", table_schema.clone())
+            .execute()
+            .await
+            .unwrap();
+
+        let batch = RecordBatch::try_new(
+            input_schema,
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])),
+                Arc::new(LargeStringArray::from(vec!["hello", "world"])),
+                Arc::new(ListArray::from_iter_primitive::<Float64Type, _, _>(vec![
+                    Some(vec![0.1, 0.2, 0.3, 0.4].into_iter().map(Some)),
+                    Some(vec![0.5, 0.6, 0.7, 0.8].into_iter().map(Some)),
+                ])),
+            ],
+        )
+        .unwrap();
+        table.add(batch).execute().await.unwrap();
+
+        let row_count = table.count_rows(None).await.unwrap();
+        assert_eq!(row_count, 2);
+    }
+
+    #[tokio::test]
+    async fn test_add_rejects_bad_vector_dimensions() {
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "embedding",
+            DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4),
+            false,
+        )]));
+
+        let input_schema = Arc::new(Schema::new(vec![Field::new(
+            "embedding",
+            DataType::List(Arc::new(Field::new("item", DataType::Float64, true))),
+            false,
+        )]));
+
+        let db = connect("memory://").execute().await.unwrap();
+        let table = db
+            .create_empty_table("cast_test", table_schema.clone())
+            .execute()
+            .await
+            .unwrap();
+
+        let batch = RecordBatch::try_new(
+            input_schema,
+            vec![Arc::new(
+                ListArray::from_iter_primitive::<Float64Type, _, _>(vec![
+                    Some(vec![0.1, 0.2, 0.3, 0.4].into_iter().map(Some)),
+                    Some(vec![0.5, 0.6, 0.8].into_iter().map(Some)),
+                ]),
+            )],
+        )
+        .unwrap();
+        let res = table.add(batch).execute().await;
+
+        // TODO: to recover the error, we will need fix upstream in Lance.
+        // assert!(
+        //     matches!(res, Err(Error::Arrow { source: ArrowError::CastError(_) })),
+        //     "Expected schema mismatch error due to wrong vector dimensions, but got: {res:?}"
+        // );
+        assert!(
+            res.is_err(),
+            "Expected error due to wrong vector dimensions, but got success"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_add_rejects_nan_vectors() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "embedding",
+            DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4),
+            false,
+        )]));
+
+        let db = connect("memory://").execute().await.unwrap();
+        let table = db
+            .create_empty_table("nan_test", schema.clone())
+            .execute()
+            .await
+            .unwrap();
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(
+                FixedSizeListArray::try_new(
+                    Arc::new(Field::new("item", DataType::Float32, true)),
+                    4,
+                    Arc::new(Float32Array::from(vec![0.1, 0.2, f32::NAN, 0.4])),
+                    None,
+                )
+                .unwrap(),
+            )],
+        )
+        .unwrap();
+        let res = table.add(batch.clone()).execute().await;
+        let err = res.unwrap_err();
+        assert!(
+            err.to_string().contains("NaN"),
+            "Expected error mentioning NaN values, but got: {err:?}"
+        );
+
+        table
+            .add(batch)
+            .on_nan_vectors(NaNVectorBehavior::Keep)
+            .execute()
+            .await
+            .unwrap();
+
+        let row_count = table.count_rows(None).await.unwrap();
+        assert_eq!(row_count, 1);
+    }
+
+    #[tokio::test]
+    async fn test_add_subschema() {
+        let data = record_batch!(("id", Int64, [4, 5]), ("text", Utf8, ["foo", "bar"])).unwrap();
+        let db = connect("memory://").execute().await.unwrap();
+        let table = db
+            .create_table("test", data.clone())
+            .execute()
+            .await
+            .unwrap();
+
+        let new_data = record_batch!(("id", Int64, [6, 7])).unwrap();
+        table.add(new_data).execute().await.unwrap();
+
+        assert_eq!(table.count_rows(None).await.unwrap(), 4);
+        assert_eq!(
+            table
+                .count_rows(Some("id IS NOT NULL".to_string()))
+                .await
+                .unwrap(),
+            4
+        );
+        assert_eq!(
+            table
+                .count_rows(Some("text IS NOT NULL".to_string()))
+                .await
+                .unwrap(),
+            2
+        );
+
+        // We can still cast
+        let new_data = record_batch!(("text", LargeUtf8, ["baz", "qux"])).unwrap();
+        table.add(new_data).execute().await.unwrap();
+
+        assert_eq!(table.count_rows(None).await.unwrap(), 6);
+        assert_eq!(
+            table
+                .count_rows(Some("id IS NOT NULL".to_string()))
+                .await
+                .unwrap(),
+            4
+        );
+        assert_eq!(
+            table
+                .count_rows(Some("text IS NOT NULL".to_string()))
+                .await
+                .unwrap(),
+            4
+        );
+
+        // Extra columns mean an error
+        let new_data =
+            record_batch!(("id", Int64, [8, 9]), ("extra", Utf8, ["extra1", "extra2"])).unwrap();
+        let res = table.add(new_data).execute().await;
+        assert!(
+            res.is_err(),
+            "Expected error due to extra column, but got: {res:?}"
+        );
+
+        // Insert with a subset of struct sub-fields
+        let struct_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new(
+                "metadata",
+                DataType::Struct(
+                    vec![
+                        Field::new("a", DataType::Int64, true),
+                        Field::new("b", DataType::Utf8, true),
+                    ]
+                    .into(),
+                ),
+                true,
+            ),
+        ]));
+        let db2 = connect("memory://").execute().await.unwrap();
+        let table2 = db2
+            .create_empty_table("struct_test", struct_schema)
+            .execute()
+            .await
+            .unwrap();
+
+        // Insert with only the "a" sub-field of the struct
+        let sub_struct_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new(
+                "metadata",
+                DataType::Struct(vec![Field::new("a", DataType::Int64, true)].into()),
+                true,
+            ),
+        ]));
+        let struct_batch = RecordBatch::try_new(
+            sub_struct_schema,
+            vec![
+                Arc::new(arrow_array::Int64Array::from(vec![1, 2])),
+                Arc::new(arrow_array::StructArray::from(vec![(
+                    Arc::new(Field::new("a", DataType::Int64, true)),
+                    Arc::new(arrow_array::Int64Array::from(vec![10, 20]))
+                        as Arc<dyn arrow_array::Array>,
+                )])),
+            ],
+        )
+        .unwrap();
+        table2.add(struct_batch).execute().await.unwrap();
+        assert_eq!(table2.count_rows(None).await.unwrap(), 2);
+    }
 }
--- a/rust/lancedb/src/table/datafusion.rs
+++ b/rust/lancedb/src/table/datafusion.rs
@@ -3,7 +3,10 @@

 //! This module contains adapters to allow LanceDB tables to be used as DataFusion table providers.

+pub mod cast;
 pub mod insert;
+pub mod reject_nan;
+pub mod scannable_exec;
 pub mod udtf;

 use std::{collections::HashMap, sync::Arc};
--- a/rust/lancedb/src/table/datafusion/cast.rs
+++ b/rust/lancedb/src/table/datafusion/cast.rs
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+use std::sync::Arc;
+
+use arrow_schema::{DataType, Field, FieldRef, Fields, Schema};
+use datafusion::functions::core::{get_field, named_struct};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::ScalarValue;
+use datafusion_physical_expr::expressions::{cast, Literal};
+use datafusion_physical_expr::ScalarFunctionExpr;
+use datafusion_physical_plan::expressions::Column;
+use datafusion_physical_plan::projection::ProjectionExec;
+use datafusion_physical_plan::{ExecutionPlan, PhysicalExpr};
+
+use crate::{Error, Result};
+
+pub fn cast_to_table_schema(
+    input: Arc<dyn ExecutionPlan>,
+    table_schema: &Schema,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let input_schema = input.schema();
+
+    if input_schema.fields() == table_schema.fields() {
+        return Ok(input);
+    }
+
+    let exprs = build_field_exprs(
+        input_schema.fields(),
+        table_schema.fields(),
+        &|idx| Arc::new(Column::new(input_schema.field(idx).name(), idx)) as Arc<dyn PhysicalExpr>,
+        &input_schema,
+    )?;
+
+    let exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = exprs
+        .into_iter()
+        .map(|(expr, field)| (expr, field.name().clone()))
+        .collect();
+
+    let projection = ProjectionExec::try_new(exprs, input).map_err(crate::Error::from)?;
+
+    Ok(Arc::new(projection))
+}
+
+/// Build expressions to project input fields to match the table schema.
+///
+/// For each table field that exists in the input, produce an expression that
+/// reads from the input and casts if needed. Fields in the table but not in the
+/// input are omitted (the storage layer handles missing columns).
+fn build_field_exprs(
+    input_fields: &Fields,
+    table_fields: &Fields,
+    get_input_expr: &dyn Fn(usize) -> Arc<dyn PhysicalExpr>,
+    input_schema: &Schema,
+) -> Result<Vec<(Arc<dyn PhysicalExpr>, FieldRef)>> {
+    let config = Arc::new(ConfigOptions::default());
+    let mut result = Vec::new();
+
+    for table_field in table_fields {
+        let Some(input_idx) = input_fields
+            .iter()
+            .position(|f| f.name() == table_field.name())
+        else {
+            continue;
+        };
+
+        let input_field = &input_fields[input_idx];
+        let input_expr = get_input_expr(input_idx);
+
+        let expr = match (input_field.data_type(), table_field.data_type()) {
+            // Both are structs: recurse into sub-fields to handle subschemas and casts.
+            (DataType::Struct(in_children), DataType::Struct(tbl_children))
+                if in_children != tbl_children =>
+            {
+                let sub_exprs = build_field_exprs(
+                    in_children,
+                    tbl_children,
+                    &|child_idx| {
+                        let child_name = in_children[child_idx].name();
+                        Arc::new(ScalarFunctionExpr::new(
+                            &format!("get_field({child_name})"),
+                            get_field(),
+                            vec![
+                                input_expr.clone(),
+                                Arc::new(Literal::new(ScalarValue::from(child_name.as_str()))),
+                            ],
+                            Arc::new(in_children[child_idx].as_ref().clone()),
+                            config.clone(),
+                        )) as Arc<dyn PhysicalExpr>
+                    },
+                    input_schema,
+                )?;
+
+                let output_struct_fields: Fields = sub_exprs
+                    .iter()
+                    .map(|(_, f)| f.clone())
+                    .collect::<Vec<_>>()
+                    .into();
+                let output_field: FieldRef = Arc::new(Field::new(
+                    table_field.name(),
+                    DataType::Struct(output_struct_fields),
+                    table_field.is_nullable(),
+                ));
+
+                // Build named_struct(lit("a"), expr_a, lit("b"), expr_b, ...)
+                let mut ns_args: Vec<Arc<dyn PhysicalExpr>> = Vec::new();
+                for (sub_expr, sub_field) in &sub_exprs {
+                    ns_args.push(Arc::new(Literal::new(ScalarValue::from(
+                        sub_field.name().as_str(),
+                    ))));
+                    ns_args.push(sub_expr.clone());
+                }
+
+                let ns_expr: Arc<dyn PhysicalExpr> = Arc::new(ScalarFunctionExpr::new(
+                    &format!("named_struct({})", table_field.name()),
+                    named_struct(),
+                    ns_args,
+                    output_field.clone(),
+                    config.clone(),
+                ));
+
+                result.push((ns_expr, output_field));
+                continue;
+            }
+            // Types match: pass through.
+            (inp, tbl) if inp == tbl => input_expr,
+            // Types differ: cast.
+            _ => cast(input_expr, input_schema, table_field.data_type().clone()).map_err(|e| {
+                Error::InvalidInput {
+                    message: format!(
+                        "cannot cast field '{}' from {} to {}: {}",
+                        table_field.name(),
+                        input_field.data_type(),
+                        table_field.data_type(),
+                        e
+                    ),
+                }
+            })?,
+        };
+
+        let output_field = Arc::new(Field::new(
+            table_field.name(),
+            table_field.data_type().clone(),
+            table_field.is_nullable(),
+        ));
+        result.push((expr, output_field));
+    }
+
+    Ok(result)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_array::{
+        Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, StructArray,
+    };
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion::prelude::SessionContext;
+    use datafusion_catalog::MemTable;
+    use futures::TryStreamExt;
+
+    use super::cast_to_table_schema;
+
+    async fn plan_from_batch(
+        batch: RecordBatch,
+    ) -> Arc<dyn datafusion_physical_plan::ExecutionPlan> {
+        let schema = batch.schema();
+        let table = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
+        let ctx = SessionContext::new();
+        ctx.register_table("t", Arc::new(table)).unwrap();
+        let df = ctx.table("t").await.unwrap();
+        df.create_physical_plan().await.unwrap()
+    }
+
+    async fn collect(plan: Arc<dyn datafusion_physical_plan::ExecutionPlan>) -> RecordBatch {
+        let ctx = SessionContext::new();
+        let stream = plan.execute(0, ctx.task_ctx()).unwrap();
+        let batches: Vec<RecordBatch> = stream.try_collect().await.unwrap();
+        arrow_select::concat::concat_batches(&plan.schema(), &batches).unwrap()
+    }
+
+    #[tokio::test]
+    async fn test_noop_when_schemas_match() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])),
+                Arc::new(StringArray::from(vec!["x", "y"])),
+            ],
+        )
+        .unwrap();
+
+        let input = plan_from_batch(batch).await;
+        let input_ptr = Arc::as_ptr(&input);
+        let result = cast_to_table_schema(input, &schema).unwrap();
+        assert_eq!(Arc::as_ptr(&result), input_ptr);
+    }
+
+    #[tokio::test]
+    async fn test_simple_type_cast() {
+        let input_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![
+                Field::new("id", DataType::Int32, false),
+                Field::new("val", DataType::Float32, false),
+            ])),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(Float32Array::from(vec![1.5, 2.5, 3.5])),
+            ],
+        )
+        .unwrap();
+
+        let table_schema = Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("val", DataType::Float64, false),
+        ]);
+
+        let plan = plan_from_batch(input_batch).await;
+        let casted = cast_to_table_schema(plan, &table_schema).unwrap();
+        let result = collect(casted).await;
+
+        assert_eq!(result.schema().field(0).data_type(), &DataType::Int64);
+        assert_eq!(result.schema().field(1).data_type(), &DataType::Float64);
+
+        let ids: &Int64Array = result.column(0).as_any().downcast_ref().unwrap();
+        assert_eq!(ids.values(), &[1, 2, 3]);
+
+        let vals: &Float64Array = result.column(1).as_any().downcast_ref().unwrap();
+        assert!((vals.value(0) - 1.5).abs() < 1e-6);
+        assert!((vals.value(1) - 2.5).abs() < 1e-6);
+        assert!((vals.value(2) - 3.5).abs() < 1e-6);
+    }
+
+    #[tokio::test]
+    async fn test_missing_table_field_skipped() {
+        // Input has "a", table expects "a" and "b". "b" is omitted from the
+        // projection since the storage layer fills in missing columns.
+        let input_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
+            vec![Arc::new(Int32Array::from(vec![10, 20]))],
+        )
+        .unwrap();
+
+        let table_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, true),
+        ]);
+
+        let plan = plan_from_batch(input_batch).await;
+        let casted = cast_to_table_schema(plan, &table_schema).unwrap();
+        let result = collect(casted).await;
+
+        assert_eq!(result.num_columns(), 1);
+        assert_eq!(result.schema().field(0).name(), "a");
+    }
+
+    #[tokio::test]
+    async fn test_extra_input_fields_dropped() {
+        // Input has "a" and "extra"; table only expects "a".
+        let input_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![
+                Field::new("a", DataType::Int32, false),
+                Field::new("extra", DataType::Utf8, false),
+            ])),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])),
+                Arc::new(StringArray::from(vec!["x", "y"])),
+            ],
+        )
+        .unwrap();
+
+        let table_schema = Schema::new(vec![Field::new("a", DataType::Int64, false)]);
+
+        let plan = plan_from_batch(input_batch).await;
+        let casted = cast_to_table_schema(plan, &table_schema).unwrap();
+        let result = collect(casted).await;
+
+        assert_eq!(result.num_columns(), 1);
+        assert_eq!(result.schema().field(0).name(), "a");
+        assert_eq!(result.schema().field(0).data_type(), &DataType::Int64);
+    }
+
+    #[tokio::test]
+    async fn test_reorders_to_table_schema() {
+        let input_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![
+                Field::new("b", DataType::Utf8, false),
+                Field::new("a", DataType::Int32, false),
+            ])),
+            vec![
+                Arc::new(StringArray::from(vec!["x", "y"])),
+                Arc::new(Int32Array::from(vec![1, 2])),
+            ],
+        )
+        .unwrap();
+
+        let table_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
+        ]);
+
+        let plan = plan_from_batch(input_batch).await;
+        let casted = cast_to_table_schema(plan, &table_schema).unwrap();
+        let result = collect(casted).await;
+
+        assert_eq!(result.schema().field(0).name(), "a");
+        assert_eq!(result.schema().field(1).name(), "b");
+
+        let a: &Int32Array = result.column(0).as_any().downcast_ref().unwrap();
+        assert_eq!(a.values(), &[1, 2]);
+        let b: &StringArray = result.column(1).as_any().downcast_ref().unwrap();
+        assert_eq!(b.value(0), "x");
+    }
+
+    #[tokio::test]
+    async fn test_struct_subfield_cast() {
+        // Input struct has {x: Int32, y: Int32}, table expects {x: Int64, y: Int64}.
+        let inner_fields = vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Int32, false),
+        ];
+        let struct_array = StructArray::from(vec![
+            (
+                Arc::new(inner_fields[0].clone()),
+                Arc::new(Int32Array::from(vec![1, 2])) as _,
+            ),
+            (
+                Arc::new(inner_fields[1].clone()),
+                Arc::new(Int32Array::from(vec![3, 4])) as _,
+            ),
+        ]);
+        let input_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new(
+                "s",
+                DataType::Struct(inner_fields.into()),
+                false,
+            )])),
+            vec![Arc::new(struct_array)],
+        )
+        .unwrap();
+
+        let table_inner = vec![
+            Field::new("x", DataType::Int64, false),
+            Field::new("y", DataType::Int64, false),
+        ];
+        let table_schema = Schema::new(vec![Field::new(
+            "s",
+            DataType::Struct(table_inner.into()),
+            false,
+        )]);
+
+        let plan = plan_from_batch(input_batch).await;
+        let casted = cast_to_table_schema(plan, &table_schema).unwrap();
+        let result = collect(casted).await;
+
+        let struct_col = result
+            .column(0)
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+        assert_eq!(struct_col.column(0).data_type(), &DataType::Int64);
+        assert_eq!(struct_col.column(1).data_type(), &DataType::Int64);
+
+        let x: &Int64Array = struct_col.column(0).as_any().downcast_ref().unwrap();
+        assert_eq!(x.values(), &[1, 2]);
+        let y: &Int64Array = struct_col.column(1).as_any().downcast_ref().unwrap();
+        assert_eq!(y.values(), &[3, 4]);
+    }
+
+    #[tokio::test]
+    async fn test_struct_subschema() {
+        // Input struct has {x, y, z}, table only expects {x, z}.
+        let inner_fields = vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Int32, false),
+            Field::new("z", DataType::Int32, false),
+        ];
+        let struct_array = StructArray::from(vec![
+            (
+                Arc::new(inner_fields[0].clone()),
+                Arc::new(Int32Array::from(vec![1, 2])) as _,
+            ),
+            (
+                Arc::new(inner_fields[1].clone()),
+                Arc::new(Int32Array::from(vec![10, 20])) as _,
+            ),
+            (
+                Arc::new(inner_fields[2].clone()),
+                Arc::new(Int32Array::from(vec![100, 200])) as _,
+            ),
+        ]);
+        let input_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new(
+                "s",
+                DataType::Struct(inner_fields.into()),
+                false,
+            )])),
+            vec![Arc::new(struct_array)],
+        )
+        .unwrap();
+
+        let table_inner = vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("z", DataType::Int32, false),
+        ];
+        let table_schema = Schema::new(vec![Field::new(
+            "s",
+            DataType::Struct(table_inner.into()),
+            false,
+        )]);
+
+        let plan = plan_from_batch(input_batch).await;
+        let casted = cast_to_table_schema(plan, &table_schema).unwrap();
+        let result = collect(casted).await;
+
+        let struct_col = result
+            .column(0)
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+        assert_eq!(struct_col.num_columns(), 2);
+
+        let x: &Int32Array = struct_col
+            .column_by_name("x")
+            .unwrap()
+            .as_any()
+            .downcast_ref()
+            .unwrap();
+        assert_eq!(x.values(), &[1, 2]);
+        let z: &Int32Array = struct_col
+            .column_by_name("z")
+            .unwrap()
+            .as_any()
+            .downcast_ref()
+            .unwrap();
+        assert_eq!(z.values(), &[100, 200]);
+    }
+
+    #[tokio::test]
+    async fn test_incompatible_cast_errors() {
+        let input_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Binary, false)])),
+            vec![Arc::new(arrow_array::BinaryArray::from_vec(vec![b"hi"]))],
+        )
+        .unwrap();
+
+        let table_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+
+        let plan = plan_from_batch(input_batch).await;
+        let result = cast_to_table_schema(plan, &table_schema);
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("cannot cast field 'a'"),
+            "unexpected error: {err_msg}"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_mixed_cast_and_passthrough() {
+        // "a" needs cast (Int32→Int64), "b" passes through unchanged.
+        let input_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![
+                Field::new("a", DataType::Int32, false),
+                Field::new("b", DataType::Utf8, false),
+            ])),
+            vec![
+                Arc::new(Int32Array::from(vec![7, 8])),
+                Arc::new(StringArray::from(vec!["hello", "world"])),
+            ],
+        )
+        .unwrap();
+
+        let table_schema = Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Utf8, false),
+        ]);
+
+        let plan = plan_from_batch(input_batch).await;
+        let casted = cast_to_table_schema(plan, &table_schema).unwrap();
+        let result = collect(casted).await;
+
+        assert_eq!(result.schema().field(0).data_type(), &DataType::Int64);
+        assert_eq!(result.schema().field(1).data_type(), &DataType::Utf8);
+
+        let a: &Int64Array = result.column(0).as_any().downcast_ref().unwrap();
+        assert_eq!(a.values(), &[7, 8]);
+        let b: &StringArray = result.column(1).as_any().downcast_ref().unwrap();
+        assert_eq!(b.value(0), "hello");
+        assert_eq!(b.value(1), "world");
+    }
+}
--- a/rust/lancedb/src/table/datafusion/reject_nan.rs
+++ b/rust/lancedb/src/table/datafusion/reject_nan.rs
@@ -0,0 +1,269 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+//! A DataFusion projection that rejects vectors containing NaN values.
+
+use std::any::Any;
+use std::sync::{Arc, LazyLock};
+
+use arrow_array::{Array, FixedSizeListArray};
+use arrow_schema::{DataType, Field, FieldRef};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
+use datafusion_physical_expr::ScalarFunctionExpr;
+use datafusion_physical_plan::expressions::Column;
+use datafusion_physical_plan::projection::ProjectionExec;
+use datafusion_physical_plan::{ExecutionPlan, PhysicalExpr};
+
+use crate::{Error, Result};
+
+static REJECT_NAN_UDF: LazyLock<Arc<datafusion_expr::ScalarUDF>> =
+    LazyLock::new(|| Arc::new(datafusion_expr::ScalarUDF::from(RejectNanUdf::new())));
+
+/// Returns true if the field is a vector column: FixedSizeList<Float16/32/64>.
+fn is_vector_field(field: &Field) -> bool {
+    if let DataType::FixedSizeList(child, _) = field.data_type() {
+        matches!(
+            child.data_type(),
+            DataType::Float16 | DataType::Float32 | DataType::Float64
+        )
+    } else {
+        false
+    }
+}
+
+/// Wraps the input plan with a projection that checks vector columns for NaN values.
+///
+/// Non-vector columns pass through unchanged. Vector columns are wrapped with a
+/// UDF that returns the column as-is if no NaNs are present, or errors otherwise.
+pub fn reject_nan_vectors(input: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
+    let schema = input.schema();
+    let config = Arc::new(ConfigOptions::default());
+    let udf = REJECT_NAN_UDF.clone();
+
+    let mut has_vector_cols = false;
+    let mut exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = Vec::new();
+
+    for (idx, field) in schema.fields().iter().enumerate() {
+        let col_expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new(field.name(), idx));
+
+        if is_vector_field(field) {
+            has_vector_cols = true;
+            let wrapped: Arc<dyn PhysicalExpr> = Arc::new(ScalarFunctionExpr::new(
+                &format!("reject_nan({})", field.name()),
+                udf.clone(),
+                vec![col_expr],
+                Arc::clone(field) as FieldRef,
+                config.clone(),
+            ));
+            exprs.push((wrapped, field.name().clone()));
+        } else {
+            exprs.push((col_expr, field.name().clone()));
+        }
+    }
+
+    if !has_vector_cols {
+        return Ok(input);
+    }
+
+    let projection = ProjectionExec::try_new(exprs, input).map_err(Error::from)?;
+    Ok(Arc::new(projection))
+}
+
+/// A scalar UDF that passes through FixedSizeList arrays unchanged, but errors
+/// if any float values in the list are NaN.
+#[derive(Debug, Hash, PartialEq, Eq)]
+struct RejectNanUdf {
+    signature: Signature,
+}
+
+impl RejectNanUdf {
+    fn new() -> Self {
+        Self {
+            signature: Signature::any(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RejectNanUdf {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "reject_nan"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        let arg = &args.args[0];
+        match arg {
+            ColumnarValue::Array(array) => {
+                check_no_nans(array)?;
+                Ok(ColumnarValue::Array(array.clone()))
+            }
+            ColumnarValue::Scalar(_) => Ok(arg.clone()),
+        }
+    }
+}
+
+fn check_no_nans(array: &dyn Array) -> datafusion_common::Result<()> {
+    let fsl = array
+        .as_any()
+        .downcast_ref::<FixedSizeListArray>()
+        .ok_or_else(|| {
+            datafusion_common::DataFusionError::Internal(
+                "reject_nan expected FixedSizeList".to_string(),
+            )
+        })?;
+
+    // Only inspect elements that are both in a valid parent row and non-null
+    // themselves. Values backing null parent rows or null child elements may
+    // contain garbage (including NaN) per the Arrow spec.
+    let has_nan = (0..fsl.len()).filter(|i| fsl.is_valid(*i)).any(|i| {
+        let row = fsl.value(i);
+        match row.data_type() {
+            DataType::Float16 => row
+                .as_any()
+                .downcast_ref::<arrow_array::Float16Array>()
+                .unwrap()
+                .iter()
+                .any(|v| v.is_some_and(|v| v.is_nan())),
+            DataType::Float32 => row
+                .as_any()
+                .downcast_ref::<arrow_array::Float32Array>()
+                .unwrap()
+                .iter()
+                .any(|v| v.is_some_and(|v| v.is_nan())),
+            DataType::Float64 => row
+                .as_any()
+                .downcast_ref::<arrow_array::Float64Array>()
+                .unwrap()
+                .iter()
+                .any(|v| v.is_some_and(|v| v.is_nan())),
+            _ => false,
+        }
+    });
+
+    if has_nan {
+        return Err(datafusion_common::DataFusionError::ArrowError(
+            Box::new(arrow_schema::ArrowError::ComputeError(
+                "Vector column contains NaN values".to_string(),
+            )),
+            None,
+        ));
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow_array::Float32Array;
+
+    #[test]
+    fn test_passes_clean_vectors() {
+        let fsl = FixedSizeListArray::try_new(
+            Arc::new(Field::new("item", DataType::Float32, true)),
+            2,
+            Arc::new(Float32Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+            None,
+        )
+        .unwrap();
+        assert!(check_no_nans(&fsl).is_ok());
+    }
+
+    #[test]
+    fn test_rejects_nan_vectors() {
+        let fsl = FixedSizeListArray::try_new(
+            Arc::new(Field::new("item", DataType::Float32, true)),
+            2,
+            Arc::new(Float32Array::from(vec![1.0, f32::NAN, 3.0, 4.0])),
+            None,
+        )
+        .unwrap();
+        assert!(check_no_nans(&fsl).is_err());
+    }
+
+    #[test]
+    fn test_skips_null_rows() {
+        // Values backing null rows may contain NaN per the Arrow spec.
+        // We should not reject a batch just because of garbage in null slots.
+        let values = Float32Array::from(vec![1.0, 2.0, f32::NAN, f32::NAN]);
+        let fsl = FixedSizeListArray::try_new(
+            Arc::new(Field::new("item", DataType::Float32, true)),
+            2,
+            Arc::new(values),
+            // Row 0 is valid [1.0, 2.0], row 1 is null [NAN, NAN]
+            Some(vec![true, false].into()),
+        )
+        .unwrap();
+        assert!(fsl.is_null(1));
+        assert!(check_no_nans(&fsl).is_ok());
+    }
+
+    #[test]
+    fn test_skips_null_elements_within_valid_row() {
+        // A valid row with null child elements: the underlying buffer may hold
+        // NaN but the null bitmap says they're absent — should not reject.
+        let values = Float32Array::from(vec![
+            Some(1.0),
+            None, // null element — buffer may contain NaN
+            Some(3.0),
+            None, // null element
+        ]);
+        let fsl = FixedSizeListArray::try_new(
+            Arc::new(Field::new("item", DataType::Float32, true)),
+            2,
+            Arc::new(values),
+            None, // both rows are valid
+        )
+        .unwrap();
+        assert!(check_no_nans(&fsl).is_ok());
+    }
+
+    #[test]
+    fn test_rejects_nan_in_valid_row_with_nulls_present() {
+        // Row 0 is null, row 1 is valid but contains NaN — should reject.
+        let values = Float32Array::from(vec![0.0, 0.0, 1.0, f32::NAN]);
+        let fsl = FixedSizeListArray::try_new(
+            Arc::new(Field::new("item", DataType::Float32, true)),
+            2,
+            Arc::new(values),
+            Some(vec![false, true].into()),
+        )
+        .unwrap();
+        assert!(check_no_nans(&fsl).is_err());
+    }
+
+    #[test]
+    fn test_is_vector_field() {
+        assert!(is_vector_field(&Field::new(
+            "v",
+            DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4),
+            false,
+        )));
+        assert!(is_vector_field(&Field::new(
+            "v",
+            DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float64, true)), 4),
+            false,
+        )));
+        assert!(!is_vector_field(&Field::new("id", DataType::Int32, false)));
+        assert!(!is_vector_field(&Field::new(
+            "v",
+            DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 4),
+            false,
+        )));
+    }
+}
--- a/rust/lancedb/src/table/datafusion/scannable_exec.rs
+++ b/rust/lancedb/src/table/datafusion/scannable_exec.rs
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+use core::fmt;
+use std::sync::{Arc, Mutex};
+
+use datafusion_common::{stats::Precision, DataFusionError, Result as DFResult, Statistics};
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
+use datafusion_physical_plan::{
+    execution_plan::EmissionType, DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+};
+
+use crate::{arrow::SendableRecordBatchStreamExt, data::scannable::Scannable};
+
+pub struct ScannableExec {
+    // We don't require Scannable to by Sync, so we wrap it in a Mutex to allow safe concurrent access.
+    source: Mutex<Box<dyn Scannable>>,
+    num_rows: Option<usize>,
+    properties: PlanProperties,
+}
+
+impl std::fmt::Debug for ScannableExec {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ScannableExec")
+            .field("schema", &self.schema())
+            .field("num_rows", &self.num_rows)
+            .finish()
+    }
+}
+
+impl ScannableExec {
+    pub fn new(source: Box<dyn Scannable>) -> Self {
+        let schema = source.schema();
+        let eq_properties = EquivalenceProperties::new(schema);
+        let properties = PlanProperties::new(
+            eq_properties,
+            Partitioning::UnknownPartitioning(1),
+            EmissionType::Incremental,
+            datafusion_physical_plan::execution_plan::Boundedness::Bounded,
+        );
+
+        let num_rows = source.num_rows();
+        let source = Mutex::new(source);
+        Self {
+            source,
+            num_rows,
+            properties,
+        }
+    }
+}
+
+impl DisplayAs for ScannableExec {
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "ScannableExec: num_rows={:?}", self.num_rows)
+    }
+}
+
+impl ExecutionPlan for ScannableExec {
+    fn name(&self) -> &str {
+        "ScannableExec"
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> DFResult<Arc<dyn ExecutionPlan>> {
+        if !children.is_empty() {
+            return Err(DataFusionError::Internal(
+                "ScannableExec does not have children".to_string(),
+            ));
+        }
+        Ok(self)
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> DFResult<SendableRecordBatchStream> {
+        if partition != 0 {
+            return Err(DataFusionError::Internal(format!(
+                "ScannableExec only supports partition 0, got {}",
+                partition
+            )));
+        }
+
+        let stream = match self.source.lock() {
+            Ok(mut guard) => guard.scan_as_stream(),
+            Err(poison) => poison.into_inner().scan_as_stream(),
+        };
+
+        Ok(stream.into_df_stream())
+    }
+
+    fn partition_statistics(&self, _partition: Option<usize>) -> DFResult<Statistics> {
+        Ok(Statistics {
+            num_rows: self
+                .num_rows
+                .map(Precision::Exact)
+                .unwrap_or(Precision::Absent),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![],
+        })
+    }
+}
--- a/rust/lancedb/src/table/query.rs
+++ b/rust/lancedb/src/table/query.rs
@@ -5,6 +5,7 @@ use std::sync::Arc;

 use super::NativeTable;
 use crate::error::{Error, Result};
+use crate::expr::expr_to_sql_string;
 use crate::query::{
    QueryExecutionOptions, QueryFilter, QueryRequest, Select, VectorQueryRequest, DEFAULT_TOP_K,
 };
@@ -452,14 +453,12 @@ fn convert_to_namespace_query(query: &AnyQuery) -> Result<NsQueryTableRequest> {

 fn filter_to_sql(filter: &QueryFilter) -> Result<String> {
    match filter {
-            QueryFilter::Sql(sql) => Ok(sql.clone()),
-            QueryFilter::Substrait(_) => Err(Error::NotSupported {
-                message: "Substrait filters are not supported for server-side queries".to_string(),
-            }),
-            QueryFilter::Datafusion(_) => Err(Error::NotSupported {
-                message: "Datafusion expression filters are not supported for server-side queries. Use SQL filter instead.".to_string(),
-            }),
-        }
+        QueryFilter::Sql(sql) => Ok(sql.clone()),
+        QueryFilter::Substrait(_) => Err(Error::NotSupported {
+            message: "Substrait filters are not supported for server-side queries".to_string(),
+        }),
+        QueryFilter::Datafusion(expr) => expr_to_sql_string(expr),
+    }
 }

 /// Extract query vector(s) from Arrow arrays into the namespace format.
--- a/rust/lancedb/src/test_utils.rs
+++ b/rust/lancedb/src/test_utils.rs
@@ -4,3 +4,14 @@
 pub mod connection;
 pub mod datagen;
 pub mod embeddings;
+
+#[derive(Debug)]
+pub struct TestCustomError;
+
+impl std::fmt::Display for TestCustomError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "TestCustomError occurred")
+    }
+}
+
+impl std::error::Error for TestCustomError {}
Author	SHA1	Message	Date
Lance Release	aeb1c3ee6a	Bump version: 0.30.0-beta.2 → 0.30.0-beta.3	2026-02-28 01:29:53 +00:00
Weston Pace	f9ae46c0e7	feat: upgrade lance to 3.0.0-rc.2 and add bindings for fast_search (#3083 )	2026-02-27 17:27:01 -08:00
Will Jones	84bf022fb1	fix(python): pin pylance to make datafusion table provider match version (#3080 )	2026-02-27 13:34:05 -08:00
Will Jones	310967eceb	ci(rust): fix linux job (#3076 )	2026-02-26 19:25:46 -08:00
Jack Ye	154dbeee2a	chore: fix clippy for PreprocessingOutput without remote feature (#3070 ) Fix clippy: ``` error: fields `overwrite` and `rescannable` are never read Error: --> /home/runner/work/xxxx/xxxx/src/lancedb/rust/lancedb/src/table/add_data.rs:158:9 \| 156 \| pub struct PreprocessingOutput { \| ------------------- fields in this struct 157 \| pub plan: Arc<dyn datafusion_physical_plan::ExecutionPlan>, 158 \| pub overwrite: bool, \| ^^^^^^^^^ 159 \| pub rescannable: bool, \| ^^^^^^^^^^^ \| = note: `-D dead-code` implied by `-D warnings` = help: to override `-D warnings` add `#[allow(dead_code)]` ```	2026-02-25 14:59:32 -08:00
Lance Release	c9c08ac8b9	Bump version: 0.27.0-beta.1 → 0.27.0-beta.2	2026-02-25 07:47:54 +00:00
Lance Release	e253f5d9b6	Bump version: 0.30.0-beta.1 → 0.30.0-beta.2	2026-02-25 07:46:06 +00:00
LanceDB Robot	05b4fb0990	chore: update lance dependency to v3.1.0-beta.2 (#3068 ) ## Summary - Bump Lance Rust workspace dependencies to `v3.1.0-beta.2` via `ci/set_lance_version.py`. - Update Java `lance-core.version` to `3.1.0-beta.2`. ## Verification - `cargo clippy --workspace --tests --all-features -- -D warnings` - `cargo fmt --all` ## Release Reference - refs/tags/v3.1.0-beta.2	2026-02-24 23:02:22 -08:00
Mesut-Doner	613b9c1099	feat(rust): add expression builder API for type-safe query filters (#3032 ) ## Summary Adds a Rust expression builder API as a type-safe alternative to SQL strings for query filters. ## Motivation Filtering with raw SQL strings can be awkward when using variables and special types: Closes #3038 --------- Co-authored-by: Will Jones <willjones127@gmail.com>	2026-02-24 18:44:03 -08:00
Will Jones	d5948576b9	feat: parallel inserts for local tables (#3062 ) When input data is sufficiently large, we automatically split up into parallel writes using a round-robin exchange operator. We sample the first batch to determine data width, and target size of 1 million rows or 2GB, whichever is smaller.	2026-02-24 12:26:51 -08:00
Will Jones	0d3fc7860a	ci: fix python DataFusion test (#3060 )	2026-02-24 07:59:12 -08:00
Weston Pace	531cec075c	fix: don't expect all offsets to fit in one batch in permutation reader (#3065 ) This would cause takes against large permutations to fail	2026-02-24 06:32:54 -08:00
Will Jones	0e486511fa	feat: hook up new writer for insert (#3029 ) This hooks up a new writer implementation for the `add()` method. The main immediate benefit is it allows streaming requests to remote tables, and at the same time allowing retries for most inputs. In NodeJS, we always convert the data to `Vec<RecordBatch>`, so it's always retry-able. For Python, all are retry-able, except `Iterator` and `pa.RecordBatchReader`, which can only be consumed once. Some, like `pa.datasets.Dataset` are retry-able and streaming. A lot of the changes here are to make the new DataFusion write pipeline maintain the same behavior as the existing Python-based preprocessing, such as: * casting input data to target schema * rejecting NaN values if `on_bad_vectors="error"` * applying embedding functions. In future PRs, we'll enhance these by moving the embedding calls into DataFusion and making sure we parallelize them. See: https://github.com/lancedb/lancedb/issues/3048 --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-23 14:43:31 -08:00
Will Jones	367262662d	feat(nodejs): upgrade napi-rs from v2 to v3 (#3057 ) ## Summary - Upgrades `@napi-rs/cli` from v2 to v3, `napi`/`napi-derive` Rust crates to 3.x - Fixes a bug ([napi-rs#1170](https://github.com/napi-rs/napi-rs/issues/1170)) where the CLI failed to locate the built `.node` binary when a custom Cargo target directory is set (via `config.toml`) ## Changes package.json / CLI: - `napi.name` → `napi.binaryName`, `napi.triples` → `napi.targets` - Removed `--no-const-enum` flag and fixed output dir arg - `napi universal` → `napi universalize` Rust API migration: - `#[napi::module_init]` → `#[napi_derive::module_init]` - `napi::JsObject` → `Object`, `.get::<_, T>()` → `.get::<T>()` - `ErrorStrategy` removed; `ThreadsafeFunction` now takes an explicit `Return` type with `CalleeHandled = false` const generic - `JsFunction` + `create_threadsafe_function` replaced by typed `Function<Args, Return>` + `build_threadsafe_function().build()` - `RerankerCallbacks` struct removed (`Function<'env,...>` can't be stored in structs); `VectorQuery::rerank` now accepts the function directly - `ClassInstance::clone()` now returns `ClassInstance`, fixed with explicit deref - `Vec<u8>` in `#[napi(object)]` now maps to `Array<number>` in v3; changed to `Buffer` to preserve the TypeScript `Buffer` type TypeScript: - `inner.rerank({ rerankHybrid: async (_, args) => ... })` → `inner.rerank(async (args) => ...)` - Header provider callback wrapped in `async` to match stricter typed constructor signature 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-02-23 14:42:55 -08:00
Lance Release	11efaf46ae	Bump version: 0.27.0-beta.0 → 0.27.0-beta.1	2026-02-23 18:34:48 +00:00