Compare commits

..

1 Commits

Author SHA1 Message Date
lancedb automation
2974b7e5c6 chore: update lance dependency to v6.0.0-beta.6 2026-04-29 07:27:24 +00:00
127 changed files with 16454 additions and 20805 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.29.1-beta.0"
current_version = "0.28.0-beta.10"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -45,9 +45,7 @@ jobs:
- name: Set up Node.js
uses: actions/setup-node@v4
with:
# pnpm 11 (used by the nodejs install step below) requires
# Node >= 22.13; use 24 since 22 hits EOL in October.
node-version: 24
node-version: 20
- name: Install Codex CLI
run: npm install -g @openai/codex
@@ -81,14 +79,10 @@ jobs:
java-version: '11'
cache: maven
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 11.1.1
- name: Install Node.js dependencies for TypeScript bindings
run: |
cd nodejs
pnpm install --frozen-lockfile
npm ci
- name: Configure git user
run: |
@@ -143,7 +137,7 @@ jobs:
- For Rust test failures: Run the specific test with "cargo test -p <crate> <test_name>"
- For Python test failures: Build with "cd python && maturin develop" then run "pytest <specific_test_file>::<test_name>"
- For Java test failures: Run "cd java && mvn test -Dtest=<TestClass>#<testMethod>"
- For TypeScript test failures: Run "cd nodejs && pnpm build && pnpm test -- --testNamePattern='<test_name>'"
- For TypeScript test failures: Run "cd nodejs && npm run build && npm test -- --testNamePattern='<test_name>'"
- Do NOT run the full test suite - only run the tests that were failing
7. If the additional guidelines are provided, follow them as well.

View File

@@ -43,7 +43,7 @@ jobs:
server-username: SONATYPE_USER
server-password: SONATYPE_TOKEN
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
gpg-passphrase: MAVEN_GPG_PASSPHRASE
gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
- name: Set git config
run: |
git config --global user.email "dev+gha@lancedb.com"
@@ -58,11 +58,10 @@ jobs:
echo "use-agent" >> ~/.gnupg/gpg.conf
echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf
export GPG_TTY=$(tty)
./mvnw --batch-mode -DskipTests -DpushChanges=false deploy -pl lancedb-core -am -P deploy-to-ossrh
./mvnw --batch-mode -DskipTests -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -pl lancedb-core -am -P deploy-to-ossrh
env:
SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
report-failure:
name: Report Workflow Failure

View File

@@ -42,17 +42,11 @@ jobs:
with:
fetch-depth: 0
lfs: true
- uses: pnpm/action-setup@v4
with:
version: 11.1.1
- uses: actions/setup-node@v4
with:
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
# in October. The library itself still supports Node >= 18
# (see test matrix below).
node-version: 24
cache: 'pnpm'
cache-dependency-path: nodejs/pnpm-lock.yaml
node-version: 20
cache: 'npm'
cache-dependency-path: nodejs/package-lock.json
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
components: rustfmt, clippy
@@ -67,13 +61,11 @@ jobs:
run: cargo clippy --profile ci --all --all-features -- -D warnings
- name: Lint Typescript
run: |
pnpm install --frozen-lockfile
pnpm lint-ci
npm ci
npm run lint-ci
- name: Lint examples
working-directory: nodejs/examples
# The `@lancedb/lancedb` dep points at file:../dist; pnpm errors if
# that dir is missing, so create an empty one for lint-only runs.
run: mkdir -p ../dist && pnpm install --frozen-lockfile && pnpm lint-ci
run: npm ci && npm run lint-ci
linux:
name: Linux (NodeJS ${{ matrix.node-version }})
timeout-minutes: 30
@@ -90,18 +82,14 @@ jobs:
with:
fetch-depth: 0
lfs: true
- uses: pnpm/action-setup@v4
with:
version: 11.1.1
- uses: actions/setup-node@v4
name: Setup Node.js 24 for build
name: Setup Node.js 20 for build
with:
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
# in October. Build/install runs on Node 24; tests run on the
# matrix version below using direct jest invocation.
node-version: 24
cache: 'pnpm'
cache-dependency-path: nodejs/pnpm-lock.yaml
# @napi-rs/cli v3 requires Node >= 20.12 (via @inquirer/prompts@8).
# Build always on Node 20; tests run on the matrix version below.
node-version: 20
cache: 'npm'
cache-dependency-path: nodejs/package-lock.json
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: |
@@ -109,52 +97,45 @@ jobs:
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
pnpm install --frozen-lockfile
# No `--` separator: pnpm forwards it literally, which would
# make napi-rs treat `--profile ci` as a cargo passthrough arg.
pnpm build:debug --profile ci
pnpm tsc
- name: Setup examples
working-directory: nodejs/examples
run: pnpm install --frozen-lockfile
- name: Check docs
run: |
# We run this as part of the job because the binary needs to be built
# first to export the types of the native code.
set -e
# `pnpm docs` would invoke pnpm's built-in `docs` command, not
# the script — use `pnpm run docs`.
pnpm run docs
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
echo "Docs need to be updated"
echo "Run 'pnpm run docs', fix any warnings, and commit the changes."
exit 1
fi
npm ci --include=optional
npm run build:debug -- --profile ci
- uses: actions/setup-node@v4
name: Setup Node.js ${{ matrix.node-version }} for test
with:
node-version: ${{ matrix.node-version }}
- name: Compile TypeScript
run: npm run tsc
- name: Setup localstack
working-directory: .
run: docker compose up --detach --wait
- name: Test
env:
S3_TEST: "1"
# Newer @smithy/core uses dynamic ESM imports.
NODE_OPTIONS: "--experimental-vm-modules"
# Invoke jest directly because pnpm 11 itself requires Node 22+
# while the matrix tests on older Node versions.
run: npx jest --verbose
run: npm run test
- name: Setup examples
working-directory: nodejs/examples
run: npm ci
- name: Test examples
working-directory: ./
env:
OPENAI_API_KEY: test
OPENAI_BASE_URL: http://0.0.0.0:8000
NODE_OPTIONS: "--experimental-vm-modules"
run: |
python ci/mock_openai.py &
cd nodejs/examples
npx jest --testEnvironment jest-environment-node-single-context --verbose
npm test
- name: Check docs
run: |
# We run this as part of the job because the binary needs to be built
# first to export the types of the native code.
set -e
npm ci
npm run docs
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
echo "Docs need to be updated"
echo "Run 'npm run docs', fix any warnings, and commit the changes."
exit 1
fi
macos:
timeout-minutes: 30
runs-on: "macos-14"
@@ -167,28 +148,20 @@ jobs:
with:
fetch-depth: 0
lfs: true
- uses: pnpm/action-setup@v4
with:
version: 11.1.1
- uses: actions/setup-node@v4
with:
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
# in October.
node-version: 24
cache: 'pnpm'
cache-dependency-path: nodejs/pnpm-lock.yaml
- uses: dtolnay/rust-toolchain@stable
node-version: 20
cache: 'npm'
cache-dependency-path: nodejs/package-lock.json
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: |
brew install protobuf
- name: Build
run: |
pnpm install --frozen-lockfile
# No `--` separator: pnpm forwards it literally, which would
# make napi-rs treat `--profile ci` as a cargo passthrough arg.
pnpm build:debug --profile ci
pnpm tsc
npm ci --include=optional
npm run build:debug -- --profile ci
npm run tsc
- name: Test
run: |
pnpm test
npm run test

View File

@@ -171,18 +171,13 @@ jobs:
working-directory: nodejs
steps:
- uses: actions/checkout@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 11.1.1
- name: Setup node
uses: actions/setup-node@v4
if: ${{ !matrix.settings.docker }}
with:
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
# in October.
node-version: 24
cache: pnpm
cache-dependency-path: nodejs/pnpm-lock.yaml
node-version: 20
cache: npm
cache-dependency-path: nodejs/package-lock.json
- name: Install
uses: dtolnay/rust-toolchain@stable
if: ${{ !matrix.settings.docker }}
@@ -200,7 +195,7 @@ jobs:
target/
key: nodejs-${{ matrix.settings.target }}-cargo-${{ matrix.settings.host }}
- name: Install dependencies
run: pnpm install --frozen-lockfile
run: npm ci
- name: Install Zig
uses: mlugg/setup-zig@v2
if: ${{ contains(matrix.settings.target, 'musl') }}
@@ -253,7 +248,7 @@ jobs:
# one to do the upload.
- name: Make generic artifacts
if: ${{ matrix.settings.target == 'aarch64-apple-darwin' }}
run: pnpm tsc
run: npm run tsc
- name: Upload Generic Artifacts
if: ${{ matrix.settings.target == 'aarch64-apple-darwin' }}
uses: actions/upload-artifact@v4
@@ -288,24 +283,14 @@ jobs:
working-directory: nodejs
steps:
- uses: actions/checkout@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 11.1.1
- name: Setup Node.js 24 for install
uses: actions/setup-node@v4
with:
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
# in October.
node-version: 24
cache: pnpm
cache-dependency-path: nodejs/pnpm-lock.yaml
- name: Install dependencies
run: pnpm install --frozen-lockfile
- name: Setup Node.js ${{ matrix.node }} for test
- name: Setup node
uses: actions/setup-node@v4
with:
node-version: ${{ matrix.node }}
cache: npm
cache-dependency-path: nodejs/package-lock.json
- name: Install dependencies
run: npm ci
- name: Download artifacts
uses: actions/download-artifact@v4
with:
@@ -326,9 +311,7 @@ jobs:
- name: Move built files
run: cp dist/native.d.ts dist/native.js dist/*.node lancedb/
- name: Test bindings
# Invoke jest directly because pnpm 11 itself requires Node 22+
# while the matrix tests on older Node versions.
run: npx jest --verbose
run: npm test
publish:
name: Publish
runs-on: ubuntu-latest
@@ -340,19 +323,15 @@ jobs:
- test-lancedb
steps:
- uses: actions/checkout@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 11.1.1
- name: Setup node
uses: actions/setup-node@v4
with:
node-version: 24
cache: pnpm
cache-dependency-path: nodejs/pnpm-lock.yaml
cache: npm
cache-dependency-path: nodejs/package-lock.json
registry-url: "https://registry.npmjs.org"
- name: Install dependencies
run: pnpm install --frozen-lockfile
run: npm ci
- uses: actions/download-artifact@v4
with:
name: nodejs-dist
@@ -372,7 +351,7 @@ jobs:
- name: Display structure of downloaded files
run: find dist && find nodejs-artifacts
- name: Move artifacts
run: pnpm exec napi artifacts -d nodejs-artifacts
run: npx napi artifacts -d nodejs-artifacts
- name: List packages
run: find npm
- name: Publish

2606
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -13,40 +13,40 @@ categories = ["database-implementations"]
rust-version = "1.91.0"
[workspace.dependencies]
lance = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance = { "version" = "=6.0.0-beta.6", default-features = false, "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=6.0.0-beta.6", default-features = false, "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=6.0.0-beta.6", default-features = false, "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=6.0.0-beta.6", "tag" = "v6.0.0-beta.6", "git" = "https://github.com/lance-format/lance.git" }
ahash = "0.8"
# Note that this one does not include pyarrow
arrow = { version = "58.0.0", optional = false }
arrow-array = "58.0.0"
arrow-data = "58.0.0"
arrow-ipc = "58.0.0"
arrow-ord = "58.0.0"
arrow-schema = "58.0.0"
arrow-select = "58.0.0"
arrow-cast = "58.0.0"
arrow = { version = "58.1", optional = false }
arrow-array = "58.1"
arrow-data = "58.1"
arrow-ipc = "58.1"
arrow-ord = "58.1"
arrow-schema = "58.1"
arrow-select = "58.1"
arrow-cast = "58.1"
async-trait = "0"
datafusion = { version = "53.0.0", default-features = false }
datafusion-catalog = "53.0.0"
datafusion-common = { version = "53.0.0", default-features = false }
datafusion-execution = "53.0.0"
datafusion-expr = "53.0.0"
datafusion-functions = "53.0.0"
datafusion-physical-plan = "53.0.0"
datafusion-physical-expr = "53.0.0"
datafusion-sql = "53.0.0"
datafusion = { version = "53.1", default-features = false }
datafusion-catalog = "53.1"
datafusion-common = { version = "53.1", default-features = false }
datafusion-execution = "53.1"
datafusion-expr = "53.1"
datafusion-functions = "53.1"
datafusion-physical-plan = "53.1"
datafusion-physical-expr = "53.1"
datafusion-sql = "53.1"
env_logger = "0.11"
half = { "version" = "2.7.1", default-features = false, features = [
"num-traits",
@@ -54,7 +54,7 @@ half = { "version" = "2.7.1", default-features = false, features = [
futures = "0"
log = "0.4"
moka = { version = "0.12", features = ["future"] }
object_store = "0.13.2"
object_store = "0.12.0"
pin-project = "1.0.7"
rand = "0.9"
snafu = "0.8"

View File

@@ -51,18 +51,6 @@ ignore = [
# https://rustsec.org/advisories/RUSTSEC-2024-0436
{ id = "RUSTSEC-2024-0436", reason = "transitive via datafusion; awaiting ecosystem migration" },
# encoding: unmaintained. Reached through lindera-dictionary, which is
# required by the native Lindera tokenizer path. Lindera has not migrated
# off this crate yet.
# https://rustsec.org/advisories/RUSTSEC-2021-0153
{ id = "RUSTSEC-2021-0153", reason = "transitive via lindera-dictionary for native Lindera tokenizer" },
# fast-float: unsound and unmaintained. Reached only through polars-arrow
# from the optional Polars integration; replacement requires a Polars
# dependency upgrade.
# https://rustsec.org/advisories/RUSTSEC-2024-0379
{ id = "RUSTSEC-2024-0379", reason = "transitive via polars-arrow; waiting on Polars migration" },
# tantivy: segfault on malformed input due to missing bounds check.
# Pulled in via lance for full-text search. We only feed tantivy
# documents we construct ourselves, not attacker-controlled bytes.
@@ -80,17 +68,11 @@ ignore = [
# https://rustsec.org/advisories/RUSTSEC-2025-0119
{ id = "RUSTSEC-2025-0119", reason = "transitive via hf-hub/indicatif; cosmetic formatting crate" },
# bincode: unmaintained. Reached through lindera and lindera-dictionary,
# which are required by the native Lindera tokenizer path. Lindera has not
# migrated to another serialization format yet.
# https://rustsec.org/advisories/RUSTSEC-2025-0141
{ id = "RUSTSEC-2025-0141", reason = "transitive via lindera/lindera-dictionary for native Lindera tokenizer" },
# lru: soundness issue in IterMut. Reached only through aws-sdk-s3 in
# LanceDB's dev-dependency graph; LanceDB does not use that iterator
# directly. Clearing this requires the AWS SDK chain to update lru.
# https://rustsec.org/advisories/RUSTSEC-2026-0002
{ id = "RUSTSEC-2026-0002", reason = "transitive via aws-sdk-s3 dev-dependency; waiting on AWS SDK lru upgrade" },
# rustls-pemfile: unmaintained. Reached from two separate chains:
# rustls-native-certs 0.6 (via hyper-rustls 0.24) and object_store 0.12.
# Both upstream dependencies need to move before we can drop it.
# https://rustsec.org/advisories/RUSTSEC-2025-0134
{ id = "RUSTSEC-2025-0134", reason = "transitive via rustls-native-certs/object_store; waiting on upstream migration" },
# rustls-webpki 0.101.7 (old major line): name-constraint checks for
# URI / wildcard names. Pulled in only via the legacy rustls 0.21 chain
@@ -107,12 +89,6 @@ ignore = [
# we actively use is upgraded to 0.103.13 which contains the fix.
# https://rustsec.org/advisories/RUSTSEC-2026-0104
{ id = "RUSTSEC-2026-0104", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
# rand 0.8.5: soundness issue only when ThreadRng reseeds inside a custom
# logger. Reached through several transitive chains. LanceDB does not use
# rand from a custom logger; upgrade once all pinned chains accept 0.8.6+.
# https://rustsec.org/advisories/RUSTSEC-2026-0097
{ id = "RUSTSEC-2026-0097", reason = "transitive rand 0.8.5; LanceDB does not call ThreadRng from custom logging" },
]
# ---------------------------------------------------------------------------

View File

@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
<dependency>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-core</artifactId>
<version>0.29.1-beta.0</version>
<version>0.28.0-beta.10</version>
</dependency>
```

View File

@@ -12,22 +12,20 @@ Typescript.
* `src/`: Rust bindings source code
* `lancedb/`: Typescript package source code
* `__test__/`: Unit tests
* `examples/`: A pnpm package with the examples shown in the documentation
* `examples/`: An npm package with the examples shown in the documentation
## Development environment
To set up your development environment, you will need to install the following:
1. Node.js 22 or later (required by pnpm 11)
2. [pnpm](https://pnpm.io/installation) 11 or later (or run via `corepack enable`,
which uses the `packageManager` field in `package.json`)
3. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
4. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
1. Node.js 14 or later
2. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
3. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
Initial setup:
```shell
pnpm install
npm install
```
### Commit Hooks
@@ -41,38 +39,38 @@ pre-commit install
## Development
Most common development commands can be run using the pnpm scripts.
Most common development commands can be run using the npm scripts.
Build the package
```shell
pnpm install
pnpm build
npm install
npm run build
```
Lint:
```shell
pnpm lint
npm run lint
```
Format and fix lints:
```shell
pnpm lint-fix
npm run lint-fix
```
Run tests:
```shell
pnpm test
npm test
```
To run a single test:
```shell
# Single file: table.test.ts
pnpm test -- table.test.ts
npm test -- table.test.ts
# Single test: 'merge insert' in table.test.ts
pnpm test -- table.test.ts --testNamePattern=merge\ insert
npm test -- table.test.ts --testNamePattern=merge\ insert
```

View File

@@ -148,33 +148,6 @@ Creates a new empty Table
***
### createNamespace()
```ts
abstract createNamespace(namespacePath, options?): Promise<CreateNamespaceResponse>
```
Create a new namespace at the given path.
#### Parameters
* **namespacePath**: `string`[]
The namespace path to create.
* **options?**: `Partial`&lt;[`CreateNamespaceOptions`](../interfaces/CreateNamespaceOptions.md)&gt;
Creation `mode`
("create" | "exist_ok" | "overwrite") and optional `properties`
to attach to the namespace.
#### Returns
`Promise`&lt;[`CreateNamespaceResponse`](../interfaces/CreateNamespaceResponse.md)&gt;
The properties of the
created namespace and an optional transaction id.
***
### createTable()
#### createTable(options, namespacePath)
@@ -257,29 +230,6 @@ Creates a new Table and initialize it with new data.
***
### describeNamespace()
```ts
abstract describeNamespace(namespacePath): Promise<DescribeNamespaceResponse>
```
Describe a namespace, returning its properties.
#### Parameters
* **namespacePath**: `string`[]
The namespace path to describe, in
parent → child order, e.g. `["analytics", "sales"]`.
#### Returns
`Promise`&lt;[`DescribeNamespaceResponse`](../interfaces/DescribeNamespaceResponse.md)&gt;
The namespace's properties
(may be undefined if the namespace has none).
***
### display()
```ts
@@ -313,36 +263,6 @@ Drop all tables in the database.
***
### dropNamespace()
```ts
abstract dropNamespace(namespacePath, options?): Promise<DropNamespaceResponse>
```
Drop a namespace.
Use `behavior: "cascade"` to also drop everything contained in the
namespace (sub-namespaces and tables). The default `"restrict"`
behavior refuses to drop a non-empty namespace.
#### Parameters
* **namespacePath**: `string`[]
The namespace path to drop.
* **options?**: `Partial`&lt;[`DropNamespaceOptions`](../interfaces/DropNamespaceOptions.md)&gt;
`mode` ("skip" | "fail"
for missing-namespace handling) and `behavior` ("restrict" | "cascade").
#### Returns
`Promise`&lt;[`DropNamespaceResponse`](../interfaces/DropNamespaceResponse.md)&gt;
Any properties returned by
the server and an optional transaction id.
***
### dropTable()
```ts
@@ -379,36 +299,6 @@ Return true if the connection has not been closed
***
### listNamespaces()
```ts
abstract listNamespaces(namespacePath?, options?): Promise<ListNamespacesResponse>
```
List the immediate child namespaces under the given parent.
Results may be paginated. To retrieve subsequent pages, pass the
`pageToken` returned by a previous call.
#### Parameters
* **namespacePath?**: `string`[]
The parent namespace path. Defaults
to the root namespace if omitted.
* **options?**: `Partial`&lt;[`ListNamespacesOptions`](../interfaces/ListNamespacesOptions.md)&gt;
Pagination options
(`pageToken`, `limit`).
#### Returns
`Promise`&lt;[`ListNamespacesResponse`](../interfaces/ListNamespacesResponse.md)&gt;
Child namespace names and
an optional token for fetching the next page.
***
### openTable()
```ts
@@ -437,29 +327,6 @@ Open a table in the database.
***
### renameTable()
```ts
abstract renameTable(
oldName,
newName,
namespacePath?): Promise<void>
```
#### Parameters
* **oldName**: `string`
* **newName**: `string`
* **namespacePath?**: `string`[]
#### Returns
`Promise`&lt;`void`&gt;
***
### tableNames()
#### tableNames(options)

View File

@@ -343,30 +343,6 @@ This is useful for pagination.
***
### orderBy()
```ts
orderBy(ordering): this
```
Sort the results by the specified column(s).
#### Parameters
* **ordering**: [`ColumnOrdering`](../interfaces/ColumnOrdering.md) \| [`ColumnOrdering`](../interfaces/ColumnOrdering.md)[]
#### Returns
`this`
This query builder.
#### Inherited from
`StandardQueryBase.orderBy`
***
### outputSchema()
```ts

View File

@@ -1,173 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / Scannable
# Class: Scannable
A data source that can be scanned as a stream of Arrow `RecordBatch`es.
`Scannable` wraps the schema + optional row count + rescannable flag and
a callback that yields batches one at a time. It is passed to consumers
(e.g. `Table.add`, `createTable`, `mergeInsert` — follow-up work) that
need to pull data without materializing the full dataset in JS memory.
Batches cross the JS↔Rust boundary as Arrow IPC Stream messages; a fresh
writer serializes each batch, and the Rust side decodes it with
`arrow_ipc::reader::StreamReader`. One batch is in flight at a time.
## Properties
### numRows
```ts
readonly numRows: null | number;
```
***
### rescannable
```ts
readonly rescannable: boolean;
```
***
### schema
```ts
readonly schema: Schema<any>;
```
## Methods
### fromFactory()
```ts
static fromFactory(
schema,
factory,
opts): Promise<Scannable>
```
Build a Scannable from an explicit schema and a factory that returns a
fresh batch iterator on each call.
The factory is invoked once per scan. Each iterator yields
`RecordBatch`es matching the declared schema. Use this when you need
direct control over the pull loop — for example, to wrap a streaming
source whose batches are produced lazily.
#### Parameters
* **schema**: `Schema`&lt;`any`&gt;
The Arrow schema of the produced batches.
* **factory**
Called at the start of each scan to produce a batch
iterator. Must be idempotent when `rescannable` is true.
* **opts**: [`ScannableOptions`](../interfaces/ScannableOptions.md) = `{}`
Optional hints. `rescannable` defaults to `true`; set to
`false` if calling `factory()` twice would not reproduce the same data.
#### Returns
`Promise`&lt;[`Scannable`](Scannable.md)&gt;
***
### fromIterable()
```ts
static fromIterable(
schema,
iter,
opts): Promise<Scannable>
```
Build a Scannable from an iterable of `RecordBatch`es. `rescannable`
defaults to `false`. Pass an explicit schema so the consumer can
validate before any batch is pulled.
`opts.rescannable: true` is honest for replayable iterables (Arrays,
Sets, or custom iterables whose `[Symbol.iterator]()` returns a fresh
iterator each call). It is rejected for one-shot iterables (generators,
async generators, or already-an-iterator inputs) because their
`[Symbol.iterator]()` returns the same exhausted object on the second
scan. For replayable sources outside this shape, use
`fromFactory(schema, () => createIter(), { rescannable: true })`.
Note: when `opts.rescannable` is `true`, the constructor calls
`[Symbol.iterator]()` once on the input to perform the structural check.
#### Parameters
* **schema**: `Schema`&lt;`any`&gt;
* **iter**: `Iterable`&lt;`RecordBatch`&lt;`any`&gt;&gt; \| `AsyncIterable`&lt;`RecordBatch`&lt;`any`&gt;&gt;
* **opts**: [`ScannableOptions`](../interfaces/ScannableOptions.md) = `{}`
#### Returns
`Promise`&lt;[`Scannable`](Scannable.md)&gt;
***
### fromRecordBatchReader()
```ts
static fromRecordBatchReader(reader, opts): Promise<Scannable>
```
Build a Scannable from an Arrow `RecordBatchReader`. A reader can only
be consumed once; `rescannable` defaults to `false`.
The reader must already be opened (via `.open()`) so its `.schema` is
populated. `RecordBatchReader.from(...)` returns an unopened reader.
`opts.rescannable: true` is rejected because `RecordBatchReader` is a
self-iterator (its `[Symbol.iterator]()` returns itself), and this
constructor does not call `reader.reset()` between scans, so a second
scan would always see an exhausted reader. For genuinely replayable
sources, use
`fromFactory(schema, () => openReader(), { rescannable: true })`,
which mints a fresh reader on each scan.
#### Parameters
* **reader**: `RecordBatchReader`&lt;`any`&gt;
* **opts**: [`ScannableOptions`](../interfaces/ScannableOptions.md) = `{}`
#### Returns
`Promise`&lt;[`Scannable`](Scannable.md)&gt;
***
### fromTable()
```ts
static fromTable(table, opts): Promise<Scannable>
```
Build a Scannable from an in-memory Arrow `Table`. Always rescannable;
the table's batches are replayed on each scan.
The table's row count is authoritative: `opts.numRows` must either be
omitted or equal to `table.numRows`. `opts.rescannable` of `false` is
rejected because in-memory Tables are always rescannable.
#### Parameters
* **table**: `Table`&lt;`any`&gt;
* **opts**: [`ScannableOptions`](../interfaces/ScannableOptions.md) = `{}`
#### Returns
`Promise`&lt;[`Scannable`](Scannable.md)&gt;

View File

@@ -501,34 +501,6 @@ Modeled after ``VACUUM`` in PostgreSQL.
***
### prewarmData()
```ts
abstract prewarmData(columns?): Promise<void>
```
Prewarm one or more columns of data in the table.
#### Parameters
* **columns?**: `string`[]
The columns to prewarm. If undefined, all columns are prewarmed.
This will load the column data into the page cache so that future queries that
read those columns avoid the initial cold-start latency. This call initiates
prewarming and returns once the request is accepted; the warming itself may
continue in the background. Calling it on already-prewarmed columns is a
no-op on the server.
Prewarming is generally useful for columns used in filters or projections.
Large columns (e.g. high-dimensional vectors or binary data) may not be
practical to prewarm.
This feature is currently only supported on remote tables.
#### Returns
`Promise`&lt;`void`&gt;
***
### prewarmIndex()
```ts
@@ -690,74 +662,6 @@ of the given query
***
### setLsmWriteSpec()
```ts
abstract setLsmWriteSpec(spec): Promise<void>
```
Install an [LsmWriteSpec](../interfaces/LsmWriteSpec.md) on this table, selecting Lance's MemWAL
LSM-style write path for future `mergeInsert` calls.
`LsmWriteSpec` chooses one of three sharding strategies via `specType`:
- `"bucket"` — hash-bucket writes by the single-column unenforced primary
key (`column` and `numBuckets` required).
- `"identity"` — shard by the raw value of a scalar `column`.
- `"unsharded"` — route every write to a single shard.
All variants require the table to have an unenforced primary key
([Table#setUnenforcedPrimaryKey](Table.md#setunenforcedprimarykey)); bucket sharding additionally
requires it to be the single column being bucketed.
#### Parameters
* **spec**: [`LsmWriteSpec`](../interfaces/LsmWriteSpec.md)
The sharding spec to install.
#### Returns
`Promise`&lt;`void`&gt;
#### Example
```ts
await table.setUnenforcedPrimaryKey("id");
await table.setLsmWriteSpec({
specType: "bucket",
column: "id",
numBuckets: 16,
maintainedIndexes: ["id_idx"],
});
```
***
### setUnenforcedPrimaryKey()
```ts
abstract setUnenforcedPrimaryKey(columns): Promise<void>
```
Set the unenforced primary key for this table to a single column.
"Unenforced" means LanceDB does not check uniqueness on writes; the
column is recorded in the schema as the primary key for use by features
such as `merge_insert`. Only single-column primary keys are supported,
and the key cannot be changed once set.
#### Parameters
* **columns**: `string` \| `string`[]
The primary key column. A one-element
array is also accepted; passing more than one column is rejected.
#### Returns
`Promise`&lt;`void`&gt;
***
### stats()
```ts
@@ -861,23 +765,6 @@ Return the table as an arrow table
***
### unsetLsmWriteSpec()
```ts
abstract unsetLsmWriteSpec(): Promise<void>
```
Remove the [LsmWriteSpec](../interfaces/LsmWriteSpec.md) from this table, reverting to the standard
`mergeInsert` write path.
Errors if no spec is currently set.
#### Returns
`Promise`&lt;`void`&gt;
***
### update()
#### update(opts)

View File

@@ -498,30 +498,6 @@ This is useful for pagination.
***
### orderBy()
```ts
orderBy(ordering): this
```
Sort the results by the specified column(s).
#### Parameters
* **ordering**: [`ColumnOrdering`](../interfaces/ColumnOrdering.md) \| [`ColumnOrdering`](../interfaces/ColumnOrdering.md)[]
#### Returns
`this`
This query builder.
#### Inherited from
`StandardQueryBase.orderBy`
***
### outputSchema()
```ts

View File

@@ -1,131 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / connectNamespace
# Function: connectNamespace()
## connectNamespace(implName, config, options)
```ts
function connectNamespace(
implName,
config,
options?): Promise<Connection>
```
Connect to a LanceDB database through a namespace.
Unlike [connect](connect.md), which routes by URI scheme (local path vs.
`db://` cloud), `connectNamespace` always returns a namespace-backed
connection. The `implName` selects the namespace implementation:
- `"dir"` — directory namespace, configured with [DirNamespaceConfig](../interfaces/DirNamespaceConfig.md).
- `"rest"` — remote REST catalog, configured with [RestNamespaceConfig](../interfaces/RestNamespaceConfig.md).
- Any other string — full module path for a custom implementation,
configured with a free-form string-keyed `properties` map.
### Parameters
* **implName**: `"dir"`
* **config**: [`DirNamespaceConfig`](../interfaces/DirNamespaceConfig.md)
* **options?**: `Partial`&lt;[`ConnectNamespaceOptions`](../interfaces/ConnectNamespaceOptions.md)&gt;
### Returns
`Promise`&lt;[`Connection`](../classes/Connection.md)&gt;
### Examples
```ts
const db = await connectNamespace("dir", { root: "/path/to/db" });
await db.createTable("users", [{ id: 1 }]);
```
```ts
const db = await connectNamespace("rest", {
uri: "https://catalog.example.com",
headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
});
```
```ts
const db = await connectNamespace("my.custom.Namespace", {
endpoint: "...",
});
```
## connectNamespace(implName, config, options)
```ts
function connectNamespace(
implName,
config,
options?): Promise<Connection>
```
Connect through the built-in REST namespace.
Configured with [RestNamespaceConfig](../interfaces/RestNamespaceConfig.md). See the function-level
documentation above for the full surface, examples, and how this
relates to [connect](connect.md).
### Parameters
* **implName**: `"rest"`
* **config**: [`RestNamespaceConfig`](../interfaces/RestNamespaceConfig.md)
* **options?**: `Partial`&lt;[`ConnectNamespaceOptions`](../interfaces/ConnectNamespaceOptions.md)&gt;
### Returns
`Promise`&lt;[`Connection`](../classes/Connection.md)&gt;
### Example
```ts
const db = await connectNamespace("rest", {
uri: "https://catalog.example.com",
headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
});
```
## connectNamespace(implName, properties, options)
```ts
function connectNamespace(
implName,
properties,
options?): Promise<Connection>
```
Connect through a custom namespace implementation by full module path,
configured with a free-form string-keyed `properties` map. Use the
typed overloads above for the built-in `"dir"` and `"rest"` impls.
See the function-level documentation above for examples and how this
relates to [connect](connect.md).
### Parameters
* **implName**: `string`
* **properties**: `Record`&lt;`string`, `string`&gt;
* **options?**: `Partial`&lt;[`ConnectNamespaceOptions`](../interfaces/ConnectNamespaceOptions.md)&gt;
### Returns
`Promise`&lt;[`Connection`](../classes/Connection.md)&gt;
### Example
```ts
const db = await connectNamespace("my.custom.Namespace", {
endpoint: "...",
});
```

View File

@@ -32,7 +32,6 @@
- [PhraseQuery](classes/PhraseQuery.md)
- [Query](classes/Query.md)
- [QueryBase](classes/QueryBase.md)
- [Scannable](classes/Scannable.md)
- [Session](classes/Session.md)
- [StaticHeaderProvider](classes/StaticHeaderProvider.md)
- [Table](classes/Table.md)
@@ -51,19 +50,11 @@
- [AlterColumnsResult](interfaces/AlterColumnsResult.md)
- [ClientConfig](interfaces/ClientConfig.md)
- [ColumnAlteration](interfaces/ColumnAlteration.md)
- [ColumnOrdering](interfaces/ColumnOrdering.md)
- [CompactionStats](interfaces/CompactionStats.md)
- [ConnectNamespaceOptions](interfaces/ConnectNamespaceOptions.md)
- [ConnectionOptions](interfaces/ConnectionOptions.md)
- [CreateNamespaceOptions](interfaces/CreateNamespaceOptions.md)
- [CreateNamespaceResponse](interfaces/CreateNamespaceResponse.md)
- [CreateTableOptions](interfaces/CreateTableOptions.md)
- [DeleteResult](interfaces/DeleteResult.md)
- [DescribeNamespaceResponse](interfaces/DescribeNamespaceResponse.md)
- [DirNamespaceConfig](interfaces/DirNamespaceConfig.md)
- [DropColumnsResult](interfaces/DropColumnsResult.md)
- [DropNamespaceOptions](interfaces/DropNamespaceOptions.md)
- [DropNamespaceResponse](interfaces/DropNamespaceResponse.md)
- [ExecutableQuery](interfaces/ExecutableQuery.md)
- [FragmentStatistics](interfaces/FragmentStatistics.md)
- [FragmentSummaryStats](interfaces/FragmentSummaryStats.md)
@@ -78,18 +69,13 @@
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
- [IvfPqOptions](interfaces/IvfPqOptions.md)
- [IvfRqOptions](interfaces/IvfRqOptions.md)
- [ListNamespacesOptions](interfaces/ListNamespacesOptions.md)
- [ListNamespacesResponse](interfaces/ListNamespacesResponse.md)
- [LsmWriteSpec](interfaces/LsmWriteSpec.md)
- [MergeResult](interfaces/MergeResult.md)
- [OpenTableOptions](interfaces/OpenTableOptions.md)
- [OptimizeOptions](interfaces/OptimizeOptions.md)
- [OptimizeStats](interfaces/OptimizeStats.md)
- [QueryExecutionOptions](interfaces/QueryExecutionOptions.md)
- [RemovalStats](interfaces/RemovalStats.md)
- [RestNamespaceConfig](interfaces/RestNamespaceConfig.md)
- [RetryConfig](interfaces/RetryConfig.md)
- [ScannableOptions](interfaces/ScannableOptions.md)
- [ShuffleOptions](interfaces/ShuffleOptions.md)
- [SplitCalculatedOptions](interfaces/SplitCalculatedOptions.md)
- [SplitHashOptions](interfaces/SplitHashOptions.md)
@@ -121,7 +107,6 @@
- [RecordBatchIterator](functions/RecordBatchIterator.md)
- [connect](functions/connect.md)
- [connectNamespace](functions/connectNamespace.md)
- [makeArrowTable](functions/makeArrowTable.md)
- [packBits](functions/packBits.md)
- [permutationBuilder](functions/permutationBuilder.md)

View File

@@ -1,31 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / ColumnOrdering
# Interface: ColumnOrdering
## Properties
### ascending?
```ts
optional ascending: boolean;
```
***
### columnName
```ts
columnName: string;
```
***
### nullsFirst?
```ts
optional nullsFirst: boolean;
```

View File

@@ -1,54 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / ConnectNamespaceOptions
# Interface: ConnectNamespaceOptions
## Properties
### namespaceClientProperties?
```ts
optional namespaceClientProperties: Record<string, string>;
```
Extra properties for the backing namespace client.
***
### readConsistencyInterval?
```ts
optional readConsistencyInterval: number;
```
The interval, in seconds, at which to check for updates to the table
from other processes. If None, then consistency is not checked. For
performance reasons, this is the default. For strong consistency, set
this to zero seconds. Then every read will check for updates from other
processes. As a compromise, you can set this to a non-zero value for
eventual consistency.
***
### session?
```ts
optional session: Session;
```
The session to use for this connection. Holds shared caches and other
session-specific state.
***
### storageOptions?
```ts
optional storageOptions: Record<string, string>;
```
Configuration for object storage. The available options are described
at https://docs.lancedb.com/storage/

View File

@@ -41,29 +41,6 @@ for testing purposes.
***
### manifestEnabled?
```ts
optional manifestEnabled: boolean;
```
(For LanceDB OSS only): use directory namespace manifests as the source
of truth for table metadata. Existing directory-listed root tables are
migrated into the manifest on access.
***
### namespaceClientProperties?
```ts
optional namespaceClientProperties: Record<string, string>;
```
(For LanceDB OSS only): extra properties for the backing namespace
client used by manifest-enabled native connections.
***
### readConsistencyInterval?
```ts

View File

@@ -1,27 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / CreateNamespaceOptions
# Interface: CreateNamespaceOptions
## Properties
### mode?
```ts
optional mode: "overwrite" | "create" | "exist_ok";
```
Creation mode.
***
### properties?
```ts
optional properties: Record<string, string>;
```
Properties to set on the new namespace.

View File

@@ -1,23 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / CreateNamespaceResponse
# Interface: CreateNamespaceResponse
## Properties
### properties?
```ts
optional properties: Record<string, string>;
```
***
### transactionId?
```ts
optional transactionId: string;
```

View File

@@ -1,15 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / DescribeNamespaceResponse
# Interface: DescribeNamespaceResponse
## Properties
### properties?
```ts
optional properties: Record<string, string>;
```

View File

@@ -1,47 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / DirNamespaceConfig
# Interface: DirNamespaceConfig
Configuration for the built-in directory namespace (`"dir"`).
The directory namespace stores tables under a single root path (local
filesystem or object storage URI). See
[https://docs.lancedb.com/namespaces](https://docs.lancedb.com/namespaces) for the documented surface;
less-common knobs live under [DirNamespaceConfig.extraProperties](DirNamespaceConfig.md#extraproperties).
## Properties
### extraProperties?
```ts
optional extraProperties: Record<string, string>;
```
Additional raw properties passed verbatim to the namespace
implementation (e.g. `storage.*`, `credential_vendor.*`). Typed
fields above take precedence on key collision.
***
### manifestEnabled?
```ts
optional manifestEnabled: boolean;
```
Whether to maintain a namespace manifest at the root. Required for
child namespaces. Defaults to true on the impl side.
***
### root
```ts
root: string;
```
Root path or URI containing the LanceDB tables.

View File

@@ -1,27 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / DropNamespaceOptions
# Interface: DropNamespaceOptions
## Properties
### behavior?
```ts
optional behavior: "restrict" | "cascade";
```
Refuse to drop if non-empty (restrict) or drop recursively (cascade).
***
### mode?
```ts
optional mode: "fail" | "skip";
```
Whether to skip if the namespace doesn't exist, or fail.

View File

@@ -1,23 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / DropNamespaceResponse
# Interface: DropNamespaceResponse
## Properties
### properties?
```ts
optional properties: Record<string, string>;
```
***
### transactionId?
```ts
optional transactionId: string[];
```

View File

@@ -1,27 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / ListNamespacesOptions
# Interface: ListNamespacesOptions
## Properties
### limit?
```ts
optional limit: number;
```
An optional limit to the number of results to return.
***
### pageToken?
```ts
optional pageToken: string;
```
Token from a previous response for pagination.

View File

@@ -1,23 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / ListNamespacesResponse
# Interface: ListNamespacesResponse
## Properties
### namespaces
```ts
namespaces: string[];
```
***
### pageToken?
```ts
optional pageToken: string;
```

View File

@@ -1,64 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / LsmWriteSpec
# Interface: LsmWriteSpec
Specification selecting Lance's MemWAL LSM-style write path for
`mergeInsert`.
`specType` is `"bucket"`, `"identity"`, or `"unsharded"`. For `"bucket"`,
`column` and `numBuckets` are required; for `"identity"`, `column` is
required.
## Properties
### column?
```ts
optional column: string;
```
Bucket and identity variants: the sharding column.
***
### maintainedIndexes?
```ts
optional maintainedIndexes: string[];
```
Names of indexes the MemWAL should keep up to date during writes.
***
### numBuckets?
```ts
optional numBuckets: number;
```
Bucket variant: the number of buckets, in `[1, 1024]`.
***
### specType
```ts
specType: "bucket" | "identity" | "unsharded";
```
One of `"bucket"`, `"identity"`, or `"unsharded"`.
***
### writerConfigDefaults?
```ts
optional writerConfigDefaults: Record<string, string>;
```
Default `ShardWriter` configuration recorded in the MemWAL index.

View File

@@ -1,47 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / RestNamespaceConfig
# Interface: RestNamespaceConfig
Configuration for the built-in REST namespace (`"rest"`).
The REST namespace talks to a remote catalog server over HTTP. See
[https://docs.lancedb.com/namespaces](https://docs.lancedb.com/namespaces) for the documented surface;
less-common knobs (TLS, metrics) live under
[RestNamespaceConfig.extraProperties](RestNamespaceConfig.md#extraproperties).
## Properties
### extraProperties?
```ts
optional extraProperties: Record<string, string>;
```
Additional raw properties passed verbatim to the namespace
implementation (e.g. `tls.*`, `ops_metrics_enabled`, `delimiter`).
Typed fields above take precedence on key collision.
***
### headers?
```ts
optional headers: Record<string, string>;
```
HTTP headers forwarded with each request. Keys are passed through
as-is (e.g. `"x-api-key"`, `"Authorization"`).
***
### uri
```ts
uri: string;
```
Catalog endpoint URL.

View File

@@ -1,29 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / ScannableOptions
# Interface: ScannableOptions
## Properties
### numRows?
```ts
optional numRows: number;
```
Hint about the number of rows. Not validated against the stream.
***
### rescannable?
```ts
optional rescannable: boolean;
```
Whether the source can be scanned more than once. Defaults to `true` for
`fromTable` / `fromFactory` and `false` for `fromIterable` /
`fromRecordBatchReader`.

View File

@@ -94,11 +94,11 @@ of raw SQL strings with [where][lancedb.query.LanceQueryBuilder.where] and
## Full text search
Use [lancedb.table.Table.create_fts_index][] for the synchronous API or
[lancedb.table.AsyncTable.create_index][] with [lancedb.index.FTS][] for the
asynchronous API.
::: lancedb.fts.create_index
::: lancedb.index.FTS
::: lancedb.fts.populate_index
::: lancedb.fts.search_index
## Utilities

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.29.1-beta.0</version>
<version>0.28.0-beta.10</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.29.1-beta.0</version>
<version>0.28.0-beta.10</version>
<packaging>pom</packaging>
<name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description>
@@ -28,7 +28,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<arrow.version>15.0.0</arrow.version>
<lance-core.version>7.0.0-beta.13</lance-core.version>
<lance-core.version>6.0.0-beta.6</lance-core.version>
<spotless.skip>false</spotless.skip>
<spotless.version>2.30.0</spotless.version>
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>

View File

@@ -3,11 +3,11 @@ The core Rust library is in the `../rust/lancedb` directory, the rust binding
code is in the `src/` directory and the typescript bindings are in
the `lancedb/` directory.
Whenever you change the Rust code, you will need to recompile: `pnpm build`.
Whenever you change the Rust code, you will need to recompile: `npm run build`.
Common commands:
* Build: `pnpm build`
* Lint: `pnpm lint`
* Fix lints: `pnpm lint-fix`
* Test: `pnpm test`
* Run single test file: `pnpm test __test__/arrow.test.ts`
* Build: `npm run build`
* Lint: `npm run lint`
* Fix lints: `npm run lint-fix`
* Test: `npm test`
* Run single test file: `npm test __test__/arrow.test.ts`

View File

@@ -12,22 +12,20 @@ Typescript.
* `src/`: Rust bindings source code
* `lancedb/`: Typescript package source code
* `__test__/`: Unit tests
* `examples/`: A pnpm package with the examples shown in the documentation
* `examples/`: An npm package with the examples shown in the documentation
## Development environment
To set up your development environment, you will need to install the following:
1. Node.js 22 or later (required by pnpm 11)
2. [pnpm](https://pnpm.io/installation) 11 or later (or run via `corepack enable`,
which uses the `packageManager` field in `package.json`)
3. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
4. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
1. Node.js 14 or later
2. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
3. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
Initial setup:
```shell
pnpm install
npm install
```
### Commit Hooks
@@ -41,38 +39,38 @@ pre-commit install
## Development
Most common development commands can be run using the pnpm scripts.
Most common development commands can be run using the npm scripts.
Build the package
```shell
pnpm install
pnpm build
npm install
npm run build
```
Lint:
```shell
pnpm lint
npm run lint
```
Format and fix lints:
```shell
pnpm lint-fix
npm run lint-fix
```
Run tests:
```shell
pnpm test
npm test
```
To run a single test:
```shell
# Single file: table.test.ts
pnpm test -- table.test.ts
npm test -- table.test.ts
# Single test: 'merge insert' in table.test.ts
pnpm test -- table.test.ts --testNamePattern=merge\ insert
npm test -- table.test.ts --testNamePattern=merge\ insert
```

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.29.1-beta.0"
version = "0.28.0-beta.10"
publish = false
license.workspace = true
description.workspace = true
@@ -16,13 +16,12 @@ crate-type = ["cdylib"]
async-trait.workspace = true
arrow-ipc.workspace = true
arrow-array.workspace = true
arrow-buffer = "58.0.0"
arrow-buffer = "58.1"
half.workspace = true
arrow-schema.workspace = true
env_logger.workspace = true
futures.workspace = true
lancedb = { path = "../rust/lancedb", default-features = false }
lance-namespace.workspace = true
napi = { version = "3.8.3", default-features = false, features = [
"napi9",
"async"

View File

@@ -4,7 +4,7 @@
import { readdirSync } from "fs";
import { Field, Float64, Schema } from "apache-arrow";
import * as tmp from "tmp";
import { Connection, Table, connect, connectNamespace } from "../lancedb";
import { Connection, Table, connect } from "../lancedb";
import { LocalTable } from "../lancedb/table";
describe("when connecting", () => {
@@ -81,16 +81,6 @@ describe("given a connection", () => {
await db.createTable("test4", [{ id: 1 }, { id: 2 }]);
});
it("should expose renameTable and reject on OSS listing DB", async () => {
await db.createTable("old_name", [{ id: 1 }]);
await expect(db.renameTable("old_name", "new_name")).rejects.toThrow(
"rename_table is not supported in LanceDB OSS",
);
await expect(db.tableNames()).resolves.toEqual(["old_name"]);
});
it("should fail if creating table twice, unless overwrite is true", async () => {
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
await expect(tbl.countRows()).resolves.toBe(2);
@@ -316,186 +306,3 @@ describe("clone table functionality", () => {
).rejects.toThrow("Deep clone is not yet implemented");
});
});
describe("namespaces", () => {
let tmpDir: tmp.DirResult;
let db: Connection;
beforeEach(async () => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
// The local DirectoryNamespace backend only supports child namespaces
// when manifest mode is enabled (see lance-namespace-impls/src/dir.rs).
db = await connect(tmpDir.name, {
// biome-ignore lint/style/useNamingConvention: opaque backend property key, must match Rust
namespaceClientProperties: { manifest_enabled: "true" },
});
});
afterEach(() => tmpDir.removeCallback());
it("should create and describe a namespace", async () => {
await db.createNamespace(["myns"]);
const desc = await db.describeNamespace(["myns"]);
expect(desc).toBeDefined();
});
it("should list namespaces created at the root", async () => {
await db.createNamespace(["alpha"]);
await db.createNamespace(["beta"]);
const list = await db.listNamespaces();
expect(list.namespaces).toEqual(expect.arrayContaining(["alpha", "beta"]));
});
it("should list child namespaces under a parent", async () => {
await db.createNamespace(["parent"]);
await db.createNamespace(["parent", "child"]);
const list = await db.listNamespaces(["parent"]);
expect(list.namespaces).toContain("child");
});
it("should drop a namespace", async () => {
await db.createNamespace(["ephemeral"]);
await db.dropNamespace(["ephemeral"]);
const list = await db.listNamespaces();
expect(list.namespaces).not.toContain("ephemeral");
});
it("should raise an error on any namespace op after close", async () => {
await db.close();
await expect(db.describeNamespace(["foo"])).rejects.toThrow(
"Connection is closed",
);
await expect(db.listNamespaces()).rejects.toThrow("Connection is closed");
await expect(db.createNamespace(["foo"])).rejects.toThrow(
"Connection is closed",
);
await expect(db.dropNamespace(["foo"])).rejects.toThrow(
"Connection is closed",
);
});
it("should raise an understandable error when describing a non-existent namespace", async () => {
await expect(db.describeNamespace(["does-not-exist"])).rejects.toThrow(
/not found/i,
);
});
it("should raise an error when creating a namespace that already exists", async () => {
await db.createNamespace(["dup"]);
await expect(db.createNamespace(["dup"])).rejects.toThrow();
});
it("should reject an unrecognized createNamespace mode with a clear error", async () => {
await expect(
// biome-ignore lint/suspicious/noExplicitAny: deliberately bypass TS to test runtime validation
db.createNamespace(["x"], { mode: "frobnicate" as any }),
).rejects.toThrow(/Invalid mode 'frobnicate'/);
});
it("should reject an unrecognized dropNamespace mode with a clear error", async () => {
await db.createNamespace(["x"]);
await expect(
// biome-ignore lint/suspicious/noExplicitAny: deliberately bypass TS to test runtime validation
db.dropNamespace(["x"], { mode: "frobnicate" as any }),
).rejects.toThrow(/Invalid mode 'frobnicate'/);
});
it("should reject an unrecognized dropNamespace behavior with a clear error", async () => {
await db.createNamespace(["x"]);
await expect(
// biome-ignore lint/suspicious/noExplicitAny: deliberately bypass TS to test runtime validation
db.dropNamespace(["x"], { behavior: "frobnicate" as any }),
).rejects.toThrow(/Invalid behavior 'frobnicate'/);
});
});
describe("connectNamespace", () => {
let tmpDir: tmp.DirResult;
beforeEach(() => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
});
afterEach(() => tmpDir.removeCallback());
it("connects via the dir implementation and supports table ops", async () => {
const db = await connectNamespace("dir", { root: tmpDir.name });
await db.createTable("users", [{ id: 1 }, { id: 2 }]);
await expect(db.tableNames()).resolves.toContain("users");
});
it("throws a clear error when implName is empty", async () => {
await expect(connectNamespace("", {})).rejects.toThrow(
"implName must be a non-empty string",
);
});
it("throws when the namespace implementation is unknown", async () => {
await expect(connectNamespace("not-a-real-impl", {})).rejects.toThrow();
});
it("passes storage options through to the namespace", async () => {
const db = await connectNamespace(
"dir",
{ root: tmpDir.name },
{ storageOptions: { newTableDataStorageVersion: "stable" } },
);
await db.createTable("plumbing", [{ id: 1 }]);
await expect(db.tableNames()).resolves.toContain("plumbing");
});
it("supports child namespaces when manifestEnabled is true on the dir config", async () => {
const writer = await connectNamespace("dir", {
root: tmpDir.name,
manifestEnabled: true,
});
await writer.createNamespace(["analytics"]);
await writer.createTable("orders", [{ id: 1 }, { id: 2 }], ["analytics"]);
await writer.close();
const reader = await connectNamespace("dir", {
root: tmpDir.name,
manifestEnabled: true,
});
await expect(reader.tableNames(["analytics"])).resolves.toContain("orders");
const orders = await reader.openTable("orders", ["analytics"]);
await expect(orders.countRows()).resolves.toBe(2);
});
it("merges extraProperties into the dir config and is overridden by typed fields", async () => {
// Two observable assertions:
// - Typed `root` overrides extraProperties.root: createTable would fail
// under the bogus path if the override didn't happen.
// - extraProperties.manifest_enabled="false" is honored end-to-end. Child
// namespaces require manifest mode (default true), so explicitly
// disabling it via extraProperties must make createNamespace reject. If
// extraProperties pass-through were silently broken, the default would
// let createNamespace succeed.
const db = await connectNamespace("dir", {
root: tmpDir.name,
extraProperties: {
root: "/should/be/overridden",
// biome-ignore lint/style/useNamingConvention: backend property key
manifest_enabled: "false",
},
});
await db.createTable("base", [{ id: 1 }]);
await expect(db.tableNames()).resolves.toContain("base");
await expect(db.createNamespace(["analytics"])).rejects.toThrow();
});
it("flows unknown top-level keys through when implName is dynamic (no silent drop)", async () => {
// Routes via the third overload because `impl` is `string`, not the
// literal `"dir"`. The dispatcher still notices the runtime value is
// "dir", but unknown keys like `manifest_enabled` must not be silently
// dropped during the conversion.
//
// Asserting a *negative* outcome (manifest disabled -> createNamespace
// rejects) is required for observability, since the backend default for
// `manifest_enabled` is true.
const impl: string = "dir";
const db = await connectNamespace(impl, {
root: tmpDir.name,
// biome-ignore lint/style/useNamingConvention: backend property key
manifest_enabled: "false",
});
await expect(db.createNamespace(["mixed"])).rejects.toThrow();
});
});

View File

@@ -109,209 +109,3 @@ describe("Query outputSchema", () => {
expect(schema.fields.length).toBe(3);
});
});
describe("Query orderBy", () => {
let tmpDir: tmp.DirResult;
let table: Table;
beforeEach(async () => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
const db = await connect(tmpDir.name);
// Create table with numeric data for sorting
const schema = new Schema([
new Field("id", new Int64(), true),
new Field("score", new Float32(), true),
new Field("name", new Utf8(), true),
]);
const data = makeArrowTable(
[
{ id: 1n, score: 3.5, name: "charlie" },
{ id: 2n, score: 1.2, name: "alice" },
{ id: 3n, score: 2.8, name: "bob" },
{ id: 4n, score: 0.5, name: "david" },
{ id: 5n, score: 4.1, name: "eve" },
],
{ schema },
);
table = await db.createTable("test", data);
});
afterEach(() => {
tmpDir.removeCallback();
});
it("should sort by single column ascending", async () => {
const results = await table
.query()
.orderBy({ columnName: "score", ascending: true, nullsFirst: false })
.toArray();
expect(results.length).toBe(5);
// Verify ascending order
expect(results[0].score).toBeCloseTo(0.5, 0.001);
expect(results[1].score).toBeCloseTo(1.2, 0.001);
expect(results[2].score).toBeCloseTo(2.8, 0.001);
expect(results[3].score).toBeCloseTo(3.5, 0.001);
expect(results[4].score).toBeCloseTo(4.1, 0.001);
});
it("should sort by single column descending", async () => {
const results = await table
.query()
.orderBy({ columnName: "score", ascending: false, nullsFirst: false })
.toArray();
expect(results.length).toBe(5);
// Verify descending order
expect(results[0].score).toBeCloseTo(4.1, 0.001);
expect(results[1].score).toBeCloseTo(3.5, 0.001);
expect(results[2].score).toBeCloseTo(2.8, 0.001);
expect(results[3].score).toBeCloseTo(1.2, 0.001);
expect(results[4].score).toBeCloseTo(0.5, 0.001);
});
it("should use ascending as default direction", async () => {
const results = await table
.query()
.orderBy({ columnName: "score" })
.toArray();
expect(results.length).toBe(5);
// Verify ascending order (default)
expect(results[0].score).toBeCloseTo(0.5, 0.001);
expect(results[1].score).toBeCloseTo(1.2, 0.001);
expect(results[2].score).toBeCloseTo(2.8, 0.001);
expect(results[3].score).toBeCloseTo(3.5, 0.001);
expect(results[4].score).toBeCloseTo(4.1, 0.001);
});
it("should sort by string column", async () => {
const results = await table
.query()
.orderBy({ columnName: "name" })
.toArray();
expect(results.length).toBe(5);
// Verify alphabetical order
expect(results[0].name).toBe("alice");
expect(results[1].name).toBe("bob");
expect(results[2].name).toBe("charlie");
expect(results[3].name).toBe("david");
expect(results[4].name).toBe("eve");
});
it("should support method chaining with where", async () => {
const results = await table
.query()
.where("score > 2.0")
.orderBy({ columnName: "score" })
.toArray();
expect(results.length).toBe(3);
// Verify filtered and sorted
expect(results[0].score).toBeCloseTo(2.8, 0.001);
expect(results[1].score).toBeCloseTo(3.5, 0.001);
expect(results[2].score).toBeCloseTo(4.1, 0.001);
});
it("should support method chaining with limit", async () => {
const results = await table
.query()
.orderBy({ columnName: "score", ascending: false })
.limit(3)
.toArray();
expect(results.length).toBe(3);
// Verify top 3 in descending order
expect(results[0].score).toBeCloseTo(4.1, 0.001);
expect(results[1].score).toBeCloseTo(3.5, 0.001);
expect(results[2].score).toBeCloseTo(2.8, 0.001);
});
it("should support method chaining with offset", async () => {
const results = await table
.query()
.orderBy({ columnName: "score" })
.offset(2)
.limit(2)
.toArray();
expect(results.length).toBe(2);
// Verify results skip first 2 and take next 2
expect(results[0].score).toBeCloseTo(2.8, 0.001);
expect(results[1].score).toBeCloseTo(3.5, 0.001);
});
it("should support method chaining with select", async () => {
const results = await table
.query()
.orderBy({ columnName: "name" })
.select(["name", "score"])
.toArray();
expect(results.length).toBe(5);
// Verify only selected columns are present
expect(Object.keys(results[0])).toEqual(["name", "score"]);
expect(Object.keys(results[4])).toEqual(["name", "score"]);
// Verify sorted by name
expect(results[0].name).toBe("alice");
expect(results[4].name).toBe("eve");
});
it("should support complex method chaining", async () => {
const results = await table
.query()
.where("score > 1.0")
.orderBy({ columnName: "score", ascending: false })
.limit(3)
.select(["id", "score", "name"])
.toArray();
expect(results.length).toBe(3);
// Verify filtered, sorted, limited, and projected
expect(results[0].score).toBeCloseTo(4.1, 0.001);
expect(results[1].score).toBeCloseTo(3.5, 0.001);
expect(results[2].score).toBeCloseTo(2.8, 0.001);
expect(Object.keys(results[0])).toEqual(["id", "score", "name"]);
});
it("should support multi-column ordering and null placement", async () => {
const schema = new Schema([
new Field("group", new Int64(), true),
new Field("score", new Float32(), true),
new Field("name", new Utf8(), true),
]);
const data = makeArrowTable(
[
{ group: 1n, score: null, name: "z" },
{ group: 1n, score: 1.0, name: "b" },
{ group: 1n, score: 1.0, name: "a" },
{ group: 2n, score: 0.5, name: "c" },
],
{ schema },
);
const nullTable = await (await connect(tmpDir.name)).createTable(
"test_multi_order",
data,
{ mode: "overwrite" },
);
const results = await nullTable
.query()
.orderBy([
{ columnName: "group", ascending: true, nullsFirst: false },
{ columnName: "score", ascending: true, nullsFirst: true },
{ columnName: "name", ascending: true, nullsFirst: false },
])
.toArray();
expect(results.map((r) => [r.group, r.score, r.name])).toEqual([
[1n, null, "z"],
[1n, 1.0, "a"],
[1n, 1.0, "b"],
[2n, 0.5, "c"],
]);
});
});

View File

@@ -1,438 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
import {
Field,
Float16,
Int32,
type RecordBatch,
RecordBatchReader,
Schema,
tableToIPC,
} from "apache-arrow";
import { makeArrowTable, makeEmptyTable } from "../lancedb/arrow";
import { Scannable } from "../lancedb/scannable";
function makeTable() {
return makeArrowTable(
[
{ id: 1, name: "a" },
{ id: 2, name: "b" },
{ id: 3, name: "c" },
],
{ vectorColumns: {} },
);
}
async function makeReader(): Promise<RecordBatchReader> {
// `RecordBatchReader.from()` returns an unopened reader; `.schema` is only
// populated after `.open()`. Opening sync readers is synchronous.
const reader = RecordBatchReader.from(tableToIPC(makeTable()));
return reader.open() as RecordBatchReader;
}
describe("Scannable", () => {
describe("fromTable", () => {
test("reflects schema, numRows, and defaults rescannable=true", async () => {
const table = makeTable();
const scannable = await Scannable.fromTable(table);
expect(scannable.schema).toBe(table.schema);
expect(scannable.numRows).toBe(table.numRows);
expect(scannable.rescannable).toBe(true);
});
test("throws when opts.numRows does not match table.numRows", async () => {
await expect(
Scannable.fromTable(makeTable(), { numRows: 42 }),
).rejects.toThrow(/does not match table\.numRows/);
});
test("throws when opts.rescannable is false", async () => {
await expect(
Scannable.fromTable(makeTable(), { rescannable: false }),
).rejects.toThrow(/always rescannable/);
});
});
describe("fromRecordBatchReader", () => {
test("reflects schema and defaults numRows=null, rescannable=false", async () => {
const reader = await makeReader();
const scannable = await Scannable.fromRecordBatchReader(reader);
expect(scannable.schema).toBe(reader.schema);
expect(scannable.numRows).toBeNull();
expect(scannable.rescannable).toBe(false);
});
test("honors numRows override", async () => {
const scannable = await Scannable.fromRecordBatchReader(
await makeReader(),
{ numRows: 3 },
);
expect(scannable.numRows).toBe(3);
expect(scannable.rescannable).toBe(false);
});
test("rescannable: false explicit does not throw", async () => {
const reader = await makeReader();
const scannable = await Scannable.fromRecordBatchReader(reader, {
rescannable: false,
});
expect(scannable.rescannable).toBe(false);
});
test("throws when opts.rescannable is true", async () => {
const reader = await makeReader();
await expect(
Scannable.fromRecordBatchReader(reader, { rescannable: true }),
).rejects.toThrow(/does not accept rescannable/);
});
test("throws when opts.rescannable is true even alongside numRows", async () => {
const reader = await makeReader();
await expect(
Scannable.fromRecordBatchReader(reader, {
numRows: 3,
rescannable: true,
}),
).rejects.toThrow(/does not accept rescannable/);
});
});
describe("fromIterable", () => {
test("accepts a sync iterable of batches", async () => {
const table = makeTable();
const scannable = await Scannable.fromIterable(
table.schema,
table.batches,
);
expect(scannable.schema).toBe(table.schema);
expect(scannable.numRows).toBeNull();
expect(scannable.rescannable).toBe(false);
});
test("accepts an async iterable of batches", async () => {
const table = makeTable();
async function* generator(): AsyncGenerator<RecordBatch> {
for (const batch of table.batches) {
yield batch;
}
}
const scannable = await Scannable.fromIterable(table.schema, generator());
expect(scannable.schema).toBe(table.schema);
expect(scannable.rescannable).toBe(false);
});
describe("rescannable: true detection", () => {
// Replayable inputs: [Symbol.iterator]() / [Symbol.asyncIterator]()
// returns a fresh iterator each call. Must NOT throw.
test("Array passes (fresh ArrayIterator each call)", async () => {
const table = makeTable();
const scannable = await Scannable.fromIterable(
table.schema,
table.batches,
{ rescannable: true },
);
expect(scannable.rescannable).toBe(true);
});
test("Set passes (fresh SetIterator each call)", async () => {
const table = makeTable();
const set = new Set<RecordBatch>(table.batches);
const scannable = await Scannable.fromIterable(table.schema, set, {
rescannable: true,
});
expect(scannable.rescannable).toBe(true);
});
test("custom Iterable returning a fresh iterator passes", async () => {
const table = makeTable();
const replayable: Iterable<RecordBatch> = {
[Symbol.iterator]() {
return table.batches[Symbol.iterator]();
},
};
const scannable = await Scannable.fromIterable(
table.schema,
replayable,
{ rescannable: true },
);
expect(scannable.rescannable).toBe(true);
});
test("object with generator method passes (fresh generator each call)", async () => {
const table = makeTable();
const replayable: Iterable<RecordBatch> = {
*[Symbol.iterator]() {
for (const batch of table.batches) yield batch;
},
};
const scannable = await Scannable.fromIterable(
table.schema,
replayable,
{ rescannable: true },
);
expect(scannable.rescannable).toBe(true);
});
test("empty Array passes (replayable degenerate case)", async () => {
const schema = makeTable().schema;
const scannable = await Scannable.fromIterable(
schema,
[] as RecordBatch[],
{ rescannable: true },
);
expect(scannable.rescannable).toBe(true);
});
// One-shot inputs: [Symbol.iterator]() / [Symbol.asyncIterator]()
// returns the same object, or the input is already-an-iterator.
// Must throw with a /one-shot/ message.
test("sync generator throws", async () => {
const table = makeTable();
function* generator(): Generator<RecordBatch> {
for (const batch of table.batches) yield batch;
}
await expect(
Scannable.fromIterable(table.schema, generator(), {
rescannable: true,
}),
).rejects.toThrow(/one-shot/);
});
test("async generator throws", async () => {
const table = makeTable();
async function* generator(): AsyncGenerator<RecordBatch> {
for (const batch of table.batches) yield batch;
}
await expect(
Scannable.fromIterable(table.schema, generator(), {
rescannable: true,
}),
).rejects.toThrow(/one-shot/);
});
test("empty generator throws (one-shot degenerate case)", async () => {
const schema = makeTable().schema;
function* generator(): Generator<RecordBatch> {
// intentionally empty; yields nothing.
}
await expect(
Scannable.fromIterable(schema, generator(), { rescannable: true }),
).rejects.toThrow(/one-shot/);
});
test("custom self-iterator throws", async () => {
const table = makeTable();
const batches = table.batches;
let i = 0;
const oneShot: Iterable<RecordBatch> & Iterator<RecordBatch> = {
[Symbol.iterator]() {
return this;
},
next() {
if (i >= batches.length) {
return { done: true, value: undefined };
}
return { done: false, value: batches[i++] };
},
};
await expect(
Scannable.fromIterable(table.schema, oneShot, { rescannable: true }),
).rejects.toThrow(/one-shot/);
});
test("Array.values() (IterableIterator) throws", async () => {
const table = makeTable();
const iter = table.batches.values();
await expect(
Scannable.fromIterable(table.schema, iter, { rescannable: true }),
).rejects.toThrow(/one-shot/);
});
test("raw iterator (only `.next`) throws", async () => {
const table = makeTable();
const batches = table.batches;
let i = 0;
const rawIter = {
next(): IteratorResult<RecordBatch> {
if (i >= batches.length) {
return { done: true, value: undefined };
}
return { done: false, value: batches[i++] };
},
};
await expect(
Scannable.fromIterable(
table.schema,
rawIter as unknown as Iterable<RecordBatch>,
{ rescannable: true },
),
).rejects.toThrow(/one-shot/);
});
// Edge: null/undefined must not crash the detection helper. The
// null check belongs to `normalizeIterator` and only fires when a
// scan starts.
test("null input does not crash detection at construction", async () => {
const schema = makeTable().schema;
await expect(
Scannable.fromIterable(
schema,
null as unknown as Iterable<RecordBatch>,
{
rescannable: true,
},
),
).resolves.toBeDefined();
});
test("undefined input does not crash detection at construction", async () => {
const schema = makeTable().schema;
await expect(
Scannable.fromIterable(
schema,
undefined as unknown as Iterable<RecordBatch>,
{ rescannable: true },
),
).resolves.toBeDefined();
});
// Default (rescannable omitted) skips the check entirely, so even
// pathological inputs construct without throwing here.
test("rescannable omitted skips detection entirely (generator passes)", async () => {
const table = makeTable();
function* generator(): Generator<RecordBatch> {
for (const batch of table.batches) yield batch;
}
const scannable = await Scannable.fromIterable(
table.schema,
generator(),
);
expect(scannable.rescannable).toBe(false);
});
test("rescannable: false explicit skips detection entirely (generator passes)", async () => {
const table = makeTable();
function* generator(): Generator<RecordBatch> {
for (const batch of table.batches) yield batch;
}
const scannable = await Scannable.fromIterable(
table.schema,
generator(),
{ rescannable: false },
);
expect(scannable.rescannable).toBe(false);
});
});
});
describe("fromFactory", () => {
test("defaults rescannable=true and does not invoke the factory eagerly", async () => {
const table = makeTable();
const factory = jest.fn(() => table.batches);
const scannable = await Scannable.fromFactory(table.schema, factory);
expect(scannable.schema).toBe(table.schema);
expect(scannable.rescannable).toBe(true);
expect(factory).not.toHaveBeenCalled();
});
test("honors rescannable and numRows overrides", async () => {
const table = makeTable();
const scannable = await Scannable.fromFactory(
table.schema,
() => table.batches,
{ numRows: 7, rescannable: false },
);
expect(scannable.numRows).toBe(7);
expect(scannable.rescannable).toBe(false);
});
});
describe("validation", () => {
test("throws when numRows is negative", async () => {
await expect(
Scannable.fromFactory(makeTable().schema, () => [], { numRows: -1 }),
).rejects.toThrow(/non-negative/);
});
test("throws when numRows is not an integer", async () => {
await expect(
Scannable.fromFactory(makeTable().schema, () => [], { numRows: 3.5 }),
).rejects.toThrow(/integer/);
});
});
describe("native handle", () => {
test("exposes a native handle via inner", async () => {
const scannable = await Scannable.fromTable(makeTable());
expect(scannable.inner).toBeDefined();
expect(typeof scannable.inner).toBe("object");
expect(scannable.inner).not.toBeNull();
});
});
// Schema-variety construction tests. Each asserts that construction
// succeeds against a richer Arrow schema, which transitively exercises
// schema serialization and the Rust-side `ipc_file_to_schema` for types
// beyond flat primitives.
describe("schema variety", () => {
test("accepts an empty table", async () => {
const schema = new Schema([new Field("id", new Int32(), true)]);
const table = makeEmptyTable(schema);
const scannable = await Scannable.fromTable(table);
expect(scannable.numRows).toBe(0);
expect(scannable.schema).toBe(table.schema);
});
test("accepts nested struct and list columns", async () => {
const table = makeArrowTable(
[
{ id: 1, point: { x: 0, y: 0 }, tags: ["a", "b"] },
{ id: 2, point: { x: 1, y: 2 }, tags: ["c"] },
],
{ vectorColumns: {} },
);
const scannable = await Scannable.fromTable(table);
expect(scannable.schema).toBe(table.schema);
expect(scannable.numRows).toBe(2);
});
test("accepts a FixedSizeList (vector) column", async () => {
const table = makeArrowTable(
[
{ id: 1, vec: [1, 2, 3] },
{ id: 2, vec: [4, 5, 6] },
],
{ vectorColumns: { vec: { type: new Float16() } } },
);
const scannable = await Scannable.fromTable(table);
expect(scannable.schema).toBe(table.schema);
expect(scannable.numRows).toBe(2);
});
test("accepts a table with many columns", async () => {
const row: Record<string, number> = {};
for (let i = 0; i < 50; i++) row[`c${i}`] = i;
const table = makeArrowTable([row, row], { vectorColumns: {} });
const scannable = await Scannable.fromTable(table);
expect(scannable.schema.fields.length).toBe(50);
expect(scannable.numRows).toBe(2);
});
});
});

View File

@@ -1870,25 +1870,6 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(results.length).toBe(3);
});
test("prewarmData errors on local tables", async () => {
const db = await connect(tmpDir.name);
const data = [
{ text: "alpha", vector: [0.1, 0.2, 0.3] },
{ text: "beta", vector: [0.4, 0.5, 0.6] },
];
const table = await db.createTable("prewarm_data_test", data);
// prewarmData is only supported on remote tables. We verify the call
// is wired through napi and surfaces the expected error for both
// arg shapes (undefined and string[]).
await expect(table.prewarmData()).rejects.toThrow(
"prewarm_data is currently only supported on remote tables",
);
await expect(table.prewarmData(["text"])).rejects.toThrow(
"prewarm_data is currently only supported on remote tables",
);
});
test("full text index on list", async () => {
const db = await connect(tmpDir.name);
const data = [
@@ -2348,130 +2329,3 @@ describe("when creating a table with Float32Array vectors", () => {
expect((fsl.children[0].type as Float32).precision).toBe(1);
});
});
describe("setUnenforcedPrimaryKey", () => {
let tmpDir: tmp.DirResult;
beforeEach(() => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
});
afterEach(() => tmpDir.removeCallback());
it("sets a single-column primary key (string or one-element array)", async () => {
const conn = await connect(tmpDir.name);
const schema = new arrow.Schema([
new arrow.Field("id", new arrow.Int64(), false),
]);
const t1 = await conn.createEmptyTable("t1", schema);
await t1.setUnenforcedPrimaryKey("id");
const t2 = await conn.createEmptyTable("t2", schema);
await t2.setUnenforcedPrimaryKey(["id"]);
});
it("rejects a compound primary key", async () => {
const conn = await connect(tmpDir.name);
const table = await conn.createEmptyTable(
"t",
new arrow.Schema([
new arrow.Field("id", new arrow.Int64(), false),
new arrow.Field("name", new arrow.Utf8(), false),
]),
);
await expect(
table.setUnenforcedPrimaryKey(["id", "name"]),
).rejects.toThrow();
});
it("rejects changing the primary key once set", async () => {
const conn = await connect(tmpDir.name);
const table = await conn.createEmptyTable(
"t",
new arrow.Schema([
new arrow.Field("id", new arrow.Int64(), false),
new arrow.Field("name", new arrow.Utf8(), false),
]),
);
await table.setUnenforcedPrimaryKey("id");
await expect(table.setUnenforcedPrimaryKey("name")).rejects.toThrow();
await expect(table.setUnenforcedPrimaryKey("id")).rejects.toThrow();
});
});
describe("setLsmWriteSpec / unsetLsmWriteSpec", () => {
let tmpDir: tmp.DirResult;
beforeEach(() => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
});
afterEach(() => tmpDir.removeCallback());
async function makeTable(conn: Connection): Promise<Table> {
return await conn.createEmptyTable(
"t",
new arrow.Schema([new arrow.Field("id", new arrow.Int64(), false)]),
);
}
it("installs and removes a bucket spec", async () => {
const conn = await connect(tmpDir.name);
const table = await makeTable(conn);
await table.setUnenforcedPrimaryKey("id");
await table.setLsmWriteSpec({
specType: "bucket",
column: "id",
numBuckets: 4,
});
await table.unsetLsmWriteSpec();
// A second unset errors — there is no spec left to remove.
await expect(table.unsetLsmWriteSpec()).rejects.toThrow();
// A fresh spec can be installed after unset.
await table.setLsmWriteSpec({
specType: "bucket",
column: "id",
numBuckets: 8,
});
});
it("installs an unsharded spec", async () => {
const conn = await connect(tmpDir.name);
const table = await makeTable(conn);
await table.setUnenforcedPrimaryKey("id");
await table.setLsmWriteSpec({ specType: "unsharded" });
await table.unsetLsmWriteSpec();
});
it("installs an identity spec", async () => {
const conn = await connect(tmpDir.name);
const table = await makeTable(conn);
await table.setUnenforcedPrimaryKey("id");
await table.setLsmWriteSpec({ specType: "identity", column: "id" });
await table.unsetLsmWriteSpec();
});
it("rejects an invalid spec", async () => {
const conn = await connect(tmpDir.name);
const table = await makeTable(conn);
await table.setUnenforcedPrimaryKey("id");
// num_buckets out of range.
await expect(
table.setLsmWriteSpec({
specType: "bucket",
column: "id",
numBuckets: 0,
}),
).rejects.toThrow();
// Column mismatch.
await expect(
table.setLsmWriteSpec({
specType: "bucket",
column: "missing",
numBuckets: 4,
}),
).rejects.toThrow();
});
});

View File

@@ -38,14 +38,5 @@ test("filtering examples", async () => {
// --8<-- [start:sql_search]
await tbl.query().where("id = 10").limit(10).toArray();
// --8<-- [end:sql_search]
// --8<-- [start:orderby_search]
await tbl
.query()
.where("id > 10")
.orderBy({ columnName: "id", ascending: false })
.limit(5)
.toArray();
// --8<-- [end:orderby_search]
});
});

4810
nodejs/examples/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -11,17 +11,16 @@
"test": "node --experimental-vm-modules node_modules/.bin/jest --testEnvironment jest-environment-node-single-context --verbose",
"lint": "biome check *.ts && biome format *.ts",
"lint-ci": "biome ci .",
"lint-fix": "biome check --write *.ts && pnpm format",
"lint-fix": "biome check --write *.ts && npm run format",
"format": "biome format --write *.ts"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"packageManager": "pnpm@11.1.1",
"dependencies": {
"@huggingface/transformers": "3.0.2",
"@huggingface/transformers": "^3.0.2",
"@lancedb/lancedb": "file:../dist",
"openai": "4.29.2",
"sharp": "0.33.5"
"openai": "^4.29.2",
"sharp": "^0.33.5"
},
"devDependencies": {
"@biomejs/biome": "^1.7.3",

File diff suppressed because it is too large Load Diff

View File

@@ -1,13 +0,0 @@
# Block resolution of versions less than 24h old (Shai-Hulud window).
# This is the pnpm 11 default but pinned here so it's visible to
# reviewers and survives a future pnpm major flipping the default.
minimumReleaseAge: 1440
# Fail install if a transitive dep tries to run an unapproved script.
strictDepBuilds: true
allowBuilds:
'@biomejs/biome': true
onnxruntime-node: true
protobufjs: true
sharp: true

View File

@@ -1291,18 +1291,6 @@ export async function fromRecordBatchToBuffer(
return Buffer.from(await writer.toUint8Array());
}
/**
* Create a buffer containing a single record batch using the Arrow IPC Stream
* serialization. Each call produces a self-contained Stream message (schema +
* batch + EOS) suitable for incremental decode by `arrow_ipc::reader::StreamReader`.
*/
export async function fromRecordBatchToStreamBuffer(
batch: RecordBatch,
): Promise<Buffer> {
const writer = RecordBatchStreamWriter.writeAll([batch]);
return Buffer.from(await writer.toUint8Array());
}
/**
* Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
*

View File

@@ -16,18 +16,6 @@ import {
} from "./arrow";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import { Connection as LanceDbConnection } from "./native";
import type {
CreateNamespaceResponse,
DescribeNamespaceResponse,
DropNamespaceResponse,
ListNamespacesResponse,
} from "./native";
export type {
CreateNamespaceResponse,
DescribeNamespaceResponse,
DropNamespaceResponse,
ListNamespacesResponse,
};
import { sanitizeTable } from "./sanitize";
import { LocalTable, Table } from "./table";
@@ -122,28 +110,6 @@ export interface TableNamesOptions {
/** An optional limit to the number of results to return. */
limit?: number;
}
export interface ListNamespacesOptions {
/** Token from a previous response for pagination. */
pageToken?: string;
/** An optional limit to the number of results to return. */
limit?: number;
}
export interface CreateNamespaceOptions {
/** Creation mode. */
mode?: "create" | "exist_ok" | "overwrite";
/** Properties to set on the new namespace. */
properties?: Record<string, string>;
}
export interface DropNamespaceOptions {
/** Whether to skip if the namespace doesn't exist, or fail. */
mode?: "skip" | "fail";
/** Refuse to drop if non-empty (restrict) or drop recursively (cascade). */
behavior?: "restrict" | "cascade";
}
/**
* A LanceDB Connection that allows you to open tables and create new ones.
*
@@ -296,81 +262,12 @@ export abstract class Connection {
*/
abstract dropTable(name: string, namespacePath?: string[]): Promise<void>;
abstract renameTable(
oldName: string,
newName: string,
namespacePath?: string[],
): Promise<void>;
/**
* Drop all tables in the database.
* @param {string[]} namespacePath The namespace path to drop tables from (defaults to root namespace).
*/
abstract dropAllTables(namespacePath?: string[]): Promise<void>;
/**
* Describe a namespace, returning its properties.
*
* @param {string[]} namespacePath - The namespace path to describe, in
* parent → child order, e.g. `["analytics", "sales"]`.
* @returns {Promise<DescribeNamespaceResponse>} The namespace's properties
* (may be undefined if the namespace has none).
*/
abstract describeNamespace(
namespacePath: string[],
): Promise<DescribeNamespaceResponse>;
/**
* List the immediate child namespaces under the given parent.
*
* Results may be paginated. To retrieve subsequent pages, pass the
* `pageToken` returned by a previous call.
*
* @param {string[]} namespacePath - The parent namespace path. Defaults
* to the root namespace if omitted.
* @param {Partial<ListNamespacesOptions>} options - Pagination options
* (`pageToken`, `limit`).
* @returns {Promise<ListNamespacesResponse>} Child namespace names and
* an optional token for fetching the next page.
*/
abstract listNamespaces(
namespacePath?: string[],
options?: Partial<ListNamespacesOptions>,
): Promise<ListNamespacesResponse>;
/**
* Create a new namespace at the given path.
*
* @param {string[]} namespacePath - The namespace path to create.
* @param {Partial<CreateNamespaceOptions>} options - Creation `mode`
* ("create" | "exist_ok" | "overwrite") and optional `properties`
* to attach to the namespace.
* @returns {Promise<CreateNamespaceResponse>} The properties of the
* created namespace and an optional transaction id.
*/
abstract createNamespace(
namespacePath: string[],
options?: Partial<CreateNamespaceOptions>,
): Promise<CreateNamespaceResponse>;
/**
* Drop a namespace.
*
* Use `behavior: "cascade"` to also drop everything contained in the
* namespace (sub-namespaces and tables). The default `"restrict"`
* behavior refuses to drop a non-empty namespace.
*
* @param {string[]} namespacePath - The namespace path to drop.
* @param {Partial<DropNamespaceOptions>} options - `mode` ("skip" | "fail"
* for missing-namespace handling) and `behavior` ("restrict" | "cascade").
* @returns {Promise<DropNamespaceResponse>} Any properties returned by
* the server and an optional transaction id.
*/
abstract dropNamespace(
namespacePath: string[],
options?: Partial<DropNamespaceOptions>,
): Promise<DropNamespaceResponse>;
/**
* Clone a table from a source table.
*
@@ -615,56 +512,9 @@ export class LocalConnection extends Connection {
return this.inner.dropTable(name, namespacePath ?? []);
}
async renameTable(
oldName: string,
newName: string,
namespacePath?: string[],
): Promise<void> {
return this.inner.renameTable(oldName, newName, namespacePath ?? []);
}
async dropAllTables(namespacePath?: string[]): Promise<void> {
return this.inner.dropAllTables(namespacePath ?? []);
}
describeNamespace(
namespacePath: string[],
): Promise<DescribeNamespaceResponse> {
return this.inner.describeNamespace(namespacePath);
}
listNamespaces(
namespacePath?: string[],
options?: Partial<ListNamespacesOptions>,
): Promise<ListNamespacesResponse> {
return this.inner.listNamespaces(
namespacePath ?? [],
options?.pageToken,
options?.limit,
);
}
createNamespace(
namespacePath: string[],
options?: Partial<CreateNamespaceOptions>,
): Promise<CreateNamespaceResponse> {
return this.inner.createNamespace(
namespacePath,
options?.mode,
options?.properties,
);
}
dropNamespace(
namespacePath: string[],
options?: Partial<DropNamespaceOptions>,
): Promise<DropNamespaceResponse> {
return this.inner.dropNamespace(
namespacePath,
options?.mode,
options?.behavior,
);
}
}
/**

View File

@@ -8,7 +8,6 @@ import {
} from "./connection";
import {
ConnectNamespaceOptions,
ConnectionOptions,
Connection as LanceDbConnection,
JsHeaderProvider as NativeJsHeaderProvider,
@@ -23,7 +22,6 @@ export { JsHeaderProvider as NativeJsHeaderProvider } from "./native.js";
export {
AddColumnsSql,
ConnectionOptions,
ConnectNamespaceOptions,
IndexStatistics,
IndexConfig,
ClientConfig,
@@ -64,13 +62,6 @@ export {
CreateTableOptions,
TableNamesOptions,
OpenTableOptions,
ListNamespacesOptions,
CreateNamespaceOptions,
DropNamespaceOptions,
ListNamespacesResponse,
CreateNamespaceResponse,
DropNamespaceResponse,
DescribeNamespaceResponse,
} from "./connection";
export { Session } from "./native.js";
@@ -82,7 +73,6 @@ export {
VectorQuery,
TakeQuery,
QueryExecutionOptions,
ColumnOrdering,
FullTextSearchOptions,
RecordBatchIterator,
FullTextQuery,
@@ -113,7 +103,6 @@ export {
UpdateOptions,
OptimizeOptions,
Version,
LsmWriteSpec,
ColumnAlteration,
} from "./table";
@@ -128,7 +117,6 @@ export { MergeInsertBuilder, WriteExecutionOptions } from "./merge";
export * as embedding from "./embedding";
export { permutationBuilder, PermutationBuilder } from "./permutation";
export { Scannable, ScannableOptions } from "./scannable";
export * as rerankers from "./rerankers";
export {
SchemaLike,
@@ -305,197 +293,3 @@ export async function connect(
);
return new LocalConnection(nativeConn);
}
/**
* Configuration for the built-in directory namespace (`"dir"`).
*
* The directory namespace stores tables under a single root path (local
* filesystem or object storage URI). See
* {@link https://docs.lancedb.com/namespaces} for the documented surface;
* less-common knobs live under {@link DirNamespaceConfig.extraProperties}.
*/
export interface DirNamespaceConfig {
/** Root path or URI containing the LanceDB tables. */
root: string;
/**
* Whether to maintain a namespace manifest at the root. Required for
* child namespaces. Defaults to true on the impl side.
*/
manifestEnabled?: boolean;
/**
* Additional raw properties passed verbatim to the namespace
* implementation (e.g. `storage.*`, `credential_vendor.*`). Typed
* fields above take precedence on key collision.
*/
extraProperties?: Record<string, string>;
}
/**
* Configuration for the built-in REST namespace (`"rest"`).
*
* The REST namespace talks to a remote catalog server over HTTP. See
* {@link https://docs.lancedb.com/namespaces} for the documented surface;
* less-common knobs (TLS, metrics) live under
* {@link RestNamespaceConfig.extraProperties}.
*/
export interface RestNamespaceConfig {
/** Catalog endpoint URL. */
uri: string;
/**
* HTTP headers forwarded with each request. Keys are passed through
* as-is (e.g. `"x-api-key"`, `"Authorization"`).
*/
headers?: Record<string, string>;
/**
* Additional raw properties passed verbatim to the namespace
* implementation (e.g. `tls.*`, `ops_metrics_enabled`, `delimiter`).
* Typed fields above take precedence on key collision.
*/
extraProperties?: Record<string, string>;
}
function dirConfigToProperties(
config: DirNamespaceConfig,
): Record<string, string> {
// Spread the whole input so that unknown keys (e.g. a raw `manifest_enabled`
// passed via the dynamic-impl path) flow through instead of being dropped.
// Typed transformations layer on top.
const { manifestEnabled, extraProperties, ...rest } = config;
const properties: Record<string, string> = {
...(extraProperties ?? {}),
...(rest as Record<string, string>),
};
if (manifestEnabled !== undefined) {
properties.manifest_enabled = String(manifestEnabled);
}
return properties;
}
function restConfigToProperties(
config: RestNamespaceConfig,
): Record<string, string> {
const { headers, extraProperties, ...rest } = config;
const properties: Record<string, string> = {
...(extraProperties ?? {}),
...(rest as Record<string, string>),
};
if (headers) {
for (const [name, value] of Object.entries(headers)) {
properties[`headers.${name}`] = value;
}
}
return properties;
}
/**
* Connect to a LanceDB database through a namespace.
*
* Unlike {@link connect}, which routes by URI scheme (local path vs.
* `db://` cloud), `connectNamespace` always returns a namespace-backed
* connection. The `implName` selects the namespace implementation:
*
* - `"dir"` — directory namespace, configured with {@link DirNamespaceConfig}.
* - `"rest"` — remote REST catalog, configured with {@link RestNamespaceConfig}.
* - Any other string — full module path for a custom implementation,
* configured with a free-form string-keyed `properties` map.
*
* @example Typed dir namespace
* ```ts
* const db = await connectNamespace("dir", { root: "/path/to/db" });
* await db.createTable("users", [{ id: 1 }]);
* ```
*
* @example Typed REST namespace with auth headers
* ```ts
* const db = await connectNamespace("rest", {
* uri: "https://catalog.example.com",
* headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
* });
* ```
*
* @example Custom implementation with raw properties
* ```ts
* const db = await connectNamespace("my.custom.Namespace", {
* endpoint: "...",
* });
* ```
*/
export function connectNamespace(
implName: "dir",
config: DirNamespaceConfig,
options?: Partial<ConnectNamespaceOptions>,
): Promise<Connection>;
/**
* Connect through the built-in REST namespace.
*
* Configured with {@link RestNamespaceConfig}. See the function-level
* documentation above for the full surface, examples, and how this
* relates to {@link connect}.
*
* @example
* ```ts
* const db = await connectNamespace("rest", {
* uri: "https://catalog.example.com",
* headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
* });
* ```
*/
export function connectNamespace(
implName: "rest",
config: RestNamespaceConfig,
options?: Partial<ConnectNamespaceOptions>,
): Promise<Connection>;
/**
* Connect through a custom namespace implementation by full module path,
* configured with a free-form string-keyed `properties` map. Use the
* typed overloads above for the built-in `"dir"` and `"rest"` impls.
*
* See the function-level documentation above for examples and how this
* relates to {@link connect}.
*
* @example
* ```ts
* const db = await connectNamespace("my.custom.Namespace", {
* endpoint: "...",
* });
* ```
*/
export function connectNamespace(
implName: string,
properties: Record<string, string>,
options?: Partial<ConnectNamespaceOptions>,
): Promise<Connection>;
export async function connectNamespace(
implName: string,
configOrProperties:
| DirNamespaceConfig
| RestNamespaceConfig
| Record<string, string>,
options?: Partial<ConnectNamespaceOptions>,
): Promise<Connection> {
let properties: Record<string, string>;
if (implName === "dir") {
properties = dirConfigToProperties(
configOrProperties as DirNamespaceConfig,
);
} else if (implName === "rest") {
properties = restConfigToProperties(
configOrProperties as RestNamespaceConfig,
);
} else {
properties = configOrProperties as Record<string, string>;
}
const finalOptions: ConnectNamespaceOptions = (options ??
{}) as ConnectNamespaceOptions;
finalOptions.storageOptions = cleanseStorageOptions(
finalOptions.storageOptions,
);
const nativeConn = await LanceDbConnection.newWithNamespace(
implName,
properties,
finalOptions,
);
return new LocalConnection(nativeConn);
}

View File

@@ -79,12 +79,6 @@ export interface QueryExecutionOptions {
timeoutMs?: number;
}
export interface ColumnOrdering {
columnName: string;
ascending?: boolean;
nullsFirst?: boolean;
}
/**
* Options that control the behavior of a full text search
*/
@@ -423,21 +417,6 @@ export class StandardQueryBase<
return this;
}
/**
* Sort the results by the specified column(s).
* @returns This query builder.
*/
orderBy(ordering: ColumnOrdering | ColumnOrdering[]): this {
const orderings = Array.isArray(ordering) ? ordering : [ordering];
const normalized = orderings.map((o) => ({
columnName: o.columnName,
ascending: o.ascending ?? true,
nullsFirst: o.nullsFirst ?? false,
}));
this.doCall((inner) => inner.orderBy(normalized));
return this;
}
/**
* Skip searching un-indexed data. This can make search faster, but will miss
* any data that is not yet indexed.

View File

@@ -1,274 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
import {
Table as ArrowTable,
RecordBatch,
RecordBatchReader,
Schema,
} from "apache-arrow";
import {
fromRecordBatchToStreamBuffer,
fromTableToBuffer,
makeEmptyTable,
} from "./arrow";
import { NapiScannable } from "./native.js";
export interface ScannableOptions {
/** Hint about the number of rows. Not validated against the stream. */
numRows?: number;
/**
* Whether the source can be scanned more than once. Defaults to `true` for
* `fromTable` / `fromFactory` and `false` for `fromIterable` /
* `fromRecordBatchReader`.
*/
rescannable?: boolean;
}
/**
* A data source that can be scanned as a stream of Arrow `RecordBatch`es.
*
* `Scannable` wraps the schema + optional row count + rescannable flag and
* a callback that yields batches one at a time. It is passed to consumers
* (e.g. `Table.add`, `createTable`, `mergeInsert` — follow-up work) that
* need to pull data without materializing the full dataset in JS memory.
*
* Batches cross the JS↔Rust boundary as Arrow IPC Stream messages; a fresh
* writer serializes each batch, and the Rust side decodes it with
* `arrow_ipc::reader::StreamReader`. One batch is in flight at a time.
*/
export class Scannable {
readonly schema: Schema;
readonly numRows: number | null;
readonly rescannable: boolean;
/** @hidden */
private readonly native: NapiScannable;
private constructor(
native: NapiScannable,
schema: Schema,
numRows: number | null,
rescannable: boolean,
) {
this.native = native;
this.schema = schema;
this.numRows = numRows;
this.rescannable = rescannable;
}
/** @hidden Access the native handle for passing through to Rust consumers. */
get inner(): NapiScannable {
return this.native;
}
/**
* Build a Scannable from an explicit schema and a factory that returns a
* fresh batch iterator on each call.
*
* The factory is invoked once per scan. Each iterator yields
* `RecordBatch`es matching the declared schema. Use this when you need
* direct control over the pull loop — for example, to wrap a streaming
* source whose batches are produced lazily.
*
* @param schema - The Arrow schema of the produced batches.
* @param factory - Called at the start of each scan to produce a batch
* iterator. Must be idempotent when `rescannable` is true.
* @param opts - Optional hints. `rescannable` defaults to `true`; set to
* `false` if calling `factory()` twice would not reproduce the same data.
*/
static async fromFactory(
schema: Schema,
factory: () =>
| AsyncIterable<RecordBatch>
| Iterable<RecordBatch>
| AsyncIterator<RecordBatch>
| Iterator<RecordBatch>,
opts: ScannableOptions = {},
): Promise<Scannable> {
const numRows = opts.numRows ?? null;
if (numRows != null && !Number.isInteger(numRows)) {
throw new TypeError("numRows must be an integer");
}
const rescannable = opts.rescannable ?? true;
let iter: AsyncIterator<RecordBatch> | Iterator<RecordBatch> | null = null;
const getNextBatch = async (isStart: boolean): Promise<Buffer | null> => {
// `isStart` is true on the first pull of every new scan_as_stream.
// Drop any cached iterator so factory() is re-invoked for the next scan
if (isStart) {
iter = null;
}
if (iter === null) {
iter = normalizeIterator(factory());
}
const result = await iter.next();
if (result.done) {
iter = null;
return null;
}
return fromRecordBatchToStreamBuffer(result.value);
};
const schemaBuf = await fromTableToBuffer(makeEmptyTable(schema));
const native = new NapiScannable(
schemaBuf,
numRows,
rescannable,
getNextBatch,
);
return new Scannable(native, schema, numRows, rescannable);
}
/**
* Build a Scannable from an in-memory Arrow `Table`. Always rescannable;
* the table's batches are replayed on each scan.
*
* The table's row count is authoritative: `opts.numRows` must either be
* omitted or equal to `table.numRows`. `opts.rescannable` of `false` is
* rejected because in-memory Tables are always rescannable.
*/
static async fromTable(
table: ArrowTable,
opts: ScannableOptions = {},
): Promise<Scannable> {
if (opts.numRows != null && opts.numRows !== table.numRows) {
throw new TypeError(
`opts.numRows (${opts.numRows}) does not match table.numRows (${table.numRows}). ` +
`The table's row count is authoritative; omit numRows or pass the matching value.`,
);
}
if (opts.rescannable === false) {
throw new TypeError(
`fromTable does not accept rescannable: false. ` +
`In-memory Arrow Tables are always rescannable; omit the option or pass true.`,
);
}
return Scannable.fromFactory(table.schema, () => table.batches, {
numRows: table.numRows,
rescannable: true,
});
}
/**
* Build a Scannable from an iterable of `RecordBatch`es. `rescannable`
* defaults to `false`. Pass an explicit schema so the consumer can
* validate before any batch is pulled.
*
* `opts.rescannable: true` is honest for replayable iterables (Arrays,
* Sets, or custom iterables whose `[Symbol.iterator]()` returns a fresh
* iterator each call). It is rejected for one-shot iterables (generators,
* async generators, or already-an-iterator inputs) because their
* `[Symbol.iterator]()` returns the same exhausted object on the second
* scan. For replayable sources outside this shape, use
* `fromFactory(schema, () => createIter(), { rescannable: true })`.
*
* Note: when `opts.rescannable` is `true`, the constructor calls
* `[Symbol.iterator]()` once on the input to perform the structural check.
*/
static async fromIterable(
schema: Schema,
iter: AsyncIterable<RecordBatch> | Iterable<RecordBatch>,
opts: ScannableOptions = {},
): Promise<Scannable> {
if (opts.rescannable === true && isOneShotIterable(iter)) {
throw new TypeError(
`fromIterable: rescannable: true is not honest for one-shot iterables ` +
`(generators, async generators, or iterators where [Symbol.iterator]() ` +
`returns the same object). The source would be exhausted after the first scan. ` +
`Use fromFactory(schema, () => createIter(), { rescannable: true }) for sources ` +
`where each call mints a fresh iterator.`,
);
}
return Scannable.fromFactory(schema, () => iter, {
numRows: opts.numRows,
rescannable: opts.rescannable ?? false,
});
}
/**
* Build a Scannable from an Arrow `RecordBatchReader`. A reader can only
* be consumed once; `rescannable` defaults to `false`.
*
* The reader must already be opened (via `.open()`) so its `.schema` is
* populated. `RecordBatchReader.from(...)` returns an unopened reader.
*
* `opts.rescannable: true` is rejected because `RecordBatchReader` is a
* self-iterator (its `[Symbol.iterator]()` returns itself), and this
* constructor does not call `reader.reset()` between scans, so a second
* scan would always see an exhausted reader. For genuinely replayable
* sources, use
* `fromFactory(schema, () => openReader(), { rescannable: true })`,
* which mints a fresh reader on each scan.
*/
static async fromRecordBatchReader(
reader: RecordBatchReader,
opts: ScannableOptions = {},
): Promise<Scannable> {
if (opts.rescannable === true) {
throw new TypeError(
`fromRecordBatchReader does not accept rescannable: true. ` +
`RecordBatchReader is a self-iterator (its [Symbol.iterator]() ` +
`returns itself) and would be exhausted after the first scan. ` +
`Use fromFactory(schema, () => openReader(), { rescannable: true }) ` +
`for sources where each call mints a fresh reader.`,
);
}
return Scannable.fromFactory(reader.schema, () => reader, {
numRows: opts.numRows,
rescannable: false,
});
}
}
function normalizeIterator<T>(
source: AsyncIterable<T> | Iterable<T> | AsyncIterator<T> | Iterator<T>,
): AsyncIterator<T> | Iterator<T> {
if (source == null) {
throw new TypeError("Scannable factory returned null/undefined");
}
if (
typeof (source as AsyncIterable<T>)[Symbol.asyncIterator] === "function"
) {
return (source as AsyncIterable<T>)[Symbol.asyncIterator]();
}
if (typeof (source as Iterable<T>)[Symbol.iterator] === "function") {
return (source as Iterable<T>)[Symbol.iterator]();
}
// Already an iterator (has `.next`).
if (typeof (source as Iterator<T>).next === "function") {
return source as Iterator<T>;
}
throw new TypeError("Scannable factory returned a non-iterable value");
}
// A "self-iterator" returns the same object from `[Symbol.iterator]()` /
// `[Symbol.asyncIterator]()`. Generators behave this way, so they exhaust
// after one pass. Replayable iterables (Array, Set, custom) return a fresh
// iterator each call. Detection mirrors `normalizeIterator`'s ordering so
// classification matches scan-time behavior.
function isOneShotIterable(
source: AsyncIterable<unknown> | Iterable<unknown>,
): boolean {
// null/undefined are not one-shot in any meaningful sense; let
// `normalizeIterator` raise the actual error at scan time.
if (source == null) return false;
const ref = source as unknown;
if (
typeof (source as AsyncIterable<unknown>)[Symbol.asyncIterator] ===
"function"
) {
const it = (source as AsyncIterable<unknown>)[
Symbol.asyncIterator
]() as unknown;
return it === ref;
}
if (typeof (source as Iterable<unknown>)[Symbol.iterator] === "function") {
const it = (source as Iterable<unknown>)[Symbol.iterator]() as unknown;
return it === ref;
}
// Already-an-iterator (has `.next` but no `Symbol.iterator`) is by
// definition one-shot.
if (typeof (source as { next?: unknown }).next === "function") return true;
return false;
}

View File

@@ -106,27 +106,6 @@ export interface Version {
metadata: Record<string, string>;
}
/**
* Specification selecting Lance's MemWAL LSM-style write path for
* `mergeInsert`.
*
* `specType` is `"bucket"`, `"identity"`, or `"unsharded"`. For `"bucket"`,
* `column` and `numBuckets` are required; for `"identity"`, `column` is
* required.
*/
export interface LsmWriteSpec {
/** One of `"bucket"`, `"identity"`, or `"unsharded"`. */
specType: "bucket" | "identity" | "unsharded";
/** Bucket and identity variants: the sharding column. */
column?: string;
/** Bucket variant: the number of buckets, in `[1, 1024]`. */
numBuckets?: number;
/** Names of indexes the MemWAL should keep up to date during writes. */
maintainedIndexes?: string[];
/** Default `ShardWriter` configuration recorded in the MemWAL index. */
writerConfigDefaults?: Record<string, string>;
}
/**
* A Table is a collection of Records in a LanceDB Database.
*
@@ -306,25 +285,6 @@ export abstract class Table {
*/
abstract prewarmIndex(name: string): Promise<void>;
/**
* Prewarm one or more columns of data in the table.
*
* @param columns The columns to prewarm. If undefined, all columns are prewarmed.
*
* This will load the column data into the page cache so that future queries that
* read those columns avoid the initial cold-start latency. This call initiates
* prewarming and returns once the request is accepted; the warming itself may
* continue in the background. Calling it on already-prewarmed columns is a
* no-op on the server.
*
* Prewarming is generally useful for columns used in filters or projections.
* Large columns (e.g. high-dimensional vectors or binary data) may not be
* practical to prewarm.
*
* This feature is currently only supported on remote tables.
*/
abstract prewarmData(columns?: string[]): Promise<void>;
/**
* Waits for asynchronous indexing to complete on the table.
*
@@ -470,54 +430,6 @@ export abstract class Table {
* containing the new version number of the table after dropping the columns.
*/
abstract dropColumns(columnNames: string[]): Promise<DropColumnsResult>;
/**
* Set the unenforced primary key for this table to a single column.
*
* "Unenforced" means LanceDB does not check uniqueness on writes; the
* column is recorded in the schema as the primary key for use by features
* such as `merge_insert`. Only single-column primary keys are supported,
* and the key cannot be changed once set.
* @param {string | string[]} columns The primary key column. A one-element
* array is also accepted; passing more than one column is rejected.
* @returns {Promise<void>}
*/
abstract setUnenforcedPrimaryKey(columns: string | string[]): Promise<void>;
/**
* Install an {@link LsmWriteSpec} on this table, selecting Lance's MemWAL
* LSM-style write path for future `mergeInsert` calls.
*
* `LsmWriteSpec` chooses one of three sharding strategies via `specType`:
*
* - `"bucket"` — hash-bucket writes by the single-column unenforced primary
* key (`column` and `numBuckets` required).
* - `"identity"` — shard by the raw value of a scalar `column`.
* - `"unsharded"` — route every write to a single shard.
*
* All variants require the table to have an unenforced primary key
* ({@link Table#setUnenforcedPrimaryKey}); bucket sharding additionally
* requires it to be the single column being bucketed.
* @param {LsmWriteSpec} spec The sharding spec to install.
* @returns {Promise<void>}
* @example
* ```ts
* await table.setUnenforcedPrimaryKey("id");
* await table.setLsmWriteSpec({
* specType: "bucket",
* column: "id",
* numBuckets: 16,
* maintainedIndexes: ["id_idx"],
* });
* ```
*/
abstract setLsmWriteSpec(spec: LsmWriteSpec): Promise<void>;
/**
* Remove the {@link LsmWriteSpec} from this table, reverting to the standard
* `mergeInsert` write path.
*
* Errors if no spec is currently set.
* @returns {Promise<void>}
*/
abstract unsetLsmWriteSpec(): Promise<void>;
/** Retrieve the version of the table */
abstract version(): Promise<number>;
@@ -798,10 +710,6 @@ export class LocalTable extends Table {
await this.inner.prewarmIndex(name);
}
async prewarmData(columns?: string[]): Promise<void> {
await this.inner.prewarmData(columns);
}
async waitForIndex(
indexNames: string[],
timeoutSeconds: number,
@@ -966,19 +874,6 @@ export class LocalTable extends Table {
return await this.inner.dropColumns(columnNames);
}
async setUnenforcedPrimaryKey(columns: string | string[]): Promise<void> {
const cols = typeof columns === "string" ? [columns] : columns;
return await this.inner.setUnenforcedPrimaryKey(cols);
}
async setLsmWriteSpec(spec: LsmWriteSpec): Promise<void> {
return await this.inner.setLsmWriteSpec(spec);
}
async unsetLsmWriteSpec(): Promise<void> {
return await this.inner.unsetLsmWriteSpec();
}
async version(): Promise<number> {
return await this.inner.version();
}

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.10",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.10",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.10",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.10",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.10",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.10",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.10",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

10452
nodejs/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.29.1-beta.0",
"version": "0.28.0-beta.10",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",
@@ -38,15 +38,15 @@
"url": "https://github.com/lancedb/lancedb"
},
"devDependencies": {
"@aws-sdk/client-dynamodb": "3.1003.0",
"@aws-sdk/client-kms": "3.1003.0",
"@aws-sdk/client-s3": "3.1003.0",
"@aws-sdk/client-dynamodb": "^3.33.0",
"@aws-sdk/client-kms": "^3.33.0",
"@aws-sdk/client-s3": "^3.33.0",
"@biomejs/biome": "^1.7.3",
"@jest/globals": "^29.7.0",
"@napi-rs/cli": "3.5.1",
"@napi-rs/cli": "^3.5.1",
"@types/axios": "^0.14.0",
"@types/jest": "^29.1.2",
"@types/node": "22.7.4",
"@types/node": "^22.7.4",
"@types/tmp": "^0.2.6",
"apache-arrow-15": "npm:apache-arrow@15.0.0",
"apache-arrow-16": "npm:apache-arrow@16.0.0",
@@ -57,9 +57,9 @@
"shx": "^0.3.4",
"tmp": "^0.2.3",
"ts-jest": "^29.1.2",
"typedoc": "0.26.4",
"typedoc-plugin-markdown": "4.2.1",
"typescript": "5.5.4",
"typedoc": "^0.26.4",
"typedoc-plugin-markdown": "^4.2.1",
"typescript": "^5.5.4",
"typescript-eslint": "^7.1.0"
},
"ava": {
@@ -68,16 +68,16 @@
"engines": {
"node": ">= 18"
},
"packageManager": "pnpm@11.1.1",
"cpu": ["x64", "arm64"],
"os": ["darwin", "linux", "win32"],
"scripts": {
"artifacts": "napi artifacts",
"build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir lancedb",
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/ && node -e \"require('fs').writeFileSync('dist/package.json', JSON.stringify({name:'@lancedb/lancedb',type:'commonjs'}))\"",
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
"build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir dist",
"build": "pnpm build:debug && pnpm tsc",
"build-release": "pnpm build:release && pnpm tsc",
"postbuild:release": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
"build": "npm run build:debug && npm run tsc",
"build-release": "npm run build:release && npm run tsc",
"tsc": "tsc -b",
"posttsc": "shx cp lancedb/native.d.ts dist/native.d.ts",
"lint-ci": "biome ci .",
@@ -87,7 +87,7 @@
"lint-fix": "biome check --write . && biome format --write .",
"prepublishOnly": "napi prepublish -t npm",
"test": "jest --verbose",
"integration": "S3_TEST=1 pnpm test",
"integration": "S3_TEST=1 npm run test",
"universal": "napi universalize",
"version": "napi version"
},
@@ -95,8 +95,8 @@
"reflect-metadata": "^0.2.2"
},
"optionalDependencies": {
"@huggingface/transformers": "3.0.2",
"openai": "4.29.2"
"@huggingface/transformers": "^3.0.2",
"openai": "^4.29.2"
},
"peerDependencies": {
"apache-arrow": ">=15.0.0 <=18.1.0"

7317
nodejs/pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,18 +0,0 @@
# Flat node_modules layout. The @napi-rs/cli build step fails to locate
# the cdylib artifact under pnpm's isolated layout; the hoisted linker
# mirrors npm's structure and unblocks the native build.
nodeLinker: hoisted
# Block resolution of versions less than 24h old (Shai-Hulud window).
# This is the pnpm 11 default but pinned here so it's visible to
# reviewers and survives a future pnpm major flipping the default.
minimumReleaseAge: 1440
# Fail install if a transitive dep tries to run an unapproved script.
strictDepBuilds: true
allowBuilds:
'@biomejs/biome': true
onnxruntime-node: true
protobufjs: true
sharp: true

View File

@@ -8,16 +8,12 @@ use lancedb::database::{CreateTableMode, Database};
use napi::bindgen_prelude::*;
use napi_derive::*;
use crate::ConnectNamespaceOptions;
use crate::ConnectionOptions;
use crate::error::NapiErrorExt;
use crate::header::JsHeaderProvider;
use crate::table::Table;
use lancedb::connection::{ConnectBuilder, Connection as LanceDBConnection, connect_namespace};
use lancedb::connection::{ConnectBuilder, Connection as LanceDBConnection};
use lance_namespace::models::{
CreateNamespaceRequest, DescribeNamespaceRequest, DropNamespaceRequest, ListNamespacesRequest,
};
use lancedb::ipc::{ipc_file_to_batches, ipc_file_to_schema};
#[napi]
@@ -25,29 +21,6 @@ pub struct Connection {
inner: Option<LanceDBConnection>,
}
#[napi(object)]
pub struct DescribeNamespaceResponse {
pub properties: Option<HashMap<String, String>>,
}
#[napi(object)]
pub struct ListNamespacesResponse {
pub namespaces: Vec<String>,
pub page_token: Option<String>,
}
#[napi(object)]
pub struct CreateNamespaceResponse {
pub properties: Option<HashMap<String, String>>,
pub transaction_id: Option<String>,
}
#[napi(object)]
pub struct DropNamespaceResponse {
pub properties: Option<HashMap<String, String>>,
pub transaction_id: Option<Vec<String>>,
}
impl Connection {
pub(crate) fn inner_new(inner: LanceDBConnection) -> Self {
Self { inner: Some(inner) }
@@ -94,12 +67,6 @@ impl Connection {
builder = builder.storage_option(key, value);
}
}
if let Some(manifest_enabled) = options.manifest_enabled {
builder = builder.manifest_enabled(manifest_enabled);
}
if let Some(namespace_client_properties) = options.namespace_client_properties {
builder = builder.namespace_client_properties(namespace_client_properties);
}
// Create client config, optionally with header provider
let client_config = options.client_config.unwrap_or_default();
@@ -133,39 +100,6 @@ impl Connection {
Ok(Self::inner_new(builder.execute().await.default_error()?))
}
/// Create a new Connection instance backed by a namespace implementation.
#[napi(factory)]
pub async fn new_with_namespace(
impl_name: String,
properties: HashMap<String, String>,
options: ConnectNamespaceOptions,
) -> napi::Result<Self> {
if impl_name.is_empty() {
return Err(napi::Error::from_reason(
"implName must be a non-empty string",
));
}
let mut builder = connect_namespace(&impl_name, properties);
if let Some(interval) = options.read_consistency_interval {
builder =
builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));
}
if let Some(storage_options) = options.storage_options {
for (key, value) in storage_options {
builder = builder.storage_option(key, value);
}
}
if let Some(namespace_client_properties) = options.namespace_client_properties {
builder = builder.namespace_client_properties(namespace_client_properties);
}
if let Some(session) = options.session {
builder = builder.session(session.inner.clone());
}
Ok(Self::inner_new(builder.execute().await.default_error()?))
}
#[napi]
pub fn display(&self) -> napi::Result<String> {
Ok(self.get_inner()?.to_string())
@@ -328,149 +262,9 @@ impl Connection {
.default_error()
}
#[napi(catch_unwind)]
pub async fn rename_table(
&self,
old_name: String,
new_name: String,
namespace_path: Option<Vec<String>>,
) -> napi::Result<()> {
let ns = namespace_path.unwrap_or_default();
self.get_inner()?
.rename_table(&old_name, &new_name, &ns, &ns)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn drop_all_tables(&self, namespace_path: Option<Vec<String>>) -> napi::Result<()> {
let ns = namespace_path.unwrap_or_default();
self.get_inner()?.drop_all_tables(&ns).await.default_error()
}
#[napi(catch_unwind)]
/// Describe a namespace and return its properties.
pub async fn describe_namespace(
&self,
namespace_path: Vec<String>,
) -> napi::Result<DescribeNamespaceResponse> {
let req = DescribeNamespaceRequest {
id: Some(namespace_path),
..Default::default()
};
let resp = self
.get_inner()?
.describe_namespace(req)
.await
.default_error()?;
Ok(DescribeNamespaceResponse {
properties: resp.properties,
})
}
#[napi(catch_unwind)]
/// List child namespaces under the given namespace path
pub async fn list_namespaces(
&self,
namespace_path: Option<Vec<String>>,
page_token: Option<String>,
limit: Option<u32>,
) -> napi::Result<ListNamespacesResponse> {
let req = ListNamespacesRequest {
id: namespace_path,
page_token,
limit: limit.map(|l| l as i32),
..Default::default()
};
let resp = self
.get_inner()?
.list_namespaces(req)
.await
.default_error()?;
Ok(ListNamespacesResponse {
namespaces: resp.namespaces,
page_token: resp.page_token,
})
}
#[napi(catch_unwind)]
/// Create a new namespace with optional properties.
pub async fn create_namespace(
&self,
namespace_path: Vec<String>,
mode: Option<String>,
properties: Option<HashMap<String, String>>,
) -> napi::Result<CreateNamespaceResponse> {
let mode_str = mode
.map(|m| match m.to_lowercase().as_str() {
"create" => Ok("Create".to_string()),
"exist_ok" => Ok("ExistOk".to_string()),
"overwrite" => Ok("Overwrite".to_string()),
_ => Err(napi::Error::from_reason(format!(
"Invalid mode '{}': expected one of 'create', 'exist_ok', 'overwrite'",
m
))),
})
.transpose()?;
let req = CreateNamespaceRequest {
id: Some(namespace_path),
mode: mode_str,
properties,
..Default::default()
};
let resp = self
.get_inner()?
.create_namespace(req)
.await
.default_error()?;
Ok(CreateNamespaceResponse {
properties: resp.properties,
transaction_id: resp.transaction_id,
})
}
#[napi(catch_unwind)]
/// Drop a namespace.
pub async fn drop_namespace(
&self,
namespace_path: Vec<String>,
mode: Option<String>,
behavior: Option<String>,
) -> napi::Result<DropNamespaceResponse> {
let mode_str = mode
.map(|m| match m.to_lowercase().as_str() {
"skip" => Ok("Skip".to_string()),
"fail" => Ok("Fail".to_string()),
_ => Err(napi::Error::from_reason(format!(
"Invalid mode '{}': expected one of 'skip', 'fail'",
m
))),
})
.transpose()?;
let behavior_str = behavior
.map(|b| match b.to_lowercase().as_str() {
"restrict" => Ok("Restrict".to_string()),
"cascade" => Ok("Cascade".to_string()),
_ => Err(napi::Error::from_reason(format!(
"Invalid behavior '{}': expected one of 'restrict', 'cascade'",
b
))),
})
.transpose()?;
let req = DropNamespaceRequest {
id: Some(namespace_path),
mode: mode_str,
behavior: behavior_str,
..Default::default()
};
let resp = self
.get_inner()?
.drop_namespace(req)
.await
.default_error()?;
Ok(DropNamespaceResponse {
properties: resp.properties,
transaction_id: resp.transaction_id,
})
}
}

View File

@@ -16,7 +16,6 @@ pub mod permutation;
mod query;
pub mod remote;
mod rerankers;
mod scannable;
mod session;
mod table;
mod util;
@@ -38,13 +37,6 @@ pub struct ConnectionOptions {
///
/// The available options are described at https://docs.lancedb.com/storage/
pub storage_options: Option<HashMap<String, String>>,
/// (For LanceDB OSS only): use directory namespace manifests as the source
/// of truth for table metadata. Existing directory-listed root tables are
/// migrated into the manifest on access.
pub manifest_enabled: Option<bool>,
/// (For LanceDB OSS only): extra properties for the backing namespace
/// client used by manifest-enabled native connections.
pub namespace_client_properties: Option<HashMap<String, String>>,
/// (For LanceDB OSS only): the session to use for this connection. Holds
/// shared caches and other session-specific state.
pub session: Option<session::Session>,
@@ -68,26 +60,6 @@ pub struct OpenTableOptions {
pub storage_options: Option<HashMap<String, String>>,
}
#[napi(object)]
#[derive(Debug)]
pub struct ConnectNamespaceOptions {
/// The interval, in seconds, at which to check for updates to the table
/// from other processes. If None, then consistency is not checked. For
/// performance reasons, this is the default. For strong consistency, set
/// this to zero seconds. Then every read will check for updates from other
/// processes. As a compromise, you can set this to a non-zero value for
/// eventual consistency.
pub read_consistency_interval: Option<f64>,
/// Configuration for object storage. The available options are described
/// at https://docs.lancedb.com/storage/
pub storage_options: Option<HashMap<String, String>>,
/// Extra properties for the backing namespace client.
pub namespace_client_properties: Option<HashMap<String, String>>,
/// The session to use for this connection. Holds shared caches and other
/// session-specific state.
pub session: Option<session::Session>,
}
#[napi_derive::module_init]
fn init() {
let env = Env::new()

View File

@@ -3,12 +3,6 @@
use std::sync::Arc;
use crate::error::NapiErrorExt;
use crate::error::convert_error;
use crate::iterator::RecordBatchIterator;
use crate::rerankers::RerankHybridCallbackArgs;
use crate::rerankers::Reranker;
use crate::util::{parse_distance_type, schema_to_buffer};
use arrow_array::{
Array, Float16Array as ArrowFloat16Array, Float32Array as ArrowFloat32Array,
Float64Array as ArrowFloat64Array, UInt8Array as ArrowUInt8Array,
@@ -25,27 +19,16 @@ use lancedb::query::QueryBase;
use lancedb::query::QueryExecutionOptions;
use lancedb::query::Select;
use lancedb::query::TakeQuery as LanceDbTakeQuery;
use lancedb::query::{ColumnOrdering as LanceDbColumnOrdering, VectorQuery as LanceDbVectorQuery};
use lancedb::query::VectorQuery as LanceDbVectorQuery;
use napi::bindgen_prelude::*;
use napi_derive::napi;
#[napi(object)]
pub struct ColumnOrdering {
pub ascending: bool,
pub nulls_first: bool,
pub column_name: String,
}
impl From<ColumnOrdering> for LanceDbColumnOrdering {
fn from(value: ColumnOrdering) -> Self {
match (value.ascending, value.nulls_first) {
(true, true) => Self::asc_nulls_first(value.column_name),
(true, false) => Self::asc_nulls_last(value.column_name),
(false, true) => Self::desc_nulls_first(value.column_name),
(false, false) => Self::desc_nulls_last(value.column_name),
}
}
}
use crate::error::NapiErrorExt;
use crate::error::convert_error;
use crate::iterator::RecordBatchIterator;
use crate::rerankers::RerankHybridCallbackArgs;
use crate::rerankers::Reranker;
use crate::util::{parse_distance_type, schema_to_buffer};
fn bytes_to_arrow_array(data: Uint8Array, dtype: String) -> napi::Result<Arc<dyn Array>> {
let buf = arrow_buffer::Buffer::from(data.to_vec());
@@ -145,18 +128,6 @@ impl Query {
self.inner = self.inner.clone().with_row_id();
}
#[napi]
pub fn order_by(&mut self, ordering: Option<Vec<ColumnOrdering>>) -> napi::Result<()> {
let ordering = ordering.map(|ordering| {
ordering
.into_iter()
.map(LanceDbColumnOrdering::from)
.collect()
});
self.inner = self.inner.clone().order_by(ordering);
Ok(())
}
#[napi(catch_unwind)]
pub async fn output_schema(&self) -> napi::Result<Buffer> {
let schema = self.inner.output_schema().await.default_error()?;
@@ -357,18 +328,6 @@ impl VectorQuery {
Ok(())
}
#[napi]
pub fn order_by(&mut self, ordering: Option<Vec<ColumnOrdering>>) -> napi::Result<()> {
let ordering = ordering.map(|ordering| {
ordering
.into_iter()
.map(LanceDbColumnOrdering::from)
.collect()
});
self.inner = self.inner.clone().order_by(ordering);
Ok(())
}
#[napi(catch_unwind)]
pub async fn output_schema(&self) -> napi::Result<Buffer> {
let schema = self.inner.output_schema().await.default_error()?;

View File

@@ -1,253 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! NodeJS binding for the [`lancedb::data::scannable::Scannable`] trait.
//!
//! The JS side supplies a `getNextBatch(isStart)` callback that returns the
//! next Arrow `RecordBatch` encoded as a self-contained Arrow IPC Stream
//! message (schema message + record batch message + EOS marker) wrapped in a
//! `Buffer`, or `null` when the stream is exhausted. The Rust side parses
//! each buffer with `arrow_ipc::reader::StreamReader`, validates every
//! standalone batch stream against the declared schema, and yields decoded
//! `RecordBatch`es as a [`SendableRecordBatchStream`].
//!
//! `isStart` is `true` on the first `getNextBatch` call of each new
//! `scan_as_stream` and `false` thereafter. JS uses it to drop any cached
//! iterator and re-invoke its factory at scan boundaries, so retries
//! triggered by mid-stream failures restart at batch 0.
use std::io::Cursor;
use std::sync::Arc;
use arrow_array::RecordBatch;
use arrow_ipc::reader::StreamReader;
use arrow_schema::SchemaRef;
use futures::stream::once;
use lancedb::arrow::{SendableRecordBatchStream, SimpleRecordBatchStream};
use lancedb::data::scannable::Scannable as LanceScannable;
use lancedb::ipc::ipc_file_to_schema;
use lancedb::{Error, Result as LanceResult};
use napi::bindgen_prelude::*;
use napi::threadsafe_function::ThreadsafeFunction;
use napi_derive::napi;
/// Threadsafe handle to the JS `getNextBatch` callback. The callback takes a
/// single boolean `isStart` (`true` on the first call of each new scan) and
/// returns a Promise that resolves to a `Buffer` containing one IPC Stream
/// message, or `null` at end-of-stream.
type GetNextBatchFn = ThreadsafeFunction<bool, Promise<Option<Buffer>>, bool, Status, false>;
/// A Rust-side view of a JS-constructed `Scannable`.
///
/// Held in JS as the return value of the `Scannable` class constructor. When
/// passed to a consumer that accepts `impl lancedb::data::scannable::Scannable`,
/// the consumer invokes `scan_as_stream()` to pull batches through the JS
/// callback.
#[napi]
pub struct NapiScannable {
schema: SchemaRef,
num_rows: Option<usize>,
rescannable: bool,
// `ThreadsafeFunction` is not `Clone`; wrap in `Arc` so the stream
// returned by `scan_as_stream` can own a handle independent of `self`.
get_next_batch: Arc<GetNextBatchFn>,
// Tracks whether a scan has already started; used to enforce one-shot
// semantics on non-rescannable sources.
scanned: bool,
}
#[napi]
impl NapiScannable {
/// Construct a new `NapiScannable`.
///
/// - `schema_buf` — Arrow IPC File buffer carrying only the schema (no batches).
/// - `num_rows` — optional row count hint; not validated against the stream.
/// - `rescannable` — whether `get_next_batch` may be re-driven after the
/// scan completes.
/// - `get_next_batch` -- JS callback that yields the next batch as an Arrow
/// IPC Stream message wrapped in a `Buffer`, or `null` at EOF. The
/// `isStart` argument is `true` on the first call of each new scan;
/// JS uses it to discard any cached iterator before pulling.
#[napi(constructor)]
pub fn new(
schema_buf: Buffer,
num_rows: Option<i64>,
rescannable: bool,
get_next_batch: Function<bool, Promise<Option<Buffer>>>,
) -> napi::Result<Self> {
let schema = ipc_file_to_schema(schema_buf.to_vec())
.map_err(|e| napi::Error::from_reason(format!("Invalid schema buffer: {}", e)))?;
let num_rows = num_rows
.map(|n| {
usize::try_from(n)
.map_err(|_| napi::Error::from_reason("num_rows must be non-negative"))
})
.transpose()?;
let get_next_batch = Arc::new(get_next_batch.build_threadsafe_function().build()?);
Ok(Self {
schema,
num_rows,
rescannable,
get_next_batch,
scanned: false,
})
}
}
impl std::fmt::Debug for NapiScannable {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("NapiScannable")
.field("schema", &self.schema)
.field("num_rows", &self.num_rows)
.field("rescannable", &self.rescannable)
.finish()
}
}
impl LanceScannable for NapiScannable {
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
fn scan_as_stream(&mut self) -> SendableRecordBatchStream {
let schema = self.schema.clone();
// One-shot enforcement for non-rescannable sources: return a stream
// whose first item is an error.
if self.scanned && !self.rescannable {
let err_stream = once(async {
Err(Error::InvalidInput {
message: "Scannable has already been consumed (non-rescannable source)"
.to_string(),
})
});
return Box::pin(SimpleRecordBatchStream::new(err_stream, schema));
}
self.scanned = true;
let tsfn = Arc::clone(&self.get_next_batch);
let declared_schema = schema.clone();
// State threaded through the unfold. `is_first_pull` starts true so
// the first call into JS signals a new-scan boundary; JS uses it to
// reset any cached iterator before factory()-ing a fresh one.
let initial = State {
tsfn,
batch_index: 0,
declared_schema,
errored: false,
is_first_pull: true,
};
let stream = futures::stream::unfold(initial, |mut state| async move {
if state.errored {
return None;
}
// Pull the next IPC Stream buffer from JS. `is_first_pull` is
// consumed here and cleared so subsequent pulls continue the
// same scan rather than restarting it.
let is_start = state.is_first_pull;
state.is_first_pull = false;
let buf = match pull_next(&state.tsfn, is_start).await {
Ok(Some(buf)) => buf,
Ok(None) => return None,
Err(e) => {
state.errored = true;
return Some((Err(e), state));
}
};
match decode_one_batch(buf.as_ref(), &state.declared_schema) {
Ok(batch) => {
state.batch_index += 1;
Some((Ok(batch), state))
}
Err(e) => {
let tagged = Error::Runtime {
message: format!(
"[scannable/rust-bridge] failure at batch index {}: {}",
state.batch_index, e
),
};
state.errored = true;
Some((Err(tagged), state))
}
}
});
Box::pin(SimpleRecordBatchStream::new(stream, schema))
}
fn num_rows(&self) -> Option<usize> {
self.num_rows
}
fn rescannable(&self) -> bool {
self.rescannable
}
}
struct State {
tsfn: Arc<GetNextBatchFn>,
batch_index: usize,
declared_schema: SchemaRef,
errored: bool,
/// True for the very first pull of a new scan. Forwarded to JS so the
/// callback can drop any cached iterator and call its factory fresh,
/// which makes rescannable sources restart at batch 0 even when the
/// previous scan ended mid-stream.
is_first_pull: bool,
}
/// Invoke the JS callback and await its Promise. `is_start` is forwarded to
/// the JS side as the `isStart` argument so it can reset its iterator at the
/// scan boundary. Errors on the JS side surface here as rejected promises
/// and are tunneled back as `lancedb::Error::Runtime`.
async fn pull_next(tsfn: &GetNextBatchFn, is_start: bool) -> LanceResult<Option<Buffer>> {
let promise = tsfn
.call_async(is_start)
.await
.map_err(|e| Error::Runtime {
message: format!(
"[scannable/js-factory] napi error status={}, reason={}",
e.status, e.reason
),
})?;
promise.await.map_err(|e| Error::Runtime {
message: format!(
"[scannable/js-iterator] napi error status={}, reason={}",
e.status, e.reason
),
})
}
/// Decode one IPC Stream buffer (schema + batch + EOS) into a `RecordBatch`.
/// Each buffer is a standalone IPC stream, so every decoded stream schema must
/// match the one declared at construction.
fn decode_one_batch(buf: &[u8], declared: &SchemaRef) -> LanceResult<RecordBatch> {
let reader = StreamReader::try_new(Cursor::new(buf), None).map_err(|e| Error::Runtime {
message: format!("failed to open IPC stream reader: {}", e),
})?;
let actual = reader.schema();
if actual.as_ref() != declared.as_ref() {
return Err(Error::InvalidInput {
message: format!(
"declared schema does not match stream schema: declared={:?} actual={:?}",
declared, actual
),
});
}
let mut iter = reader;
let batch = iter
.next()
.ok_or_else(|| Error::Runtime {
message: "IPC stream contained schema but no record batch".to_string(),
})?
.map_err(|e| Error::Runtime {
message: format!("failed to decode record batch: {}", e),
})?;
Ok(batch)
}

View File

@@ -159,14 +159,6 @@ impl Table {
.default_error()
}
#[napi(catch_unwind)]
pub async fn prewarm_data(&self, columns: Option<Vec<String>>) -> napi::Result<()> {
self.inner_ref()?
.prewarm_data(columns)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn wait_for_index(&self, index_names: Vec<String>, timeout_s: i64) -> Result<()> {
let timeout = std::time::Duration::from_secs(timeout_s.try_into().unwrap());
@@ -344,31 +336,6 @@ impl Table {
Ok(res.into())
}
#[napi(catch_unwind)]
pub async fn set_unenforced_primary_key(&self, columns: Vec<String>) -> napi::Result<()> {
self.inner_ref()?
.set_unenforced_primary_key(columns)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn set_lsm_write_spec(&self, spec: LsmWriteSpec) -> napi::Result<()> {
let native_spec = lancedb::table::LsmWriteSpec::try_from(spec)?;
self.inner_ref()?
.set_lsm_write_spec(native_spec)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn unset_lsm_write_spec(&self) -> napi::Result<()> {
self.inner_ref()?
.unset_lsm_write_spec()
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn version(&self) -> napi::Result<i64> {
self.inner_ref()?
@@ -563,63 +530,6 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
}
}
/// Specification selecting Lance's MemWAL LSM-style write path for
/// `mergeInsert`.
///
/// `specType` must be `"bucket"`, `"identity"`, or `"unsharded"`. For
/// `"bucket"`, `column` and `numBuckets` are required; for `"identity"`,
/// `column` is required.
#[napi(object)]
#[derive(Clone, Debug)]
pub struct LsmWriteSpec {
/// One of `"bucket"`, `"identity"`, or `"unsharded"`.
pub spec_type: String,
/// Bucket and identity variants: the sharding column.
pub column: Option<String>,
/// Bucket variant: the number of buckets, in `[1, 1024]`.
pub num_buckets: Option<u32>,
/// Names of indexes the MemWAL should keep up to date during writes.
pub maintained_indexes: Option<Vec<String>>,
/// Default `ShardWriter` configuration recorded in the MemWAL index.
pub writer_config_defaults: Option<HashMap<String, String>>,
}
impl TryFrom<LsmWriteSpec> for lancedb::table::LsmWriteSpec {
type Error = napi::Error;
fn try_from(value: LsmWriteSpec) -> napi::Result<Self> {
let maintained = value.maintained_indexes.unwrap_or_default();
let writer_config_defaults = value.writer_config_defaults.unwrap_or_default();
let spec = match value.spec_type.as_str() {
"bucket" => {
let column = value.column.ok_or_else(|| {
napi::Error::from_reason("LsmWriteSpec bucket requires `column`")
})?;
let num_buckets = value.num_buckets.ok_or_else(|| {
napi::Error::from_reason("LsmWriteSpec bucket requires `numBuckets`")
})?;
Self::bucket(column, num_buckets)
}
"identity" => {
let column = value.column.ok_or_else(|| {
napi::Error::from_reason("LsmWriteSpec identity requires `column`")
})?;
Self::identity(column)
}
"unsharded" => Self::unsharded(),
other => {
return Err(napi::Error::from_reason(format!(
"LsmWriteSpec `specType` must be 'bucket', 'identity', or 'unsharded', got '{}'",
other
)));
}
};
Ok(spec
.with_maintained_indexes(maintained)
.with_writer_config_defaults(writer_config_defaults))
}
}
/// Statistics about a compaction operation.
#[napi(object)]
#[derive(Clone, Debug)]

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.32.1-beta.0"
current_version = "0.31.0-beta.10"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.32.1-beta.0"
version = "0.31.0-beta.10"
publish = false
edition.workspace = true
description = "Python bindings for LanceDB"
@@ -15,11 +15,10 @@ name = "_lancedb"
crate-type = ["cdylib"]
[dependencies]
arrow = { version = "58.0.0", features = ["pyarrow"] }
arrow = { workspace = true, features = ["pyarrow"] }
async-trait = "0.1"
bytes = "1"
lancedb = { path = "../rust/lancedb", default-features = false }
datafusion-common.workspace = true
lance-core.workspace = true
lance-namespace.workspace = true
lance-namespace-impls.workspace = true
@@ -36,8 +35,7 @@ futures.workspace = true
serde = "1"
serde_json = "1"
snafu.workspace = true
tokio = { version = "1.40", features = ["sync", "rt-multi-thread"] }
libc = "0.2"
tokio = { version = "1.40", features = ["sync"] }
[build-dependencies]
pyo3-build-config = { version = "0.28", features = [

View File

@@ -7,6 +7,7 @@ import os
from concurrent.futures import ThreadPoolExecutor
from datetime import timedelta
from typing import Dict, Optional, Union, Any, List
import warnings
__version__ = importlib.metadata.version("lancedb")
@@ -72,7 +73,6 @@ def connect(
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
storage_options: Optional[Dict[str, str]] = None,
session: Optional[Session] = None,
manifest_enabled: bool = False,
namespace_client_impl: Optional[str] = None,
namespace_client_properties: Optional[Dict[str, str]] = None,
namespace_client_pushdown_operations: Optional[List[str]] = None,
@@ -111,10 +111,6 @@ def connect(
storage_options: dict, optional
Additional options for the storage backend. See available options at
<https://docs.lancedb.com/storage/>
manifest_enabled : bool, default False
When true for local/native connections, use directory namespace
manifests as the source of truth for table metadata. Existing
directory-listed root tables are migrated into the manifest on access.
session: Session, optional
(For LanceDB OSS only)
A session to use for this connection. Sessions allow you to configure
@@ -162,11 +158,11 @@ def connect(
conn : DBConnection
A connection to a LanceDB database.
"""
if namespace_client_impl is not None:
if namespace_client_properties is None:
if namespace_client_impl is not None or namespace_client_properties is not None:
if namespace_client_impl is None or namespace_client_properties is None:
raise ValueError(
"namespace_client_properties must be provided when "
"namespace_client_impl is set"
"Both namespace_client_impl and "
"namespace_client_properties must be provided"
)
if kwargs:
raise ValueError(f"Unknown keyword arguments: {kwargs}")
@@ -179,12 +175,6 @@ def connect(
namespace_client_pushdown_operations=namespace_client_pushdown_operations,
)
if namespace_client_properties is not None and not manifest_enabled:
raise ValueError(
"namespace_client_impl must be provided when using "
"namespace_client_properties unless manifest_enabled=True"
)
if namespace_client_pushdown_operations is not None:
raise ValueError(
"namespace_client_pushdown_operations is only valid when "
@@ -222,8 +212,6 @@ def connect(
read_consistency_interval=read_consistency_interval,
storage_options=storage_options,
session=session,
manifest_enabled=manifest_enabled,
namespace_client_properties=namespace_client_properties,
)
@@ -301,8 +289,6 @@ def deserialize_conn(
parsed["uri"],
read_consistency_interval=rci,
storage_options=storage_options,
manifest_enabled=parsed.get("manifest_enabled", False),
namespace_client_properties=parsed.get("namespace_client_properties"),
)
else:
raise ValueError(f"Unknown connection_type: {connection_type}")
@@ -318,8 +304,6 @@ async def connect_async(
client_config: Optional[Union[ClientConfig, Dict[str, Any]]] = None,
storage_options: Optional[Dict[str, str]] = None,
session: Optional[Session] = None,
manifest_enabled: bool = False,
namespace_client_properties: Optional[Dict[str, str]] = None,
) -> AsyncConnection:
"""Connect to a LanceDB database.
@@ -359,13 +343,6 @@ async def connect_async(
cache sizes for index and metadata caches, which can significantly
impact memory use and performance. They can also be re-used across
multiple connections to share the same cache state.
manifest_enabled : bool, default False
When true for local/native connections, use directory namespace
manifests as the source of truth for table metadata. Existing
directory-listed root tables are migrated into the manifest on access.
namespace_client_properties : dict, optional
Additional directory namespace client properties to use with
``manifest_enabled=True``.
Examples
--------
@@ -408,8 +385,6 @@ async def connect_async(
client_config,
storage_options,
session,
manifest_enabled,
namespace_client_properties,
)
)
@@ -437,3 +412,13 @@ __all__ = [
"Table",
"__version__",
]
def __warn_on_fork():
warnings.warn(
"lance is not fork-safe. If you are using multiprocessing, use spawn instead.",
)
if hasattr(os, "register_at_fork"):
os.register_at_fork(before=__warn_on_fork) # type: ignore[attr-defined]

View File

@@ -12,7 +12,6 @@ from .index import (
LabelList,
HnswPq,
HnswSq,
HnswFlat,
FTS,
)
from lance_namespace import (
@@ -26,7 +25,6 @@ from .remote import ClientConfig
IvfHnswPq: type[HnswPq] = HnswPq
IvfHnswSq: type[HnswSq] = HnswSq
IvfHnswFlat: type[HnswFlat] = HnswFlat
class PyExpr:
"""A type-safe DataFusion expression node (Rust-side handle)."""
@@ -51,7 +49,7 @@ class PyExpr:
def to_sql(self) -> str: ...
def expr_col(name: str) -> PyExpr: ...
def expr_lit(value: Union[bool, int, float, str, bytes]) -> PyExpr: ...
def expr_lit(value: Union[bool, int, float, str]) -> PyExpr: ...
def expr_func(name: str, args: List[PyExpr]) -> PyExpr: ...
class Session:
@@ -182,7 +180,6 @@ class Table:
IvfPq,
HnswPq,
HnswSq,
HnswFlat,
BTree,
Bitmap,
LabelList,
@@ -217,9 +214,6 @@ class Table:
async def uri(self) -> str: ...
async def initial_storage_options(self) -> Optional[Dict[str, str]]: ...
async def latest_storage_options(self) -> Optional[Dict[str, str]]: ...
async def set_unenforced_primary_key(self, columns: List[str]) -> None: ...
async def set_lsm_write_spec(self, spec: LsmWriteSpec) -> None: ...
async def unset_lsm_write_spec(self) -> None: ...
@property
def tags(self) -> Tags: ...
def query(self) -> Query: ...
@@ -248,8 +242,6 @@ async def connect(
client_config: Optional[Union[ClientConfig, Dict[str, Any]]],
storage_options: Optional[Dict[str, str]],
session: Optional[Session],
manifest_enabled: bool = False,
namespace_client_properties: Optional[Dict[str, str]] = None,
) -> Connection: ...
class RecordBatchStream:
@@ -258,11 +250,6 @@ class RecordBatchStream:
def __aiter__(self) -> "RecordBatchStream": ...
async def __anext__(self) -> pa.RecordBatch: ...
class ColumnOrdering(TypedDict):
column_name: str
ascending: bool
nulls_first: bool
class Query:
def where(self, filter: str): ...
def where_expr(self, expr: PyExpr): ...
@@ -276,7 +263,6 @@ class Query:
def postfilter(self): ...
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
def nearest_to_text(self, query: dict) -> FTSQuery: ...
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
async def output_schema(self) -> pa.Schema: ...
async def execute(
self, max_batch_length: Optional[int], timeout: Optional[timedelta]
@@ -305,7 +291,6 @@ class FTSQuery:
def get_query(self) -> str: ...
def add_query_vector(self, query_vec: pa.Array) -> None: ...
def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
async def output_schema(self) -> pa.Schema: ...
async def execute(
self, max_batch_length: Optional[int], timeout: Optional[timedelta]
@@ -331,7 +316,6 @@ class VectorQuery:
def maximum_nprobes(self, maximum_nprobes: int): ...
def bypass_vector_index(self): ...
def nearest_to_text(self, query: dict) -> HybridQuery: ...
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
def to_query_request(self) -> PyQueryRequest: ...
class HybridQuery:
@@ -350,7 +334,6 @@ class HybridQuery:
def minimum_nprobes(self, minimum_nprobes: int): ...
def maximum_nprobes(self, maximum_nprobes: int): ...
def bypass_vector_index(self): ...
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
def to_vector_query(self) -> VectorQuery: ...
def to_fts_query(self) -> FTSQuery: ...
def get_limit(self) -> int: ...
@@ -380,7 +363,6 @@ class PyQueryRequest:
bypass_vector_index: Optional[bool]
postfilter: Optional[bool]
norm: Optional[str]
order_by: Optional[List[ColumnOrdering]]
class CompactionStats:
fragments_removed: int
@@ -421,37 +403,6 @@ class MergeResult:
num_deleted_rows: int
num_attempts: int
class LsmWriteSpec:
"""Specification selecting Lance's MemWAL LSM-style write path for
`merge_insert`."""
@staticmethod
def bucket(column: str, num_buckets: int) -> "LsmWriteSpec": ...
@staticmethod
def identity(column: str) -> "LsmWriteSpec": ...
@staticmethod
def unsharded() -> "LsmWriteSpec": ...
def with_maintained_indexes(self, indexes: List[str]) -> "LsmWriteSpec":
"""Return a copy of this spec asking the MemWAL to keep the named
indexes up to date as rows are appended."""
...
def with_writer_config_defaults(self, defaults: Dict[str, str]) -> "LsmWriteSpec":
"""Return a copy of this spec recording the given default
`ShardWriter` configuration in the MemWAL index."""
...
@property
def spec_type(self) -> str:
"""One of 'bucket', 'identity', or 'unsharded'."""
...
@property
def column(self) -> Optional[str]: ...
@property
def num_buckets(self) -> Optional[int]: ...
@property
def maintained_indexes(self) -> List[str]: ...
@property
def writer_config_defaults(self) -> Dict[str, str]: ...
class AddColumnsResult:
version: int
@@ -489,7 +440,7 @@ class AsyncPermutationBuilder:
async def execute(self) -> Table: ...
def async_permutation_builder(
table: Table,
table: Table, dest_table_name: str
) -> AsyncPermutationBuilder: ...
def fts_query_to_json(query: Any) -> str: ...

View File

@@ -2,9 +2,7 @@
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
import asyncio
import os
import threading
import warnings
class BackgroundEventLoop:
@@ -15,9 +13,6 @@ class BackgroundEventLoop:
"""
def __init__(self):
self._start()
def _start(self):
self.loop = asyncio.new_event_loop()
self.thread = threading.Thread(
target=self.loop.run_forever,
@@ -36,30 +31,3 @@ class BackgroundEventLoop:
LOOP = BackgroundEventLoop()
_FORK_WARNED = False
def _reset_after_fork():
# Threads do not survive fork(), so the asyncio loop in LOOP.thread is
# dead in the child. Re-initialize the singleton in place so existing
# `from .background_loop import LOOP` references in other modules see
# the new state. The Rust-side tokio runtime is reset analogously by a
# pthread_atfork hook installed in the _lancedb extension.
LOOP._start()
global _FORK_WARNED
if not _FORK_WARNED:
_FORK_WARNED = True
warnings.warn(
"lancedb fork support is experimental: the internal async "
"runtime has been reset in the forked child, but a small chance "
"of deadlock remains if other state was mid-operation at fork "
"time. The 'forkserver' or 'spawn' multiprocessing start method "
"is likely a safer alternative.",
RuntimeWarning,
stacklevel=2,
)
if hasattr(os, "register_at_fork"):
os.register_at_fork(after_in_child=_reset_after_fork)

View File

@@ -590,13 +590,8 @@ class LanceDBConnection(DBConnection):
read_consistency_interval: Optional[timedelta] = None,
storage_options: Optional[Dict[str, str]] = None,
session: Optional[Session] = None,
manifest_enabled: bool = False,
namespace_client_properties: Optional[Dict[str, str]] = None,
_inner: Optional[LanceDbConnection] = None,
):
self.storage_options = storage_options
self._manifest_enabled = manifest_enabled
self._namespace_client_properties = namespace_client_properties
if _inner is not None:
self._conn = _inner
self._cached_namespace_client = None
@@ -638,8 +633,6 @@ class LanceDBConnection(DBConnection):
None,
storage_options,
session,
manifest_enabled,
namespace_client_properties,
)
# TODO: It would be nice if we didn't store self.storage_options but it is
@@ -647,6 +640,7 @@ class LanceDBConnection(DBConnection):
# work because some paths like LanceDBConnection.from_inner will lose the
# storage_options. Also, this class really shouldn't be holding any state
# beyond _conn.
self.storage_options = storage_options
self._conn = AsyncConnection(LOOP.run(do_connect()))
self._cached_namespace_client: Optional[LanceNamespace] = None
@@ -683,8 +677,6 @@ class LanceDBConnection(DBConnection):
"connection_type": "local",
"uri": self.uri,
"storage_options": self.storage_options,
"manifest_enabled": self._manifest_enabled,
"namespace_client_properties": self._namespace_client_properties,
"read_consistency_interval_seconds": (
rci.total_seconds() if rci else None
),

View File

@@ -63,7 +63,7 @@ def _coerce(value: "ExprLike") -> "Expr":
# Type alias used in annotations.
ExprLike = Union["Expr", bool, int, float, str, bytes]
ExprLike = Union["Expr", bool, int, float, str]
class Expr:
@@ -261,13 +261,13 @@ def col(name: str) -> Expr:
return Expr(expr_col(name))
def lit(value: Union[bool, int, float, str, bytes]) -> Expr:
def lit(value: Union[bool, int, float, str]) -> Expr:
"""Create a literal (constant) value expression.
Parameters
----------
value:
A Python ``bool``, ``int``, ``float``, ``str``, or ``bytes``.
A Python ``bool``, ``int``, ``float``, or ``str``.
Examples
--------

View File

@@ -7,7 +7,6 @@ from typing import Literal, Optional
from ._lancedb import (
IndexConfig,
)
from .types import BaseTokenizerType
lang_mapping = {
"ar": "Arabic",
@@ -112,12 +111,8 @@ class FTS:
- "simple": Splits text by whitespace and punctuation.
- "whitespace": Split text by whitespace, but not punctuation.
- "raw": No tokenization. The entire text is treated as a single token.
- "ngram": N-gram tokenizer for substring-style matching.
- "jieba/*": Jieba tokenizer loaded from Lance's language model home.
- "lindera/*": Lindera tokenizer loaded from Lance's language model home.
language : str, default "English"
The language to use for stemming and stop-word removal. This is not the
primary way to enable CJK tokenization.
The language to use for tokenization.
max_token_length : int, default 40
The maximum token length to index. Tokens longer than this length will be
ignored.
@@ -132,17 +127,10 @@ class FTS:
ascii_folding : bool, default True
Whether to fold ASCII characters. This converts accented characters to
their ASCII equivalent. For example, "café" would be converted to "cafe".
Notes
-----
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
require tokenizer models in Lance's language model home. Set
``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
directory under ``lance/language_models``.
"""
with_position: bool = False
base_tokenizer: BaseTokenizerType = "simple"
base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple"
language: str = "English"
max_token_length: Optional[int] = 40
lower_case: bool = True
@@ -388,98 +376,9 @@ class HnswSq:
target_partition_size: Optional[int] = None
@dataclass
class HnswFlat:
"""Describe a HNSW-FLAT index configuration.
HNSW-FLAT stands for Hierarchical Navigable Small World without quantization.
It stores raw vectors in the HNSW graph, providing the highest recall among
the IVF_HNSW family at the cost of more memory and disk space compared to
:class:`HnswSq` or :class:`HnswPq`.
Parameters
----------
distance_type: str, default "l2"
The distance metric used to train the index.
The following distance types are available:
"l2" - Euclidean distance. This is a very common distance metric that
accounts for both magnitude and direction when determining the distance
between vectors. l2 distance has a range of [0, ∞).
"cosine" - Cosine distance. Cosine distance is a distance metric
calculated from the cosine similarity between two vectors. Cosine
similarity is a measure of similarity between two non-zero vectors of an
inner product space. It is defined to equal the cosine of the angle
between them. Unlike l2, the cosine distance is not affected by the
magnitude of the vectors. Cosine distance has a range of [0, 2].
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
l2 norm is 1), then dot distance is equivalent to the cosine distance.
num_partitions, default sqrt(num_rows)
The number of IVF partitions to create.
For HNSW, we recommend a small number of partitions. Setting this to 1
works well for most tables. For very large tables, training just one HNSW
graph will require too much memory. Each partition becomes its own HNSW
graph, so setting this value higher reduces the peak memory use of
training.
max_iterations, default 50
Max iterations to train kmeans.
When training an IVF index we use kmeans to calculate the partitions.
This parameter controls how many iterations of kmeans to run.
sample_rate, default 256
The rate used to calculate the number of training vectors for kmeans.
m, default 20
The number of neighbors to select for each vector in the HNSW graph.
This value controls the tradeoff between search speed and accuracy.
The higher the value the more accurate the search but the slower it
will be.
ef_construction, default 300
The number of candidates to evaluate during the construction of the HNSW
graph.
This value controls the tradeoff between build speed and accuracy.
The higher the value the more accurate the build but the slower it will
be. 150 to 300 is the typical range. 100 is a minimum for good quality
search results. In most cases, there is no benefit to setting this higher
than 500. This value should be set to a value that is not less than `ef`
in the search phase.
target_partition_size, default is 1,048,576
The target size of each partition.
"""
distance_type: Literal["l2", "cosine", "dot"] = "l2"
num_partitions: Optional[int] = None
max_iterations: int = 50
sample_rate: int = 256
m: int = 20
ef_construction: int = 300
target_partition_size: Optional[int] = None
# Backwards-compatible aliases
IvfHnswPq = HnswPq
IvfHnswSq = HnswSq
IvfHnswFlat = HnswFlat
@dataclass
@@ -799,13 +698,11 @@ __all__ = [
"IvfPq",
"IvfHnswPq",
"IvfHnswSq",
"IvfHnswFlat",
"IvfSq",
"IvfRq",
"IvfFlat",
"HnswPq",
"HnswSq",
"HnswFlat",
"IndexConfig",
"FTS",
"Bitmap",

View File

@@ -6,44 +6,22 @@
from typing import Optional
_CREATE_NAMESPACE_MODES = frozenset({"create", "exist_ok", "overwrite"})
_DROP_NAMESPACE_MODES = frozenset({"SKIP", "FAIL"})
_DROP_NAMESPACE_BEHAVIORS = frozenset({"RESTRICT", "CASCADE"})
def _normalize_create_namespace_mode(mode: Optional[str]) -> Optional[str]:
"""Normalize create namespace mode to lowercase (API expects lowercase)."""
if mode is None:
return None
normalized = mode.lower()
if normalized not in _CREATE_NAMESPACE_MODES:
raise ValueError(
f"Invalid create namespace mode {mode!r}: "
f"expected one of 'create', 'exist_ok', 'overwrite'"
)
return normalized
return mode.lower()
def _normalize_drop_namespace_mode(mode: Optional[str]) -> Optional[str]:
"""Normalize drop namespace mode to uppercase (API expects uppercase)."""
if mode is None:
return None
normalized = mode.upper()
if normalized not in _DROP_NAMESPACE_MODES:
raise ValueError(
f"Invalid drop namespace mode {mode!r}: expected one of 'skip', 'fail'"
)
return normalized
return mode.upper()
def _normalize_drop_namespace_behavior(behavior: Optional[str]) -> Optional[str]:
"""Normalize drop namespace behavior to uppercase (API expects uppercase)."""
if behavior is None:
return None
normalized = behavior.upper()
if normalized not in _DROP_NAMESPACE_BEHAVIORS:
raise ValueError(
f"Invalid drop namespace behavior {behavior!r}: "
f"expected one of 'restrict', 'cascade'"
)
return normalized
return behavior.upper()

View File

@@ -1,11 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
import copy
import json
from deprecation import deprecated
from lancedb import AsyncConnection, DBConnection
import pyarrow as pa
import json
from ._lancedb import async_permutation_builder, PermutationReader
from .table import LanceTable
@@ -37,7 +36,10 @@ class PermutationBuilder:
be referenced by name in the future. If names are not provided then they can only
be referenced by their ordinal index. There is no requirement to name every split.
The permutation is stored in memory and will be lost when the program exits.
By default, the permutation will be stored in memory and will be lost when the
program exits. To persist the permutation (for very large datasets or to share
the permutation across multiple workers) use the [persist](#persist) method to
create a permanent table.
"""
def __init__(self, table: LanceTable):
@@ -49,6 +51,15 @@ class PermutationBuilder:
"""
self._async = async_permutation_builder(table)
def persist(
self, database: Union[DBConnection, AsyncConnection], table_name: str
) -> "PermutationBuilder":
"""
Persist the permutation to the given database.
"""
self._async.persist(database, table_name)
return self
def split_random(
self,
*,
@@ -369,44 +380,20 @@ class Permutation:
def __init__(
self,
base_table: LanceTable,
permutation_table: Optional[LanceTable],
split: int,
reader: PermutationReader,
selection: dict[str, str],
batch_size: int,
transform_fn: Callable[pa.RecordBatch, Any],
offset: Optional[int] = None,
limit: Optional[int] = None,
connection_factory: Optional[Callable[[str], LanceTable]] = None,
_reader: Optional[PermutationReader] = None,
):
"""
Internal constructor. Use [from_tables](#from_tables) instead.
"""
assert base_table is not None, "base_table is required"
assert reader is not None, "reader is required"
assert selection is not None, "selection is required"
self.base_table = base_table
self.permutation_table = permutation_table
self.split = split
self.reader = reader
self.selection = selection
self.transform_fn = transform_fn
self.batch_size = batch_size
self.offset = offset
self.limit = limit
self.connection_factory = connection_factory
if _reader is None:
_reader = LOOP.run(self._build_reader())
self.reader: PermutationReader = _reader
async def _build_reader(self) -> PermutationReader:
reader = await PermutationReader.from_tables(
self.base_table, self.permutation_table, self.split
)
if self.offset is not None:
reader = await reader.with_offset(self.offset)
if self.limit is not None:
reader = await reader.with_limit(self.limit)
return reader
def _with_selection(self, selection: dict[str, str]) -> "Permutation":
"""
@@ -415,97 +402,21 @@ class Permutation:
Does not validation of the selection and it replaces it entirely. This is not
intended for public use.
"""
new = copy.copy(self)
new.selection = selection
return new
return Permutation(self.reader, selection, self.batch_size, self.transform_fn)
def _with_reader(self, reader: PermutationReader) -> "Permutation":
"""
Creates a new permutation with the given reader
This is an internal method and should not be used directly.
"""
return Permutation(reader, self.selection, self.batch_size, self.transform_fn)
def with_batch_size(self, batch_size: int) -> "Permutation":
"""
Creates a new permutation with the given batch size
"""
new = copy.copy(self)
new.batch_size = batch_size
return new
def with_connection_factory(
self, connection_factory: Callable[[str], LanceTable]
) -> "Permutation":
"""
Creates a new permutation that will use ``connection_factory`` to reopen
the base table when this permutation is unpickled in a worker process.
The factory is a callable that takes a single argument — the base table
name — and returns a [LanceTable]. It must be picklable; the worker
will pickle it via standard ``pickle`` and call it to recover the base
table. Picklable callables in practice means top-level (module-level)
functions, ``functools.partial`` of such functions, or instances of
picklable classes implementing ``__call__``. Lambdas and closures over
local variables don't pickle with the default protocol.
Setting a factory is necessary when the URI alone is not enough to
re-open the connection — most importantly for LanceDB Cloud (``db://``)
connections, where ``api_key`` and ``region`` aren't recoverable from
the connection object after construction.
For local file or cloud-storage paths the factory is optional: if not
set, ``__getstate__`` falls back to capturing
``(uri, storage_options, namespace_path)`` and re-opening via
``lancedb.connect(uri, storage_options=...)``.
Examples
--------
Basic native (file-system path), parameterized via ``functools.partial``::
import functools, lancedb
from lancedb.permutation import Permutation
def open_native_table(uri: str, table_name: str):
return lancedb.connect(uri).open_table(table_name)
factory = functools.partial(open_native_table, "/data/lance_db")
permutation = Permutation.identity(
factory("training")
).with_connection_factory(factory)
Native via :func:`lancedb.connect_namespace` (e.g. a directory- or
REST-backed namespace client). The factory takes the
implementation name and properties dict as partial-bound args so
the worker can rebuild the same namespace connection::
def open_via_namespace(
impl: str, properties: dict[str, str], table_name: str,
):
return lancedb.connect_namespace(impl, properties).open_table(
table_name,
)
factory = functools.partial(
open_via_namespace,
"dir",
{"root": "/data/lance_db"},
)
LanceDB Cloud, reading credentials from env vars at worker startup
so secrets aren't pickled into the dataset::
import os, lancedb
def open_remote_table(table_name: str):
db = lancedb.connect(
"db://my-database",
api_key=os.environ["LANCEDB_API_KEY"],
region=os.environ.get("LANCEDB_REGION", "us-east-1"),
)
return db.open_table(table_name)
permutation = Permutation.identity(
open_remote_table("training")
).with_connection_factory(open_remote_table)
"""
assert connection_factory is not None, "connection_factory is required"
new = copy.copy(self)
new.connection_factory = connection_factory
return new
return Permutation(self.reader, self.selection, batch_size, self.transform_fn)
@classmethod
def identity(cls, table: LanceTable) -> "Permutation":
@@ -578,126 +489,11 @@ class Permutation:
schema = await reader.output_schema(None)
initial_selection = {name: name for name in schema.names}
return cls(
base_table,
permutation_table,
split,
initial_selection,
DEFAULT_BATCH_SIZE,
Transforms.arrow2python,
_reader=reader,
reader, initial_selection, DEFAULT_BATCH_SIZE, Transforms.arrow2python
)
return LOOP.run(do_from_tables())
def __getstate__(self) -> dict[str, Any]:
"""Build a picklable state dict for this permutation.
The base table is captured either via a user-supplied
``connection_factory`` (see [with_connection_factory]) or, as a
fallback, by introspecting ``(uri, storage_options, namespace_path)``
on the connection. The permutation table — always an in-memory
LanceDB table — is captured as a pyarrow Table (which pickles via
Arrow IPC natively). The reader is dropped from the wire format;
``__setstate__`` rebuilds it from the restored tables.
"""
permutation_data: Optional[pa.Table] = None
if self.permutation_table is not None:
permutation_data = self.permutation_table.to_arrow()
common = {
"base_table_name": self.base_table.name,
"permutation_data": permutation_data,
"split": self.split,
"selection": self.selection,
"batch_size": self.batch_size,
"transform_fn": self.transform_fn,
"offset": self.offset,
"limit": self.limit,
"connection_factory": self.connection_factory,
}
if self.connection_factory is not None:
# The factory carries enough state to recover the base table on
# its own; we don't need to capture the URI / storage options /
# namespace from the existing connection.
return common
# URI-introspection fallback: only viable for native (OSS) connections
# where (uri, storage_options) is enough to reopen. Remote / cloud
# connections don't expose recoverable api_key / region — those users
# must call with_connection_factory().
try:
base_uri = self.base_table._conn.uri
storage_options = self.base_table._conn.storage_options
except AttributeError as e:
raise ValueError(
"Cannot pickle this Permutation: the base table's connection "
"does not expose a uri/storage_options, which usually means it "
"is a remote (LanceDB Cloud) connection. Call "
"Permutation.with_connection_factory(...) first to provide a "
"picklable callable that re-opens the base table from a worker "
"process."
) from e
if base_uri.startswith("memory://"):
# In-memory base tables don't exist in any worker process by
# default, so dump the entire base table into the pickle. This
# can be expensive for large datasets — users with large
# in-memory base tables should either persist them or set a
# connection_factory.
return {
**common,
"base_table_data": self.base_table.to_arrow(),
}
return {
**common,
"base_table_uri": base_uri,
"base_table_namespace": self.base_table._namespace_path,
"base_table_storage_options": storage_options,
}
def __setstate__(self, state: dict[str, Any]) -> None:
from . import connect
connection_factory = state["connection_factory"]
if connection_factory is not None:
base_table = connection_factory(state["base_table_name"])
elif "base_table_data" in state:
# In-memory base table inlined into the pickle; rebuild the same
# way we rebuild the in-memory permutation table.
mem_db = connect("memory://")
base_table = mem_db.create_table(
state["base_table_name"], state["base_table_data"]
)
else:
base_db = connect(
state["base_table_uri"],
storage_options=state["base_table_storage_options"],
)
base_table = base_db.open_table(
state["base_table_name"],
namespace_path=state["base_table_namespace"] or None,
)
permutation_table: Optional[LanceTable] = None
if state["permutation_data"] is not None:
mem_db = connect("memory://")
permutation_table = mem_db.create_table(
"permutation", state["permutation_data"]
)
self.base_table = base_table
self.permutation_table = permutation_table
self.split = state["split"]
self.selection = state["selection"]
self.batch_size = state["batch_size"]
self.transform_fn = state["transform_fn"]
self.offset = state["offset"]
self.limit = state["limit"]
self.connection_factory = connection_factory
self.reader = LOOP.run(self._build_reader())
@property
def schema(self) -> pa.Schema:
async def do_output_schema():
@@ -964,36 +760,24 @@ class Permutation:
for expensive operations such as image decoding.
"""
assert transform is not None, "transform is required"
new = copy.copy(self)
new.transform_fn = transform
return new
def take_offsets(self, offsets: list[int]) -> Any:
"""
Take rows from the permutation by offset
The returned value is passed through the permutation's current transform,
so `with_format` and `with_transform` affect this method in the same way
they affect iteration.
"""
async def do_take_offsets():
return await self.reader.take_offsets(offsets, selection=self.selection)
batch = LOOP.run(do_take_offsets())
return self.transform_fn(batch)
return Permutation(self.reader, self.selection, self.batch_size, transform)
def __getitem__(self, index: int) -> Any:
"""
Returns a single row from the permutation by offset
"""
return self.take_offsets([index])
return self.__getitems__([index])
def __getitems__(self, indices: list[int]) -> Any:
"""
Returns rows from the permutation by offset
"""
return self.take_offsets(indices)
async def do_getitems():
return await self.reader.take_offsets(indices, selection=self.selection)
batch = LOOP.run(do_getitems())
return self.transform_fn(batch)
@deprecated(details="Use with_skip instead")
def skip(self, skip: int) -> "Permutation":
@@ -1011,10 +795,12 @@ class Permutation:
"""
Skip the first `skip` rows of the permutation
"""
new = copy.copy(self)
new.offset = skip
new.reader = LOOP.run(new._build_reader())
return new
async def do_with_skip():
reader = await self.reader.with_offset(skip)
return self._with_reader(reader)
return LOOP.run(do_with_skip())
@deprecated(details="Use with_take instead")
def take(self, limit: int) -> "Permutation":
@@ -1032,10 +818,12 @@ class Permutation:
"""
Limit the permutation to `limit` rows (following any `skip`)
"""
new = copy.copy(self)
new.limit = limit
new.reader = LOOP.run(new._build_reader())
return new
async def do_with_take():
reader = await self.reader.with_limit(limit)
return self._with_reader(reader)
return LOOP.run(do_with_take())
@deprecated(details="Use with_repeat instead")
def repeat(self, times: int) -> "Permutation":

View File

@@ -92,12 +92,6 @@ def ensure_vector_query(
return val
class ColumnOrdering(pydantic.BaseModel):
column_name: str
ascending: bool = True
nulls_first: bool = False
class FullTextQueryType(str, Enum):
MATCH = "match"
MATCH_PHRASE = "match_phrase"
@@ -510,8 +504,6 @@ class Query(pydantic.BaseModel):
# Bypass the vector index and use a brute force search
bypass_vector_index: Optional[bool] = None
order_by: Optional[List[ColumnOrdering]] = None
@classmethod
def from_inner(cls, req: PyQueryRequest) -> Self:
query = cls()
@@ -532,8 +524,6 @@ class Query(pydantic.BaseModel):
query.refine_factor = req.refine_factor
query.bypass_vector_index = req.bypass_vector_index
query.postfilter = req.postfilter
if req.order_by is not None:
query.order_by = [ColumnOrdering(**o) for o in req.order_by]
if req.full_text_search is not None:
query.full_text_query = FullTextSearchQuery(
columns=None,
@@ -582,22 +572,9 @@ class LanceQueryBuilder(ABC):
If "auto", the query type is inferred based on the query.
vector_column_name: str
The name of the vector column to use for vector search.
ordering_field_name: Optional[str]
.. deprecated:: 0.27.0
Use ``order_by()`` method instead.
fts_columns: Optional[Union[str, List[str]]]
The columns to search in for full text search.
fast_search: bool
Skip flat search of unindexed data.
"""
if ordering_field_name is not None:
import warnings
warnings.warn(
"ordering_field_name is deprecated, use .order_by() method instead.",
DeprecationWarning,
stacklevel=2,
)
# Check hybrid search first as it supports empty query pattern
if query_type == "hybrid":
# hybrid fts and vector query
@@ -694,7 +671,6 @@ class LanceQueryBuilder(ABC):
self._text = None
self._ef = None
self._bypass_vector_index = None
self._order_by = None
@deprecation.deprecated(
deprecated_in="0.3.1",
@@ -971,24 +947,6 @@ class LanceQueryBuilder(ABC):
""" # noqa: E501
return self._table._explain_plan(self.to_query_object(), verbose=verbose)
def order_by(self, ordering: Optional[List[ColumnOrdering]]) -> Self:
"""
Set the ordering for the results.
Parameters
----------
ordering: Optional[List[ColumnOrdering]]
The ordering to use for the results. If None, then the default ordering
will be used.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
self._order_by = ordering
return self
def analyze_plan(self) -> str:
"""
Run the query and return its execution plan with runtime metrics.
@@ -1356,7 +1314,6 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
fast_search=self._fast_search,
ef=self._ef,
bypass_vector_index=self._bypass_vector_index,
order_by=self._order_by,
)
def to_batches(
@@ -1508,9 +1465,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
super().__init__(table)
self._query = query
self._phrase_query = False
# Deprecated compatibility parameter. Native FTS ordering is now
# configured through order_by(); LanceQueryBuilder.create emits the warning.
_ = ordering_field_name
self.ordering_field_name = ordering_field_name
self._reranker = None
self._fast_search = fast_search
if isinstance(fts_columns, str):
@@ -1559,7 +1514,6 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
),
offset=self._offset,
fast_search=self._fast_search,
order_by=self._order_by,
)
def output_schema(self) -> pa.Schema:
@@ -1625,7 +1579,6 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder):
limit=self._limit,
with_row_id=self._with_row_id,
offset=self._offset,
order_by=self._order_by,
)
def output_schema(self) -> pa.Schema:
@@ -1690,7 +1643,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
def _validate_query(self, query, vector=None, text=None):
if query is not None and (vector is not None or text is not None):
raise ValueError(
"You can either provide a string query in search() method "
"You can either provide a string query in search() method"
"or set `vector()` and `text()` explicitly for hybrid search."
"But not both."
)
@@ -2549,27 +2502,6 @@ class AsyncStandardQuery(AsyncQueryBase):
self._inner.offset(offset)
return self
def order_by(self, ordering: Optional[List[ColumnOrdering]]) -> Self:
"""
Set the ordering for the results.
Parameters
----------
ordering: Optional[List[ColumnOrdering]]
The ordering to use for the results. If None, then the default ordering
will be used.
"""
if ordering is None:
self._inner.order_by(None)
else:
self._inner.order_by(
[
o.model_dump() if hasattr(o, "model_dump") else o.dict()
for o in ordering
]
)
return self
def fast_search(self) -> Self:
"""
Skip searching un-indexed data.

View File

@@ -14,7 +14,6 @@ from lancedb._lancedb import (
DeleteResult,
DropColumnsResult,
IndexConfig,
LsmWriteSpec,
MergeResult,
UpdateResult,
)
@@ -23,7 +22,6 @@ from lancedb.index import (
FTS,
BTree,
Bitmap,
HnswFlat,
HnswSq,
IvfFlat,
IvfPq,
@@ -41,7 +39,6 @@ from lancedb.table import _normalize_progress
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
from ..types import BaseTokenizerType
class RemoteTable(Table):
@@ -170,7 +167,7 @@ class RemoteTable(Table):
wait_timeout: Optional[timedelta] = None,
with_position: bool = False,
# tokenizer configs:
base_tokenizer: BaseTokenizerType = "simple",
base_tokenizer: str = "simple",
language: str = "English",
max_token_length: Optional[int] = 40,
lower_case: bool = True,
@@ -287,15 +284,13 @@ class RemoteTable(Table):
)
elif index_type == "IVF_HNSW_SQ":
config = HnswSq(distance_type=metric, num_partitions=num_partitions)
elif index_type == "IVF_HNSW_FLAT":
config = HnswFlat(distance_type=metric, num_partitions=num_partitions)
elif index_type == "IVF_FLAT":
config = IvfFlat(distance_type=metric, num_partitions=num_partitions)
else:
raise ValueError(
f"Unknown vector index type: {index_type}. Valid options are"
" 'IVF_FLAT', 'IVF_PQ', 'IVF_RQ', 'IVF_SQ',"
" 'IVF_HNSW_PQ', 'IVF_HNSW_SQ', 'IVF_HNSW_FLAT'"
" 'IVF_HNSW_PQ', 'IVF_HNSW_SQ'"
)
LOOP.run(
@@ -656,18 +651,6 @@ class RemoteTable(Table):
def drop_columns(self, columns: Iterable[str]) -> DropColumnsResult:
return LOOP.run(self._table.drop_columns(columns))
def set_unenforced_primary_key(self, columns: Union[str, Iterable[str]]) -> None:
"""Not supported on LanceDB Cloud."""
return LOOP.run(self._table.set_unenforced_primary_key(columns))
def set_lsm_write_spec(self, spec: "LsmWriteSpec") -> None:
"""Not supported on LanceDB Cloud."""
return LOOP.run(self._table.set_lsm_write_spec(spec))
def unset_lsm_write_spec(self) -> None:
"""Not supported on LanceDB Cloud."""
return LOOP.run(self._table.unset_lsm_write_spec())
def drop_index(self, index_name: str):
return LOOP.run(self._table.drop_index(index_name))

View File

@@ -57,7 +57,6 @@ from .index import (
LabelList,
HnswPq,
HnswSq,
HnswFlat,
FTS,
)
from .merge import LanceMergeInsertBuilder
@@ -87,59 +86,6 @@ from .util import (
)
from .index import lang_mapping
_MODEL_BACKED_TOKENIZER_PREFIXES = ("jieba", "lindera")
_MODEL_BACKED_TOKENIZER_ERRORS = (
"unknown base tokenizer",
"Invalid directory path:",
"Failed to load Jieba",
"Failed to load tokenizer config",
"Failed to initialize default tokenizer",
)
def _add_unique_note(exception: BaseException, note: str) -> None:
existing_notes = getattr(exception, "__notes__", ()) or ()
message = (
exception.args[0]
if exception.args and isinstance(exception.args[0], str)
else ""
)
if note not in existing_notes and note not in message:
add_note(exception, note)
def _is_model_backed_tokenizer(base_tokenizer: str) -> bool:
return any(
base_tokenizer == prefix or base_tokenizer.startswith(f"{prefix}/")
for prefix in _MODEL_BACKED_TOKENIZER_PREFIXES
)
def _maybe_add_fts_error_note(
exception: BaseException, *, base_tokenizer: str, language: Optional[str] = None
) -> None:
message = str(exception)
if language is not None and "not support the requested language" in message:
supported_langs = ", ".join(lang_mapping.values())
_add_unique_note(exception, f"Supported languages: {supported_langs}")
return
if not _is_model_backed_tokenizer(base_tokenizer):
return
if not any(marker in message for marker in _MODEL_BACKED_TOKENIZER_ERRORS):
return
_add_unique_note(
exception,
"Model-backed tokenizers such as 'jieba/default' and 'lindera/ipadic' "
"require tokenizer models in Lance's language model home. Set "
"LANCE_LANGUAGE_MODEL_HOME to override the default platform data "
"directory under 'lance/language_models'. Expected layouts include "
"'<model-home>/jieba/default/...' and "
"'<model-home>/lindera/ipadic/...'.",
)
if TYPE_CHECKING:
from .db import LanceDBConnection
@@ -154,7 +100,6 @@ if TYPE_CHECKING:
AlterColumnsResult,
DeleteResult,
DropColumnsResult,
LsmWriteSpec,
MergeResult,
UpdateResult,
)
@@ -1013,10 +958,7 @@ class Table(ABC):
tokenizer_name: str, default "default"
A compatibility alias for native tokenizer configs. Can be "raw",
"default" or the 2 letter language code followed by "_stem". So
for english it would be "en_stem". For new native FTS indexes, use
``base_tokenizer`` directly; ``tokenizer_name`` is a legacy
compatibility alias and does not expose model-backed tokenizer names
such as ``jieba/default`` or ``lindera/ipadic``.
for english it would be "en_stem".
use_tantivy: bool, default False
Deprecated legacy Tantivy parameter. Setting this to True raises an
error.
@@ -1030,11 +972,8 @@ class Table(ABC):
- "whitespace": Split text by whitespace, but not punctuation.
- "raw": No tokenization. The entire text is treated as a single token.
- "ngram": N-Gram tokenizer.
- "jieba/*": Jieba tokenizer loaded from Lance's language model home.
- "lindera/*": Lindera tokenizer loaded from Lance's language model home.
language : str, default "English"
The language to use for stemming and stop-word removal. This is not
the primary way to enable CJK tokenization.
The language to use for tokenization.
max_token_length : int, default 40
The maximum token length to index. Tokens longer than this length will be
ignored.
@@ -1060,13 +999,6 @@ class Table(ABC):
The timeout to wait if indexing is asynchronous.
name: str, optional
The name of the index. If not provided, a default name will be generated.
Notes
-----
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
require tokenizer models in Lance's language model home. Set
``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
directory under ``lance/language_models``.
"""
raise NotImplementedError
@@ -2238,13 +2170,7 @@ class LanceTable(Table):
index_cache_size: Optional[int] = None,
num_bits: int = 8,
index_type: Literal[
"IVF_FLAT",
"IVF_SQ",
"IVF_PQ",
"IVF_RQ",
"IVF_HNSW_SQ",
"IVF_HNSW_PQ",
"IVF_HNSW_FLAT",
"IVF_FLAT", "IVF_SQ", "IVF_PQ", "IVF_RQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
] = "IVF_PQ",
max_iterations: int = 50,
sample_rate: int = 256,
@@ -2331,16 +2257,6 @@ class LanceTable(Table):
ef_construction=ef_construction,
target_partition_size=target_partition_size,
)
elif index_type == "IVF_HNSW_FLAT":
config = HnswFlat(
distance_type=metric,
num_partitions=num_partitions,
max_iterations=max_iterations,
sample_rate=sample_rate,
m=m,
ef_construction=ef_construction,
target_partition_size=target_partition_size,
)
else:
raise ValueError(f"Unknown index type {index_type}")
@@ -2546,22 +2462,14 @@ class LanceTable(Table):
**tokenizer_configs,
)
try:
LOOP.run(
self._table.create_index(
field_names,
replace=replace,
config=config,
name=name,
)
LOOP.run(
self._table.create_index(
field_names,
replace=replace,
config=config,
name=name,
)
except (ValueError, RuntimeError) as e:
_maybe_add_fts_error_note(
e,
base_tokenizer=config.base_tokenizer,
language=config.language,
)
raise e
)
@staticmethod
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
@@ -3264,21 +3172,6 @@ class LanceTable(Table):
def drop_columns(self, columns: Iterable[str]) -> DropColumnsResult:
return LOOP.run(self._table.drop_columns(columns))
def set_unenforced_primary_key(self, columns: Union[str, Iterable[str]]) -> None:
"""Set the unenforced primary key. See
[`AsyncTable.set_unenforced_primary_key`][lancedb.AsyncTable.set_unenforced_primary_key]."""
return LOOP.run(self._table.set_unenforced_primary_key(columns))
def set_lsm_write_spec(self, spec: "LsmWriteSpec") -> None:
"""Install an LsmWriteSpec. See
[`AsyncTable.set_lsm_write_spec`][lancedb.AsyncTable.set_lsm_write_spec]."""
return LOOP.run(self._table.set_lsm_write_spec(spec))
def unset_lsm_write_spec(self) -> None:
"""Remove the LsmWriteSpec. See
[`AsyncTable.unset_lsm_write_spec`][lancedb.AsyncTable.unset_lsm_write_spec]."""
return LOOP.run(self._table.unset_lsm_write_spec())
def uses_v2_manifest_paths(self) -> bool:
"""
Check if the table is using the new v2 manifest paths.
@@ -3824,69 +3717,6 @@ class AsyncTable:
Any attempt to use the table after it has been closed will raise an error."""
return self._inner.close()
async def set_unenforced_primary_key(
self, columns: Union[str, Iterable[str]]
) -> None:
"""Set the unenforced primary key for this table to the given
ordered list of columns.
"Unenforced" means LanceDB does not check uniqueness on writes; the
columns are recorded in the schema as the primary key so that
features such as `merge_insert` can use them. Calling this again
replaces any previously-set primary key.
Parameters
----------
columns : str or Iterable[str]
Either a single column name (single-column key) or an ordered
iterable of column names (composite key). Each column dtype
must be one of: int32, int64, utf8, large_utf8, binary,
large_binary, fixed_size_binary.
"""
if isinstance(columns, str):
columns = [columns]
else:
columns = list(columns)
await self._inner.set_unenforced_primary_key(columns)
async def set_lsm_write_spec(self, spec: "LsmWriteSpec") -> None:
"""Install an LsmWriteSpec on this table.
The spec selects Lance's MemWAL LSM-style write path for future
`merge_insert` calls. ``LsmWriteSpec`` chooses one of three sharding
strategies:
- ``LsmWriteSpec.bucket(column, num_buckets)`` — hash-bucket writes by
the single-column unenforced primary key.
- ``LsmWriteSpec.identity(column)`` — shard by the raw value of a
scalar column.
- ``LsmWriteSpec.unsharded()`` — route every write to a single shard.
All variants require the table to have an unenforced primary key set
via [`set_unenforced_primary_key`]; bucket sharding additionally
requires it to be the single column being bucketed.
Parameters
----------
spec : LsmWriteSpec
The sharding spec to install.
Examples
--------
>>> from lancedb._lancedb import LsmWriteSpec
>>> # table.set_unenforced_primary_key("id")
>>> # table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 16))
"""
await self._inner.set_lsm_write_spec(spec)
async def unset_lsm_write_spec(self) -> None:
"""Remove the LsmWriteSpec from this table.
Reverts to the standard `merge_insert` write path. Errors if no spec
is currently set.
"""
await self._inner.unset_lsm_write_spec()
@property
def name(self) -> str:
"""The name of the table."""
@@ -3969,18 +3799,7 @@ class AsyncTable:
*,
replace: Optional[bool] = None,
config: Optional[
Union[
IvfFlat,
IvfPq,
IvfRq,
HnswPq,
HnswSq,
HnswFlat,
BTree,
Bitmap,
LabelList,
FTS,
]
Union[IvfFlat, IvfPq, IvfRq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
] = None,
wait_timeout: Optional[timedelta] = None,
name: Optional[str] = None,
@@ -4027,7 +3846,6 @@ class AsyncTable:
IvfRq,
HnswPq,
HnswSq,
HnswFlat,
BTree,
Bitmap,
LabelList,
@@ -4047,13 +3865,11 @@ class AsyncTable:
name=name,
train=train,
)
except (ValueError, RuntimeError) as e:
if isinstance(config, FTS):
_maybe_add_fts_error_note(
e,
base_tokenizer=config.base_tokenizer,
language=config.language,
)
except ValueError as e:
if "not support the requested language" in str(e):
supported_langs = ", ".join(lang_mapping.values())
help_msg = f"Supported languages: {supported_langs}"
add_note(e, help_msg)
raise e
async def drop_index(self, name: str) -> None:
@@ -4591,8 +4407,6 @@ class AsyncTable:
async_query = async_query.fast_search()
if query.with_row_id:
async_query = async_query.with_row_id()
if query.order_by:
async_query = async_query.order_by(query.order_by)
if query.vector:
async_query = async_query.nearest_to(query.vector).distance_range(
@@ -5200,7 +5014,6 @@ class IndexStatistics:
"IVF_RQ",
"IVF_HNSW_SQ",
"IVF_HNSW_PQ",
"IVF_HNSW_FLAT",
"FTS",
"BTREE",
"BITMAP",

View File

@@ -24,7 +24,6 @@ VectorIndexType = Literal[
"IVF_PQ",
"IVF_HNSW_SQ",
"IVF_HNSW_PQ",
"IVF_HNSW_FLAT",
"IVF_RQ",
]
ScalarIndexType = Literal["BTREE", "BITMAP", "LABEL_LIST"]
@@ -32,7 +31,6 @@ IndexType = Literal[
"IVF_PQ",
"IVF_HNSW_PQ",
"IVF_HNSW_SQ",
"IVF_HNSW_FLAT",
"IVF_SQ",
"FTS",
"BTREE",
@@ -42,5 +40,4 @@ IndexType = Literal[
]
# Tokenizer literals
BuiltinTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
BaseTokenizerType = BuiltinTokenizerType | str
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]

View File

@@ -1,8 +0,0 @@
我们 98740 r
都 202780 d
有 423765 v
光明 1219 n
的 318825 uj
前途 1263 n
前 62779 f
途 857 n

View File

@@ -1,3 +0,0 @@
segmenter:
mode: "normal"
dictionary: "./python/tests/models/lindera/ipadic/main"

View File

@@ -914,29 +914,6 @@ def test_local_namespace_operations(tmp_path):
assert db.list_namespaces().namespaces == []
def test_create_namespace_invalid_mode_raises(tmp_path):
"""Unrecognized create namespace modes raise a clear error."""
db = lancedb.connect(tmp_path)
with pytest.raises(ValueError, match="Invalid create namespace mode"):
db.create_namespace(["child"], mode="frobnicate")
def test_drop_namespace_invalid_mode_raises(tmp_path):
"""Unrecognized drop namespace modes raise a clear error."""
db = lancedb.connect(tmp_path)
db.create_namespace(["child"])
with pytest.raises(ValueError, match="Invalid drop namespace mode"):
db.drop_namespace(["child"], mode="frobnicate")
def test_drop_namespace_invalid_behavior_raises(tmp_path):
"""Unrecognized drop namespace behaviors raise a clear error."""
db = lancedb.connect(tmp_path)
db.create_namespace(["child"])
with pytest.raises(ValueError, match="Invalid drop namespace behavior"):
db.drop_namespace(["child"], behavior="frobnicate")
def test_clone_table_latest_version(tmp_path):
"""Test cloning a table with the latest version (default behavior)"""
import os

View File

@@ -15,10 +15,7 @@
# limitations under the License.
import os
import random
import shutil
from unittest import mock
from pathlib import Path
import zipfile
import lancedb as ldb
from lancedb.db import DBConnection
@@ -29,7 +26,6 @@ from lancedb.query import (
MultiMatchQuery,
PhraseQuery,
BooleanQuery,
ColumnOrdering,
Occur,
LanceFtsQueryBuilder,
)
@@ -40,8 +36,6 @@ import pytest
import pytest_asyncio
from utils import exception_output
TEST_LANGUAGE_MODEL_HOME = Path(__file__).parent / "models"
@pytest.fixture
def table(tmp_path) -> ldb.table.LanceTable:
@@ -95,39 +89,6 @@ def table(tmp_path) -> ldb.table.LanceTable:
return table
@pytest.fixture
def language_model_home(monkeypatch, tmp_path):
model_home = tmp_path / "language-models"
shutil.copytree(TEST_LANGUAGE_MODEL_HOME, model_home)
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(model_home))
return model_home
@pytest.fixture
def lindera_ipadic(language_model_home):
model_path = language_model_home / "lindera" / "ipadic"
extracted_model = model_path / "main"
config_path = model_path / "config.yml"
if extracted_model.exists():
shutil.rmtree(extracted_model)
with zipfile.ZipFile(model_path / "main.zip", "r") as zip_ref:
zip_ref.extractall(model_path)
config_path.write_text(
"segmenter:\n"
' mode: "normal"\n'
f' dictionary: "{extracted_model.resolve().as_posix()}"\n',
encoding="utf-8",
)
try:
yield
finally:
if extracted_model.exists():
shutil.rmtree(extracted_model)
@pytest_asyncio.fixture
async def async_table(tmp_path) -> ldb.table.AsyncTable:
# Use local random state to avoid affecting other tests
@@ -500,36 +461,6 @@ async def test_search_fts_specify_column_async(async_table):
pass
def test_search_order_by_descending(table):
table.create_fts_index("text")
rows = (
table.search("puppy")
.order_by([ColumnOrdering(column_name="count", ascending=False)])
.limit(20)
.select(["text", "count"])
.to_list()
)
for r in rows:
assert "puppy" in r["text"]
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
def test_search_order_by_ascending(table):
table.create_fts_index("text")
rows = (
table.search("puppy")
.order_by([ColumnOrdering(column_name="count", ascending=True)])
.limit(20)
.select(["text", "count"])
.to_list()
)
for r in rows:
assert "puppy" in r["text"]
assert sorted(rows, key=lambda x: x["count"]) == rows
def test_create_index_from_table(tmp_path, table):
table.create_fts_index("text")
df = table.search("puppy").limit(5).select(["text"]).to_pandas()
@@ -753,90 +684,6 @@ def test_fts_ngram(mem_db: DBConnection):
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
def test_fts_jieba_tokenizer(mem_db: DBConnection, language_model_home):
data = pa.table({"text": ["我们都有光明的前途", "光明的前途"]})
table = mem_db.create_table("test_jieba", data=data)
table.create_fts_index(
"text",
base_tokenizer="jieba/default",
stem=False,
remove_stop_words=False,
ascii_folding=False,
)
results = table.search("我们", query_type="fts").limit(10).to_list()
assert [row["text"] for row in results] == ["我们都有光明的前途"]
def test_fts_jieba_missing_language_model_note(
mem_db: DBConnection, monkeypatch, tmp_path
):
missing_root = tmp_path / "missing-language-models"
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
table = mem_db.create_table(
"test_missing_jieba_model",
data=pa.table({"text": ["我们都有光明的前途"]}),
)
with pytest.raises((ValueError, RuntimeError)) as e:
table.create_fts_index(
"text",
base_tokenizer="jieba/default",
stem=False,
remove_stop_words=False,
ascii_folding=False,
)
output = exception_output(e)
assert "Invalid directory path:" in output
assert "LANCE_LANGUAGE_MODEL_HOME" in output
assert "jieba/default" in output
@pytest.mark.asyncio
async def test_fts_jieba_missing_language_model_note_async(monkeypatch, tmp_path):
missing_root = tmp_path / "missing-language-models"
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
db = await ldb.connect_async(tmp_path / "async-db")
table = await db.create_table(
"test_missing_jieba_model_async",
data=pa.table({"text": ["我们都有光明的前途"]}),
)
with pytest.raises((ValueError, RuntimeError)) as e:
await table.create_index(
"text",
config=FTS(
base_tokenizer="jieba/default",
stem=False,
remove_stop_words=False,
ascii_folding=False,
),
)
output = exception_output(e)
assert "Invalid directory path:" in output
assert "LANCE_LANGUAGE_MODEL_HOME" in output
assert "jieba/default" in output
def test_fts_lindera_tokenizer(
mem_db: DBConnection, language_model_home, lindera_ipadic
):
data = pa.table({"text": ["成田国際空港", "東京国際空港", "羽田空港"]})
table = mem_db.create_table("test_lindera", data=data)
table.create_fts_index(
"text",
base_tokenizer="lindera/ipadic",
stem=False,
remove_stop_words=False,
ascii_folding=False,
)
results = table.search("成田", query_type="fts").limit(10).to_list()
assert [row["text"] for row in results] == ["成田国際空港"]
def test_fts_query_to_json():
"""Test that FTS query to_json() produces valid JSON strings with exact format."""

View File

@@ -16,13 +16,11 @@ from lancedb.index import (
IvfSq,
IvfHnswPq,
IvfHnswSq,
IvfHnswFlat,
IvfRq,
Bitmap,
LabelList,
HnswPq,
HnswSq,
HnswFlat,
FTS,
)
from lancedb.table import IndexStatistics
@@ -252,21 +250,6 @@ async def test_create_hnswpq_alias_index(some_table: AsyncTable):
assert indices[0].index_type in {"HnswPq", "IvfHnswPq"}
@pytest.mark.asyncio
async def test_create_hnswflat_index(some_table: AsyncTable):
await some_table.create_index("vector", config=HnswFlat(num_partitions=10))
indices = await some_table.list_indices()
assert len(indices) == 1
@pytest.mark.asyncio
async def test_create_hnswflat_alias_index(some_table: AsyncTable):
await some_table.create_index("vector", config=IvfHnswFlat(num_partitions=5))
indices = await some_table.list_indices()
assert len(indices) == 1
assert indices[0].index_type in {"HnswFlat", "IvfHnswFlat"}
@pytest.mark.asyncio
async def test_create_ivfsq_index(some_table: AsyncTable):
await some_table.create_index("vector", config=IvfSq(num_partitions=10))
@@ -312,7 +295,6 @@ def test_index_statistics_index_type_lists_all_supported_values():
"IVF_RQ",
"IVF_HNSW_SQ",
"IVF_HNSW_PQ",
"IVF_HNSW_FLAT",
"FTS",
"BTREE",
"BITMAP",

View File

@@ -1,149 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
"""Tests for installing and clearing an LsmWriteSpec via
`Table.set_lsm_write_spec` / `Table.unset_lsm_write_spec`.
"""
from datetime import timedelta
import lancedb
import pyarrow as pa
import pytest
from lancedb._lancedb import LsmWriteSpec
SCHEMA = pa.schema(
[
pa.field("id", pa.utf8(), nullable=False),
pa.field("v", pa.int32(), nullable=False),
]
)
def _batch(ids, vs):
return pa.RecordBatch.from_arrays(
[pa.array(ids, type=pa.utf8()), pa.array(vs, type=pa.int32())],
schema=SCHEMA,
)
def _reader(ids, vs):
return pa.RecordBatchReader.from_batches(SCHEMA, [_batch(ids, vs)])
def _make_table(tmp_path):
db = lancedb.connect(tmp_path, read_consistency_interval=timedelta(seconds=0))
table = db.create_table("t", _reader(["seed"], [0]))
return db, table
def test_set_lsm_write_spec_validates(tmp_path):
_db, table = _make_table(tmp_path)
# No PK set yet.
with pytest.raises(Exception, match="primary key"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
table.set_unenforced_primary_key("id")
# Column mismatch.
with pytest.raises(Exception, match="match"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("v", 4))
# Out-of-range num_buckets.
with pytest.raises(Exception, match="num_buckets"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 0))
with pytest.raises(Exception, match="num_buckets"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 1025))
# Happy path then mutation rejected.
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
with pytest.raises(Exception, match="mutation"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 8))
def test_unset_lsm_write_spec(tmp_path):
_db, table = _make_table(tmp_path)
# unset errors when no spec is set.
with pytest.raises(Exception, match="no LSM write spec"):
table.unset_lsm_write_spec()
# Install a spec, then remove it; afterwards a fresh spec can be set.
table.set_unenforced_primary_key("id")
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
table.unset_lsm_write_spec()
# A second unset errors — there is no spec left to remove.
with pytest.raises(Exception, match="no LSM write spec"):
table.unset_lsm_write_spec()
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 8))
def test_set_unsharded_spec(tmp_path):
_db, table = _make_table(tmp_path)
# Lance MemWAL still requires a primary key on the dataset; Unsharded
# just skips per-row hashing.
table.set_unenforced_primary_key("id")
table.set_lsm_write_spec(LsmWriteSpec.unsharded())
table.unset_lsm_write_spec()
def test_lsm_write_spec_repr():
s = LsmWriteSpec.bucket("id", 4)
assert s.spec_type == "bucket"
assert s.column == "id"
assert s.num_buckets == 4
assert s.maintained_indexes == []
assert "bucket" in repr(s)
assert "id" in repr(s)
assert "4" in repr(s)
u = LsmWriteSpec.unsharded()
assert u.spec_type == "unsharded"
assert u.column is None
assert u.num_buckets is None
assert "unsharded" in repr(u)
def test_lsm_write_spec_with_maintained_indexes():
s = LsmWriteSpec.bucket("id", 4).with_maintained_indexes(["idx_a", "idx_b"])
assert s.maintained_indexes == ["idx_a", "idx_b"]
@pytest.mark.asyncio
async def test_async_set_unset_lsm_write_spec(tmp_path):
db = await lancedb.connect_async(
tmp_path, read_consistency_interval=timedelta(seconds=0)
)
table = await db.create_table(
"t",
pa.RecordBatchReader.from_batches(SCHEMA, [_batch(["seed"], [0])]),
)
await table.set_unenforced_primary_key("id")
await table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
await table.unset_lsm_write_spec()
# A second unset errors.
with pytest.raises(Exception, match="no LSM write spec"):
await table.unset_lsm_write_spec()
def test_set_identity_spec(tmp_path):
_db, table = _make_table(tmp_path)
# Identity sharding still requires an unenforced primary key on the
# table; it shards by the raw value of the given column.
table.set_unenforced_primary_key("id")
table.set_lsm_write_spec(LsmWriteSpec.identity("v"))
table.unset_lsm_write_spec()
def test_lsm_write_spec_identity_and_writer_config_defaults():
s = LsmWriteSpec.identity("v")
assert s.spec_type == "identity"
assert s.column == "v"
assert s.num_buckets is None
assert "identity" in repr(s)
s = s.with_writer_config_defaults({"durable_write": "false"})
assert s.writer_config_defaults == {"durable_write": "false"}
assert "durable_write" in repr(s)

View File

@@ -9,6 +9,21 @@ from lancedb import DBConnection, Table, connect
from lancedb.permutation import Permutation, Permutations, permutation_builder
def test_permutation_persistence(tmp_path):
db = connect(tmp_path)
tbl = db.create_table("test_table", pa.table({"x": range(100), "y": range(100)}))
permutation_tbl = (
permutation_builder(tbl).shuffle().persist(db, "test_permutation").execute()
)
assert permutation_tbl.count_rows() == 100
re_open = db.open_table("test_permutation")
assert re_open.count_rows() == 100
assert permutation_tbl.to_arrow() == re_open.to_arrow()
def test_split_random_ratios(mem_db):
"""Test random splitting with ratios."""
tbl = mem_db.create_table(
@@ -1080,29 +1095,3 @@ def test_getitems_invalid_offset(some_permutation: Permutation):
"""Test __getitems__ with an out-of-range offset raises an error."""
with pytest.raises(Exception):
some_permutation.__getitems__([999999])
def test_take_offsets(some_permutation: Permutation):
result = some_permutation.take_offsets([0, 1, 2])
assert isinstance(result, list)
assert "id" in result[0]
assert "value" in result[0]
assert len(result) == 3
def test_take_offsets_empty_identity_permutation(mem_db):
tbl = mem_db.create_table(
"test_table", pa.table({"id": range(10), "value": range(10)})
)
permutation = Permutation.identity(tbl)
result = permutation.take_offsets([])
assert result == []
def test_take_offsets_empty_permutation(some_permutation: Permutation):
result = some_permutation.take_offsets([])
assert result == []

View File

@@ -1,79 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
"""Tests for Table.set_unenforced_primary_key."""
from datetime import timedelta
import lancedb
import pyarrow as pa
import pytest
def _empty_table(path, schema):
db = lancedb.connect(path, read_consistency_interval=timedelta(seconds=0))
return db.create_table("t", schema=schema)
def test_set_unenforced_primary_key_accepts_string_or_one_element_list(tmp_path):
schema = pa.schema([pa.field("id", pa.int64(), nullable=False)])
# Bare string.
table = _empty_table(tmp_path / "s", schema)
table.set_unenforced_primary_key("id")
# One-element list.
table = _empty_table(tmp_path / "l", schema)
table.set_unenforced_primary_key(["id"])
def test_set_unenforced_primary_key_rejects_compound_and_empty(tmp_path):
table = _empty_table(
tmp_path,
pa.schema(
[
pa.field("a", pa.utf8(), nullable=False),
pa.field("b", pa.int64(), nullable=False),
]
),
)
# Compound keys are not supported.
with pytest.raises(Exception, match="compound"):
table.set_unenforced_primary_key(["a", "b"])
# Empty input.
with pytest.raises(Exception, match="required"):
table.set_unenforced_primary_key([])
def test_set_unenforced_primary_key_is_immutable(tmp_path):
table = _empty_table(
tmp_path,
pa.schema(
[
pa.field("a", pa.utf8(), nullable=False),
pa.field("b", pa.int64(), nullable=False),
]
),
)
table.set_unenforced_primary_key("a")
# The primary key cannot be changed or re-set once installed.
with pytest.raises(Exception, match="already set"):
table.set_unenforced_primary_key("b")
with pytest.raises(Exception, match="already set"):
table.set_unenforced_primary_key("a")
def test_set_unenforced_primary_key_validates(tmp_path):
table = _empty_table(
tmp_path / "t", pa.schema([pa.field("id", pa.utf8(), nullable=False)])
)
# Unknown column.
with pytest.raises(Exception, match="not found"):
table.set_unenforced_primary_key("nonexistent")
# Unsupported dtype (Float32 not in the supported set).
bad = _empty_table(
tmp_path / "bad", pa.schema([pa.field("id", pa.float32(), nullable=False)])
)
with pytest.raises(Exception, match="not supported"):
bad.set_unenforced_primary_key("id")

View File

@@ -25,7 +25,6 @@ from lancedb.query import (
AsyncHybridQuery,
AsyncQueryBase,
AsyncVectorQuery,
ColumnOrdering,
LanceVectorQueryBuilder,
MatchQuery,
PhraseQuery,
@@ -165,71 +164,6 @@ def test_offset(table):
assert len(results_with_offset.to_pandas()) == 1
def test_order_by_plain_query(mem_db):
table = mem_db.create_table(
"test_order_by",
pa.table(
{
"group": [1, 1, 1, 2],
"score": [None, 1.0, 1.0, 0.5],
"name": ["z", "b", "a", "c"],
}
),
)
res = (
table.search()
.order_by(
[
ColumnOrdering(column_name="group", ascending=True, nulls_first=False),
ColumnOrdering(column_name="score", ascending=True, nulls_first=True),
ColumnOrdering(column_name="name", ascending=True, nulls_first=False),
]
)
.to_arrow()
)
assert res.select(["group", "score", "name"]).to_pylist() == [
{"group": 1, "score": None, "name": "z"},
{"group": 1, "score": 1.0, "name": "a"},
{"group": 1, "score": 1.0, "name": "b"},
{"group": 2, "score": 0.5, "name": "c"},
]
@pytest.mark.asyncio
async def test_order_by_async_query(mem_db_async: AsyncConnection):
table = await mem_db_async.create_table(
"test_order_by_async",
pa.table(
{
"group": [1, 1, 1, 2],
"score": [None, 1.0, 1.0, 0.5],
"name": ["z", "b", "a", "c"],
}
),
)
res = await (
table.query()
.order_by(
[
ColumnOrdering(column_name="group", ascending=True, nulls_first=False),
ColumnOrdering(column_name="score", ascending=True, nulls_first=True),
ColumnOrdering(column_name="name", ascending=True, nulls_first=False),
]
)
.to_arrow()
)
assert res.select(["group", "score", "name"]).to_pylist() == [
{"group": 1, "score": None, "name": "z"},
{"group": 1, "score": 1.0, "name": "a"},
{"group": 1, "score": 1.0, "name": "b"},
{"group": 2, "score": 0.5, "name": "c"},
]
def test_query_builder(table):
rs = (
LanceVectorQueryBuilder(table, [0, 0], "vector")

View File

@@ -6,8 +6,6 @@ import contextlib
from datetime import timedelta
import http.server
import json
import multiprocessing as mp
import sys
import threading
import time
from unittest.mock import MagicMock, patch
@@ -16,7 +14,6 @@ from packaging.version import Version
import lancedb
from lancedb.conftest import MockTextEmbeddingFunction
from lancedb.query import ColumnOrdering
from lancedb.remote import ClientConfig
from lancedb.remote.errors import HttpError, RetryError
import pytest
@@ -661,18 +658,6 @@ def test_query_sync_maximal():
"ef": None,
"filter": "id > 0",
"columns": ["id", "name"],
"order_by": [
{
"column_name": "score",
"ascending": False,
"nulls_first": True,
},
{
"column_name": "id",
"ascending": True,
"nulls_first": False,
},
],
"vector_column": "vector2",
"fast_search": True,
"with_row_id": True,
@@ -690,14 +675,6 @@ def test_query_sync_maximal():
.refine_factor(10)
.nprobes(5)
.where("id > 0", prefilter=True)
.order_by(
[
ColumnOrdering(
column_name="score", ascending=False, nulls_first=True
),
ColumnOrdering(column_name="id", ascending=True, nulls_first=False),
]
)
.with_row_id(True)
.select(["id", "name"])
.to_list()
@@ -1253,82 +1230,3 @@ def test_background_loop_cancellation(exception):
with pytest.raises(exception):
loop.run(None)
mock_future.cancel.assert_called_once()
def _remote_fork_child(port: int, queue) -> None:
# Build a fresh Connection in the child so we exercise the at-fork-child
# tokio runtime reset rather than relying on an inherited reqwest client.
db = lancedb.connect(
"db://dev",
api_key="fake",
host_override=f"http://localhost:{port}",
client_config={
"retry_config": {"retries": 0},
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
},
)
queue.put(db.table_names())
@pytest.mark.skipif(
sys.platform != "linux",
reason=(
"fork() is unavailable on Windows and unsafe on macOS "
"(Apple frameworks/TLS are not fork-safe)"
),
)
def test_remote_connection_after_fork():
"""A freshly-built remote Connection in a forked child should not hang.
The pyo3-async-runtimes tokio runtime would otherwise be inherited from
the parent with dead worker threads; the at-fork-child handler in our
runtime module rebuilds it on first use in the child.
"""
def handler(request):
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b'{"tables": []}')
server = http.server.HTTPServer(("localhost", 0), make_mock_http_handler(handler))
port = server.server_address[1]
server_thread = threading.Thread(target=server.serve_forever)
server_thread.start()
try:
# Hit the server in the parent first so the runtime + LOOP are warm
# before fork; a fresh child must still succeed.
parent_db = lancedb.connect(
"db://dev",
api_key="fake",
host_override=f"http://localhost:{port}",
client_config={
"retry_config": {"retries": 0},
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
},
)
assert parent_db.table_names() == []
ctx = mp.get_context("fork")
queue = ctx.Queue()
proc = ctx.Process(target=_remote_fork_child, args=(port, queue))
proc.start()
proc.join(timeout=15)
if proc.is_alive():
proc.terminate()
proc.join(timeout=5)
if proc.is_alive():
proc.kill()
proc.join()
pytest.fail("Remote connection hung after fork")
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
assert not queue.empty(), "child produced no result"
assert queue.get() == []
# Parent connection must still be usable after the child returned.
assert parent_db.table_names() == []
finally:
server.shutdown()
server_thread.join()

View File

@@ -11,7 +11,7 @@ from unittest.mock import patch
import lancedb
from lancedb.dependencies import _PANDAS_AVAILABLE
from lancedb.index import HnswFlat, HnswPq, HnswSq, IvfPq
from lancedb.index import HnswPq, HnswSq, IvfPq
import numpy as np
import polars as pl
import pyarrow as pa
@@ -917,21 +917,6 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
"my_vector", replace=True, config=expected_config, name=None, train=True
)
table.create_index(
vector_column_name="my_vector",
metric="cosine",
index_type="IVF_HNSW_FLAT",
sample_rate=0.1,
m=29,
ef_construction=10,
)
expected_config = HnswFlat(
distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10
)
mock_create_index.assert_called_with(
"my_vector", replace=True, config=expected_config, name=None, train=True
)
@patch("lancedb.table.AsyncTable.create_index")
def test_create_index_name_and_train_parameters(

View File

@@ -1,29 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
import functools
import multiprocessing as mp
import pickle
import sys
import lancedb
import pyarrow as pa
import pytest
from lancedb.permutation import Permutation, Permutations, permutation_builder
from lancedb.util import tbl_to_tensor
from lancedb.permutation import Permutation
torch = pytest.importorskip("torch")
def _open_native_table(uri: str, table_name: str):
"""Top-level connection factory used by the explicit-factory pickle test.
Defined at module scope so that pickle can resolve it by name in the
worker / unpickling process.
"""
return lancedb.connect(uri).open_table(table_name)
def test_table_dataloader(mem_db):
table = mem_db.create_table("test_table", pa.table({"a": range(1000)}))
dataloader = torch.utils.data.DataLoader(
@@ -55,156 +40,3 @@ def test_permutation_dataloader(mem_db):
for batch in dataloader:
assert batch.size(0) == 1
assert batch.size(1) == 10
def test_permutation_is_picklable(tmp_db):
"""A Permutation must be picklable so it can be used with PyTorch's
DataLoader when num_workers > 0 (which uses multiprocessing and pickles
the dataset to pass it to worker processes)."""
table = tmp_db.create_table("test_table", pa.table({"a": range(1000)}))
permutation = Permutation.identity(table)
pickled = pickle.dumps(permutation)
restored = pickle.loads(pickled)
assert len(restored) == 1000
rows = restored.__getitems__([0, 1, 2])
assert rows == [{"a": 0}, {"a": 1}, {"a": 2}]
def test_permutation_with_memory_base_is_picklable(mem_db):
"""An in-memory base table is inlined into the pickle as Arrow IPC bytes
and rebuilt on the other side as an in-memory LanceTable, so the
Permutation round-trips even though the original database can't be
reopened across processes."""
table = mem_db.create_table("test_table", pa.table({"a": range(50)}))
permutation = Permutation.identity(table)
restored = pickle.loads(pickle.dumps(permutation))
assert len(restored) == 50
assert restored.__getitems__([0, 10, 49]) == [{"a": 0}, {"a": 10}, {"a": 49}]
def test_permutation_dataloader_multiprocessing(tmp_db):
"""Using a Permutation with a PyTorch DataLoader that has num_workers > 0
must work end-to-end. Each worker process gets a pickled copy of the
dataset and reads batches from it."""
table = tmp_db.create_table("test_table", pa.table({"a": range(1000)}))
permutation = Permutation.identity(table)
dataloader = torch.utils.data.DataLoader(
permutation,
batch_size=10,
shuffle=True,
num_workers=2,
multiprocessing_context="spawn",
)
seen = 0
for batch in dataloader:
assert batch["a"].size(0) == 10
seen += batch["a"].size(0)
assert seen == 1000
def test_permutation_pickle_with_connection_factory(tmp_path):
"""When the user provides a connection_factory, pickling should round-trip
through that factory rather than introspecting the connection URI. Useful
for remote / cloud connections where the URI alone isn't reopenable."""
db = lancedb.connect(tmp_path)
db.create_table("test_table", pa.table({"a": range(50)}))
factory = functools.partial(_open_native_table, str(tmp_path))
permutation = Permutation.identity(factory("test_table")).with_connection_factory(
factory
)
restored = pickle.loads(pickle.dumps(permutation))
assert len(restored) == 50
# The factory survives pickling and is what powered base-table reopen.
assert restored.connection_factory is not None
assert restored.connection_factory.func is _open_native_table
assert restored.__getitems__([0, 1, 2]) == [{"a": 0}, {"a": 1}, {"a": 2}]
def test_permutation_with_builder_is_picklable(tmp_db):
"""A Permutation built from a non-identity permutation table must round-trip
through pickle while preserving the row order defined by the permutation."""
table = tmp_db.create_table("test_table", pa.table({"a": range(100)}))
perm_tbl = (
permutation_builder(table)
.split_random(ratios=[0.8, 0.2], seed=42, split_names=["train", "test"])
.shuffle(seed=42)
.execute()
)
permutations = Permutations(table, perm_tbl)
permutation = permutations["train"]
indices = list(range(len(permutation)))
expected = permutation.__getitems__(indices)
restored = pickle.loads(pickle.dumps(permutation))
assert len(restored) == len(permutation)
assert restored.__getitems__(indices) == expected
def _multiworker_dataloader_target(db_uri: str, result_queue):
import lancedb
from lancedb.permutation import Permutation
db = lancedb.connect(db_uri)
table = db.open_table("test_table")
permutation = Permutation.identity(table)
dataloader = torch.utils.data.DataLoader(
permutation,
batch_size=10,
num_workers=2,
multiprocessing_context="fork",
)
count = 0
for batch in dataloader:
assert batch["a"].size(0) == 10
count += 1
result_queue.put(count)
@pytest.mark.skipif(
sys.platform != "linux",
reason=(
"fork() is unavailable on Windows and unsafe on macOS "
"(Apple frameworks/TLS are not fork-safe)"
),
)
def test_permutation_dataloader_fork_workers(tmp_path):
"""A Permutation used by a fork-based DataLoader should not hang.
PyTorch's DataLoader uses fork-based multiprocessing by default on Linux.
LanceDB drives async work through a background asyncio thread that does
not survive a fork, so any LOOP.run() in a worker blocks forever.
"""
import lancedb
db_uri = str(tmp_path / "db")
db = lancedb.connect(db_uri)
db.create_table("test_table", pa.table({"a": list(range(1000))}))
ctx = mp.get_context("spawn")
queue = ctx.Queue()
proc = ctx.Process(target=_multiworker_dataloader_target, args=(db_uri, queue))
proc.start()
proc.join(timeout=30)
if proc.is_alive():
proc.terminate()
proc.join(timeout=5)
if proc.is_alive():
proc.kill()
proc.join()
pytest.fail("Permutation hung when iterated in a fork-based DataLoader worker")
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
assert not queue.empty(), "child produced no batches"
assert queue.get() == 100

View File

@@ -3,8 +3,6 @@
use std::sync::Arc;
use crate::error::PythonErrorExt;
use crate::runtime::future_into_py;
use arrow::{
datatypes::SchemaRef,
pyarrow::{IntoPyArrow, ToPyArrow},
@@ -14,6 +12,9 @@ use lancedb::arrow::SendableRecordBatchStream;
use pyo3::{
Bound, Py, PyAny, PyRef, PyResult, Python, exceptions::PyStopAsyncIteration, pyclass, pymethods,
};
use pyo3_async_runtimes::tokio::future_into_py;
use crate::error::PythonErrorExt;
#[pyclass]
pub struct RecordBatchStream {

View File

@@ -7,12 +7,6 @@ use std::{
time::Duration,
};
use crate::{
error::PythonErrorExt,
namespace::{create_namespace_storage_options_provider, extract_namespace_arc},
runtime::future_into_py,
table::Table,
};
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
use lancedb::{
connection::Connection as LanceConnection,
@@ -26,6 +20,13 @@ use pyo3::{
pyclass, pyfunction, pymethods,
types::{PyDict, PyDictMethods},
};
use pyo3_async_runtimes::tokio::future_into_py;
use crate::{
error::PythonErrorExt,
namespace::{create_namespace_storage_options_provider, extract_namespace_arc},
table::Table,
};
#[pyclass]
pub struct Connection {
@@ -395,17 +396,12 @@ impl Connection {
future_into_py(py, async move {
use lance_namespace::models::CreateNamespaceRequest;
// Mode is now a string field
let mode_str = mode
.map(|m| match m.to_lowercase().as_str() {
"create" => Ok("Create".to_string()),
"exist_ok" => Ok("ExistOk".to_string()),
"overwrite" => Ok("Overwrite".to_string()),
_ => Err(PyValueError::new_err(format!(
"Invalid mode {:?}: expected one of 'create', 'exist_ok', 'overwrite'",
m
))),
})
.transpose()?;
let mode_str = mode.and_then(|m| match m.to_lowercase().as_str() {
"create" => Some("Create".to_string()),
"exist_ok" => Some("ExistOk".to_string()),
"overwrite" => Some("Overwrite".to_string()),
_ => None,
});
let request = CreateNamespaceRequest {
id: Some(namespace_path),
mode: mode_str,
@@ -433,26 +429,16 @@ impl Connection {
future_into_py(py, async move {
use lance_namespace::models::DropNamespaceRequest;
// Mode and Behavior are now string fields
let mode_str = mode
.map(|m| match m.to_uppercase().as_str() {
"SKIP" => Ok("Skip".to_string()),
"FAIL" => Ok("Fail".to_string()),
_ => Err(PyValueError::new_err(format!(
"Invalid mode {:?}: expected one of 'skip', 'fail'",
m
))),
})
.transpose()?;
let behavior_str = behavior
.map(|b| match b.to_uppercase().as_str() {
"RESTRICT" => Ok("Restrict".to_string()),
"CASCADE" => Ok("Cascade".to_string()),
_ => Err(PyValueError::new_err(format!(
"Invalid behavior {:?}: expected one of 'restrict', 'cascade'",
b
))),
})
.transpose()?;
let mode_str = mode.and_then(|m| match m.to_uppercase().as_str() {
"SKIP" => Some("Skip".to_string()),
"FAIL" => Some("Fail".to_string()),
_ => None,
});
let behavior_str = behavior.and_then(|b| match b.to_uppercase().as_str() {
"RESTRICT" => Some("Restrict".to_string()),
"CASCADE" => Some("Cascade".to_string()),
_ => None,
});
let request = DropNamespaceRequest {
id: Some(namespace_path),
mode: mode_str,
@@ -539,7 +525,7 @@ impl Connection {
}
#[pyfunction]
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None, manifest_enabled=false, namespace_client_properties=None))]
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None))]
#[allow(clippy::too_many_arguments)]
pub fn connect(
py: Python<'_>,
@@ -551,8 +537,6 @@ pub fn connect(
client_config: Option<PyClientConfig>,
storage_options: Option<HashMap<String, String>>,
session: Option<crate::session::Session>,
manifest_enabled: bool,
namespace_client_properties: Option<HashMap<String, String>>,
) -> PyResult<Bound<'_, PyAny>> {
future_into_py(py, async move {
let mut builder = lancedb::connect(&uri);
@@ -572,12 +556,6 @@ pub fn connect(
if let Some(storage_options) = storage_options {
builder = builder.storage_options(storage_options);
}
if manifest_enabled {
builder = builder.manifest_enabled(true);
}
if let Some(namespace_client_properties) = namespace_client_properties {
builder = builder.namespace_client_properties(namespace_client_properties);
}
#[cfg(feature = "remote")]
if let Some(client_config) = client_config {
builder = builder.client_config(client_config.into());

View File

@@ -8,9 +8,7 @@
//! DataFusion [`Expr`] nodes, bypassing SQL string parsing.
use arrow::{datatypes::DataType, pyarrow::PyArrowType};
use datafusion_common::ScalarValue;
use lancedb::expr::{DfExpr, col as ldb_col, contains, expr_cast, lit as df_lit, lower, upper};
use pyo3::types::PyBytes;
use pyo3::{Bound, PyAny, PyResult, exceptions::PyValueError, prelude::*, pyfunction};
/// A type-safe DataFusion expression.
@@ -143,7 +141,7 @@ pub fn expr_col(name: &str) -> PyExpr {
/// Create a literal value expression.
///
/// Supported Python types: `bool`, `int`, `float`, `str`, `bytes`.
/// Supported Python types: `bool`, `int`, `float`, `str`.
#[pyfunction]
pub fn expr_lit(value: Bound<'_, PyAny>) -> PyResult<PyExpr> {
// bool must be checked before int because bool is a subclass of int in Python
@@ -159,12 +157,8 @@ pub fn expr_lit(value: Bound<'_, PyAny>) -> PyResult<PyExpr> {
if let Ok(s) = value.extract::<String>() {
return Ok(PyExpr(df_lit(s)));
}
if value.is_instance_of::<PyBytes>() {
let bytes = value.extract::<Vec<u8>>()?;
return Ok(PyExpr(df_lit(ScalarValue::Binary(Some(bytes)))));
}
Err(PyValueError::new_err(format!(
"unsupported literal type: {}. Supported: bool, int, float, str, bytes",
"unsupported literal type: {}. Supported: bool, int, float, str",
value.get_type().name()?
)))
}

View File

@@ -1,13 +1,11 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
use lancedb::index::vector::{
IvfFlatIndexBuilder, IvfHnswFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder,
IvfPqIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder,
};
use lancedb::index::vector::{IvfFlatIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder};
use lancedb::index::{
Index as LanceDbIndex,
scalar::{BTreeIndexBuilder, FtsIndexBuilder},
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
};
use pyo3::IntoPyObject;
use pyo3::types::PyStringMethods;
@@ -15,7 +13,7 @@ use pyo3::{
Bound, FromPyObject, PyAny, PyResult, Python,
exceptions::{PyKeyError, PyValueError},
intern, pyclass, pymethods,
types::{PyAnyMethods, PyString},
types::PyAnyMethods,
};
use crate::util::parse_distance_type;
@@ -24,7 +22,7 @@ pub fn class_name(ob: &'_ Bound<'_, PyAny>) -> PyResult<String> {
let full_name = ob
.getattr(intern!(ob.py(), "__class__"))?
.getattr(intern!(ob.py(), "__name__"))?;
let full_name = full_name.cast::<PyString>()?.to_string_lossy();
let full_name = full_name.cast()?.to_string_lossy();
match full_name.rsplit_once('.') {
Some((_, name)) => Ok(name.to_string()),
@@ -164,26 +162,8 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
}
Ok(LanceDbIndex::IvfHnswSq(hnsw_sq_builder))
}
"HnswFlat" => {
let params = source.extract::<IvfHnswFlatParams>()?;
let distance_type = parse_distance_type(params.distance_type)?;
let mut hnsw_flat_builder = IvfHnswFlatIndexBuilder::default()
.distance_type(distance_type)
.max_iterations(params.max_iterations)
.sample_rate(params.sample_rate)
.num_edges(params.m)
.ef_construction(params.ef_construction);
if let Some(num_partitions) = params.num_partitions {
hnsw_flat_builder = hnsw_flat_builder.num_partitions(num_partitions);
}
if let Some(target_partition_size) = params.target_partition_size {
hnsw_flat_builder =
hnsw_flat_builder.target_partition_size(target_partition_size);
}
Ok(LanceDbIndex::IvfHnswFlat(hnsw_flat_builder))
}
not_supported => Err(PyValueError::new_err(format!(
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, IvfHnswSq, or IvfHnswFlat",
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, or IvfHnswSq",
not_supported
))),
}
@@ -270,17 +250,6 @@ struct IvfHnswSqParams {
target_partition_size: Option<u32>,
}
#[derive(FromPyObject)]
struct IvfHnswFlatParams {
distance_type: String,
num_partitions: Option<u32>,
max_iterations: u32,
sample_rate: u32,
m: u32,
ef_construction: u32,
target_partition_size: Option<u32>,
}
#[pyclass(get_all)]
/// A description of an index currently configured on a column
pub struct IndexConfig {

View File

@@ -15,8 +15,8 @@ use pyo3::{
use query::{FTSQuery, HybridQuery, Query, VectorQuery};
use session::Session;
use table::{
AddColumnsResult, AddResult, AlterColumnsResult, DeleteResult, DropColumnsResult, LsmWriteSpec,
MergeResult, Table, UpdateResult,
AddColumnsResult, AddResult, AlterColumnsResult, DeleteResult, DropColumnsResult, MergeResult,
Table, UpdateResult,
};
pub mod arrow;
@@ -28,7 +28,6 @@ pub mod index;
pub mod namespace;
pub mod permutation;
pub mod query;
pub mod runtime;
pub mod session;
pub mod table;
pub mod util;
@@ -52,7 +51,6 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<AlterColumnsResult>()?;
m.add_class::<AddResult>()?;
m.add_class::<MergeResult>()?;
m.add_class::<LsmWriteSpec>()?;
m.add_class::<DeleteResult>()?;
m.add_class::<DropColumnsResult>()?;
m.add_class::<UpdateResult>()?;

Some files were not shown because too many files have changed in this diff Show More