mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-23 15:00:39 +00:00
Compare commits
21 Commits
codex/upda
...
xuanwo/nat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
980536c6ef | ||
|
|
1700d618e5 | ||
|
|
3726491b27 | ||
|
|
13c6dae9a3 | ||
|
|
64aeee84a8 | ||
|
|
5b45e44ce3 | ||
|
|
f893589356 | ||
|
|
df4ad9f851 | ||
|
|
9330a9b851 | ||
|
|
02de07576e | ||
|
|
81617fd3d9 | ||
|
|
011fdd5c94 | ||
|
|
650f173236 | ||
|
|
9b21c136c6 | ||
|
|
694aa48e19 | ||
|
|
455ba5abbf | ||
|
|
5338aeb006 | ||
|
|
47a34f5cca | ||
|
|
a17c241e86 | ||
|
|
1fc23e5473 | ||
|
|
87b831bcae |
12
.github/workflows/codex-fix-ci.yml
vendored
12
.github/workflows/codex-fix-ci.yml
vendored
@@ -45,7 +45,9 @@ jobs:
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
# pnpm 11 (used by the nodejs install step below) requires
|
||||
# Node >= 22.13; use 24 since 22 hits EOL in October.
|
||||
node-version: 24
|
||||
|
||||
- name: Install Codex CLI
|
||||
run: npm install -g @openai/codex
|
||||
@@ -79,10 +81,14 @@ jobs:
|
||||
java-version: '11'
|
||||
cache: maven
|
||||
|
||||
- name: Setup pnpm
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 11.1.1
|
||||
- name: Install Node.js dependencies for TypeScript bindings
|
||||
run: |
|
||||
cd nodejs
|
||||
npm ci
|
||||
pnpm install --frozen-lockfile
|
||||
|
||||
- name: Configure git user
|
||||
run: |
|
||||
@@ -137,7 +143,7 @@ jobs:
|
||||
- For Rust test failures: Run the specific test with "cargo test -p <crate> <test_name>"
|
||||
- For Python test failures: Build with "cd python && maturin develop" then run "pytest <specific_test_file>::<test_name>"
|
||||
- For Java test failures: Run "cd java && mvn test -Dtest=<TestClass>#<testMethod>"
|
||||
- For TypeScript test failures: Run "cd nodejs && npm run build && npm test -- --testNamePattern='<test_name>'"
|
||||
- For TypeScript test failures: Run "cd nodejs && pnpm build && pnpm test -- --testNamePattern='<test_name>'"
|
||||
- Do NOT run the full test suite - only run the tests that were failing
|
||||
|
||||
7. If the additional guidelines are provided, follow them as well.
|
||||
|
||||
5
.github/workflows/java-publish.yml
vendored
5
.github/workflows/java-publish.yml
vendored
@@ -43,7 +43,7 @@ jobs:
|
||||
server-username: SONATYPE_USER
|
||||
server-password: SONATYPE_TOKEN
|
||||
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
|
||||
gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
|
||||
gpg-passphrase: MAVEN_GPG_PASSPHRASE
|
||||
- name: Set git config
|
||||
run: |
|
||||
git config --global user.email "dev+gha@lancedb.com"
|
||||
@@ -58,10 +58,11 @@ jobs:
|
||||
echo "use-agent" >> ~/.gnupg/gpg.conf
|
||||
echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf
|
||||
export GPG_TTY=$(tty)
|
||||
./mvnw --batch-mode -DskipTests -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -pl lancedb-core -am -P deploy-to-ossrh
|
||||
./mvnw --batch-mode -DskipTests -DpushChanges=false deploy -pl lancedb-core -am -P deploy-to-ossrh
|
||||
env:
|
||||
SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
|
||||
SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
|
||||
MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
|
||||
|
||||
report-failure:
|
||||
name: Report Workflow Failure
|
||||
|
||||
109
.github/workflows/nodejs.yml
vendored
109
.github/workflows/nodejs.yml
vendored
@@ -42,11 +42,17 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: true
|
||||
- uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 11.1.1
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
cache: 'npm'
|
||||
cache-dependency-path: nodejs/package-lock.json
|
||||
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
|
||||
# in October. The library itself still supports Node >= 18
|
||||
# (see test matrix below).
|
||||
node-version: 24
|
||||
cache: 'pnpm'
|
||||
cache-dependency-path: nodejs/pnpm-lock.yaml
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
with:
|
||||
components: rustfmt, clippy
|
||||
@@ -61,11 +67,13 @@ jobs:
|
||||
run: cargo clippy --profile ci --all --all-features -- -D warnings
|
||||
- name: Lint Typescript
|
||||
run: |
|
||||
npm ci
|
||||
npm run lint-ci
|
||||
pnpm install --frozen-lockfile
|
||||
pnpm lint-ci
|
||||
- name: Lint examples
|
||||
working-directory: nodejs/examples
|
||||
run: npm ci && npm run lint-ci
|
||||
# The `@lancedb/lancedb` dep points at file:../dist; pnpm errors if
|
||||
# that dir is missing, so create an empty one for lint-only runs.
|
||||
run: mkdir -p ../dist && pnpm install --frozen-lockfile && pnpm lint-ci
|
||||
linux:
|
||||
name: Linux (NodeJS ${{ matrix.node-version }})
|
||||
timeout-minutes: 30
|
||||
@@ -82,14 +90,18 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: true
|
||||
- uses: actions/setup-node@v4
|
||||
name: Setup Node.js 20 for build
|
||||
- uses: pnpm/action-setup@v4
|
||||
with:
|
||||
# @napi-rs/cli v3 requires Node >= 20.12 (via @inquirer/prompts@8).
|
||||
# Build always on Node 20; tests run on the matrix version below.
|
||||
node-version: 20
|
||||
cache: 'npm'
|
||||
cache-dependency-path: nodejs/package-lock.json
|
||||
version: 11.1.1
|
||||
- uses: actions/setup-node@v4
|
||||
name: Setup Node.js 24 for build
|
||||
with:
|
||||
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
|
||||
# in October. Build/install runs on Node 24; tests run on the
|
||||
# matrix version below using direct jest invocation.
|
||||
node-version: 24
|
||||
cache: 'pnpm'
|
||||
cache-dependency-path: nodejs/pnpm-lock.yaml
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
@@ -97,45 +109,52 @@ jobs:
|
||||
sudo apt install -y protobuf-compiler libssl-dev
|
||||
- name: Build
|
||||
run: |
|
||||
npm ci --include=optional
|
||||
npm run build:debug -- --profile ci
|
||||
pnpm install --frozen-lockfile
|
||||
# No `--` separator: pnpm forwards it literally, which would
|
||||
# make napi-rs treat `--profile ci` as a cargo passthrough arg.
|
||||
pnpm build:debug --profile ci
|
||||
pnpm tsc
|
||||
- name: Setup examples
|
||||
working-directory: nodejs/examples
|
||||
run: pnpm install --frozen-lockfile
|
||||
- name: Check docs
|
||||
run: |
|
||||
# We run this as part of the job because the binary needs to be built
|
||||
# first to export the types of the native code.
|
||||
set -e
|
||||
# `pnpm docs` would invoke pnpm's built-in `docs` command, not
|
||||
# the script — use `pnpm run docs`.
|
||||
pnpm run docs
|
||||
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
|
||||
echo "Docs need to be updated"
|
||||
echo "Run 'pnpm run docs', fix any warnings, and commit the changes."
|
||||
exit 1
|
||||
fi
|
||||
- uses: actions/setup-node@v4
|
||||
name: Setup Node.js ${{ matrix.node-version }} for test
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
- name: Compile TypeScript
|
||||
run: npm run tsc
|
||||
- name: Setup localstack
|
||||
working-directory: .
|
||||
run: docker compose up --detach --wait
|
||||
- name: Test
|
||||
env:
|
||||
S3_TEST: "1"
|
||||
run: npm run test
|
||||
- name: Setup examples
|
||||
working-directory: nodejs/examples
|
||||
run: npm ci
|
||||
# Newer @smithy/core uses dynamic ESM imports.
|
||||
NODE_OPTIONS: "--experimental-vm-modules"
|
||||
# Invoke jest directly because pnpm 11 itself requires Node 22+
|
||||
# while the matrix tests on older Node versions.
|
||||
run: npx jest --verbose
|
||||
- name: Test examples
|
||||
working-directory: ./
|
||||
env:
|
||||
OPENAI_API_KEY: test
|
||||
OPENAI_BASE_URL: http://0.0.0.0:8000
|
||||
NODE_OPTIONS: "--experimental-vm-modules"
|
||||
run: |
|
||||
python ci/mock_openai.py &
|
||||
cd nodejs/examples
|
||||
npm test
|
||||
- name: Check docs
|
||||
run: |
|
||||
# We run this as part of the job because the binary needs to be built
|
||||
# first to export the types of the native code.
|
||||
set -e
|
||||
npm ci
|
||||
npm run docs
|
||||
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
|
||||
echo "Docs need to be updated"
|
||||
echo "Run 'npm run docs', fix any warnings, and commit the changes."
|
||||
exit 1
|
||||
fi
|
||||
npx jest --testEnvironment jest-environment-node-single-context --verbose
|
||||
macos:
|
||||
timeout-minutes: 30
|
||||
runs-on: "macos-14"
|
||||
@@ -148,20 +167,28 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: true
|
||||
- uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 11.1.1
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
cache: 'npm'
|
||||
cache-dependency-path: nodejs/package-lock.json
|
||||
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
|
||||
# in October.
|
||||
node-version: 24
|
||||
cache: 'pnpm'
|
||||
cache-dependency-path: nodejs/pnpm-lock.yaml
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
brew install protobuf
|
||||
- name: Build
|
||||
run: |
|
||||
npm ci --include=optional
|
||||
npm run build:debug -- --profile ci
|
||||
npm run tsc
|
||||
pnpm install --frozen-lockfile
|
||||
# No `--` separator: pnpm forwards it literally, which would
|
||||
# make napi-rs treat `--profile ci` as a cargo passthrough arg.
|
||||
pnpm build:debug --profile ci
|
||||
pnpm tsc
|
||||
- name: Test
|
||||
run: |
|
||||
npm run test
|
||||
pnpm test
|
||||
|
||||
53
.github/workflows/npm-publish.yml
vendored
53
.github/workflows/npm-publish.yml
vendored
@@ -171,13 +171,18 @@ jobs:
|
||||
working-directory: nodejs
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup pnpm
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 11.1.1
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v4
|
||||
if: ${{ !matrix.settings.docker }}
|
||||
with:
|
||||
node-version: 20
|
||||
cache: npm
|
||||
cache-dependency-path: nodejs/package-lock.json
|
||||
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
|
||||
# in October.
|
||||
node-version: 24
|
||||
cache: pnpm
|
||||
cache-dependency-path: nodejs/pnpm-lock.yaml
|
||||
- name: Install
|
||||
uses: dtolnay/rust-toolchain@stable
|
||||
if: ${{ !matrix.settings.docker }}
|
||||
@@ -195,7 +200,7 @@ jobs:
|
||||
target/
|
||||
key: nodejs-${{ matrix.settings.target }}-cargo-${{ matrix.settings.host }}
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
run: pnpm install --frozen-lockfile
|
||||
- name: Install Zig
|
||||
uses: mlugg/setup-zig@v2
|
||||
if: ${{ contains(matrix.settings.target, 'musl') }}
|
||||
@@ -248,7 +253,7 @@ jobs:
|
||||
# one to do the upload.
|
||||
- name: Make generic artifacts
|
||||
if: ${{ matrix.settings.target == 'aarch64-apple-darwin' }}
|
||||
run: npm run tsc
|
||||
run: pnpm tsc
|
||||
- name: Upload Generic Artifacts
|
||||
if: ${{ matrix.settings.target == 'aarch64-apple-darwin' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
@@ -283,14 +288,24 @@ jobs:
|
||||
working-directory: nodejs
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup node
|
||||
- name: Setup pnpm
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 11.1.1
|
||||
- name: Setup Node.js 24 for install
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
|
||||
# in October.
|
||||
node-version: 24
|
||||
cache: pnpm
|
||||
cache-dependency-path: nodejs/pnpm-lock.yaml
|
||||
- name: Install dependencies
|
||||
run: pnpm install --frozen-lockfile
|
||||
- name: Setup Node.js ${{ matrix.node }} for test
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: ${{ matrix.node }}
|
||||
cache: npm
|
||||
cache-dependency-path: nodejs/package-lock.json
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
- name: Download artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
@@ -311,7 +326,9 @@ jobs:
|
||||
- name: Move built files
|
||||
run: cp dist/native.d.ts dist/native.js dist/*.node lancedb/
|
||||
- name: Test bindings
|
||||
run: npm test
|
||||
# Invoke jest directly because pnpm 11 itself requires Node 22+
|
||||
# while the matrix tests on older Node versions.
|
||||
run: npx jest --verbose
|
||||
publish:
|
||||
name: Publish
|
||||
runs-on: ubuntu-latest
|
||||
@@ -323,15 +340,19 @@ jobs:
|
||||
- test-lancedb
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup pnpm
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 11.1.1
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 24
|
||||
cache: npm
|
||||
cache-dependency-path: nodejs/package-lock.json
|
||||
cache: pnpm
|
||||
cache-dependency-path: nodejs/pnpm-lock.yaml
|
||||
registry-url: "https://registry.npmjs.org"
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
run: pnpm install --frozen-lockfile
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: nodejs-dist
|
||||
@@ -351,7 +372,7 @@ jobs:
|
||||
- name: Display structure of downloaded files
|
||||
run: find dist && find nodejs-artifacts
|
||||
- name: Move artifacts
|
||||
run: npx napi artifacts -d nodejs-artifacts
|
||||
run: pnpm exec napi artifacts -d nodejs-artifacts
|
||||
- name: List packages
|
||||
run: find npm
|
||||
- name: Publish
|
||||
|
||||
1804
Cargo.lock
generated
1804
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
30
Cargo.toml
30
Cargo.toml
@@ -13,20 +13,20 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.91.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=7.0.0-beta.2", default-features = false, "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-core = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datagen = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-file = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-io = { "version" = "=7.0.0-beta.2", default-features = false, "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-index = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-linalg = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=7.0.0-beta.2", default-features = false, "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-table = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-testing = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datafusion = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-encoding = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-arrow = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance = { "version" = "=7.0.0-beta.9", default-features = false, "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-core = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datagen = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-file = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-io = { "version" = "=7.0.0-beta.9", default-features = false, "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-index = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-linalg = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=7.0.0-beta.9", default-features = false, "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-table = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-testing = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datafusion = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-encoding = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-arrow = { "version" = "=7.0.0-beta.9", "tag" = "v7.0.0-beta.9", "git" = "https://github.com/lance-format/lance.git" }
|
||||
ahash = "0.8"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "58.0.0", optional = false }
|
||||
@@ -54,7 +54,7 @@ half = { "version" = "2.7.1", default-features = false, features = [
|
||||
futures = "0"
|
||||
log = "0.4"
|
||||
moka = { version = "0.12", features = ["future"] }
|
||||
object_store = "0.12.0"
|
||||
object_store = "0.13.2"
|
||||
pin-project = "1.0.7"
|
||||
rand = "0.9"
|
||||
snafu = "0.8"
|
||||
|
||||
34
deny.toml
34
deny.toml
@@ -51,6 +51,18 @@ ignore = [
|
||||
# https://rustsec.org/advisories/RUSTSEC-2024-0436
|
||||
{ id = "RUSTSEC-2024-0436", reason = "transitive via datafusion; awaiting ecosystem migration" },
|
||||
|
||||
# encoding: unmaintained. Reached through lindera-dictionary, which is
|
||||
# required by the native Lindera tokenizer path. Lindera has not migrated
|
||||
# off this crate yet.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2021-0153
|
||||
{ id = "RUSTSEC-2021-0153", reason = "transitive via lindera-dictionary for native Lindera tokenizer" },
|
||||
|
||||
# fast-float: unsound and unmaintained. Reached only through polars-arrow
|
||||
# from the optional Polars integration; replacement requires a Polars
|
||||
# dependency upgrade.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2024-0379
|
||||
{ id = "RUSTSEC-2024-0379", reason = "transitive via polars-arrow; waiting on Polars migration" },
|
||||
|
||||
# tantivy: segfault on malformed input due to missing bounds check.
|
||||
# Pulled in via lance for full-text search. We only feed tantivy
|
||||
# documents we construct ourselves, not attacker-controlled bytes.
|
||||
@@ -68,11 +80,17 @@ ignore = [
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0119
|
||||
{ id = "RUSTSEC-2025-0119", reason = "transitive via hf-hub/indicatif; cosmetic formatting crate" },
|
||||
|
||||
# rustls-pemfile: unmaintained. Reached from two separate chains:
|
||||
# rustls-native-certs 0.6 (via hyper-rustls 0.24) and object_store 0.12.
|
||||
# Both upstream dependencies need to move before we can drop it.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0134
|
||||
{ id = "RUSTSEC-2025-0134", reason = "transitive via rustls-native-certs/object_store; waiting on upstream migration" },
|
||||
# bincode: unmaintained. Reached through lindera and lindera-dictionary,
|
||||
# which are required by the native Lindera tokenizer path. Lindera has not
|
||||
# migrated to another serialization format yet.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0141
|
||||
{ id = "RUSTSEC-2025-0141", reason = "transitive via lindera/lindera-dictionary for native Lindera tokenizer" },
|
||||
|
||||
# lru: soundness issue in IterMut. Reached only through aws-sdk-s3 in
|
||||
# LanceDB's dev-dependency graph; LanceDB does not use that iterator
|
||||
# directly. Clearing this requires the AWS SDK chain to update lru.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0002
|
||||
{ id = "RUSTSEC-2026-0002", reason = "transitive via aws-sdk-s3 dev-dependency; waiting on AWS SDK lru upgrade" },
|
||||
|
||||
# rustls-webpki 0.101.7 (old major line): name-constraint checks for
|
||||
# URI / wildcard names. Pulled in only via the legacy rustls 0.21 chain
|
||||
@@ -89,6 +107,12 @@ ignore = [
|
||||
# we actively use is upgraded to 0.103.13 which contains the fix.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0104
|
||||
{ id = "RUSTSEC-2026-0104", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
|
||||
|
||||
# rand 0.8.5: soundness issue only when ThreadRng reseeds inside a custom
|
||||
# logger. Reached through several transitive chains. LanceDB does not use
|
||||
# rand from a custom logger; upgrade once all pinned chains accept 0.8.6+.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0097
|
||||
{ id = "RUSTSEC-2026-0097", reason = "transitive rand 0.8.5; LanceDB does not call ThreadRng from custom logging" },
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -12,20 +12,22 @@ Typescript.
|
||||
* `src/`: Rust bindings source code
|
||||
* `lancedb/`: Typescript package source code
|
||||
* `__test__/`: Unit tests
|
||||
* `examples/`: An npm package with the examples shown in the documentation
|
||||
* `examples/`: A pnpm package with the examples shown in the documentation
|
||||
|
||||
## Development environment
|
||||
|
||||
To set up your development environment, you will need to install the following:
|
||||
|
||||
1. Node.js 14 or later
|
||||
2. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
|
||||
3. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
|
||||
1. Node.js 22 or later (required by pnpm 11)
|
||||
2. [pnpm](https://pnpm.io/installation) 11 or later (or run via `corepack enable`,
|
||||
which uses the `packageManager` field in `package.json`)
|
||||
3. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
|
||||
4. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
|
||||
|
||||
Initial setup:
|
||||
|
||||
```shell
|
||||
npm install
|
||||
pnpm install
|
||||
```
|
||||
|
||||
### Commit Hooks
|
||||
@@ -39,38 +41,38 @@ pre-commit install
|
||||
|
||||
## Development
|
||||
|
||||
Most common development commands can be run using the npm scripts.
|
||||
Most common development commands can be run using the pnpm scripts.
|
||||
|
||||
Build the package
|
||||
|
||||
```shell
|
||||
npm install
|
||||
npm run build
|
||||
pnpm install
|
||||
pnpm build
|
||||
```
|
||||
|
||||
Lint:
|
||||
|
||||
```shell
|
||||
npm run lint
|
||||
pnpm lint
|
||||
```
|
||||
|
||||
Format and fix lints:
|
||||
|
||||
```shell
|
||||
npm run lint-fix
|
||||
pnpm lint-fix
|
||||
```
|
||||
|
||||
Run tests:
|
||||
|
||||
```shell
|
||||
npm test
|
||||
pnpm test
|
||||
```
|
||||
|
||||
To run a single test:
|
||||
|
||||
```shell
|
||||
# Single file: table.test.ts
|
||||
npm test -- table.test.ts
|
||||
pnpm test -- table.test.ts
|
||||
# Single test: 'merge insert' in table.test.ts
|
||||
npm test -- table.test.ts --testNamePattern=merge\ insert
|
||||
pnpm test -- table.test.ts --testNamePattern=merge\ insert
|
||||
```
|
||||
|
||||
@@ -148,6 +148,33 @@ Creates a new empty Table
|
||||
|
||||
***
|
||||
|
||||
### createNamespace()
|
||||
|
||||
```ts
|
||||
abstract createNamespace(namespacePath, options?): Promise<CreateNamespaceResponse>
|
||||
```
|
||||
|
||||
Create a new namespace at the given path.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **namespacePath**: `string`[]
|
||||
The namespace path to create.
|
||||
|
||||
* **options?**: `Partial`<[`CreateNamespaceOptions`](../interfaces/CreateNamespaceOptions.md)>
|
||||
Creation `mode`
|
||||
("create" | "exist_ok" | "overwrite") and optional `properties`
|
||||
to attach to the namespace.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`CreateNamespaceResponse`](../interfaces/CreateNamespaceResponse.md)>
|
||||
|
||||
The properties of the
|
||||
created namespace and an optional transaction id.
|
||||
|
||||
***
|
||||
|
||||
### createTable()
|
||||
|
||||
#### createTable(options, namespacePath)
|
||||
@@ -230,6 +257,29 @@ Creates a new Table and initialize it with new data.
|
||||
|
||||
***
|
||||
|
||||
### describeNamespace()
|
||||
|
||||
```ts
|
||||
abstract describeNamespace(namespacePath): Promise<DescribeNamespaceResponse>
|
||||
```
|
||||
|
||||
Describe a namespace, returning its properties.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **namespacePath**: `string`[]
|
||||
The namespace path to describe, in
|
||||
parent → child order, e.g. `["analytics", "sales"]`.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`DescribeNamespaceResponse`](../interfaces/DescribeNamespaceResponse.md)>
|
||||
|
||||
The namespace's properties
|
||||
(may be undefined if the namespace has none).
|
||||
|
||||
***
|
||||
|
||||
### display()
|
||||
|
||||
```ts
|
||||
@@ -263,6 +313,36 @@ Drop all tables in the database.
|
||||
|
||||
***
|
||||
|
||||
### dropNamespace()
|
||||
|
||||
```ts
|
||||
abstract dropNamespace(namespacePath, options?): Promise<DropNamespaceResponse>
|
||||
```
|
||||
|
||||
Drop a namespace.
|
||||
|
||||
Use `behavior: "cascade"` to also drop everything contained in the
|
||||
namespace (sub-namespaces and tables). The default `"restrict"`
|
||||
behavior refuses to drop a non-empty namespace.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **namespacePath**: `string`[]
|
||||
The namespace path to drop.
|
||||
|
||||
* **options?**: `Partial`<[`DropNamespaceOptions`](../interfaces/DropNamespaceOptions.md)>
|
||||
`mode` ("skip" | "fail"
|
||||
for missing-namespace handling) and `behavior` ("restrict" | "cascade").
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`DropNamespaceResponse`](../interfaces/DropNamespaceResponse.md)>
|
||||
|
||||
Any properties returned by
|
||||
the server and an optional transaction id.
|
||||
|
||||
***
|
||||
|
||||
### dropTable()
|
||||
|
||||
```ts
|
||||
@@ -299,6 +379,36 @@ Return true if the connection has not been closed
|
||||
|
||||
***
|
||||
|
||||
### listNamespaces()
|
||||
|
||||
```ts
|
||||
abstract listNamespaces(namespacePath?, options?): Promise<ListNamespacesResponse>
|
||||
```
|
||||
|
||||
List the immediate child namespaces under the given parent.
|
||||
|
||||
Results may be paginated. To retrieve subsequent pages, pass the
|
||||
`pageToken` returned by a previous call.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **namespacePath?**: `string`[]
|
||||
The parent namespace path. Defaults
|
||||
to the root namespace if omitted.
|
||||
|
||||
* **options?**: `Partial`<[`ListNamespacesOptions`](../interfaces/ListNamespacesOptions.md)>
|
||||
Pagination options
|
||||
(`pageToken`, `limit`).
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`ListNamespacesResponse`](../interfaces/ListNamespacesResponse.md)>
|
||||
|
||||
Child namespace names and
|
||||
an optional token for fetching the next page.
|
||||
|
||||
***
|
||||
|
||||
### openTable()
|
||||
|
||||
```ts
|
||||
@@ -327,6 +437,29 @@ Open a table in the database.
|
||||
|
||||
***
|
||||
|
||||
### renameTable()
|
||||
|
||||
```ts
|
||||
abstract renameTable(
|
||||
oldName,
|
||||
newName,
|
||||
namespacePath?): Promise<void>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **oldName**: `string`
|
||||
|
||||
* **newName**: `string`
|
||||
|
||||
* **namespacePath?**: `string`[]
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
|
||||
***
|
||||
|
||||
### tableNames()
|
||||
|
||||
#### tableNames(options)
|
||||
|
||||
173
docs/src/js/classes/Scannable.md
Normal file
173
docs/src/js/classes/Scannable.md
Normal file
@@ -0,0 +1,173 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / Scannable
|
||||
|
||||
# Class: Scannable
|
||||
|
||||
A data source that can be scanned as a stream of Arrow `RecordBatch`es.
|
||||
|
||||
`Scannable` wraps the schema + optional row count + rescannable flag and
|
||||
a callback that yields batches one at a time. It is passed to consumers
|
||||
(e.g. `Table.add`, `createTable`, `mergeInsert` — follow-up work) that
|
||||
need to pull data without materializing the full dataset in JS memory.
|
||||
|
||||
Batches cross the JS↔Rust boundary as Arrow IPC Stream messages; a fresh
|
||||
writer serializes each batch, and the Rust side decodes it with
|
||||
`arrow_ipc::reader::StreamReader`. One batch is in flight at a time.
|
||||
|
||||
## Properties
|
||||
|
||||
### numRows
|
||||
|
||||
```ts
|
||||
readonly numRows: null | number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### rescannable
|
||||
|
||||
```ts
|
||||
readonly rescannable: boolean;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### schema
|
||||
|
||||
```ts
|
||||
readonly schema: Schema<any>;
|
||||
```
|
||||
|
||||
## Methods
|
||||
|
||||
### fromFactory()
|
||||
|
||||
```ts
|
||||
static fromFactory(
|
||||
schema,
|
||||
factory,
|
||||
opts): Promise<Scannable>
|
||||
```
|
||||
|
||||
Build a Scannable from an explicit schema and a factory that returns a
|
||||
fresh batch iterator on each call.
|
||||
|
||||
The factory is invoked once per scan. Each iterator yields
|
||||
`RecordBatch`es matching the declared schema. Use this when you need
|
||||
direct control over the pull loop — for example, to wrap a streaming
|
||||
source whose batches are produced lazily.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **schema**: `Schema`<`any`>
|
||||
The Arrow schema of the produced batches.
|
||||
|
||||
* **factory**
|
||||
Called at the start of each scan to produce a batch
|
||||
iterator. Must be idempotent when `rescannable` is true.
|
||||
|
||||
* **opts**: [`ScannableOptions`](../interfaces/ScannableOptions.md) = `{}`
|
||||
Optional hints. `rescannable` defaults to `true`; set to
|
||||
`false` if calling `factory()` twice would not reproduce the same data.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Scannable`](Scannable.md)>
|
||||
|
||||
***
|
||||
|
||||
### fromIterable()
|
||||
|
||||
```ts
|
||||
static fromIterable(
|
||||
schema,
|
||||
iter,
|
||||
opts): Promise<Scannable>
|
||||
```
|
||||
|
||||
Build a Scannable from an iterable of `RecordBatch`es. `rescannable`
|
||||
defaults to `false`. Pass an explicit schema so the consumer can
|
||||
validate before any batch is pulled.
|
||||
|
||||
`opts.rescannable: true` is honest for replayable iterables (Arrays,
|
||||
Sets, or custom iterables whose `[Symbol.iterator]()` returns a fresh
|
||||
iterator each call). It is rejected for one-shot iterables (generators,
|
||||
async generators, or already-an-iterator inputs) because their
|
||||
`[Symbol.iterator]()` returns the same exhausted object on the second
|
||||
scan. For replayable sources outside this shape, use
|
||||
`fromFactory(schema, () => createIter(), { rescannable: true })`.
|
||||
|
||||
Note: when `opts.rescannable` is `true`, the constructor calls
|
||||
`[Symbol.iterator]()` once on the input to perform the structural check.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **schema**: `Schema`<`any`>
|
||||
|
||||
* **iter**: `Iterable`<`RecordBatch`<`any`>> \| `AsyncIterable`<`RecordBatch`<`any`>>
|
||||
|
||||
* **opts**: [`ScannableOptions`](../interfaces/ScannableOptions.md) = `{}`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Scannable`](Scannable.md)>
|
||||
|
||||
***
|
||||
|
||||
### fromRecordBatchReader()
|
||||
|
||||
```ts
|
||||
static fromRecordBatchReader(reader, opts): Promise<Scannable>
|
||||
```
|
||||
|
||||
Build a Scannable from an Arrow `RecordBatchReader`. A reader can only
|
||||
be consumed once; `rescannable` defaults to `false`.
|
||||
|
||||
The reader must already be opened (via `.open()`) so its `.schema` is
|
||||
populated. `RecordBatchReader.from(...)` returns an unopened reader.
|
||||
|
||||
`opts.rescannable: true` is rejected because `RecordBatchReader` is a
|
||||
self-iterator (its `[Symbol.iterator]()` returns itself), and this
|
||||
constructor does not call `reader.reset()` between scans, so a second
|
||||
scan would always see an exhausted reader. For genuinely replayable
|
||||
sources, use
|
||||
`fromFactory(schema, () => openReader(), { rescannable: true })`,
|
||||
which mints a fresh reader on each scan.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **reader**: `RecordBatchReader`<`any`>
|
||||
|
||||
* **opts**: [`ScannableOptions`](../interfaces/ScannableOptions.md) = `{}`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Scannable`](Scannable.md)>
|
||||
|
||||
***
|
||||
|
||||
### fromTable()
|
||||
|
||||
```ts
|
||||
static fromTable(table, opts): Promise<Scannable>
|
||||
```
|
||||
|
||||
Build a Scannable from an in-memory Arrow `Table`. Always rescannable;
|
||||
the table's batches are replayed on each scan.
|
||||
|
||||
The table's row count is authoritative: `opts.numRows` must either be
|
||||
omitted or equal to `table.numRows`. `opts.rescannable` of `false` is
|
||||
rejected because in-memory Tables are always rescannable.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **table**: `Table`<`any`>
|
||||
|
||||
* **opts**: [`ScannableOptions`](../interfaces/ScannableOptions.md) = `{}`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Scannable`](Scannable.md)>
|
||||
@@ -501,6 +501,34 @@ Modeled after ``VACUUM`` in PostgreSQL.
|
||||
|
||||
***
|
||||
|
||||
### prewarmData()
|
||||
|
||||
```ts
|
||||
abstract prewarmData(columns?): Promise<void>
|
||||
```
|
||||
|
||||
Prewarm one or more columns of data in the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **columns?**: `string`[]
|
||||
The columns to prewarm. If undefined, all columns are prewarmed.
|
||||
This will load the column data into the page cache so that future queries that
|
||||
read those columns avoid the initial cold-start latency. This call initiates
|
||||
prewarming and returns once the request is accepted; the warming itself may
|
||||
continue in the background. Calling it on already-prewarmed columns is a
|
||||
no-op on the server.
|
||||
Prewarming is generally useful for columns used in filters or projections.
|
||||
Large columns (e.g. high-dimensional vectors or binary data) may not be
|
||||
practical to prewarm.
|
||||
This feature is currently only supported on remote tables.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
|
||||
***
|
||||
|
||||
### prewarmIndex()
|
||||
|
||||
```ts
|
||||
|
||||
131
docs/src/js/functions/connectNamespace.md
Normal file
131
docs/src/js/functions/connectNamespace.md
Normal file
@@ -0,0 +1,131 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / connectNamespace
|
||||
|
||||
# Function: connectNamespace()
|
||||
|
||||
## connectNamespace(implName, config, options)
|
||||
|
||||
```ts
|
||||
function connectNamespace(
|
||||
implName,
|
||||
config,
|
||||
options?): Promise<Connection>
|
||||
```
|
||||
|
||||
Connect to a LanceDB database through a namespace.
|
||||
|
||||
Unlike [connect](connect.md), which routes by URI scheme (local path vs.
|
||||
`db://` cloud), `connectNamespace` always returns a namespace-backed
|
||||
connection. The `implName` selects the namespace implementation:
|
||||
|
||||
- `"dir"` — directory namespace, configured with [DirNamespaceConfig](../interfaces/DirNamespaceConfig.md).
|
||||
- `"rest"` — remote REST catalog, configured with [RestNamespaceConfig](../interfaces/RestNamespaceConfig.md).
|
||||
- Any other string — full module path for a custom implementation,
|
||||
configured with a free-form string-keyed `properties` map.
|
||||
|
||||
### Parameters
|
||||
|
||||
* **implName**: `"dir"`
|
||||
|
||||
* **config**: [`DirNamespaceConfig`](../interfaces/DirNamespaceConfig.md)
|
||||
|
||||
* **options?**: `Partial`<[`ConnectNamespaceOptions`](../interfaces/ConnectNamespaceOptions.md)>
|
||||
|
||||
### Returns
|
||||
|
||||
`Promise`<[`Connection`](../classes/Connection.md)>
|
||||
|
||||
### Examples
|
||||
|
||||
```ts
|
||||
const db = await connectNamespace("dir", { root: "/path/to/db" });
|
||||
await db.createTable("users", [{ id: 1 }]);
|
||||
```
|
||||
|
||||
```ts
|
||||
const db = await connectNamespace("rest", {
|
||||
uri: "https://catalog.example.com",
|
||||
headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
|
||||
});
|
||||
```
|
||||
|
||||
```ts
|
||||
const db = await connectNamespace("my.custom.Namespace", {
|
||||
endpoint: "...",
|
||||
});
|
||||
```
|
||||
|
||||
## connectNamespace(implName, config, options)
|
||||
|
||||
```ts
|
||||
function connectNamespace(
|
||||
implName,
|
||||
config,
|
||||
options?): Promise<Connection>
|
||||
```
|
||||
|
||||
Connect through the built-in REST namespace.
|
||||
|
||||
Configured with [RestNamespaceConfig](../interfaces/RestNamespaceConfig.md). See the function-level
|
||||
documentation above for the full surface, examples, and how this
|
||||
relates to [connect](connect.md).
|
||||
|
||||
### Parameters
|
||||
|
||||
* **implName**: `"rest"`
|
||||
|
||||
* **config**: [`RestNamespaceConfig`](../interfaces/RestNamespaceConfig.md)
|
||||
|
||||
* **options?**: `Partial`<[`ConnectNamespaceOptions`](../interfaces/ConnectNamespaceOptions.md)>
|
||||
|
||||
### Returns
|
||||
|
||||
`Promise`<[`Connection`](../classes/Connection.md)>
|
||||
|
||||
### Example
|
||||
|
||||
```ts
|
||||
const db = await connectNamespace("rest", {
|
||||
uri: "https://catalog.example.com",
|
||||
headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
|
||||
});
|
||||
```
|
||||
|
||||
## connectNamespace(implName, properties, options)
|
||||
|
||||
```ts
|
||||
function connectNamespace(
|
||||
implName,
|
||||
properties,
|
||||
options?): Promise<Connection>
|
||||
```
|
||||
|
||||
Connect through a custom namespace implementation by full module path,
|
||||
configured with a free-form string-keyed `properties` map. Use the
|
||||
typed overloads above for the built-in `"dir"` and `"rest"` impls.
|
||||
|
||||
See the function-level documentation above for examples and how this
|
||||
relates to [connect](connect.md).
|
||||
|
||||
### Parameters
|
||||
|
||||
* **implName**: `string`
|
||||
|
||||
* **properties**: `Record`<`string`, `string`>
|
||||
|
||||
* **options?**: `Partial`<[`ConnectNamespaceOptions`](../interfaces/ConnectNamespaceOptions.md)>
|
||||
|
||||
### Returns
|
||||
|
||||
`Promise`<[`Connection`](../classes/Connection.md)>
|
||||
|
||||
### Example
|
||||
|
||||
```ts
|
||||
const db = await connectNamespace("my.custom.Namespace", {
|
||||
endpoint: "...",
|
||||
});
|
||||
```
|
||||
@@ -32,6 +32,7 @@
|
||||
- [PhraseQuery](classes/PhraseQuery.md)
|
||||
- [Query](classes/Query.md)
|
||||
- [QueryBase](classes/QueryBase.md)
|
||||
- [Scannable](classes/Scannable.md)
|
||||
- [Session](classes/Session.md)
|
||||
- [StaticHeaderProvider](classes/StaticHeaderProvider.md)
|
||||
- [Table](classes/Table.md)
|
||||
@@ -51,10 +52,17 @@
|
||||
- [ClientConfig](interfaces/ClientConfig.md)
|
||||
- [ColumnAlteration](interfaces/ColumnAlteration.md)
|
||||
- [CompactionStats](interfaces/CompactionStats.md)
|
||||
- [ConnectNamespaceOptions](interfaces/ConnectNamespaceOptions.md)
|
||||
- [ConnectionOptions](interfaces/ConnectionOptions.md)
|
||||
- [CreateNamespaceOptions](interfaces/CreateNamespaceOptions.md)
|
||||
- [CreateNamespaceResponse](interfaces/CreateNamespaceResponse.md)
|
||||
- [CreateTableOptions](interfaces/CreateTableOptions.md)
|
||||
- [DeleteResult](interfaces/DeleteResult.md)
|
||||
- [DescribeNamespaceResponse](interfaces/DescribeNamespaceResponse.md)
|
||||
- [DirNamespaceConfig](interfaces/DirNamespaceConfig.md)
|
||||
- [DropColumnsResult](interfaces/DropColumnsResult.md)
|
||||
- [DropNamespaceOptions](interfaces/DropNamespaceOptions.md)
|
||||
- [DropNamespaceResponse](interfaces/DropNamespaceResponse.md)
|
||||
- [ExecutableQuery](interfaces/ExecutableQuery.md)
|
||||
- [FragmentStatistics](interfaces/FragmentStatistics.md)
|
||||
- [FragmentSummaryStats](interfaces/FragmentSummaryStats.md)
|
||||
@@ -69,13 +77,17 @@
|
||||
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
|
||||
- [IvfPqOptions](interfaces/IvfPqOptions.md)
|
||||
- [IvfRqOptions](interfaces/IvfRqOptions.md)
|
||||
- [ListNamespacesOptions](interfaces/ListNamespacesOptions.md)
|
||||
- [ListNamespacesResponse](interfaces/ListNamespacesResponse.md)
|
||||
- [MergeResult](interfaces/MergeResult.md)
|
||||
- [OpenTableOptions](interfaces/OpenTableOptions.md)
|
||||
- [OptimizeOptions](interfaces/OptimizeOptions.md)
|
||||
- [OptimizeStats](interfaces/OptimizeStats.md)
|
||||
- [QueryExecutionOptions](interfaces/QueryExecutionOptions.md)
|
||||
- [RemovalStats](interfaces/RemovalStats.md)
|
||||
- [RestNamespaceConfig](interfaces/RestNamespaceConfig.md)
|
||||
- [RetryConfig](interfaces/RetryConfig.md)
|
||||
- [ScannableOptions](interfaces/ScannableOptions.md)
|
||||
- [ShuffleOptions](interfaces/ShuffleOptions.md)
|
||||
- [SplitCalculatedOptions](interfaces/SplitCalculatedOptions.md)
|
||||
- [SplitHashOptions](interfaces/SplitHashOptions.md)
|
||||
@@ -107,6 +119,7 @@
|
||||
|
||||
- [RecordBatchIterator](functions/RecordBatchIterator.md)
|
||||
- [connect](functions/connect.md)
|
||||
- [connectNamespace](functions/connectNamespace.md)
|
||||
- [makeArrowTable](functions/makeArrowTable.md)
|
||||
- [packBits](functions/packBits.md)
|
||||
- [permutationBuilder](functions/permutationBuilder.md)
|
||||
|
||||
54
docs/src/js/interfaces/ConnectNamespaceOptions.md
Normal file
54
docs/src/js/interfaces/ConnectNamespaceOptions.md
Normal file
@@ -0,0 +1,54 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / ConnectNamespaceOptions
|
||||
|
||||
# Interface: ConnectNamespaceOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### namespaceClientProperties?
|
||||
|
||||
```ts
|
||||
optional namespaceClientProperties: Record<string, string>;
|
||||
```
|
||||
|
||||
Extra properties for the backing namespace client.
|
||||
|
||||
***
|
||||
|
||||
### readConsistencyInterval?
|
||||
|
||||
```ts
|
||||
optional readConsistencyInterval: number;
|
||||
```
|
||||
|
||||
The interval, in seconds, at which to check for updates to the table
|
||||
from other processes. If None, then consistency is not checked. For
|
||||
performance reasons, this is the default. For strong consistency, set
|
||||
this to zero seconds. Then every read will check for updates from other
|
||||
processes. As a compromise, you can set this to a non-zero value for
|
||||
eventual consistency.
|
||||
|
||||
***
|
||||
|
||||
### session?
|
||||
|
||||
```ts
|
||||
optional session: Session;
|
||||
```
|
||||
|
||||
The session to use for this connection. Holds shared caches and other
|
||||
session-specific state.
|
||||
|
||||
***
|
||||
|
||||
### storageOptions?
|
||||
|
||||
```ts
|
||||
optional storageOptions: Record<string, string>;
|
||||
```
|
||||
|
||||
Configuration for object storage. The available options are described
|
||||
at https://docs.lancedb.com/storage/
|
||||
27
docs/src/js/interfaces/CreateNamespaceOptions.md
Normal file
27
docs/src/js/interfaces/CreateNamespaceOptions.md
Normal file
@@ -0,0 +1,27 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / CreateNamespaceOptions
|
||||
|
||||
# Interface: CreateNamespaceOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### mode?
|
||||
|
||||
```ts
|
||||
optional mode: "overwrite" | "create" | "exist_ok";
|
||||
```
|
||||
|
||||
Creation mode.
|
||||
|
||||
***
|
||||
|
||||
### properties?
|
||||
|
||||
```ts
|
||||
optional properties: Record<string, string>;
|
||||
```
|
||||
|
||||
Properties to set on the new namespace.
|
||||
23
docs/src/js/interfaces/CreateNamespaceResponse.md
Normal file
23
docs/src/js/interfaces/CreateNamespaceResponse.md
Normal file
@@ -0,0 +1,23 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / CreateNamespaceResponse
|
||||
|
||||
# Interface: CreateNamespaceResponse
|
||||
|
||||
## Properties
|
||||
|
||||
### properties?
|
||||
|
||||
```ts
|
||||
optional properties: Record<string, string>;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### transactionId?
|
||||
|
||||
```ts
|
||||
optional transactionId: string;
|
||||
```
|
||||
15
docs/src/js/interfaces/DescribeNamespaceResponse.md
Normal file
15
docs/src/js/interfaces/DescribeNamespaceResponse.md
Normal file
@@ -0,0 +1,15 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / DescribeNamespaceResponse
|
||||
|
||||
# Interface: DescribeNamespaceResponse
|
||||
|
||||
## Properties
|
||||
|
||||
### properties?
|
||||
|
||||
```ts
|
||||
optional properties: Record<string, string>;
|
||||
```
|
||||
47
docs/src/js/interfaces/DirNamespaceConfig.md
Normal file
47
docs/src/js/interfaces/DirNamespaceConfig.md
Normal file
@@ -0,0 +1,47 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / DirNamespaceConfig
|
||||
|
||||
# Interface: DirNamespaceConfig
|
||||
|
||||
Configuration for the built-in directory namespace (`"dir"`).
|
||||
|
||||
The directory namespace stores tables under a single root path (local
|
||||
filesystem or object storage URI). See
|
||||
[https://docs.lancedb.com/namespaces](https://docs.lancedb.com/namespaces) for the documented surface;
|
||||
less-common knobs live under [DirNamespaceConfig.extraProperties](DirNamespaceConfig.md#extraproperties).
|
||||
|
||||
## Properties
|
||||
|
||||
### extraProperties?
|
||||
|
||||
```ts
|
||||
optional extraProperties: Record<string, string>;
|
||||
```
|
||||
|
||||
Additional raw properties passed verbatim to the namespace
|
||||
implementation (e.g. `storage.*`, `credential_vendor.*`). Typed
|
||||
fields above take precedence on key collision.
|
||||
|
||||
***
|
||||
|
||||
### manifestEnabled?
|
||||
|
||||
```ts
|
||||
optional manifestEnabled: boolean;
|
||||
```
|
||||
|
||||
Whether to maintain a namespace manifest at the root. Required for
|
||||
child namespaces. Defaults to true on the impl side.
|
||||
|
||||
***
|
||||
|
||||
### root
|
||||
|
||||
```ts
|
||||
root: string;
|
||||
```
|
||||
|
||||
Root path or URI containing the LanceDB tables.
|
||||
27
docs/src/js/interfaces/DropNamespaceOptions.md
Normal file
27
docs/src/js/interfaces/DropNamespaceOptions.md
Normal file
@@ -0,0 +1,27 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / DropNamespaceOptions
|
||||
|
||||
# Interface: DropNamespaceOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### behavior?
|
||||
|
||||
```ts
|
||||
optional behavior: "restrict" | "cascade";
|
||||
```
|
||||
|
||||
Refuse to drop if non-empty (restrict) or drop recursively (cascade).
|
||||
|
||||
***
|
||||
|
||||
### mode?
|
||||
|
||||
```ts
|
||||
optional mode: "fail" | "skip";
|
||||
```
|
||||
|
||||
Whether to skip if the namespace doesn't exist, or fail.
|
||||
23
docs/src/js/interfaces/DropNamespaceResponse.md
Normal file
23
docs/src/js/interfaces/DropNamespaceResponse.md
Normal file
@@ -0,0 +1,23 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / DropNamespaceResponse
|
||||
|
||||
# Interface: DropNamespaceResponse
|
||||
|
||||
## Properties
|
||||
|
||||
### properties?
|
||||
|
||||
```ts
|
||||
optional properties: Record<string, string>;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### transactionId?
|
||||
|
||||
```ts
|
||||
optional transactionId: string[];
|
||||
```
|
||||
27
docs/src/js/interfaces/ListNamespacesOptions.md
Normal file
27
docs/src/js/interfaces/ListNamespacesOptions.md
Normal file
@@ -0,0 +1,27 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / ListNamespacesOptions
|
||||
|
||||
# Interface: ListNamespacesOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### limit?
|
||||
|
||||
```ts
|
||||
optional limit: number;
|
||||
```
|
||||
|
||||
An optional limit to the number of results to return.
|
||||
|
||||
***
|
||||
|
||||
### pageToken?
|
||||
|
||||
```ts
|
||||
optional pageToken: string;
|
||||
```
|
||||
|
||||
Token from a previous response for pagination.
|
||||
23
docs/src/js/interfaces/ListNamespacesResponse.md
Normal file
23
docs/src/js/interfaces/ListNamespacesResponse.md
Normal file
@@ -0,0 +1,23 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / ListNamespacesResponse
|
||||
|
||||
# Interface: ListNamespacesResponse
|
||||
|
||||
## Properties
|
||||
|
||||
### namespaces
|
||||
|
||||
```ts
|
||||
namespaces: string[];
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### pageToken?
|
||||
|
||||
```ts
|
||||
optional pageToken: string;
|
||||
```
|
||||
47
docs/src/js/interfaces/RestNamespaceConfig.md
Normal file
47
docs/src/js/interfaces/RestNamespaceConfig.md
Normal file
@@ -0,0 +1,47 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / RestNamespaceConfig
|
||||
|
||||
# Interface: RestNamespaceConfig
|
||||
|
||||
Configuration for the built-in REST namespace (`"rest"`).
|
||||
|
||||
The REST namespace talks to a remote catalog server over HTTP. See
|
||||
[https://docs.lancedb.com/namespaces](https://docs.lancedb.com/namespaces) for the documented surface;
|
||||
less-common knobs (TLS, metrics) live under
|
||||
[RestNamespaceConfig.extraProperties](RestNamespaceConfig.md#extraproperties).
|
||||
|
||||
## Properties
|
||||
|
||||
### extraProperties?
|
||||
|
||||
```ts
|
||||
optional extraProperties: Record<string, string>;
|
||||
```
|
||||
|
||||
Additional raw properties passed verbatim to the namespace
|
||||
implementation (e.g. `tls.*`, `ops_metrics_enabled`, `delimiter`).
|
||||
Typed fields above take precedence on key collision.
|
||||
|
||||
***
|
||||
|
||||
### headers?
|
||||
|
||||
```ts
|
||||
optional headers: Record<string, string>;
|
||||
```
|
||||
|
||||
HTTP headers forwarded with each request. Keys are passed through
|
||||
as-is (e.g. `"x-api-key"`, `"Authorization"`).
|
||||
|
||||
***
|
||||
|
||||
### uri
|
||||
|
||||
```ts
|
||||
uri: string;
|
||||
```
|
||||
|
||||
Catalog endpoint URL.
|
||||
29
docs/src/js/interfaces/ScannableOptions.md
Normal file
29
docs/src/js/interfaces/ScannableOptions.md
Normal file
@@ -0,0 +1,29 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / ScannableOptions
|
||||
|
||||
# Interface: ScannableOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### numRows?
|
||||
|
||||
```ts
|
||||
optional numRows: number;
|
||||
```
|
||||
|
||||
Hint about the number of rows. Not validated against the stream.
|
||||
|
||||
***
|
||||
|
||||
### rescannable?
|
||||
|
||||
```ts
|
||||
optional rescannable: boolean;
|
||||
```
|
||||
|
||||
Whether the source can be scanned more than once. Defaults to `true` for
|
||||
`fromTable` / `fromFactory` and `false` for `fromIterable` /
|
||||
`fromRecordBatchReader`.
|
||||
@@ -94,11 +94,11 @@ of raw SQL strings with [where][lancedb.query.LanceQueryBuilder.where] and
|
||||
|
||||
## Full text search
|
||||
|
||||
::: lancedb.fts.create_index
|
||||
Use [lancedb.table.Table.create_fts_index][] for the synchronous API or
|
||||
[lancedb.table.AsyncTable.create_index][] with [lancedb.index.FTS][] for the
|
||||
asynchronous API.
|
||||
|
||||
::: lancedb.fts.populate_index
|
||||
|
||||
::: lancedb.fts.search_index
|
||||
::: lancedb.index.FTS
|
||||
|
||||
## Utilities
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<arrow.version>15.0.0</arrow.version>
|
||||
<lance-core.version>7.0.0-beta.2</lance-core.version>
|
||||
<lance-core.version>7.0.0-beta.9</lance-core.version>
|
||||
<spotless.skip>false</spotless.skip>
|
||||
<spotless.version>2.30.0</spotless.version>
|
||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||
|
||||
@@ -3,11 +3,11 @@ The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
||||
code is in the `src/` directory and the typescript bindings are in
|
||||
the `lancedb/` directory.
|
||||
|
||||
Whenever you change the Rust code, you will need to recompile: `npm run build`.
|
||||
Whenever you change the Rust code, you will need to recompile: `pnpm build`.
|
||||
|
||||
Common commands:
|
||||
* Build: `npm run build`
|
||||
* Lint: `npm run lint`
|
||||
* Fix lints: `npm run lint-fix`
|
||||
* Test: `npm test`
|
||||
* Run single test file: `npm test __test__/arrow.test.ts`
|
||||
* Build: `pnpm build`
|
||||
* Lint: `pnpm lint`
|
||||
* Fix lints: `pnpm lint-fix`
|
||||
* Test: `pnpm test`
|
||||
* Run single test file: `pnpm test __test__/arrow.test.ts`
|
||||
|
||||
@@ -12,20 +12,22 @@ Typescript.
|
||||
* `src/`: Rust bindings source code
|
||||
* `lancedb/`: Typescript package source code
|
||||
* `__test__/`: Unit tests
|
||||
* `examples/`: An npm package with the examples shown in the documentation
|
||||
* `examples/`: A pnpm package with the examples shown in the documentation
|
||||
|
||||
## Development environment
|
||||
|
||||
To set up your development environment, you will need to install the following:
|
||||
|
||||
1. Node.js 14 or later
|
||||
2. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
|
||||
3. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
|
||||
1. Node.js 22 or later (required by pnpm 11)
|
||||
2. [pnpm](https://pnpm.io/installation) 11 or later (or run via `corepack enable`,
|
||||
which uses the `packageManager` field in `package.json`)
|
||||
3. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
|
||||
4. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
|
||||
|
||||
Initial setup:
|
||||
|
||||
```shell
|
||||
npm install
|
||||
pnpm install
|
||||
```
|
||||
|
||||
### Commit Hooks
|
||||
@@ -39,38 +41,38 @@ pre-commit install
|
||||
|
||||
## Development
|
||||
|
||||
Most common development commands can be run using the npm scripts.
|
||||
Most common development commands can be run using the pnpm scripts.
|
||||
|
||||
Build the package
|
||||
|
||||
```shell
|
||||
npm install
|
||||
npm run build
|
||||
pnpm install
|
||||
pnpm build
|
||||
```
|
||||
|
||||
Lint:
|
||||
|
||||
```shell
|
||||
npm run lint
|
||||
pnpm lint
|
||||
```
|
||||
|
||||
Format and fix lints:
|
||||
|
||||
```shell
|
||||
npm run lint-fix
|
||||
pnpm lint-fix
|
||||
```
|
||||
|
||||
Run tests:
|
||||
|
||||
```shell
|
||||
npm test
|
||||
pnpm test
|
||||
```
|
||||
|
||||
To run a single test:
|
||||
|
||||
```shell
|
||||
# Single file: table.test.ts
|
||||
npm test -- table.test.ts
|
||||
pnpm test -- table.test.ts
|
||||
# Single test: 'merge insert' in table.test.ts
|
||||
npm test -- table.test.ts --testNamePattern=merge\ insert
|
||||
pnpm test -- table.test.ts --testNamePattern=merge\ insert
|
||||
```
|
||||
|
||||
@@ -22,6 +22,7 @@ arrow-schema.workspace = true
|
||||
env_logger.workspace = true
|
||||
futures.workspace = true
|
||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||
lance-namespace.workspace = true
|
||||
napi = { version = "3.8.3", default-features = false, features = [
|
||||
"napi9",
|
||||
"async"
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
import { readdirSync } from "fs";
|
||||
import { Field, Float64, Schema } from "apache-arrow";
|
||||
import * as tmp from "tmp";
|
||||
import { Connection, Table, connect } from "../lancedb";
|
||||
import { Connection, Table, connect, connectNamespace } from "../lancedb";
|
||||
import { LocalTable } from "../lancedb/table";
|
||||
|
||||
describe("when connecting", () => {
|
||||
@@ -81,6 +81,16 @@ describe("given a connection", () => {
|
||||
await db.createTable("test4", [{ id: 1 }, { id: 2 }]);
|
||||
});
|
||||
|
||||
it("should expose renameTable and reject on OSS listing DB", async () => {
|
||||
await db.createTable("old_name", [{ id: 1 }]);
|
||||
|
||||
await expect(db.renameTable("old_name", "new_name")).rejects.toThrow(
|
||||
"rename_table is not supported in LanceDB OSS",
|
||||
);
|
||||
|
||||
await expect(db.tableNames()).resolves.toEqual(["old_name"]);
|
||||
});
|
||||
|
||||
it("should fail if creating table twice, unless overwrite is true", async () => {
|
||||
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
|
||||
await expect(tbl.countRows()).resolves.toBe(2);
|
||||
@@ -306,3 +316,186 @@ describe("clone table functionality", () => {
|
||||
).rejects.toThrow("Deep clone is not yet implemented");
|
||||
});
|
||||
});
|
||||
|
||||
describe("namespaces", () => {
|
||||
let tmpDir: tmp.DirResult;
|
||||
let db: Connection;
|
||||
|
||||
beforeEach(async () => {
|
||||
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||
// The local DirectoryNamespace backend only supports child namespaces
|
||||
// when manifest mode is enabled (see lance-namespace-impls/src/dir.rs).
|
||||
db = await connect(tmpDir.name, {
|
||||
// biome-ignore lint/style/useNamingConvention: opaque backend property key, must match Rust
|
||||
namespaceClientProperties: { manifest_enabled: "true" },
|
||||
});
|
||||
});
|
||||
afterEach(() => tmpDir.removeCallback());
|
||||
|
||||
it("should create and describe a namespace", async () => {
|
||||
await db.createNamespace(["myns"]);
|
||||
const desc = await db.describeNamespace(["myns"]);
|
||||
expect(desc).toBeDefined();
|
||||
});
|
||||
|
||||
it("should list namespaces created at the root", async () => {
|
||||
await db.createNamespace(["alpha"]);
|
||||
await db.createNamespace(["beta"]);
|
||||
const list = await db.listNamespaces();
|
||||
expect(list.namespaces).toEqual(expect.arrayContaining(["alpha", "beta"]));
|
||||
});
|
||||
|
||||
it("should list child namespaces under a parent", async () => {
|
||||
await db.createNamespace(["parent"]);
|
||||
await db.createNamespace(["parent", "child"]);
|
||||
const list = await db.listNamespaces(["parent"]);
|
||||
expect(list.namespaces).toContain("child");
|
||||
});
|
||||
|
||||
it("should drop a namespace", async () => {
|
||||
await db.createNamespace(["ephemeral"]);
|
||||
await db.dropNamespace(["ephemeral"]);
|
||||
const list = await db.listNamespaces();
|
||||
expect(list.namespaces).not.toContain("ephemeral");
|
||||
});
|
||||
|
||||
it("should raise an error on any namespace op after close", async () => {
|
||||
await db.close();
|
||||
await expect(db.describeNamespace(["foo"])).rejects.toThrow(
|
||||
"Connection is closed",
|
||||
);
|
||||
await expect(db.listNamespaces()).rejects.toThrow("Connection is closed");
|
||||
await expect(db.createNamespace(["foo"])).rejects.toThrow(
|
||||
"Connection is closed",
|
||||
);
|
||||
await expect(db.dropNamespace(["foo"])).rejects.toThrow(
|
||||
"Connection is closed",
|
||||
);
|
||||
});
|
||||
|
||||
it("should raise an understandable error when describing a non-existent namespace", async () => {
|
||||
await expect(db.describeNamespace(["does-not-exist"])).rejects.toThrow(
|
||||
/not found/i,
|
||||
);
|
||||
});
|
||||
|
||||
it("should raise an error when creating a namespace that already exists", async () => {
|
||||
await db.createNamespace(["dup"]);
|
||||
await expect(db.createNamespace(["dup"])).rejects.toThrow();
|
||||
});
|
||||
|
||||
it("should reject an unrecognized createNamespace mode with a clear error", async () => {
|
||||
await expect(
|
||||
// biome-ignore lint/suspicious/noExplicitAny: deliberately bypass TS to test runtime validation
|
||||
db.createNamespace(["x"], { mode: "frobnicate" as any }),
|
||||
).rejects.toThrow(/Invalid mode 'frobnicate'/);
|
||||
});
|
||||
|
||||
it("should reject an unrecognized dropNamespace mode with a clear error", async () => {
|
||||
await db.createNamespace(["x"]);
|
||||
await expect(
|
||||
// biome-ignore lint/suspicious/noExplicitAny: deliberately bypass TS to test runtime validation
|
||||
db.dropNamespace(["x"], { mode: "frobnicate" as any }),
|
||||
).rejects.toThrow(/Invalid mode 'frobnicate'/);
|
||||
});
|
||||
|
||||
it("should reject an unrecognized dropNamespace behavior with a clear error", async () => {
|
||||
await db.createNamespace(["x"]);
|
||||
await expect(
|
||||
// biome-ignore lint/suspicious/noExplicitAny: deliberately bypass TS to test runtime validation
|
||||
db.dropNamespace(["x"], { behavior: "frobnicate" as any }),
|
||||
).rejects.toThrow(/Invalid behavior 'frobnicate'/);
|
||||
});
|
||||
});
|
||||
|
||||
describe("connectNamespace", () => {
|
||||
let tmpDir: tmp.DirResult;
|
||||
beforeEach(() => {
|
||||
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||
});
|
||||
afterEach(() => tmpDir.removeCallback());
|
||||
|
||||
it("connects via the dir implementation and supports table ops", async () => {
|
||||
const db = await connectNamespace("dir", { root: tmpDir.name });
|
||||
await db.createTable("users", [{ id: 1 }, { id: 2 }]);
|
||||
await expect(db.tableNames()).resolves.toContain("users");
|
||||
});
|
||||
|
||||
it("throws a clear error when implName is empty", async () => {
|
||||
await expect(connectNamespace("", {})).rejects.toThrow(
|
||||
"implName must be a non-empty string",
|
||||
);
|
||||
});
|
||||
|
||||
it("throws when the namespace implementation is unknown", async () => {
|
||||
await expect(connectNamespace("not-a-real-impl", {})).rejects.toThrow();
|
||||
});
|
||||
|
||||
it("passes storage options through to the namespace", async () => {
|
||||
const db = await connectNamespace(
|
||||
"dir",
|
||||
{ root: tmpDir.name },
|
||||
{ storageOptions: { newTableDataStorageVersion: "stable" } },
|
||||
);
|
||||
await db.createTable("plumbing", [{ id: 1 }]);
|
||||
await expect(db.tableNames()).resolves.toContain("plumbing");
|
||||
});
|
||||
|
||||
it("supports child namespaces when manifestEnabled is true on the dir config", async () => {
|
||||
const writer = await connectNamespace("dir", {
|
||||
root: tmpDir.name,
|
||||
manifestEnabled: true,
|
||||
});
|
||||
await writer.createNamespace(["analytics"]);
|
||||
await writer.createTable("orders", [{ id: 1 }, { id: 2 }], ["analytics"]);
|
||||
await writer.close();
|
||||
|
||||
const reader = await connectNamespace("dir", {
|
||||
root: tmpDir.name,
|
||||
manifestEnabled: true,
|
||||
});
|
||||
await expect(reader.tableNames(["analytics"])).resolves.toContain("orders");
|
||||
const orders = await reader.openTable("orders", ["analytics"]);
|
||||
await expect(orders.countRows()).resolves.toBe(2);
|
||||
});
|
||||
|
||||
it("merges extraProperties into the dir config and is overridden by typed fields", async () => {
|
||||
// Two observable assertions:
|
||||
// - Typed `root` overrides extraProperties.root: createTable would fail
|
||||
// under the bogus path if the override didn't happen.
|
||||
// - extraProperties.manifest_enabled="false" is honored end-to-end. Child
|
||||
// namespaces require manifest mode (default true), so explicitly
|
||||
// disabling it via extraProperties must make createNamespace reject. If
|
||||
// extraProperties pass-through were silently broken, the default would
|
||||
// let createNamespace succeed.
|
||||
const db = await connectNamespace("dir", {
|
||||
root: tmpDir.name,
|
||||
extraProperties: {
|
||||
root: "/should/be/overridden",
|
||||
// biome-ignore lint/style/useNamingConvention: backend property key
|
||||
manifest_enabled: "false",
|
||||
},
|
||||
});
|
||||
await db.createTable("base", [{ id: 1 }]);
|
||||
await expect(db.tableNames()).resolves.toContain("base");
|
||||
await expect(db.createNamespace(["analytics"])).rejects.toThrow();
|
||||
});
|
||||
|
||||
it("flows unknown top-level keys through when implName is dynamic (no silent drop)", async () => {
|
||||
// Routes via the third overload because `impl` is `string`, not the
|
||||
// literal `"dir"`. The dispatcher still notices the runtime value is
|
||||
// "dir", but unknown keys like `manifest_enabled` must not be silently
|
||||
// dropped during the conversion.
|
||||
//
|
||||
// Asserting a *negative* outcome (manifest disabled -> createNamespace
|
||||
// rejects) is required for observability, since the backend default for
|
||||
// `manifest_enabled` is true.
|
||||
const impl: string = "dir";
|
||||
const db = await connectNamespace(impl, {
|
||||
root: tmpDir.name,
|
||||
// biome-ignore lint/style/useNamingConvention: backend property key
|
||||
manifest_enabled: "false",
|
||||
});
|
||||
await expect(db.createNamespace(["mixed"])).rejects.toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
438
nodejs/__test__/scannable.test.ts
Normal file
438
nodejs/__test__/scannable.test.ts
Normal file
@@ -0,0 +1,438 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import {
|
||||
Field,
|
||||
Float16,
|
||||
Int32,
|
||||
type RecordBatch,
|
||||
RecordBatchReader,
|
||||
Schema,
|
||||
tableToIPC,
|
||||
} from "apache-arrow";
|
||||
import { makeArrowTable, makeEmptyTable } from "../lancedb/arrow";
|
||||
import { Scannable } from "../lancedb/scannable";
|
||||
|
||||
function makeTable() {
|
||||
return makeArrowTable(
|
||||
[
|
||||
{ id: 1, name: "a" },
|
||||
{ id: 2, name: "b" },
|
||||
{ id: 3, name: "c" },
|
||||
],
|
||||
{ vectorColumns: {} },
|
||||
);
|
||||
}
|
||||
|
||||
async function makeReader(): Promise<RecordBatchReader> {
|
||||
// `RecordBatchReader.from()` returns an unopened reader; `.schema` is only
|
||||
// populated after `.open()`. Opening sync readers is synchronous.
|
||||
const reader = RecordBatchReader.from(tableToIPC(makeTable()));
|
||||
return reader.open() as RecordBatchReader;
|
||||
}
|
||||
|
||||
describe("Scannable", () => {
|
||||
describe("fromTable", () => {
|
||||
test("reflects schema, numRows, and defaults rescannable=true", async () => {
|
||||
const table = makeTable();
|
||||
const scannable = await Scannable.fromTable(table);
|
||||
|
||||
expect(scannable.schema).toBe(table.schema);
|
||||
expect(scannable.numRows).toBe(table.numRows);
|
||||
expect(scannable.rescannable).toBe(true);
|
||||
});
|
||||
|
||||
test("throws when opts.numRows does not match table.numRows", async () => {
|
||||
await expect(
|
||||
Scannable.fromTable(makeTable(), { numRows: 42 }),
|
||||
).rejects.toThrow(/does not match table\.numRows/);
|
||||
});
|
||||
|
||||
test("throws when opts.rescannable is false", async () => {
|
||||
await expect(
|
||||
Scannable.fromTable(makeTable(), { rescannable: false }),
|
||||
).rejects.toThrow(/always rescannable/);
|
||||
});
|
||||
});
|
||||
|
||||
describe("fromRecordBatchReader", () => {
|
||||
test("reflects schema and defaults numRows=null, rescannable=false", async () => {
|
||||
const reader = await makeReader();
|
||||
const scannable = await Scannable.fromRecordBatchReader(reader);
|
||||
|
||||
expect(scannable.schema).toBe(reader.schema);
|
||||
expect(scannable.numRows).toBeNull();
|
||||
expect(scannable.rescannable).toBe(false);
|
||||
});
|
||||
|
||||
test("honors numRows override", async () => {
|
||||
const scannable = await Scannable.fromRecordBatchReader(
|
||||
await makeReader(),
|
||||
{ numRows: 3 },
|
||||
);
|
||||
|
||||
expect(scannable.numRows).toBe(3);
|
||||
expect(scannable.rescannable).toBe(false);
|
||||
});
|
||||
|
||||
test("rescannable: false explicit does not throw", async () => {
|
||||
const reader = await makeReader();
|
||||
const scannable = await Scannable.fromRecordBatchReader(reader, {
|
||||
rescannable: false,
|
||||
});
|
||||
expect(scannable.rescannable).toBe(false);
|
||||
});
|
||||
|
||||
test("throws when opts.rescannable is true", async () => {
|
||||
const reader = await makeReader();
|
||||
await expect(
|
||||
Scannable.fromRecordBatchReader(reader, { rescannable: true }),
|
||||
).rejects.toThrow(/does not accept rescannable/);
|
||||
});
|
||||
|
||||
test("throws when opts.rescannable is true even alongside numRows", async () => {
|
||||
const reader = await makeReader();
|
||||
await expect(
|
||||
Scannable.fromRecordBatchReader(reader, {
|
||||
numRows: 3,
|
||||
rescannable: true,
|
||||
}),
|
||||
).rejects.toThrow(/does not accept rescannable/);
|
||||
});
|
||||
});
|
||||
|
||||
describe("fromIterable", () => {
|
||||
test("accepts a sync iterable of batches", async () => {
|
||||
const table = makeTable();
|
||||
const scannable = await Scannable.fromIterable(
|
||||
table.schema,
|
||||
table.batches,
|
||||
);
|
||||
|
||||
expect(scannable.schema).toBe(table.schema);
|
||||
expect(scannable.numRows).toBeNull();
|
||||
expect(scannable.rescannable).toBe(false);
|
||||
});
|
||||
|
||||
test("accepts an async iterable of batches", async () => {
|
||||
const table = makeTable();
|
||||
async function* generator(): AsyncGenerator<RecordBatch> {
|
||||
for (const batch of table.batches) {
|
||||
yield batch;
|
||||
}
|
||||
}
|
||||
|
||||
const scannable = await Scannable.fromIterable(table.schema, generator());
|
||||
expect(scannable.schema).toBe(table.schema);
|
||||
expect(scannable.rescannable).toBe(false);
|
||||
});
|
||||
|
||||
describe("rescannable: true detection", () => {
|
||||
// Replayable inputs: [Symbol.iterator]() / [Symbol.asyncIterator]()
|
||||
// returns a fresh iterator each call. Must NOT throw.
|
||||
|
||||
test("Array passes (fresh ArrayIterator each call)", async () => {
|
||||
const table = makeTable();
|
||||
const scannable = await Scannable.fromIterable(
|
||||
table.schema,
|
||||
table.batches,
|
||||
{ rescannable: true },
|
||||
);
|
||||
expect(scannable.rescannable).toBe(true);
|
||||
});
|
||||
|
||||
test("Set passes (fresh SetIterator each call)", async () => {
|
||||
const table = makeTable();
|
||||
const set = new Set<RecordBatch>(table.batches);
|
||||
const scannable = await Scannable.fromIterable(table.schema, set, {
|
||||
rescannable: true,
|
||||
});
|
||||
expect(scannable.rescannable).toBe(true);
|
||||
});
|
||||
|
||||
test("custom Iterable returning a fresh iterator passes", async () => {
|
||||
const table = makeTable();
|
||||
const replayable: Iterable<RecordBatch> = {
|
||||
[Symbol.iterator]() {
|
||||
return table.batches[Symbol.iterator]();
|
||||
},
|
||||
};
|
||||
const scannable = await Scannable.fromIterable(
|
||||
table.schema,
|
||||
replayable,
|
||||
{ rescannable: true },
|
||||
);
|
||||
expect(scannable.rescannable).toBe(true);
|
||||
});
|
||||
|
||||
test("object with generator method passes (fresh generator each call)", async () => {
|
||||
const table = makeTable();
|
||||
const replayable: Iterable<RecordBatch> = {
|
||||
*[Symbol.iterator]() {
|
||||
for (const batch of table.batches) yield batch;
|
||||
},
|
||||
};
|
||||
const scannable = await Scannable.fromIterable(
|
||||
table.schema,
|
||||
replayable,
|
||||
{ rescannable: true },
|
||||
);
|
||||
expect(scannable.rescannable).toBe(true);
|
||||
});
|
||||
|
||||
test("empty Array passes (replayable degenerate case)", async () => {
|
||||
const schema = makeTable().schema;
|
||||
const scannable = await Scannable.fromIterable(
|
||||
schema,
|
||||
[] as RecordBatch[],
|
||||
{ rescannable: true },
|
||||
);
|
||||
expect(scannable.rescannable).toBe(true);
|
||||
});
|
||||
|
||||
// One-shot inputs: [Symbol.iterator]() / [Symbol.asyncIterator]()
|
||||
// returns the same object, or the input is already-an-iterator.
|
||||
// Must throw with a /one-shot/ message.
|
||||
|
||||
test("sync generator throws", async () => {
|
||||
const table = makeTable();
|
||||
function* generator(): Generator<RecordBatch> {
|
||||
for (const batch of table.batches) yield batch;
|
||||
}
|
||||
await expect(
|
||||
Scannable.fromIterable(table.schema, generator(), {
|
||||
rescannable: true,
|
||||
}),
|
||||
).rejects.toThrow(/one-shot/);
|
||||
});
|
||||
|
||||
test("async generator throws", async () => {
|
||||
const table = makeTable();
|
||||
async function* generator(): AsyncGenerator<RecordBatch> {
|
||||
for (const batch of table.batches) yield batch;
|
||||
}
|
||||
await expect(
|
||||
Scannable.fromIterable(table.schema, generator(), {
|
||||
rescannable: true,
|
||||
}),
|
||||
).rejects.toThrow(/one-shot/);
|
||||
});
|
||||
|
||||
test("empty generator throws (one-shot degenerate case)", async () => {
|
||||
const schema = makeTable().schema;
|
||||
function* generator(): Generator<RecordBatch> {
|
||||
// intentionally empty; yields nothing.
|
||||
}
|
||||
await expect(
|
||||
Scannable.fromIterable(schema, generator(), { rescannable: true }),
|
||||
).rejects.toThrow(/one-shot/);
|
||||
});
|
||||
|
||||
test("custom self-iterator throws", async () => {
|
||||
const table = makeTable();
|
||||
const batches = table.batches;
|
||||
let i = 0;
|
||||
const oneShot: Iterable<RecordBatch> & Iterator<RecordBatch> = {
|
||||
[Symbol.iterator]() {
|
||||
return this;
|
||||
},
|
||||
next() {
|
||||
if (i >= batches.length) {
|
||||
return { done: true, value: undefined };
|
||||
}
|
||||
return { done: false, value: batches[i++] };
|
||||
},
|
||||
};
|
||||
await expect(
|
||||
Scannable.fromIterable(table.schema, oneShot, { rescannable: true }),
|
||||
).rejects.toThrow(/one-shot/);
|
||||
});
|
||||
|
||||
test("Array.values() (IterableIterator) throws", async () => {
|
||||
const table = makeTable();
|
||||
const iter = table.batches.values();
|
||||
await expect(
|
||||
Scannable.fromIterable(table.schema, iter, { rescannable: true }),
|
||||
).rejects.toThrow(/one-shot/);
|
||||
});
|
||||
|
||||
test("raw iterator (only `.next`) throws", async () => {
|
||||
const table = makeTable();
|
||||
const batches = table.batches;
|
||||
let i = 0;
|
||||
const rawIter = {
|
||||
next(): IteratorResult<RecordBatch> {
|
||||
if (i >= batches.length) {
|
||||
return { done: true, value: undefined };
|
||||
}
|
||||
return { done: false, value: batches[i++] };
|
||||
},
|
||||
};
|
||||
await expect(
|
||||
Scannable.fromIterable(
|
||||
table.schema,
|
||||
rawIter as unknown as Iterable<RecordBatch>,
|
||||
{ rescannable: true },
|
||||
),
|
||||
).rejects.toThrow(/one-shot/);
|
||||
});
|
||||
|
||||
// Edge: null/undefined must not crash the detection helper. The
|
||||
// null check belongs to `normalizeIterator` and only fires when a
|
||||
// scan starts.
|
||||
|
||||
test("null input does not crash detection at construction", async () => {
|
||||
const schema = makeTable().schema;
|
||||
await expect(
|
||||
Scannable.fromIterable(
|
||||
schema,
|
||||
null as unknown as Iterable<RecordBatch>,
|
||||
{
|
||||
rescannable: true,
|
||||
},
|
||||
),
|
||||
).resolves.toBeDefined();
|
||||
});
|
||||
|
||||
test("undefined input does not crash detection at construction", async () => {
|
||||
const schema = makeTable().schema;
|
||||
await expect(
|
||||
Scannable.fromIterable(
|
||||
schema,
|
||||
undefined as unknown as Iterable<RecordBatch>,
|
||||
{ rescannable: true },
|
||||
),
|
||||
).resolves.toBeDefined();
|
||||
});
|
||||
|
||||
// Default (rescannable omitted) skips the check entirely, so even
|
||||
// pathological inputs construct without throwing here.
|
||||
|
||||
test("rescannable omitted skips detection entirely (generator passes)", async () => {
|
||||
const table = makeTable();
|
||||
function* generator(): Generator<RecordBatch> {
|
||||
for (const batch of table.batches) yield batch;
|
||||
}
|
||||
const scannable = await Scannable.fromIterable(
|
||||
table.schema,
|
||||
generator(),
|
||||
);
|
||||
expect(scannable.rescannable).toBe(false);
|
||||
});
|
||||
|
||||
test("rescannable: false explicit skips detection entirely (generator passes)", async () => {
|
||||
const table = makeTable();
|
||||
function* generator(): Generator<RecordBatch> {
|
||||
for (const batch of table.batches) yield batch;
|
||||
}
|
||||
const scannable = await Scannable.fromIterable(
|
||||
table.schema,
|
||||
generator(),
|
||||
{ rescannable: false },
|
||||
);
|
||||
expect(scannable.rescannable).toBe(false);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("fromFactory", () => {
|
||||
test("defaults rescannable=true and does not invoke the factory eagerly", async () => {
|
||||
const table = makeTable();
|
||||
const factory = jest.fn(() => table.batches);
|
||||
|
||||
const scannable = await Scannable.fromFactory(table.schema, factory);
|
||||
|
||||
expect(scannable.schema).toBe(table.schema);
|
||||
expect(scannable.rescannable).toBe(true);
|
||||
expect(factory).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test("honors rescannable and numRows overrides", async () => {
|
||||
const table = makeTable();
|
||||
const scannable = await Scannable.fromFactory(
|
||||
table.schema,
|
||||
() => table.batches,
|
||||
{ numRows: 7, rescannable: false },
|
||||
);
|
||||
|
||||
expect(scannable.numRows).toBe(7);
|
||||
expect(scannable.rescannable).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("validation", () => {
|
||||
test("throws when numRows is negative", async () => {
|
||||
await expect(
|
||||
Scannable.fromFactory(makeTable().schema, () => [], { numRows: -1 }),
|
||||
).rejects.toThrow(/non-negative/);
|
||||
});
|
||||
|
||||
test("throws when numRows is not an integer", async () => {
|
||||
await expect(
|
||||
Scannable.fromFactory(makeTable().schema, () => [], { numRows: 3.5 }),
|
||||
).rejects.toThrow(/integer/);
|
||||
});
|
||||
});
|
||||
|
||||
describe("native handle", () => {
|
||||
test("exposes a native handle via inner", async () => {
|
||||
const scannable = await Scannable.fromTable(makeTable());
|
||||
expect(scannable.inner).toBeDefined();
|
||||
expect(typeof scannable.inner).toBe("object");
|
||||
expect(scannable.inner).not.toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
// Schema-variety construction tests. Each asserts that construction
|
||||
// succeeds against a richer Arrow schema, which transitively exercises
|
||||
// schema serialization and the Rust-side `ipc_file_to_schema` for types
|
||||
// beyond flat primitives.
|
||||
describe("schema variety", () => {
|
||||
test("accepts an empty table", async () => {
|
||||
const schema = new Schema([new Field("id", new Int32(), true)]);
|
||||
const table = makeEmptyTable(schema);
|
||||
const scannable = await Scannable.fromTable(table);
|
||||
|
||||
expect(scannable.numRows).toBe(0);
|
||||
expect(scannable.schema).toBe(table.schema);
|
||||
});
|
||||
|
||||
test("accepts nested struct and list columns", async () => {
|
||||
const table = makeArrowTable(
|
||||
[
|
||||
{ id: 1, point: { x: 0, y: 0 }, tags: ["a", "b"] },
|
||||
{ id: 2, point: { x: 1, y: 2 }, tags: ["c"] },
|
||||
],
|
||||
{ vectorColumns: {} },
|
||||
);
|
||||
const scannable = await Scannable.fromTable(table);
|
||||
|
||||
expect(scannable.schema).toBe(table.schema);
|
||||
expect(scannable.numRows).toBe(2);
|
||||
});
|
||||
|
||||
test("accepts a FixedSizeList (vector) column", async () => {
|
||||
const table = makeArrowTable(
|
||||
[
|
||||
{ id: 1, vec: [1, 2, 3] },
|
||||
{ id: 2, vec: [4, 5, 6] },
|
||||
],
|
||||
{ vectorColumns: { vec: { type: new Float16() } } },
|
||||
);
|
||||
const scannable = await Scannable.fromTable(table);
|
||||
|
||||
expect(scannable.schema).toBe(table.schema);
|
||||
expect(scannable.numRows).toBe(2);
|
||||
});
|
||||
|
||||
test("accepts a table with many columns", async () => {
|
||||
const row: Record<string, number> = {};
|
||||
for (let i = 0; i < 50; i++) row[`c${i}`] = i;
|
||||
const table = makeArrowTable([row, row], { vectorColumns: {} });
|
||||
const scannable = await Scannable.fromTable(table);
|
||||
|
||||
expect(scannable.schema.fields.length).toBe(50);
|
||||
expect(scannable.numRows).toBe(2);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1870,6 +1870,25 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(results.length).toBe(3);
|
||||
});
|
||||
|
||||
test("prewarmData errors on local tables", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
{ text: "alpha", vector: [0.1, 0.2, 0.3] },
|
||||
{ text: "beta", vector: [0.4, 0.5, 0.6] },
|
||||
];
|
||||
const table = await db.createTable("prewarm_data_test", data);
|
||||
|
||||
// prewarmData is only supported on remote tables. We verify the call
|
||||
// is wired through napi and surfaces the expected error for both
|
||||
// arg shapes (undefined and string[]).
|
||||
await expect(table.prewarmData()).rejects.toThrow(
|
||||
"prewarm_data is currently only supported on remote tables",
|
||||
);
|
||||
await expect(table.prewarmData(["text"])).rejects.toThrow(
|
||||
"prewarm_data is currently only supported on remote tables",
|
||||
);
|
||||
});
|
||||
|
||||
test("full text index on list", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
|
||||
4810
nodejs/examples/package-lock.json
generated
4810
nodejs/examples/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -11,16 +11,17 @@
|
||||
"test": "node --experimental-vm-modules node_modules/.bin/jest --testEnvironment jest-environment-node-single-context --verbose",
|
||||
"lint": "biome check *.ts && biome format *.ts",
|
||||
"lint-ci": "biome ci .",
|
||||
"lint-fix": "biome check --write *.ts && npm run format",
|
||||
"lint-fix": "biome check --write *.ts && pnpm format",
|
||||
"format": "biome format --write *.ts"
|
||||
},
|
||||
"author": "Lance Devs",
|
||||
"license": "Apache-2.0",
|
||||
"packageManager": "pnpm@11.1.1",
|
||||
"dependencies": {
|
||||
"@huggingface/transformers": "^3.0.2",
|
||||
"@huggingface/transformers": "3.0.2",
|
||||
"@lancedb/lancedb": "file:../dist",
|
||||
"openai": "^4.29.2",
|
||||
"sharp": "^0.33.5"
|
||||
"openai": "4.29.2",
|
||||
"sharp": "0.33.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@biomejs/biome": "^1.7.3",
|
||||
|
||||
3466
nodejs/examples/pnpm-lock.yaml
generated
Normal file
3466
nodejs/examples/pnpm-lock.yaml
generated
Normal file
File diff suppressed because it is too large
Load Diff
13
nodejs/examples/pnpm-workspace.yaml
Normal file
13
nodejs/examples/pnpm-workspace.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
# Block resolution of versions less than 24h old (Shai-Hulud window).
|
||||
# This is the pnpm 11 default but pinned here so it's visible to
|
||||
# reviewers and survives a future pnpm major flipping the default.
|
||||
minimumReleaseAge: 1440
|
||||
|
||||
# Fail install if a transitive dep tries to run an unapproved script.
|
||||
strictDepBuilds: true
|
||||
|
||||
allowBuilds:
|
||||
'@biomejs/biome': true
|
||||
onnxruntime-node: true
|
||||
protobufjs: true
|
||||
sharp: true
|
||||
@@ -1291,6 +1291,18 @@ export async function fromRecordBatchToBuffer(
|
||||
return Buffer.from(await writer.toUint8Array());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a buffer containing a single record batch using the Arrow IPC Stream
|
||||
* serialization. Each call produces a self-contained Stream message (schema +
|
||||
* batch + EOS) suitable for incremental decode by `arrow_ipc::reader::StreamReader`.
|
||||
*/
|
||||
export async function fromRecordBatchToStreamBuffer(
|
||||
batch: RecordBatch,
|
||||
): Promise<Buffer> {
|
||||
const writer = RecordBatchStreamWriter.writeAll([batch]);
|
||||
return Buffer.from(await writer.toUint8Array());
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
|
||||
*
|
||||
|
||||
@@ -16,6 +16,18 @@ import {
|
||||
} from "./arrow";
|
||||
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
||||
import { Connection as LanceDbConnection } from "./native";
|
||||
import type {
|
||||
CreateNamespaceResponse,
|
||||
DescribeNamespaceResponse,
|
||||
DropNamespaceResponse,
|
||||
ListNamespacesResponse,
|
||||
} from "./native";
|
||||
export type {
|
||||
CreateNamespaceResponse,
|
||||
DescribeNamespaceResponse,
|
||||
DropNamespaceResponse,
|
||||
ListNamespacesResponse,
|
||||
};
|
||||
import { sanitizeTable } from "./sanitize";
|
||||
import { LocalTable, Table } from "./table";
|
||||
|
||||
@@ -110,6 +122,28 @@ export interface TableNamesOptions {
|
||||
/** An optional limit to the number of results to return. */
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
export interface ListNamespacesOptions {
|
||||
/** Token from a previous response for pagination. */
|
||||
pageToken?: string;
|
||||
/** An optional limit to the number of results to return. */
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
export interface CreateNamespaceOptions {
|
||||
/** Creation mode. */
|
||||
mode?: "create" | "exist_ok" | "overwrite";
|
||||
/** Properties to set on the new namespace. */
|
||||
properties?: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface DropNamespaceOptions {
|
||||
/** Whether to skip if the namespace doesn't exist, or fail. */
|
||||
mode?: "skip" | "fail";
|
||||
/** Refuse to drop if non-empty (restrict) or drop recursively (cascade). */
|
||||
behavior?: "restrict" | "cascade";
|
||||
}
|
||||
|
||||
/**
|
||||
* A LanceDB Connection that allows you to open tables and create new ones.
|
||||
*
|
||||
@@ -262,12 +296,81 @@ export abstract class Connection {
|
||||
*/
|
||||
abstract dropTable(name: string, namespacePath?: string[]): Promise<void>;
|
||||
|
||||
abstract renameTable(
|
||||
oldName: string,
|
||||
newName: string,
|
||||
namespacePath?: string[],
|
||||
): Promise<void>;
|
||||
|
||||
/**
|
||||
* Drop all tables in the database.
|
||||
* @param {string[]} namespacePath The namespace path to drop tables from (defaults to root namespace).
|
||||
*/
|
||||
abstract dropAllTables(namespacePath?: string[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Describe a namespace, returning its properties.
|
||||
*
|
||||
* @param {string[]} namespacePath - The namespace path to describe, in
|
||||
* parent → child order, e.g. `["analytics", "sales"]`.
|
||||
* @returns {Promise<DescribeNamespaceResponse>} The namespace's properties
|
||||
* (may be undefined if the namespace has none).
|
||||
*/
|
||||
abstract describeNamespace(
|
||||
namespacePath: string[],
|
||||
): Promise<DescribeNamespaceResponse>;
|
||||
|
||||
/**
|
||||
* List the immediate child namespaces under the given parent.
|
||||
*
|
||||
* Results may be paginated. To retrieve subsequent pages, pass the
|
||||
* `pageToken` returned by a previous call.
|
||||
*
|
||||
* @param {string[]} namespacePath - The parent namespace path. Defaults
|
||||
* to the root namespace if omitted.
|
||||
* @param {Partial<ListNamespacesOptions>} options - Pagination options
|
||||
* (`pageToken`, `limit`).
|
||||
* @returns {Promise<ListNamespacesResponse>} Child namespace names and
|
||||
* an optional token for fetching the next page.
|
||||
*/
|
||||
abstract listNamespaces(
|
||||
namespacePath?: string[],
|
||||
options?: Partial<ListNamespacesOptions>,
|
||||
): Promise<ListNamespacesResponse>;
|
||||
|
||||
/**
|
||||
* Create a new namespace at the given path.
|
||||
*
|
||||
* @param {string[]} namespacePath - The namespace path to create.
|
||||
* @param {Partial<CreateNamespaceOptions>} options - Creation `mode`
|
||||
* ("create" | "exist_ok" | "overwrite") and optional `properties`
|
||||
* to attach to the namespace.
|
||||
* @returns {Promise<CreateNamespaceResponse>} The properties of the
|
||||
* created namespace and an optional transaction id.
|
||||
*/
|
||||
abstract createNamespace(
|
||||
namespacePath: string[],
|
||||
options?: Partial<CreateNamespaceOptions>,
|
||||
): Promise<CreateNamespaceResponse>;
|
||||
|
||||
/**
|
||||
* Drop a namespace.
|
||||
*
|
||||
* Use `behavior: "cascade"` to also drop everything contained in the
|
||||
* namespace (sub-namespaces and tables). The default `"restrict"`
|
||||
* behavior refuses to drop a non-empty namespace.
|
||||
*
|
||||
* @param {string[]} namespacePath - The namespace path to drop.
|
||||
* @param {Partial<DropNamespaceOptions>} options - `mode` ("skip" | "fail"
|
||||
* for missing-namespace handling) and `behavior` ("restrict" | "cascade").
|
||||
* @returns {Promise<DropNamespaceResponse>} Any properties returned by
|
||||
* the server and an optional transaction id.
|
||||
*/
|
||||
abstract dropNamespace(
|
||||
namespacePath: string[],
|
||||
options?: Partial<DropNamespaceOptions>,
|
||||
): Promise<DropNamespaceResponse>;
|
||||
|
||||
/**
|
||||
* Clone a table from a source table.
|
||||
*
|
||||
@@ -512,9 +615,56 @@ export class LocalConnection extends Connection {
|
||||
return this.inner.dropTable(name, namespacePath ?? []);
|
||||
}
|
||||
|
||||
async renameTable(
|
||||
oldName: string,
|
||||
newName: string,
|
||||
namespacePath?: string[],
|
||||
): Promise<void> {
|
||||
return this.inner.renameTable(oldName, newName, namespacePath ?? []);
|
||||
}
|
||||
|
||||
async dropAllTables(namespacePath?: string[]): Promise<void> {
|
||||
return this.inner.dropAllTables(namespacePath ?? []);
|
||||
}
|
||||
|
||||
describeNamespace(
|
||||
namespacePath: string[],
|
||||
): Promise<DescribeNamespaceResponse> {
|
||||
return this.inner.describeNamespace(namespacePath);
|
||||
}
|
||||
|
||||
listNamespaces(
|
||||
namespacePath?: string[],
|
||||
options?: Partial<ListNamespacesOptions>,
|
||||
): Promise<ListNamespacesResponse> {
|
||||
return this.inner.listNamespaces(
|
||||
namespacePath ?? [],
|
||||
options?.pageToken,
|
||||
options?.limit,
|
||||
);
|
||||
}
|
||||
|
||||
createNamespace(
|
||||
namespacePath: string[],
|
||||
options?: Partial<CreateNamespaceOptions>,
|
||||
): Promise<CreateNamespaceResponse> {
|
||||
return this.inner.createNamespace(
|
||||
namespacePath,
|
||||
options?.mode,
|
||||
options?.properties,
|
||||
);
|
||||
}
|
||||
|
||||
dropNamespace(
|
||||
namespacePath: string[],
|
||||
options?: Partial<DropNamespaceOptions>,
|
||||
): Promise<DropNamespaceResponse> {
|
||||
return this.inner.dropNamespace(
|
||||
namespacePath,
|
||||
options?.mode,
|
||||
options?.behavior,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -8,6 +8,7 @@ import {
|
||||
} from "./connection";
|
||||
|
||||
import {
|
||||
ConnectNamespaceOptions,
|
||||
ConnectionOptions,
|
||||
Connection as LanceDbConnection,
|
||||
JsHeaderProvider as NativeJsHeaderProvider,
|
||||
@@ -22,6 +23,7 @@ export { JsHeaderProvider as NativeJsHeaderProvider } from "./native.js";
|
||||
export {
|
||||
AddColumnsSql,
|
||||
ConnectionOptions,
|
||||
ConnectNamespaceOptions,
|
||||
IndexStatistics,
|
||||
IndexConfig,
|
||||
ClientConfig,
|
||||
@@ -62,6 +64,13 @@ export {
|
||||
CreateTableOptions,
|
||||
TableNamesOptions,
|
||||
OpenTableOptions,
|
||||
ListNamespacesOptions,
|
||||
CreateNamespaceOptions,
|
||||
DropNamespaceOptions,
|
||||
ListNamespacesResponse,
|
||||
CreateNamespaceResponse,
|
||||
DropNamespaceResponse,
|
||||
DescribeNamespaceResponse,
|
||||
} from "./connection";
|
||||
|
||||
export { Session } from "./native.js";
|
||||
@@ -117,6 +126,7 @@ export { MergeInsertBuilder, WriteExecutionOptions } from "./merge";
|
||||
|
||||
export * as embedding from "./embedding";
|
||||
export { permutationBuilder, PermutationBuilder } from "./permutation";
|
||||
export { Scannable, ScannableOptions } from "./scannable";
|
||||
export * as rerankers from "./rerankers";
|
||||
export {
|
||||
SchemaLike,
|
||||
@@ -293,3 +303,197 @@ export async function connect(
|
||||
);
|
||||
return new LocalConnection(nativeConn);
|
||||
}
|
||||
|
||||
/**
|
||||
* Configuration for the built-in directory namespace (`"dir"`).
|
||||
*
|
||||
* The directory namespace stores tables under a single root path (local
|
||||
* filesystem or object storage URI). See
|
||||
* {@link https://docs.lancedb.com/namespaces} for the documented surface;
|
||||
* less-common knobs live under {@link DirNamespaceConfig.extraProperties}.
|
||||
*/
|
||||
export interface DirNamespaceConfig {
|
||||
/** Root path or URI containing the LanceDB tables. */
|
||||
root: string;
|
||||
/**
|
||||
* Whether to maintain a namespace manifest at the root. Required for
|
||||
* child namespaces. Defaults to true on the impl side.
|
||||
*/
|
||||
manifestEnabled?: boolean;
|
||||
/**
|
||||
* Additional raw properties passed verbatim to the namespace
|
||||
* implementation (e.g. `storage.*`, `credential_vendor.*`). Typed
|
||||
* fields above take precedence on key collision.
|
||||
*/
|
||||
extraProperties?: Record<string, string>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configuration for the built-in REST namespace (`"rest"`).
|
||||
*
|
||||
* The REST namespace talks to a remote catalog server over HTTP. See
|
||||
* {@link https://docs.lancedb.com/namespaces} for the documented surface;
|
||||
* less-common knobs (TLS, metrics) live under
|
||||
* {@link RestNamespaceConfig.extraProperties}.
|
||||
*/
|
||||
export interface RestNamespaceConfig {
|
||||
/** Catalog endpoint URL. */
|
||||
uri: string;
|
||||
/**
|
||||
* HTTP headers forwarded with each request. Keys are passed through
|
||||
* as-is (e.g. `"x-api-key"`, `"Authorization"`).
|
||||
*/
|
||||
headers?: Record<string, string>;
|
||||
/**
|
||||
* Additional raw properties passed verbatim to the namespace
|
||||
* implementation (e.g. `tls.*`, `ops_metrics_enabled`, `delimiter`).
|
||||
* Typed fields above take precedence on key collision.
|
||||
*/
|
||||
extraProperties?: Record<string, string>;
|
||||
}
|
||||
|
||||
function dirConfigToProperties(
|
||||
config: DirNamespaceConfig,
|
||||
): Record<string, string> {
|
||||
// Spread the whole input so that unknown keys (e.g. a raw `manifest_enabled`
|
||||
// passed via the dynamic-impl path) flow through instead of being dropped.
|
||||
// Typed transformations layer on top.
|
||||
const { manifestEnabled, extraProperties, ...rest } = config;
|
||||
const properties: Record<string, string> = {
|
||||
...(extraProperties ?? {}),
|
||||
...(rest as Record<string, string>),
|
||||
};
|
||||
if (manifestEnabled !== undefined) {
|
||||
properties.manifest_enabled = String(manifestEnabled);
|
||||
}
|
||||
return properties;
|
||||
}
|
||||
|
||||
function restConfigToProperties(
|
||||
config: RestNamespaceConfig,
|
||||
): Record<string, string> {
|
||||
const { headers, extraProperties, ...rest } = config;
|
||||
const properties: Record<string, string> = {
|
||||
...(extraProperties ?? {}),
|
||||
...(rest as Record<string, string>),
|
||||
};
|
||||
if (headers) {
|
||||
for (const [name, value] of Object.entries(headers)) {
|
||||
properties[`headers.${name}`] = value;
|
||||
}
|
||||
}
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* Connect to a LanceDB database through a namespace.
|
||||
*
|
||||
* Unlike {@link connect}, which routes by URI scheme (local path vs.
|
||||
* `db://` cloud), `connectNamespace` always returns a namespace-backed
|
||||
* connection. The `implName` selects the namespace implementation:
|
||||
*
|
||||
* - `"dir"` — directory namespace, configured with {@link DirNamespaceConfig}.
|
||||
* - `"rest"` — remote REST catalog, configured with {@link RestNamespaceConfig}.
|
||||
* - Any other string — full module path for a custom implementation,
|
||||
* configured with a free-form string-keyed `properties` map.
|
||||
*
|
||||
* @example Typed dir namespace
|
||||
* ```ts
|
||||
* const db = await connectNamespace("dir", { root: "/path/to/db" });
|
||||
* await db.createTable("users", [{ id: 1 }]);
|
||||
* ```
|
||||
*
|
||||
* @example Typed REST namespace with auth headers
|
||||
* ```ts
|
||||
* const db = await connectNamespace("rest", {
|
||||
* uri: "https://catalog.example.com",
|
||||
* headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
|
||||
* });
|
||||
* ```
|
||||
*
|
||||
* @example Custom implementation with raw properties
|
||||
* ```ts
|
||||
* const db = await connectNamespace("my.custom.Namespace", {
|
||||
* endpoint: "...",
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
export function connectNamespace(
|
||||
implName: "dir",
|
||||
config: DirNamespaceConfig,
|
||||
options?: Partial<ConnectNamespaceOptions>,
|
||||
): Promise<Connection>;
|
||||
/**
|
||||
* Connect through the built-in REST namespace.
|
||||
*
|
||||
* Configured with {@link RestNamespaceConfig}. See the function-level
|
||||
* documentation above for the full surface, examples, and how this
|
||||
* relates to {@link connect}.
|
||||
*
|
||||
* @example
|
||||
* ```ts
|
||||
* const db = await connectNamespace("rest", {
|
||||
* uri: "https://catalog.example.com",
|
||||
* headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
export function connectNamespace(
|
||||
implName: "rest",
|
||||
config: RestNamespaceConfig,
|
||||
options?: Partial<ConnectNamespaceOptions>,
|
||||
): Promise<Connection>;
|
||||
/**
|
||||
* Connect through a custom namespace implementation by full module path,
|
||||
* configured with a free-form string-keyed `properties` map. Use the
|
||||
* typed overloads above for the built-in `"dir"` and `"rest"` impls.
|
||||
*
|
||||
* See the function-level documentation above for examples and how this
|
||||
* relates to {@link connect}.
|
||||
*
|
||||
* @example
|
||||
* ```ts
|
||||
* const db = await connectNamespace("my.custom.Namespace", {
|
||||
* endpoint: "...",
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
export function connectNamespace(
|
||||
implName: string,
|
||||
properties: Record<string, string>,
|
||||
options?: Partial<ConnectNamespaceOptions>,
|
||||
): Promise<Connection>;
|
||||
export async function connectNamespace(
|
||||
implName: string,
|
||||
configOrProperties:
|
||||
| DirNamespaceConfig
|
||||
| RestNamespaceConfig
|
||||
| Record<string, string>,
|
||||
options?: Partial<ConnectNamespaceOptions>,
|
||||
): Promise<Connection> {
|
||||
let properties: Record<string, string>;
|
||||
if (implName === "dir") {
|
||||
properties = dirConfigToProperties(
|
||||
configOrProperties as DirNamespaceConfig,
|
||||
);
|
||||
} else if (implName === "rest") {
|
||||
properties = restConfigToProperties(
|
||||
configOrProperties as RestNamespaceConfig,
|
||||
);
|
||||
} else {
|
||||
properties = configOrProperties as Record<string, string>;
|
||||
}
|
||||
|
||||
const finalOptions: ConnectNamespaceOptions = (options ??
|
||||
{}) as ConnectNamespaceOptions;
|
||||
finalOptions.storageOptions = cleanseStorageOptions(
|
||||
finalOptions.storageOptions,
|
||||
);
|
||||
|
||||
const nativeConn = await LanceDbConnection.newWithNamespace(
|
||||
implName,
|
||||
properties,
|
||||
finalOptions,
|
||||
);
|
||||
return new LocalConnection(nativeConn);
|
||||
}
|
||||
|
||||
274
nodejs/lancedb/scannable.ts
Normal file
274
nodejs/lancedb/scannable.ts
Normal file
@@ -0,0 +1,274 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import {
|
||||
Table as ArrowTable,
|
||||
RecordBatch,
|
||||
RecordBatchReader,
|
||||
Schema,
|
||||
} from "apache-arrow";
|
||||
import {
|
||||
fromRecordBatchToStreamBuffer,
|
||||
fromTableToBuffer,
|
||||
makeEmptyTable,
|
||||
} from "./arrow";
|
||||
import { NapiScannable } from "./native.js";
|
||||
|
||||
export interface ScannableOptions {
|
||||
/** Hint about the number of rows. Not validated against the stream. */
|
||||
numRows?: number;
|
||||
/**
|
||||
* Whether the source can be scanned more than once. Defaults to `true` for
|
||||
* `fromTable` / `fromFactory` and `false` for `fromIterable` /
|
||||
* `fromRecordBatchReader`.
|
||||
*/
|
||||
rescannable?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* A data source that can be scanned as a stream of Arrow `RecordBatch`es.
|
||||
*
|
||||
* `Scannable` wraps the schema + optional row count + rescannable flag and
|
||||
* a callback that yields batches one at a time. It is passed to consumers
|
||||
* (e.g. `Table.add`, `createTable`, `mergeInsert` — follow-up work) that
|
||||
* need to pull data without materializing the full dataset in JS memory.
|
||||
*
|
||||
* Batches cross the JS↔Rust boundary as Arrow IPC Stream messages; a fresh
|
||||
* writer serializes each batch, and the Rust side decodes it with
|
||||
* `arrow_ipc::reader::StreamReader`. One batch is in flight at a time.
|
||||
*/
|
||||
export class Scannable {
|
||||
readonly schema: Schema;
|
||||
readonly numRows: number | null;
|
||||
readonly rescannable: boolean;
|
||||
|
||||
/** @hidden */
|
||||
private readonly native: NapiScannable;
|
||||
|
||||
private constructor(
|
||||
native: NapiScannable,
|
||||
schema: Schema,
|
||||
numRows: number | null,
|
||||
rescannable: boolean,
|
||||
) {
|
||||
this.native = native;
|
||||
this.schema = schema;
|
||||
this.numRows = numRows;
|
||||
this.rescannable = rescannable;
|
||||
}
|
||||
|
||||
/** @hidden Access the native handle for passing through to Rust consumers. */
|
||||
get inner(): NapiScannable {
|
||||
return this.native;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a Scannable from an explicit schema and a factory that returns a
|
||||
* fresh batch iterator on each call.
|
||||
*
|
||||
* The factory is invoked once per scan. Each iterator yields
|
||||
* `RecordBatch`es matching the declared schema. Use this when you need
|
||||
* direct control over the pull loop — for example, to wrap a streaming
|
||||
* source whose batches are produced lazily.
|
||||
*
|
||||
* @param schema - The Arrow schema of the produced batches.
|
||||
* @param factory - Called at the start of each scan to produce a batch
|
||||
* iterator. Must be idempotent when `rescannable` is true.
|
||||
* @param opts - Optional hints. `rescannable` defaults to `true`; set to
|
||||
* `false` if calling `factory()` twice would not reproduce the same data.
|
||||
*/
|
||||
static async fromFactory(
|
||||
schema: Schema,
|
||||
factory: () =>
|
||||
| AsyncIterable<RecordBatch>
|
||||
| Iterable<RecordBatch>
|
||||
| AsyncIterator<RecordBatch>
|
||||
| Iterator<RecordBatch>,
|
||||
opts: ScannableOptions = {},
|
||||
): Promise<Scannable> {
|
||||
const numRows = opts.numRows ?? null;
|
||||
if (numRows != null && !Number.isInteger(numRows)) {
|
||||
throw new TypeError("numRows must be an integer");
|
||||
}
|
||||
const rescannable = opts.rescannable ?? true;
|
||||
|
||||
let iter: AsyncIterator<RecordBatch> | Iterator<RecordBatch> | null = null;
|
||||
const getNextBatch = async (isStart: boolean): Promise<Buffer | null> => {
|
||||
// `isStart` is true on the first pull of every new scan_as_stream.
|
||||
// Drop any cached iterator so factory() is re-invoked for the next scan
|
||||
if (isStart) {
|
||||
iter = null;
|
||||
}
|
||||
if (iter === null) {
|
||||
iter = normalizeIterator(factory());
|
||||
}
|
||||
const result = await iter.next();
|
||||
if (result.done) {
|
||||
iter = null;
|
||||
return null;
|
||||
}
|
||||
return fromRecordBatchToStreamBuffer(result.value);
|
||||
};
|
||||
|
||||
const schemaBuf = await fromTableToBuffer(makeEmptyTable(schema));
|
||||
const native = new NapiScannable(
|
||||
schemaBuf,
|
||||
numRows,
|
||||
rescannable,
|
||||
getNextBatch,
|
||||
);
|
||||
return new Scannable(native, schema, numRows, rescannable);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a Scannable from an in-memory Arrow `Table`. Always rescannable;
|
||||
* the table's batches are replayed on each scan.
|
||||
*
|
||||
* The table's row count is authoritative: `opts.numRows` must either be
|
||||
* omitted or equal to `table.numRows`. `opts.rescannable` of `false` is
|
||||
* rejected because in-memory Tables are always rescannable.
|
||||
*/
|
||||
static async fromTable(
|
||||
table: ArrowTable,
|
||||
opts: ScannableOptions = {},
|
||||
): Promise<Scannable> {
|
||||
if (opts.numRows != null && opts.numRows !== table.numRows) {
|
||||
throw new TypeError(
|
||||
`opts.numRows (${opts.numRows}) does not match table.numRows (${table.numRows}). ` +
|
||||
`The table's row count is authoritative; omit numRows or pass the matching value.`,
|
||||
);
|
||||
}
|
||||
if (opts.rescannable === false) {
|
||||
throw new TypeError(
|
||||
`fromTable does not accept rescannable: false. ` +
|
||||
`In-memory Arrow Tables are always rescannable; omit the option or pass true.`,
|
||||
);
|
||||
}
|
||||
return Scannable.fromFactory(table.schema, () => table.batches, {
|
||||
numRows: table.numRows,
|
||||
rescannable: true,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a Scannable from an iterable of `RecordBatch`es. `rescannable`
|
||||
* defaults to `false`. Pass an explicit schema so the consumer can
|
||||
* validate before any batch is pulled.
|
||||
*
|
||||
* `opts.rescannable: true` is honest for replayable iterables (Arrays,
|
||||
* Sets, or custom iterables whose `[Symbol.iterator]()` returns a fresh
|
||||
* iterator each call). It is rejected for one-shot iterables (generators,
|
||||
* async generators, or already-an-iterator inputs) because their
|
||||
* `[Symbol.iterator]()` returns the same exhausted object on the second
|
||||
* scan. For replayable sources outside this shape, use
|
||||
* `fromFactory(schema, () => createIter(), { rescannable: true })`.
|
||||
*
|
||||
* Note: when `opts.rescannable` is `true`, the constructor calls
|
||||
* `[Symbol.iterator]()` once on the input to perform the structural check.
|
||||
*/
|
||||
static async fromIterable(
|
||||
schema: Schema,
|
||||
iter: AsyncIterable<RecordBatch> | Iterable<RecordBatch>,
|
||||
opts: ScannableOptions = {},
|
||||
): Promise<Scannable> {
|
||||
if (opts.rescannable === true && isOneShotIterable(iter)) {
|
||||
throw new TypeError(
|
||||
`fromIterable: rescannable: true is not honest for one-shot iterables ` +
|
||||
`(generators, async generators, or iterators where [Symbol.iterator]() ` +
|
||||
`returns the same object). The source would be exhausted after the first scan. ` +
|
||||
`Use fromFactory(schema, () => createIter(), { rescannable: true }) for sources ` +
|
||||
`where each call mints a fresh iterator.`,
|
||||
);
|
||||
}
|
||||
return Scannable.fromFactory(schema, () => iter, {
|
||||
numRows: opts.numRows,
|
||||
rescannable: opts.rescannable ?? false,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a Scannable from an Arrow `RecordBatchReader`. A reader can only
|
||||
* be consumed once; `rescannable` defaults to `false`.
|
||||
*
|
||||
* The reader must already be opened (via `.open()`) so its `.schema` is
|
||||
* populated. `RecordBatchReader.from(...)` returns an unopened reader.
|
||||
*
|
||||
* `opts.rescannable: true` is rejected because `RecordBatchReader` is a
|
||||
* self-iterator (its `[Symbol.iterator]()` returns itself), and this
|
||||
* constructor does not call `reader.reset()` between scans, so a second
|
||||
* scan would always see an exhausted reader. For genuinely replayable
|
||||
* sources, use
|
||||
* `fromFactory(schema, () => openReader(), { rescannable: true })`,
|
||||
* which mints a fresh reader on each scan.
|
||||
*/
|
||||
static async fromRecordBatchReader(
|
||||
reader: RecordBatchReader,
|
||||
opts: ScannableOptions = {},
|
||||
): Promise<Scannable> {
|
||||
if (opts.rescannable === true) {
|
||||
throw new TypeError(
|
||||
`fromRecordBatchReader does not accept rescannable: true. ` +
|
||||
`RecordBatchReader is a self-iterator (its [Symbol.iterator]() ` +
|
||||
`returns itself) and would be exhausted after the first scan. ` +
|
||||
`Use fromFactory(schema, () => openReader(), { rescannable: true }) ` +
|
||||
`for sources where each call mints a fresh reader.`,
|
||||
);
|
||||
}
|
||||
return Scannable.fromFactory(reader.schema, () => reader, {
|
||||
numRows: opts.numRows,
|
||||
rescannable: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeIterator<T>(
|
||||
source: AsyncIterable<T> | Iterable<T> | AsyncIterator<T> | Iterator<T>,
|
||||
): AsyncIterator<T> | Iterator<T> {
|
||||
if (source == null) {
|
||||
throw new TypeError("Scannable factory returned null/undefined");
|
||||
}
|
||||
if (
|
||||
typeof (source as AsyncIterable<T>)[Symbol.asyncIterator] === "function"
|
||||
) {
|
||||
return (source as AsyncIterable<T>)[Symbol.asyncIterator]();
|
||||
}
|
||||
if (typeof (source as Iterable<T>)[Symbol.iterator] === "function") {
|
||||
return (source as Iterable<T>)[Symbol.iterator]();
|
||||
}
|
||||
// Already an iterator (has `.next`).
|
||||
if (typeof (source as Iterator<T>).next === "function") {
|
||||
return source as Iterator<T>;
|
||||
}
|
||||
throw new TypeError("Scannable factory returned a non-iterable value");
|
||||
}
|
||||
|
||||
// A "self-iterator" returns the same object from `[Symbol.iterator]()` /
|
||||
// `[Symbol.asyncIterator]()`. Generators behave this way, so they exhaust
|
||||
// after one pass. Replayable iterables (Array, Set, custom) return a fresh
|
||||
// iterator each call. Detection mirrors `normalizeIterator`'s ordering so
|
||||
// classification matches scan-time behavior.
|
||||
function isOneShotIterable(
|
||||
source: AsyncIterable<unknown> | Iterable<unknown>,
|
||||
): boolean {
|
||||
// null/undefined are not one-shot in any meaningful sense; let
|
||||
// `normalizeIterator` raise the actual error at scan time.
|
||||
if (source == null) return false;
|
||||
const ref = source as unknown;
|
||||
if (
|
||||
typeof (source as AsyncIterable<unknown>)[Symbol.asyncIterator] ===
|
||||
"function"
|
||||
) {
|
||||
const it = (source as AsyncIterable<unknown>)[
|
||||
Symbol.asyncIterator
|
||||
]() as unknown;
|
||||
return it === ref;
|
||||
}
|
||||
if (typeof (source as Iterable<unknown>)[Symbol.iterator] === "function") {
|
||||
const it = (source as Iterable<unknown>)[Symbol.iterator]() as unknown;
|
||||
return it === ref;
|
||||
}
|
||||
// Already-an-iterator (has `.next` but no `Symbol.iterator`) is by
|
||||
// definition one-shot.
|
||||
if (typeof (source as { next?: unknown }).next === "function") return true;
|
||||
return false;
|
||||
}
|
||||
@@ -285,6 +285,25 @@ export abstract class Table {
|
||||
*/
|
||||
abstract prewarmIndex(name: string): Promise<void>;
|
||||
|
||||
/**
|
||||
* Prewarm one or more columns of data in the table.
|
||||
*
|
||||
* @param columns The columns to prewarm. If undefined, all columns are prewarmed.
|
||||
*
|
||||
* This will load the column data into the page cache so that future queries that
|
||||
* read those columns avoid the initial cold-start latency. This call initiates
|
||||
* prewarming and returns once the request is accepted; the warming itself may
|
||||
* continue in the background. Calling it on already-prewarmed columns is a
|
||||
* no-op on the server.
|
||||
*
|
||||
* Prewarming is generally useful for columns used in filters or projections.
|
||||
* Large columns (e.g. high-dimensional vectors or binary data) may not be
|
||||
* practical to prewarm.
|
||||
*
|
||||
* This feature is currently only supported on remote tables.
|
||||
*/
|
||||
abstract prewarmData(columns?: string[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Waits for asynchronous indexing to complete on the table.
|
||||
*
|
||||
@@ -710,6 +729,10 @@ export class LocalTable extends Table {
|
||||
await this.inner.prewarmIndex(name);
|
||||
}
|
||||
|
||||
async prewarmData(columns?: string[]): Promise<void> {
|
||||
await this.inner.prewarmData(columns);
|
||||
}
|
||||
|
||||
async waitForIndex(
|
||||
indexNames: string[],
|
||||
timeoutSeconds: number,
|
||||
|
||||
10452
nodejs/package-lock.json
generated
10452
nodejs/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -38,15 +38,15 @@
|
||||
"url": "https://github.com/lancedb/lancedb"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@aws-sdk/client-dynamodb": "^3.33.0",
|
||||
"@aws-sdk/client-kms": "^3.33.0",
|
||||
"@aws-sdk/client-s3": "^3.33.0",
|
||||
"@aws-sdk/client-dynamodb": "3.1003.0",
|
||||
"@aws-sdk/client-kms": "3.1003.0",
|
||||
"@aws-sdk/client-s3": "3.1003.0",
|
||||
"@biomejs/biome": "^1.7.3",
|
||||
"@jest/globals": "^29.7.0",
|
||||
"@napi-rs/cli": "^3.5.1",
|
||||
"@napi-rs/cli": "3.5.1",
|
||||
"@types/axios": "^0.14.0",
|
||||
"@types/jest": "^29.1.2",
|
||||
"@types/node": "^22.7.4",
|
||||
"@types/node": "22.7.4",
|
||||
"@types/tmp": "^0.2.6",
|
||||
"apache-arrow-15": "npm:apache-arrow@15.0.0",
|
||||
"apache-arrow-16": "npm:apache-arrow@16.0.0",
|
||||
@@ -57,9 +57,9 @@
|
||||
"shx": "^0.3.4",
|
||||
"tmp": "^0.2.3",
|
||||
"ts-jest": "^29.1.2",
|
||||
"typedoc": "^0.26.4",
|
||||
"typedoc-plugin-markdown": "^4.2.1",
|
||||
"typescript": "^5.5.4",
|
||||
"typedoc": "0.26.4",
|
||||
"typedoc-plugin-markdown": "4.2.1",
|
||||
"typescript": "5.5.4",
|
||||
"typescript-eslint": "^7.1.0"
|
||||
},
|
||||
"ava": {
|
||||
@@ -68,16 +68,16 @@
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
},
|
||||
"packageManager": "pnpm@11.1.1",
|
||||
"cpu": ["x64", "arm64"],
|
||||
"os": ["darwin", "linux", "win32"],
|
||||
"scripts": {
|
||||
"artifacts": "napi artifacts",
|
||||
"build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir lancedb",
|
||||
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
|
||||
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/ && node -e \"require('fs').writeFileSync('dist/package.json', JSON.stringify({name:'@lancedb/lancedb',type:'commonjs'}))\"",
|
||||
"build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir dist",
|
||||
"postbuild:release": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
|
||||
"build": "npm run build:debug && npm run tsc",
|
||||
"build-release": "npm run build:release && npm run tsc",
|
||||
"build": "pnpm build:debug && pnpm tsc",
|
||||
"build-release": "pnpm build:release && pnpm tsc",
|
||||
"tsc": "tsc -b",
|
||||
"posttsc": "shx cp lancedb/native.d.ts dist/native.d.ts",
|
||||
"lint-ci": "biome ci .",
|
||||
@@ -87,7 +87,7 @@
|
||||
"lint-fix": "biome check --write . && biome format --write .",
|
||||
"prepublishOnly": "napi prepublish -t npm",
|
||||
"test": "jest --verbose",
|
||||
"integration": "S3_TEST=1 npm run test",
|
||||
"integration": "S3_TEST=1 pnpm test",
|
||||
"universal": "napi universalize",
|
||||
"version": "napi version"
|
||||
},
|
||||
@@ -95,8 +95,8 @@
|
||||
"reflect-metadata": "^0.2.2"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@huggingface/transformers": "^3.0.2",
|
||||
"openai": "^4.29.2"
|
||||
"@huggingface/transformers": "3.0.2",
|
||||
"openai": "4.29.2"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"apache-arrow": ">=15.0.0 <=18.1.0"
|
||||
|
||||
7317
nodejs/pnpm-lock.yaml
generated
Normal file
7317
nodejs/pnpm-lock.yaml
generated
Normal file
File diff suppressed because it is too large
Load Diff
18
nodejs/pnpm-workspace.yaml
Normal file
18
nodejs/pnpm-workspace.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
# Flat node_modules layout. The @napi-rs/cli build step fails to locate
|
||||
# the cdylib artifact under pnpm's isolated layout; the hoisted linker
|
||||
# mirrors npm's structure and unblocks the native build.
|
||||
nodeLinker: hoisted
|
||||
|
||||
# Block resolution of versions less than 24h old (Shai-Hulud window).
|
||||
# This is the pnpm 11 default but pinned here so it's visible to
|
||||
# reviewers and survives a future pnpm major flipping the default.
|
||||
minimumReleaseAge: 1440
|
||||
|
||||
# Fail install if a transitive dep tries to run an unapproved script.
|
||||
strictDepBuilds: true
|
||||
|
||||
allowBuilds:
|
||||
'@biomejs/biome': true
|
||||
onnxruntime-node: true
|
||||
protobufjs: true
|
||||
sharp: true
|
||||
@@ -8,12 +8,16 @@ use lancedb::database::{CreateTableMode, Database};
|
||||
use napi::bindgen_prelude::*;
|
||||
use napi_derive::*;
|
||||
|
||||
use crate::ConnectNamespaceOptions;
|
||||
use crate::ConnectionOptions;
|
||||
use crate::error::NapiErrorExt;
|
||||
use crate::header::JsHeaderProvider;
|
||||
use crate::table::Table;
|
||||
use lancedb::connection::{ConnectBuilder, Connection as LanceDBConnection};
|
||||
use lancedb::connection::{ConnectBuilder, Connection as LanceDBConnection, connect_namespace};
|
||||
|
||||
use lance_namespace::models::{
|
||||
CreateNamespaceRequest, DescribeNamespaceRequest, DropNamespaceRequest, ListNamespacesRequest,
|
||||
};
|
||||
use lancedb::ipc::{ipc_file_to_batches, ipc_file_to_schema};
|
||||
|
||||
#[napi]
|
||||
@@ -21,6 +25,29 @@ pub struct Connection {
|
||||
inner: Option<LanceDBConnection>,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct DescribeNamespaceResponse {
|
||||
pub properties: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct ListNamespacesResponse {
|
||||
pub namespaces: Vec<String>,
|
||||
pub page_token: Option<String>,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct CreateNamespaceResponse {
|
||||
pub properties: Option<HashMap<String, String>>,
|
||||
pub transaction_id: Option<String>,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct DropNamespaceResponse {
|
||||
pub properties: Option<HashMap<String, String>>,
|
||||
pub transaction_id: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
impl Connection {
|
||||
pub(crate) fn inner_new(inner: LanceDBConnection) -> Self {
|
||||
Self { inner: Some(inner) }
|
||||
@@ -106,6 +133,39 @@ impl Connection {
|
||||
Ok(Self::inner_new(builder.execute().await.default_error()?))
|
||||
}
|
||||
|
||||
/// Create a new Connection instance backed by a namespace implementation.
|
||||
#[napi(factory)]
|
||||
pub async fn new_with_namespace(
|
||||
impl_name: String,
|
||||
properties: HashMap<String, String>,
|
||||
options: ConnectNamespaceOptions,
|
||||
) -> napi::Result<Self> {
|
||||
if impl_name.is_empty() {
|
||||
return Err(napi::Error::from_reason(
|
||||
"implName must be a non-empty string",
|
||||
));
|
||||
}
|
||||
|
||||
let mut builder = connect_namespace(&impl_name, properties);
|
||||
if let Some(interval) = options.read_consistency_interval {
|
||||
builder =
|
||||
builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));
|
||||
}
|
||||
if let Some(storage_options) = options.storage_options {
|
||||
for (key, value) in storage_options {
|
||||
builder = builder.storage_option(key, value);
|
||||
}
|
||||
}
|
||||
if let Some(namespace_client_properties) = options.namespace_client_properties {
|
||||
builder = builder.namespace_client_properties(namespace_client_properties);
|
||||
}
|
||||
if let Some(session) = options.session {
|
||||
builder = builder.session(session.inner.clone());
|
||||
}
|
||||
|
||||
Ok(Self::inner_new(builder.execute().await.default_error()?))
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn display(&self) -> napi::Result<String> {
|
||||
Ok(self.get_inner()?.to_string())
|
||||
@@ -268,9 +328,149 @@ impl Connection {
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn rename_table(
|
||||
&self,
|
||||
old_name: String,
|
||||
new_name: String,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
) -> napi::Result<()> {
|
||||
let ns = namespace_path.unwrap_or_default();
|
||||
self.get_inner()?
|
||||
.rename_table(&old_name, &new_name, &ns, &ns)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn drop_all_tables(&self, namespace_path: Option<Vec<String>>) -> napi::Result<()> {
|
||||
let ns = namespace_path.unwrap_or_default();
|
||||
self.get_inner()?.drop_all_tables(&ns).await.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
/// Describe a namespace and return its properties.
|
||||
pub async fn describe_namespace(
|
||||
&self,
|
||||
namespace_path: Vec<String>,
|
||||
) -> napi::Result<DescribeNamespaceResponse> {
|
||||
let req = DescribeNamespaceRequest {
|
||||
id: Some(namespace_path),
|
||||
..Default::default()
|
||||
};
|
||||
let resp = self
|
||||
.get_inner()?
|
||||
.describe_namespace(req)
|
||||
.await
|
||||
.default_error()?;
|
||||
Ok(DescribeNamespaceResponse {
|
||||
properties: resp.properties,
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
/// List child namespaces under the given namespace path
|
||||
pub async fn list_namespaces(
|
||||
&self,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
page_token: Option<String>,
|
||||
limit: Option<u32>,
|
||||
) -> napi::Result<ListNamespacesResponse> {
|
||||
let req = ListNamespacesRequest {
|
||||
id: namespace_path,
|
||||
page_token,
|
||||
limit: limit.map(|l| l as i32),
|
||||
..Default::default()
|
||||
};
|
||||
let resp = self
|
||||
.get_inner()?
|
||||
.list_namespaces(req)
|
||||
.await
|
||||
.default_error()?;
|
||||
Ok(ListNamespacesResponse {
|
||||
namespaces: resp.namespaces,
|
||||
page_token: resp.page_token,
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
/// Create a new namespace with optional properties.
|
||||
pub async fn create_namespace(
|
||||
&self,
|
||||
namespace_path: Vec<String>,
|
||||
mode: Option<String>,
|
||||
properties: Option<HashMap<String, String>>,
|
||||
) -> napi::Result<CreateNamespaceResponse> {
|
||||
let mode_str = mode
|
||||
.map(|m| match m.to_lowercase().as_str() {
|
||||
"create" => Ok("Create".to_string()),
|
||||
"exist_ok" => Ok("ExistOk".to_string()),
|
||||
"overwrite" => Ok("Overwrite".to_string()),
|
||||
_ => Err(napi::Error::from_reason(format!(
|
||||
"Invalid mode '{}': expected one of 'create', 'exist_ok', 'overwrite'",
|
||||
m
|
||||
))),
|
||||
})
|
||||
.transpose()?;
|
||||
let req = CreateNamespaceRequest {
|
||||
id: Some(namespace_path),
|
||||
mode: mode_str,
|
||||
properties,
|
||||
..Default::default()
|
||||
};
|
||||
let resp = self
|
||||
.get_inner()?
|
||||
.create_namespace(req)
|
||||
.await
|
||||
.default_error()?;
|
||||
Ok(CreateNamespaceResponse {
|
||||
properties: resp.properties,
|
||||
transaction_id: resp.transaction_id,
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
/// Drop a namespace.
|
||||
pub async fn drop_namespace(
|
||||
&self,
|
||||
namespace_path: Vec<String>,
|
||||
mode: Option<String>,
|
||||
behavior: Option<String>,
|
||||
) -> napi::Result<DropNamespaceResponse> {
|
||||
let mode_str = mode
|
||||
.map(|m| match m.to_lowercase().as_str() {
|
||||
"skip" => Ok("Skip".to_string()),
|
||||
"fail" => Ok("Fail".to_string()),
|
||||
_ => Err(napi::Error::from_reason(format!(
|
||||
"Invalid mode '{}': expected one of 'skip', 'fail'",
|
||||
m
|
||||
))),
|
||||
})
|
||||
.transpose()?;
|
||||
let behavior_str = behavior
|
||||
.map(|b| match b.to_lowercase().as_str() {
|
||||
"restrict" => Ok("Restrict".to_string()),
|
||||
"cascade" => Ok("Cascade".to_string()),
|
||||
_ => Err(napi::Error::from_reason(format!(
|
||||
"Invalid behavior '{}': expected one of 'restrict', 'cascade'",
|
||||
b
|
||||
))),
|
||||
})
|
||||
.transpose()?;
|
||||
let req = DropNamespaceRequest {
|
||||
id: Some(namespace_path),
|
||||
mode: mode_str,
|
||||
behavior: behavior_str,
|
||||
..Default::default()
|
||||
};
|
||||
let resp = self
|
||||
.get_inner()?
|
||||
.drop_namespace(req)
|
||||
.await
|
||||
.default_error()?;
|
||||
Ok(DropNamespaceResponse {
|
||||
properties: resp.properties,
|
||||
transaction_id: resp.transaction_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ pub mod permutation;
|
||||
mod query;
|
||||
pub mod remote;
|
||||
mod rerankers;
|
||||
mod scannable;
|
||||
mod session;
|
||||
mod table;
|
||||
mod util;
|
||||
@@ -67,6 +68,26 @@ pub struct OpenTableOptions {
|
||||
pub storage_options: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
#[derive(Debug)]
|
||||
pub struct ConnectNamespaceOptions {
|
||||
/// The interval, in seconds, at which to check for updates to the table
|
||||
/// from other processes. If None, then consistency is not checked. For
|
||||
/// performance reasons, this is the default. For strong consistency, set
|
||||
/// this to zero seconds. Then every read will check for updates from other
|
||||
/// processes. As a compromise, you can set this to a non-zero value for
|
||||
/// eventual consistency.
|
||||
pub read_consistency_interval: Option<f64>,
|
||||
/// Configuration for object storage. The available options are described
|
||||
/// at https://docs.lancedb.com/storage/
|
||||
pub storage_options: Option<HashMap<String, String>>,
|
||||
/// Extra properties for the backing namespace client.
|
||||
pub namespace_client_properties: Option<HashMap<String, String>>,
|
||||
/// The session to use for this connection. Holds shared caches and other
|
||||
/// session-specific state.
|
||||
pub session: Option<session::Session>,
|
||||
}
|
||||
|
||||
#[napi_derive::module_init]
|
||||
fn init() {
|
||||
let env = Env::new()
|
||||
|
||||
253
nodejs/src/scannable.rs
Normal file
253
nodejs/src/scannable.rs
Normal file
@@ -0,0 +1,253 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
//! NodeJS binding for the [`lancedb::data::scannable::Scannable`] trait.
|
||||
//!
|
||||
//! The JS side supplies a `getNextBatch(isStart)` callback that returns the
|
||||
//! next Arrow `RecordBatch` encoded as a self-contained Arrow IPC Stream
|
||||
//! message (schema message + record batch message + EOS marker) wrapped in a
|
||||
//! `Buffer`, or `null` when the stream is exhausted. The Rust side parses
|
||||
//! each buffer with `arrow_ipc::reader::StreamReader`, validates every
|
||||
//! standalone batch stream against the declared schema, and yields decoded
|
||||
//! `RecordBatch`es as a [`SendableRecordBatchStream`].
|
||||
//!
|
||||
//! `isStart` is `true` on the first `getNextBatch` call of each new
|
||||
//! `scan_as_stream` and `false` thereafter. JS uses it to drop any cached
|
||||
//! iterator and re-invoke its factory at scan boundaries, so retries
|
||||
//! triggered by mid-stream failures restart at batch 0.
|
||||
|
||||
use std::io::Cursor;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_array::RecordBatch;
|
||||
use arrow_ipc::reader::StreamReader;
|
||||
use arrow_schema::SchemaRef;
|
||||
use futures::stream::once;
|
||||
use lancedb::arrow::{SendableRecordBatchStream, SimpleRecordBatchStream};
|
||||
use lancedb::data::scannable::Scannable as LanceScannable;
|
||||
use lancedb::ipc::ipc_file_to_schema;
|
||||
use lancedb::{Error, Result as LanceResult};
|
||||
use napi::bindgen_prelude::*;
|
||||
use napi::threadsafe_function::ThreadsafeFunction;
|
||||
use napi_derive::napi;
|
||||
|
||||
/// Threadsafe handle to the JS `getNextBatch` callback. The callback takes a
|
||||
/// single boolean `isStart` (`true` on the first call of each new scan) and
|
||||
/// returns a Promise that resolves to a `Buffer` containing one IPC Stream
|
||||
/// message, or `null` at end-of-stream.
|
||||
type GetNextBatchFn = ThreadsafeFunction<bool, Promise<Option<Buffer>>, bool, Status, false>;
|
||||
|
||||
/// A Rust-side view of a JS-constructed `Scannable`.
|
||||
///
|
||||
/// Held in JS as the return value of the `Scannable` class constructor. When
|
||||
/// passed to a consumer that accepts `impl lancedb::data::scannable::Scannable`,
|
||||
/// the consumer invokes `scan_as_stream()` to pull batches through the JS
|
||||
/// callback.
|
||||
#[napi]
|
||||
pub struct NapiScannable {
|
||||
schema: SchemaRef,
|
||||
num_rows: Option<usize>,
|
||||
rescannable: bool,
|
||||
// `ThreadsafeFunction` is not `Clone`; wrap in `Arc` so the stream
|
||||
// returned by `scan_as_stream` can own a handle independent of `self`.
|
||||
get_next_batch: Arc<GetNextBatchFn>,
|
||||
// Tracks whether a scan has already started; used to enforce one-shot
|
||||
// semantics on non-rescannable sources.
|
||||
scanned: bool,
|
||||
}
|
||||
|
||||
#[napi]
|
||||
impl NapiScannable {
|
||||
/// Construct a new `NapiScannable`.
|
||||
///
|
||||
/// - `schema_buf` — Arrow IPC File buffer carrying only the schema (no batches).
|
||||
/// - `num_rows` — optional row count hint; not validated against the stream.
|
||||
/// - `rescannable` — whether `get_next_batch` may be re-driven after the
|
||||
/// scan completes.
|
||||
/// - `get_next_batch` -- JS callback that yields the next batch as an Arrow
|
||||
/// IPC Stream message wrapped in a `Buffer`, or `null` at EOF. The
|
||||
/// `isStart` argument is `true` on the first call of each new scan;
|
||||
/// JS uses it to discard any cached iterator before pulling.
|
||||
#[napi(constructor)]
|
||||
pub fn new(
|
||||
schema_buf: Buffer,
|
||||
num_rows: Option<i64>,
|
||||
rescannable: bool,
|
||||
get_next_batch: Function<bool, Promise<Option<Buffer>>>,
|
||||
) -> napi::Result<Self> {
|
||||
let schema = ipc_file_to_schema(schema_buf.to_vec())
|
||||
.map_err(|e| napi::Error::from_reason(format!("Invalid schema buffer: {}", e)))?;
|
||||
let num_rows = num_rows
|
||||
.map(|n| {
|
||||
usize::try_from(n)
|
||||
.map_err(|_| napi::Error::from_reason("num_rows must be non-negative"))
|
||||
})
|
||||
.transpose()?;
|
||||
let get_next_batch = Arc::new(get_next_batch.build_threadsafe_function().build()?);
|
||||
Ok(Self {
|
||||
schema,
|
||||
num_rows,
|
||||
rescannable,
|
||||
get_next_batch,
|
||||
scanned: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for NapiScannable {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("NapiScannable")
|
||||
.field("schema", &self.schema)
|
||||
.field("num_rows", &self.num_rows)
|
||||
.field("rescannable", &self.rescannable)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl LanceScannable for NapiScannable {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
self.schema.clone()
|
||||
}
|
||||
|
||||
fn scan_as_stream(&mut self) -> SendableRecordBatchStream {
|
||||
let schema = self.schema.clone();
|
||||
|
||||
// One-shot enforcement for non-rescannable sources: return a stream
|
||||
// whose first item is an error.
|
||||
if self.scanned && !self.rescannable {
|
||||
let err_stream = once(async {
|
||||
Err(Error::InvalidInput {
|
||||
message: "Scannable has already been consumed (non-rescannable source)"
|
||||
.to_string(),
|
||||
})
|
||||
});
|
||||
return Box::pin(SimpleRecordBatchStream::new(err_stream, schema));
|
||||
}
|
||||
self.scanned = true;
|
||||
|
||||
let tsfn = Arc::clone(&self.get_next_batch);
|
||||
let declared_schema = schema.clone();
|
||||
|
||||
// State threaded through the unfold. `is_first_pull` starts true so
|
||||
// the first call into JS signals a new-scan boundary; JS uses it to
|
||||
// reset any cached iterator before factory()-ing a fresh one.
|
||||
let initial = State {
|
||||
tsfn,
|
||||
batch_index: 0,
|
||||
declared_schema,
|
||||
errored: false,
|
||||
is_first_pull: true,
|
||||
};
|
||||
|
||||
let stream = futures::stream::unfold(initial, |mut state| async move {
|
||||
if state.errored {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Pull the next IPC Stream buffer from JS. `is_first_pull` is
|
||||
// consumed here and cleared so subsequent pulls continue the
|
||||
// same scan rather than restarting it.
|
||||
let is_start = state.is_first_pull;
|
||||
state.is_first_pull = false;
|
||||
let buf = match pull_next(&state.tsfn, is_start).await {
|
||||
Ok(Some(buf)) => buf,
|
||||
Ok(None) => return None,
|
||||
Err(e) => {
|
||||
state.errored = true;
|
||||
return Some((Err(e), state));
|
||||
}
|
||||
};
|
||||
|
||||
match decode_one_batch(buf.as_ref(), &state.declared_schema) {
|
||||
Ok(batch) => {
|
||||
state.batch_index += 1;
|
||||
Some((Ok(batch), state))
|
||||
}
|
||||
Err(e) => {
|
||||
let tagged = Error::Runtime {
|
||||
message: format!(
|
||||
"[scannable/rust-bridge] failure at batch index {}: {}",
|
||||
state.batch_index, e
|
||||
),
|
||||
};
|
||||
state.errored = true;
|
||||
Some((Err(tagged), state))
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Box::pin(SimpleRecordBatchStream::new(stream, schema))
|
||||
}
|
||||
|
||||
fn num_rows(&self) -> Option<usize> {
|
||||
self.num_rows
|
||||
}
|
||||
|
||||
fn rescannable(&self) -> bool {
|
||||
self.rescannable
|
||||
}
|
||||
}
|
||||
|
||||
struct State {
|
||||
tsfn: Arc<GetNextBatchFn>,
|
||||
batch_index: usize,
|
||||
declared_schema: SchemaRef,
|
||||
errored: bool,
|
||||
/// True for the very first pull of a new scan. Forwarded to JS so the
|
||||
/// callback can drop any cached iterator and call its factory fresh,
|
||||
/// which makes rescannable sources restart at batch 0 even when the
|
||||
/// previous scan ended mid-stream.
|
||||
is_first_pull: bool,
|
||||
}
|
||||
|
||||
/// Invoke the JS callback and await its Promise. `is_start` is forwarded to
|
||||
/// the JS side as the `isStart` argument so it can reset its iterator at the
|
||||
/// scan boundary. Errors on the JS side surface here as rejected promises
|
||||
/// and are tunneled back as `lancedb::Error::Runtime`.
|
||||
async fn pull_next(tsfn: &GetNextBatchFn, is_start: bool) -> LanceResult<Option<Buffer>> {
|
||||
let promise = tsfn
|
||||
.call_async(is_start)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!(
|
||||
"[scannable/js-factory] napi error status={}, reason={}",
|
||||
e.status, e.reason
|
||||
),
|
||||
})?;
|
||||
promise.await.map_err(|e| Error::Runtime {
|
||||
message: format!(
|
||||
"[scannable/js-iterator] napi error status={}, reason={}",
|
||||
e.status, e.reason
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
/// Decode one IPC Stream buffer (schema + batch + EOS) into a `RecordBatch`.
|
||||
/// Each buffer is a standalone IPC stream, so every decoded stream schema must
|
||||
/// match the one declared at construction.
|
||||
fn decode_one_batch(buf: &[u8], declared: &SchemaRef) -> LanceResult<RecordBatch> {
|
||||
let reader = StreamReader::try_new(Cursor::new(buf), None).map_err(|e| Error::Runtime {
|
||||
message: format!("failed to open IPC stream reader: {}", e),
|
||||
})?;
|
||||
|
||||
let actual = reader.schema();
|
||||
if actual.as_ref() != declared.as_ref() {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"declared schema does not match stream schema: declared={:?} actual={:?}",
|
||||
declared, actual
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let mut iter = reader;
|
||||
let batch = iter
|
||||
.next()
|
||||
.ok_or_else(|| Error::Runtime {
|
||||
message: "IPC stream contained schema but no record batch".to_string(),
|
||||
})?
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("failed to decode record batch: {}", e),
|
||||
})?;
|
||||
Ok(batch)
|
||||
}
|
||||
@@ -159,6 +159,14 @@ impl Table {
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn prewarm_data(&self, columns: Option<Vec<String>>) -> napi::Result<()> {
|
||||
self.inner_ref()?
|
||||
.prewarm_data(columns)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn wait_for_index(&self, index_names: Vec<String>, timeout_s: i64) -> Result<()> {
|
||||
let timeout = std::time::Duration::from_secs(timeout_s.try_into().unwrap());
|
||||
|
||||
@@ -19,6 +19,7 @@ arrow = { version = "58.0.0", features = ["pyarrow"] }
|
||||
async-trait = "0.1"
|
||||
bytes = "1"
|
||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||
datafusion-common.workspace = true
|
||||
lance-core.workspace = true
|
||||
lance-namespace.workspace = true
|
||||
lance-namespace-impls.workspace = true
|
||||
@@ -35,7 +36,8 @@ futures.workspace = true
|
||||
serde = "1"
|
||||
serde_json = "1"
|
||||
snafu.workspace = true
|
||||
tokio = { version = "1.40", features = ["sync"] }
|
||||
tokio = { version = "1.40", features = ["sync", "rt-multi-thread"] }
|
||||
libc = "0.2"
|
||||
|
||||
[build-dependencies]
|
||||
pyo3-build-config = { version = "0.28", features = [
|
||||
|
||||
@@ -7,7 +7,6 @@ import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import timedelta
|
||||
from typing import Dict, Optional, Union, Any, List
|
||||
import warnings
|
||||
|
||||
__version__ = importlib.metadata.version("lancedb")
|
||||
|
||||
@@ -438,13 +437,3 @@ __all__ = [
|
||||
"Table",
|
||||
"__version__",
|
||||
]
|
||||
|
||||
|
||||
def __warn_on_fork():
|
||||
warnings.warn(
|
||||
"lance is not fork-safe. If you are using multiprocessing, use spawn instead.",
|
||||
)
|
||||
|
||||
|
||||
if hasattr(os, "register_at_fork"):
|
||||
os.register_at_fork(before=__warn_on_fork) # type: ignore[attr-defined]
|
||||
|
||||
@@ -12,6 +12,7 @@ from .index import (
|
||||
LabelList,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
FTS,
|
||||
)
|
||||
from lance_namespace import (
|
||||
@@ -25,6 +26,7 @@ from .remote import ClientConfig
|
||||
|
||||
IvfHnswPq: type[HnswPq] = HnswPq
|
||||
IvfHnswSq: type[HnswSq] = HnswSq
|
||||
IvfHnswFlat: type[HnswFlat] = HnswFlat
|
||||
|
||||
class PyExpr:
|
||||
"""A type-safe DataFusion expression node (Rust-side handle)."""
|
||||
@@ -49,7 +51,7 @@ class PyExpr:
|
||||
def to_sql(self) -> str: ...
|
||||
|
||||
def expr_col(name: str) -> PyExpr: ...
|
||||
def expr_lit(value: Union[bool, int, float, str]) -> PyExpr: ...
|
||||
def expr_lit(value: Union[bool, int, float, str, bytes]) -> PyExpr: ...
|
||||
def expr_func(name: str, args: List[PyExpr]) -> PyExpr: ...
|
||||
|
||||
class Session:
|
||||
@@ -180,6 +182,7 @@ class Table:
|
||||
IvfPq,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
@@ -442,7 +445,7 @@ class AsyncPermutationBuilder:
|
||||
async def execute(self) -> Table: ...
|
||||
|
||||
def async_permutation_builder(
|
||||
table: Table, dest_table_name: str
|
||||
table: Table,
|
||||
) -> AsyncPermutationBuilder: ...
|
||||
def fts_query_to_json(query: Any) -> str: ...
|
||||
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import threading
|
||||
import warnings
|
||||
|
||||
|
||||
class BackgroundEventLoop:
|
||||
@@ -13,6 +15,9 @@ class BackgroundEventLoop:
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._start()
|
||||
|
||||
def _start(self):
|
||||
self.loop = asyncio.new_event_loop()
|
||||
self.thread = threading.Thread(
|
||||
target=self.loop.run_forever,
|
||||
@@ -31,3 +36,30 @@ class BackgroundEventLoop:
|
||||
|
||||
|
||||
LOOP = BackgroundEventLoop()
|
||||
|
||||
_FORK_WARNED = False
|
||||
|
||||
|
||||
def _reset_after_fork():
|
||||
# Threads do not survive fork(), so the asyncio loop in LOOP.thread is
|
||||
# dead in the child. Re-initialize the singleton in place so existing
|
||||
# `from .background_loop import LOOP` references in other modules see
|
||||
# the new state. The Rust-side tokio runtime is reset analogously by a
|
||||
# pthread_atfork hook installed in the _lancedb extension.
|
||||
LOOP._start()
|
||||
global _FORK_WARNED
|
||||
if not _FORK_WARNED:
|
||||
_FORK_WARNED = True
|
||||
warnings.warn(
|
||||
"lancedb fork support is experimental: the internal async "
|
||||
"runtime has been reset in the forked child, but a small chance "
|
||||
"of deadlock remains if other state was mid-operation at fork "
|
||||
"time. The 'forkserver' or 'spawn' multiprocessing start method "
|
||||
"is likely a safer alternative.",
|
||||
RuntimeWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
if hasattr(os, "register_at_fork"):
|
||||
os.register_at_fork(after_in_child=_reset_after_fork)
|
||||
|
||||
@@ -63,7 +63,7 @@ def _coerce(value: "ExprLike") -> "Expr":
|
||||
|
||||
|
||||
# Type alias used in annotations.
|
||||
ExprLike = Union["Expr", bool, int, float, str]
|
||||
ExprLike = Union["Expr", bool, int, float, str, bytes]
|
||||
|
||||
|
||||
class Expr:
|
||||
@@ -261,13 +261,13 @@ def col(name: str) -> Expr:
|
||||
return Expr(expr_col(name))
|
||||
|
||||
|
||||
def lit(value: Union[bool, int, float, str]) -> Expr:
|
||||
def lit(value: Union[bool, int, float, str, bytes]) -> Expr:
|
||||
"""Create a literal (constant) value expression.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value:
|
||||
A Python ``bool``, ``int``, ``float``, or ``str``.
|
||||
A Python ``bool``, ``int``, ``float``, ``str``, or ``bytes``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
@@ -7,6 +7,7 @@ from typing import Literal, Optional
|
||||
from ._lancedb import (
|
||||
IndexConfig,
|
||||
)
|
||||
from .types import BaseTokenizerType
|
||||
|
||||
lang_mapping = {
|
||||
"ar": "Arabic",
|
||||
@@ -111,8 +112,12 @@ class FTS:
|
||||
- "simple": Splits text by whitespace and punctuation.
|
||||
- "whitespace": Split text by whitespace, but not punctuation.
|
||||
- "raw": No tokenization. The entire text is treated as a single token.
|
||||
- "ngram": N-gram tokenizer for substring-style matching.
|
||||
- "jieba/*": Jieba tokenizer loaded from Lance's language model home.
|
||||
- "lindera/*": Lindera tokenizer loaded from Lance's language model home.
|
||||
language : str, default "English"
|
||||
The language to use for tokenization.
|
||||
The language to use for stemming and stop-word removal. This is not the
|
||||
primary way to enable CJK tokenization.
|
||||
max_token_length : int, default 40
|
||||
The maximum token length to index. Tokens longer than this length will be
|
||||
ignored.
|
||||
@@ -127,10 +132,17 @@ class FTS:
|
||||
ascii_folding : bool, default True
|
||||
Whether to fold ASCII characters. This converts accented characters to
|
||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||
|
||||
Notes
|
||||
-----
|
||||
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
|
||||
require tokenizer models in Lance's language model home. Set
|
||||
``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
|
||||
directory under ``lance/language_models``.
|
||||
"""
|
||||
|
||||
with_position: bool = False
|
||||
base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple"
|
||||
base_tokenizer: BaseTokenizerType = "simple"
|
||||
language: str = "English"
|
||||
max_token_length: Optional[int] = 40
|
||||
lower_case: bool = True
|
||||
@@ -376,9 +388,98 @@ class HnswSq:
|
||||
target_partition_size: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class HnswFlat:
|
||||
"""Describe a HNSW-FLAT index configuration.
|
||||
|
||||
HNSW-FLAT stands for Hierarchical Navigable Small World without quantization.
|
||||
It stores raw vectors in the HNSW graph, providing the highest recall among
|
||||
the IVF_HNSW family at the cost of more memory and disk space compared to
|
||||
:class:`HnswSq` or :class:`HnswPq`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
distance_type: str, default "l2"
|
||||
|
||||
The distance metric used to train the index.
|
||||
|
||||
The following distance types are available:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
num_partitions, default sqrt(num_rows)
|
||||
|
||||
The number of IVF partitions to create.
|
||||
|
||||
For HNSW, we recommend a small number of partitions. Setting this to 1
|
||||
works well for most tables. For very large tables, training just one HNSW
|
||||
graph will require too much memory. Each partition becomes its own HNSW
|
||||
graph, so setting this value higher reduces the peak memory use of
|
||||
training.
|
||||
|
||||
max_iterations, default 50
|
||||
|
||||
Max iterations to train kmeans.
|
||||
|
||||
When training an IVF index we use kmeans to calculate the partitions.
|
||||
This parameter controls how many iterations of kmeans to run.
|
||||
|
||||
sample_rate, default 256
|
||||
|
||||
The rate used to calculate the number of training vectors for kmeans.
|
||||
|
||||
m, default 20
|
||||
|
||||
The number of neighbors to select for each vector in the HNSW graph.
|
||||
|
||||
This value controls the tradeoff between search speed and accuracy.
|
||||
The higher the value the more accurate the search but the slower it
|
||||
will be.
|
||||
|
||||
ef_construction, default 300
|
||||
|
||||
The number of candidates to evaluate during the construction of the HNSW
|
||||
graph.
|
||||
|
||||
This value controls the tradeoff between build speed and accuracy.
|
||||
The higher the value the more accurate the build but the slower it will
|
||||
be. 150 to 300 is the typical range. 100 is a minimum for good quality
|
||||
search results. In most cases, there is no benefit to setting this higher
|
||||
than 500. This value should be set to a value that is not less than `ef`
|
||||
in the search phase.
|
||||
|
||||
target_partition_size, default is 1,048,576
|
||||
|
||||
The target size of each partition.
|
||||
"""
|
||||
|
||||
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
||||
num_partitions: Optional[int] = None
|
||||
max_iterations: int = 50
|
||||
sample_rate: int = 256
|
||||
m: int = 20
|
||||
ef_construction: int = 300
|
||||
target_partition_size: Optional[int] = None
|
||||
|
||||
|
||||
# Backwards-compatible aliases
|
||||
IvfHnswPq = HnswPq
|
||||
IvfHnswSq = HnswSq
|
||||
IvfHnswFlat = HnswFlat
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -698,11 +799,13 @@ __all__ = [
|
||||
"IvfPq",
|
||||
"IvfHnswPq",
|
||||
"IvfHnswSq",
|
||||
"IvfHnswFlat",
|
||||
"IvfSq",
|
||||
"IvfRq",
|
||||
"IvfFlat",
|
||||
"HnswPq",
|
||||
"HnswSq",
|
||||
"HnswFlat",
|
||||
"IndexConfig",
|
||||
"FTS",
|
||||
"Bitmap",
|
||||
|
||||
@@ -6,22 +6,44 @@
|
||||
from typing import Optional
|
||||
|
||||
|
||||
_CREATE_NAMESPACE_MODES = frozenset({"create", "exist_ok", "overwrite"})
|
||||
_DROP_NAMESPACE_MODES = frozenset({"SKIP", "FAIL"})
|
||||
_DROP_NAMESPACE_BEHAVIORS = frozenset({"RESTRICT", "CASCADE"})
|
||||
|
||||
|
||||
def _normalize_create_namespace_mode(mode: Optional[str]) -> Optional[str]:
|
||||
"""Normalize create namespace mode to lowercase (API expects lowercase)."""
|
||||
if mode is None:
|
||||
return None
|
||||
return mode.lower()
|
||||
normalized = mode.lower()
|
||||
if normalized not in _CREATE_NAMESPACE_MODES:
|
||||
raise ValueError(
|
||||
f"Invalid create namespace mode {mode!r}: "
|
||||
f"expected one of 'create', 'exist_ok', 'overwrite'"
|
||||
)
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_drop_namespace_mode(mode: Optional[str]) -> Optional[str]:
|
||||
"""Normalize drop namespace mode to uppercase (API expects uppercase)."""
|
||||
if mode is None:
|
||||
return None
|
||||
return mode.upper()
|
||||
normalized = mode.upper()
|
||||
if normalized not in _DROP_NAMESPACE_MODES:
|
||||
raise ValueError(
|
||||
f"Invalid drop namespace mode {mode!r}: expected one of 'skip', 'fail'"
|
||||
)
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_drop_namespace_behavior(behavior: Optional[str]) -> Optional[str]:
|
||||
"""Normalize drop namespace behavior to uppercase (API expects uppercase)."""
|
||||
if behavior is None:
|
||||
return None
|
||||
return behavior.upper()
|
||||
normalized = behavior.upper()
|
||||
if normalized not in _DROP_NAMESPACE_BEHAVIORS:
|
||||
raise ValueError(
|
||||
f"Invalid drop namespace behavior {behavior!r}: "
|
||||
f"expected one of 'restrict', 'cascade'"
|
||||
)
|
||||
return normalized
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
from deprecation import deprecated
|
||||
from lancedb import AsyncConnection, DBConnection
|
||||
import pyarrow as pa
|
||||
import copy
|
||||
import json
|
||||
|
||||
from deprecation import deprecated
|
||||
import pyarrow as pa
|
||||
|
||||
from ._lancedb import async_permutation_builder, PermutationReader
|
||||
from .table import LanceTable
|
||||
from .background_loop import LOOP
|
||||
@@ -36,10 +37,7 @@ class PermutationBuilder:
|
||||
be referenced by name in the future. If names are not provided then they can only
|
||||
be referenced by their ordinal index. There is no requirement to name every split.
|
||||
|
||||
By default, the permutation will be stored in memory and will be lost when the
|
||||
program exits. To persist the permutation (for very large datasets or to share
|
||||
the permutation across multiple workers) use the [persist](#persist) method to
|
||||
create a permanent table.
|
||||
The permutation is stored in memory and will be lost when the program exits.
|
||||
"""
|
||||
|
||||
def __init__(self, table: LanceTable):
|
||||
@@ -51,15 +49,6 @@ class PermutationBuilder:
|
||||
"""
|
||||
self._async = async_permutation_builder(table)
|
||||
|
||||
def persist(
|
||||
self, database: Union[DBConnection, AsyncConnection], table_name: str
|
||||
) -> "PermutationBuilder":
|
||||
"""
|
||||
Persist the permutation to the given database.
|
||||
"""
|
||||
self._async.persist(database, table_name)
|
||||
return self
|
||||
|
||||
def split_random(
|
||||
self,
|
||||
*,
|
||||
@@ -380,20 +369,44 @@ class Permutation:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
reader: PermutationReader,
|
||||
base_table: LanceTable,
|
||||
permutation_table: Optional[LanceTable],
|
||||
split: int,
|
||||
selection: dict[str, str],
|
||||
batch_size: int,
|
||||
transform_fn: Callable[pa.RecordBatch, Any],
|
||||
offset: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
connection_factory: Optional[Callable[[str], LanceTable]] = None,
|
||||
_reader: Optional[PermutationReader] = None,
|
||||
):
|
||||
"""
|
||||
Internal constructor. Use [from_tables](#from_tables) instead.
|
||||
"""
|
||||
assert reader is not None, "reader is required"
|
||||
assert base_table is not None, "base_table is required"
|
||||
assert selection is not None, "selection is required"
|
||||
self.reader = reader
|
||||
self.base_table = base_table
|
||||
self.permutation_table = permutation_table
|
||||
self.split = split
|
||||
self.selection = selection
|
||||
self.transform_fn = transform_fn
|
||||
self.batch_size = batch_size
|
||||
self.offset = offset
|
||||
self.limit = limit
|
||||
self.connection_factory = connection_factory
|
||||
if _reader is None:
|
||||
_reader = LOOP.run(self._build_reader())
|
||||
self.reader: PermutationReader = _reader
|
||||
|
||||
async def _build_reader(self) -> PermutationReader:
|
||||
reader = await PermutationReader.from_tables(
|
||||
self.base_table, self.permutation_table, self.split
|
||||
)
|
||||
if self.offset is not None:
|
||||
reader = await reader.with_offset(self.offset)
|
||||
if self.limit is not None:
|
||||
reader = await reader.with_limit(self.limit)
|
||||
return reader
|
||||
|
||||
def _with_selection(self, selection: dict[str, str]) -> "Permutation":
|
||||
"""
|
||||
@@ -402,21 +415,97 @@ class Permutation:
|
||||
Does not validation of the selection and it replaces it entirely. This is not
|
||||
intended for public use.
|
||||
"""
|
||||
return Permutation(self.reader, selection, self.batch_size, self.transform_fn)
|
||||
|
||||
def _with_reader(self, reader: PermutationReader) -> "Permutation":
|
||||
"""
|
||||
Creates a new permutation with the given reader
|
||||
|
||||
This is an internal method and should not be used directly.
|
||||
"""
|
||||
return Permutation(reader, self.selection, self.batch_size, self.transform_fn)
|
||||
new = copy.copy(self)
|
||||
new.selection = selection
|
||||
return new
|
||||
|
||||
def with_batch_size(self, batch_size: int) -> "Permutation":
|
||||
"""
|
||||
Creates a new permutation with the given batch size
|
||||
"""
|
||||
return Permutation(self.reader, self.selection, batch_size, self.transform_fn)
|
||||
new = copy.copy(self)
|
||||
new.batch_size = batch_size
|
||||
return new
|
||||
|
||||
def with_connection_factory(
|
||||
self, connection_factory: Callable[[str], LanceTable]
|
||||
) -> "Permutation":
|
||||
"""
|
||||
Creates a new permutation that will use ``connection_factory`` to reopen
|
||||
the base table when this permutation is unpickled in a worker process.
|
||||
|
||||
The factory is a callable that takes a single argument — the base table
|
||||
name — and returns a [LanceTable]. It must be picklable; the worker
|
||||
will pickle it via standard ``pickle`` and call it to recover the base
|
||||
table. Picklable callables in practice means top-level (module-level)
|
||||
functions, ``functools.partial`` of such functions, or instances of
|
||||
picklable classes implementing ``__call__``. Lambdas and closures over
|
||||
local variables don't pickle with the default protocol.
|
||||
|
||||
Setting a factory is necessary when the URI alone is not enough to
|
||||
re-open the connection — most importantly for LanceDB Cloud (``db://``)
|
||||
connections, where ``api_key`` and ``region`` aren't recoverable from
|
||||
the connection object after construction.
|
||||
|
||||
For local file or cloud-storage paths the factory is optional: if not
|
||||
set, ``__getstate__`` falls back to capturing
|
||||
``(uri, storage_options, namespace_path)`` and re-opening via
|
||||
``lancedb.connect(uri, storage_options=...)``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Basic native (file-system path), parameterized via ``functools.partial``::
|
||||
|
||||
import functools, lancedb
|
||||
from lancedb.permutation import Permutation
|
||||
|
||||
def open_native_table(uri: str, table_name: str):
|
||||
return lancedb.connect(uri).open_table(table_name)
|
||||
|
||||
factory = functools.partial(open_native_table, "/data/lance_db")
|
||||
permutation = Permutation.identity(
|
||||
factory("training")
|
||||
).with_connection_factory(factory)
|
||||
|
||||
Native via :func:`lancedb.connect_namespace` (e.g. a directory- or
|
||||
REST-backed namespace client). The factory takes the
|
||||
implementation name and properties dict as partial-bound args so
|
||||
the worker can rebuild the same namespace connection::
|
||||
|
||||
def open_via_namespace(
|
||||
impl: str, properties: dict[str, str], table_name: str,
|
||||
):
|
||||
return lancedb.connect_namespace(impl, properties).open_table(
|
||||
table_name,
|
||||
)
|
||||
|
||||
factory = functools.partial(
|
||||
open_via_namespace,
|
||||
"dir",
|
||||
{"root": "/data/lance_db"},
|
||||
)
|
||||
|
||||
LanceDB Cloud, reading credentials from env vars at worker startup
|
||||
so secrets aren't pickled into the dataset::
|
||||
|
||||
import os, lancedb
|
||||
|
||||
def open_remote_table(table_name: str):
|
||||
db = lancedb.connect(
|
||||
"db://my-database",
|
||||
api_key=os.environ["LANCEDB_API_KEY"],
|
||||
region=os.environ.get("LANCEDB_REGION", "us-east-1"),
|
||||
)
|
||||
return db.open_table(table_name)
|
||||
|
||||
permutation = Permutation.identity(
|
||||
open_remote_table("training")
|
||||
).with_connection_factory(open_remote_table)
|
||||
"""
|
||||
assert connection_factory is not None, "connection_factory is required"
|
||||
new = copy.copy(self)
|
||||
new.connection_factory = connection_factory
|
||||
return new
|
||||
|
||||
@classmethod
|
||||
def identity(cls, table: LanceTable) -> "Permutation":
|
||||
@@ -489,11 +578,126 @@ class Permutation:
|
||||
schema = await reader.output_schema(None)
|
||||
initial_selection = {name: name for name in schema.names}
|
||||
return cls(
|
||||
reader, initial_selection, DEFAULT_BATCH_SIZE, Transforms.arrow2python
|
||||
base_table,
|
||||
permutation_table,
|
||||
split,
|
||||
initial_selection,
|
||||
DEFAULT_BATCH_SIZE,
|
||||
Transforms.arrow2python,
|
||||
_reader=reader,
|
||||
)
|
||||
|
||||
return LOOP.run(do_from_tables())
|
||||
|
||||
def __getstate__(self) -> dict[str, Any]:
|
||||
"""Build a picklable state dict for this permutation.
|
||||
|
||||
The base table is captured either via a user-supplied
|
||||
``connection_factory`` (see [with_connection_factory]) or, as a
|
||||
fallback, by introspecting ``(uri, storage_options, namespace_path)``
|
||||
on the connection. The permutation table — always an in-memory
|
||||
LanceDB table — is captured as a pyarrow Table (which pickles via
|
||||
Arrow IPC natively). The reader is dropped from the wire format;
|
||||
``__setstate__`` rebuilds it from the restored tables.
|
||||
"""
|
||||
permutation_data: Optional[pa.Table] = None
|
||||
if self.permutation_table is not None:
|
||||
permutation_data = self.permutation_table.to_arrow()
|
||||
|
||||
common = {
|
||||
"base_table_name": self.base_table.name,
|
||||
"permutation_data": permutation_data,
|
||||
"split": self.split,
|
||||
"selection": self.selection,
|
||||
"batch_size": self.batch_size,
|
||||
"transform_fn": self.transform_fn,
|
||||
"offset": self.offset,
|
||||
"limit": self.limit,
|
||||
"connection_factory": self.connection_factory,
|
||||
}
|
||||
|
||||
if self.connection_factory is not None:
|
||||
# The factory carries enough state to recover the base table on
|
||||
# its own; we don't need to capture the URI / storage options /
|
||||
# namespace from the existing connection.
|
||||
return common
|
||||
|
||||
# URI-introspection fallback: only viable for native (OSS) connections
|
||||
# where (uri, storage_options) is enough to reopen. Remote / cloud
|
||||
# connections don't expose recoverable api_key / region — those users
|
||||
# must call with_connection_factory().
|
||||
try:
|
||||
base_uri = self.base_table._conn.uri
|
||||
storage_options = self.base_table._conn.storage_options
|
||||
except AttributeError as e:
|
||||
raise ValueError(
|
||||
"Cannot pickle this Permutation: the base table's connection "
|
||||
"does not expose a uri/storage_options, which usually means it "
|
||||
"is a remote (LanceDB Cloud) connection. Call "
|
||||
"Permutation.with_connection_factory(...) first to provide a "
|
||||
"picklable callable that re-opens the base table from a worker "
|
||||
"process."
|
||||
) from e
|
||||
|
||||
if base_uri.startswith("memory://"):
|
||||
# In-memory base tables don't exist in any worker process by
|
||||
# default, so dump the entire base table into the pickle. This
|
||||
# can be expensive for large datasets — users with large
|
||||
# in-memory base tables should either persist them or set a
|
||||
# connection_factory.
|
||||
return {
|
||||
**common,
|
||||
"base_table_data": self.base_table.to_arrow(),
|
||||
}
|
||||
|
||||
return {
|
||||
**common,
|
||||
"base_table_uri": base_uri,
|
||||
"base_table_namespace": self.base_table._namespace_path,
|
||||
"base_table_storage_options": storage_options,
|
||||
}
|
||||
|
||||
def __setstate__(self, state: dict[str, Any]) -> None:
|
||||
from . import connect
|
||||
|
||||
connection_factory = state["connection_factory"]
|
||||
if connection_factory is not None:
|
||||
base_table = connection_factory(state["base_table_name"])
|
||||
elif "base_table_data" in state:
|
||||
# In-memory base table inlined into the pickle; rebuild the same
|
||||
# way we rebuild the in-memory permutation table.
|
||||
mem_db = connect("memory://")
|
||||
base_table = mem_db.create_table(
|
||||
state["base_table_name"], state["base_table_data"]
|
||||
)
|
||||
else:
|
||||
base_db = connect(
|
||||
state["base_table_uri"],
|
||||
storage_options=state["base_table_storage_options"],
|
||||
)
|
||||
base_table = base_db.open_table(
|
||||
state["base_table_name"],
|
||||
namespace_path=state["base_table_namespace"] or None,
|
||||
)
|
||||
|
||||
permutation_table: Optional[LanceTable] = None
|
||||
if state["permutation_data"] is not None:
|
||||
mem_db = connect("memory://")
|
||||
permutation_table = mem_db.create_table(
|
||||
"permutation", state["permutation_data"]
|
||||
)
|
||||
|
||||
self.base_table = base_table
|
||||
self.permutation_table = permutation_table
|
||||
self.split = state["split"]
|
||||
self.selection = state["selection"]
|
||||
self.batch_size = state["batch_size"]
|
||||
self.transform_fn = state["transform_fn"]
|
||||
self.offset = state["offset"]
|
||||
self.limit = state["limit"]
|
||||
self.connection_factory = connection_factory
|
||||
self.reader = LOOP.run(self._build_reader())
|
||||
|
||||
@property
|
||||
def schema(self) -> pa.Schema:
|
||||
async def do_output_schema():
|
||||
@@ -760,7 +964,9 @@ class Permutation:
|
||||
for expensive operations such as image decoding.
|
||||
"""
|
||||
assert transform is not None, "transform is required"
|
||||
return Permutation(self.reader, self.selection, self.batch_size, transform)
|
||||
new = copy.copy(self)
|
||||
new.transform_fn = transform
|
||||
return new
|
||||
|
||||
def __getitem__(self, index: int) -> Any:
|
||||
"""
|
||||
@@ -795,12 +1001,10 @@ class Permutation:
|
||||
"""
|
||||
Skip the first `skip` rows of the permutation
|
||||
"""
|
||||
|
||||
async def do_with_skip():
|
||||
reader = await self.reader.with_offset(skip)
|
||||
return self._with_reader(reader)
|
||||
|
||||
return LOOP.run(do_with_skip())
|
||||
new = copy.copy(self)
|
||||
new.offset = skip
|
||||
new.reader = LOOP.run(new._build_reader())
|
||||
return new
|
||||
|
||||
@deprecated(details="Use with_take instead")
|
||||
def take(self, limit: int) -> "Permutation":
|
||||
@@ -818,12 +1022,10 @@ class Permutation:
|
||||
"""
|
||||
Limit the permutation to `limit` rows (following any `skip`)
|
||||
"""
|
||||
|
||||
async def do_with_take():
|
||||
reader = await self.reader.with_limit(limit)
|
||||
return self._with_reader(reader)
|
||||
|
||||
return LOOP.run(do_with_take())
|
||||
new = copy.copy(self)
|
||||
new.limit = limit
|
||||
new.reader = LOOP.run(new._build_reader())
|
||||
return new
|
||||
|
||||
@deprecated(details="Use with_repeat instead")
|
||||
def repeat(self, times: int) -> "Permutation":
|
||||
|
||||
@@ -22,6 +22,7 @@ from lancedb.index import (
|
||||
FTS,
|
||||
BTree,
|
||||
Bitmap,
|
||||
HnswFlat,
|
||||
HnswSq,
|
||||
IvfFlat,
|
||||
IvfPq,
|
||||
@@ -39,6 +40,7 @@ from lancedb.table import _normalize_progress
|
||||
|
||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
|
||||
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
||||
from ..types import BaseTokenizerType
|
||||
|
||||
|
||||
class RemoteTable(Table):
|
||||
@@ -167,7 +169,7 @@ class RemoteTable(Table):
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
with_position: bool = False,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: str = "simple",
|
||||
base_tokenizer: BaseTokenizerType = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
@@ -284,13 +286,15 @@ class RemoteTable(Table):
|
||||
)
|
||||
elif index_type == "IVF_HNSW_SQ":
|
||||
config = HnswSq(distance_type=metric, num_partitions=num_partitions)
|
||||
elif index_type == "IVF_HNSW_FLAT":
|
||||
config = HnswFlat(distance_type=metric, num_partitions=num_partitions)
|
||||
elif index_type == "IVF_FLAT":
|
||||
config = IvfFlat(distance_type=metric, num_partitions=num_partitions)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown vector index type: {index_type}. Valid options are"
|
||||
" 'IVF_FLAT', 'IVF_PQ', 'IVF_RQ', 'IVF_SQ',"
|
||||
" 'IVF_HNSW_PQ', 'IVF_HNSW_SQ'"
|
||||
" 'IVF_HNSW_PQ', 'IVF_HNSW_SQ', 'IVF_HNSW_FLAT'"
|
||||
)
|
||||
|
||||
LOOP.run(
|
||||
|
||||
@@ -57,6 +57,7 @@ from .index import (
|
||||
LabelList,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
FTS,
|
||||
)
|
||||
from .merge import LanceMergeInsertBuilder
|
||||
@@ -86,6 +87,59 @@ from .util import (
|
||||
)
|
||||
from .index import lang_mapping
|
||||
|
||||
_MODEL_BACKED_TOKENIZER_PREFIXES = ("jieba", "lindera")
|
||||
_MODEL_BACKED_TOKENIZER_ERRORS = (
|
||||
"unknown base tokenizer",
|
||||
"Invalid directory path:",
|
||||
"Failed to load Jieba",
|
||||
"Failed to load tokenizer config",
|
||||
"Failed to initialize default tokenizer",
|
||||
)
|
||||
|
||||
|
||||
def _add_unique_note(exception: BaseException, note: str) -> None:
|
||||
existing_notes = getattr(exception, "__notes__", ()) or ()
|
||||
message = (
|
||||
exception.args[0]
|
||||
if exception.args and isinstance(exception.args[0], str)
|
||||
else ""
|
||||
)
|
||||
if note not in existing_notes and note not in message:
|
||||
add_note(exception, note)
|
||||
|
||||
|
||||
def _is_model_backed_tokenizer(base_tokenizer: str) -> bool:
|
||||
return any(
|
||||
base_tokenizer == prefix or base_tokenizer.startswith(f"{prefix}/")
|
||||
for prefix in _MODEL_BACKED_TOKENIZER_PREFIXES
|
||||
)
|
||||
|
||||
|
||||
def _maybe_add_fts_error_note(
|
||||
exception: BaseException, *, base_tokenizer: str, language: Optional[str] = None
|
||||
) -> None:
|
||||
message = str(exception)
|
||||
if language is not None and "not support the requested language" in message:
|
||||
supported_langs = ", ".join(lang_mapping.values())
|
||||
_add_unique_note(exception, f"Supported languages: {supported_langs}")
|
||||
return
|
||||
|
||||
if not _is_model_backed_tokenizer(base_tokenizer):
|
||||
return
|
||||
|
||||
if not any(marker in message for marker in _MODEL_BACKED_TOKENIZER_ERRORS):
|
||||
return
|
||||
|
||||
_add_unique_note(
|
||||
exception,
|
||||
"Model-backed tokenizers such as 'jieba/default' and 'lindera/ipadic' "
|
||||
"require tokenizer models in Lance's language model home. Set "
|
||||
"LANCE_LANGUAGE_MODEL_HOME to override the default platform data "
|
||||
"directory under 'lance/language_models'. Expected layouts include "
|
||||
"'<model-home>/jieba/default/...' and "
|
||||
"'<model-home>/lindera/ipadic/...'.",
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .db import LanceDBConnection
|
||||
@@ -958,7 +1012,10 @@ class Table(ABC):
|
||||
tokenizer_name: str, default "default"
|
||||
A compatibility alias for native tokenizer configs. Can be "raw",
|
||||
"default" or the 2 letter language code followed by "_stem". So
|
||||
for english it would be "en_stem".
|
||||
for english it would be "en_stem". For new native FTS indexes, use
|
||||
``base_tokenizer`` directly; ``tokenizer_name`` is a legacy
|
||||
compatibility alias and does not expose model-backed tokenizer names
|
||||
such as ``jieba/default`` or ``lindera/ipadic``.
|
||||
use_tantivy: bool, default False
|
||||
Deprecated legacy Tantivy parameter. Setting this to True raises an
|
||||
error.
|
||||
@@ -972,8 +1029,11 @@ class Table(ABC):
|
||||
- "whitespace": Split text by whitespace, but not punctuation.
|
||||
- "raw": No tokenization. The entire text is treated as a single token.
|
||||
- "ngram": N-Gram tokenizer.
|
||||
- "jieba/*": Jieba tokenizer loaded from Lance's language model home.
|
||||
- "lindera/*": Lindera tokenizer loaded from Lance's language model home.
|
||||
language : str, default "English"
|
||||
The language to use for tokenization.
|
||||
The language to use for stemming and stop-word removal. This is not
|
||||
the primary way to enable CJK tokenization.
|
||||
max_token_length : int, default 40
|
||||
The maximum token length to index. Tokens longer than this length will be
|
||||
ignored.
|
||||
@@ -999,6 +1059,13 @@ class Table(ABC):
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
The name of the index. If not provided, a default name will be generated.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
|
||||
require tokenizer models in Lance's language model home. Set
|
||||
``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
|
||||
directory under ``lance/language_models``.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -2170,7 +2237,13 @@ class LanceTable(Table):
|
||||
index_cache_size: Optional[int] = None,
|
||||
num_bits: int = 8,
|
||||
index_type: Literal[
|
||||
"IVF_FLAT", "IVF_SQ", "IVF_PQ", "IVF_RQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
|
||||
"IVF_FLAT",
|
||||
"IVF_SQ",
|
||||
"IVF_PQ",
|
||||
"IVF_RQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
] = "IVF_PQ",
|
||||
max_iterations: int = 50,
|
||||
sample_rate: int = 256,
|
||||
@@ -2257,6 +2330,16 @@ class LanceTable(Table):
|
||||
ef_construction=ef_construction,
|
||||
target_partition_size=target_partition_size,
|
||||
)
|
||||
elif index_type == "IVF_HNSW_FLAT":
|
||||
config = HnswFlat(
|
||||
distance_type=metric,
|
||||
num_partitions=num_partitions,
|
||||
max_iterations=max_iterations,
|
||||
sample_rate=sample_rate,
|
||||
m=m,
|
||||
ef_construction=ef_construction,
|
||||
target_partition_size=target_partition_size,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown index type {index_type}")
|
||||
|
||||
@@ -2462,14 +2545,22 @@ class LanceTable(Table):
|
||||
**tokenizer_configs,
|
||||
)
|
||||
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
field_names,
|
||||
replace=replace,
|
||||
config=config,
|
||||
name=name,
|
||||
try:
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
field_names,
|
||||
replace=replace,
|
||||
config=config,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
)
|
||||
except (ValueError, RuntimeError) as e:
|
||||
_maybe_add_fts_error_note(
|
||||
e,
|
||||
base_tokenizer=config.base_tokenizer,
|
||||
language=config.language,
|
||||
)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
|
||||
@@ -3799,7 +3890,18 @@ class AsyncTable:
|
||||
*,
|
||||
replace: Optional[bool] = None,
|
||||
config: Optional[
|
||||
Union[IvfFlat, IvfPq, IvfRq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
||||
Union[
|
||||
IvfFlat,
|
||||
IvfPq,
|
||||
IvfRq,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
FTS,
|
||||
]
|
||||
] = None,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
name: Optional[str] = None,
|
||||
@@ -3846,6 +3948,7 @@ class AsyncTable:
|
||||
IvfRq,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
@@ -3865,11 +3968,13 @@ class AsyncTable:
|
||||
name=name,
|
||||
train=train,
|
||||
)
|
||||
except ValueError as e:
|
||||
if "not support the requested language" in str(e):
|
||||
supported_langs = ", ".join(lang_mapping.values())
|
||||
help_msg = f"Supported languages: {supported_langs}"
|
||||
add_note(e, help_msg)
|
||||
except (ValueError, RuntimeError) as e:
|
||||
if isinstance(config, FTS):
|
||||
_maybe_add_fts_error_note(
|
||||
e,
|
||||
base_tokenizer=config.base_tokenizer,
|
||||
language=config.language,
|
||||
)
|
||||
raise e
|
||||
|
||||
async def drop_index(self, name: str) -> None:
|
||||
@@ -5014,6 +5119,7 @@ class IndexStatistics:
|
||||
"IVF_RQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"FTS",
|
||||
"BTREE",
|
||||
"BITMAP",
|
||||
|
||||
@@ -24,6 +24,7 @@ VectorIndexType = Literal[
|
||||
"IVF_PQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"IVF_RQ",
|
||||
]
|
||||
ScalarIndexType = Literal["BTREE", "BITMAP", "LABEL_LIST"]
|
||||
@@ -31,6 +32,7 @@ IndexType = Literal[
|
||||
"IVF_PQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"IVF_SQ",
|
||||
"FTS",
|
||||
"BTREE",
|
||||
@@ -40,4 +42,5 @@ IndexType = Literal[
|
||||
]
|
||||
|
||||
# Tokenizer literals
|
||||
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||
BuiltinTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||
BaseTokenizerType = BuiltinTokenizerType | str
|
||||
|
||||
8
python/python/tests/models/jieba/default/dict.txt
Normal file
8
python/python/tests/models/jieba/default/dict.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
我们 98740 r
|
||||
都 202780 d
|
||||
有 423765 v
|
||||
光明 1219 n
|
||||
的 318825 uj
|
||||
前途 1263 n
|
||||
前 62779 f
|
||||
途 857 n
|
||||
3
python/python/tests/models/lindera/ipadic/config.yml
Normal file
3
python/python/tests/models/lindera/ipadic/config.yml
Normal file
@@ -0,0 +1,3 @@
|
||||
segmenter:
|
||||
mode: "normal"
|
||||
dictionary: "./python/tests/models/lindera/ipadic/main"
|
||||
BIN
python/python/tests/models/lindera/ipadic/main.zip
Normal file
BIN
python/python/tests/models/lindera/ipadic/main.zip
Normal file
Binary file not shown.
@@ -914,6 +914,29 @@ def test_local_namespace_operations(tmp_path):
|
||||
assert db.list_namespaces().namespaces == []
|
||||
|
||||
|
||||
def test_create_namespace_invalid_mode_raises(tmp_path):
|
||||
"""Unrecognized create namespace modes raise a clear error."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
with pytest.raises(ValueError, match="Invalid create namespace mode"):
|
||||
db.create_namespace(["child"], mode="frobnicate")
|
||||
|
||||
|
||||
def test_drop_namespace_invalid_mode_raises(tmp_path):
|
||||
"""Unrecognized drop namespace modes raise a clear error."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
db.create_namespace(["child"])
|
||||
with pytest.raises(ValueError, match="Invalid drop namespace mode"):
|
||||
db.drop_namespace(["child"], mode="frobnicate")
|
||||
|
||||
|
||||
def test_drop_namespace_invalid_behavior_raises(tmp_path):
|
||||
"""Unrecognized drop namespace behaviors raise a clear error."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
db.create_namespace(["child"])
|
||||
with pytest.raises(ValueError, match="Invalid drop namespace behavior"):
|
||||
db.drop_namespace(["child"], behavior="frobnicate")
|
||||
|
||||
|
||||
def test_clone_table_latest_version(tmp_path):
|
||||
"""Test cloning a table with the latest version (default behavior)"""
|
||||
import os
|
||||
|
||||
@@ -15,7 +15,10 @@
|
||||
# limitations under the License.
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from pathlib import Path
|
||||
import zipfile
|
||||
|
||||
import lancedb as ldb
|
||||
from lancedb.db import DBConnection
|
||||
@@ -36,6 +39,8 @@ import pytest
|
||||
import pytest_asyncio
|
||||
from utils import exception_output
|
||||
|
||||
TEST_LANGUAGE_MODEL_HOME = Path(__file__).parent / "models"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table(tmp_path) -> ldb.table.LanceTable:
|
||||
@@ -89,6 +94,39 @@ def table(tmp_path) -> ldb.table.LanceTable:
|
||||
return table
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def language_model_home(monkeypatch, tmp_path):
|
||||
model_home = tmp_path / "language-models"
|
||||
shutil.copytree(TEST_LANGUAGE_MODEL_HOME, model_home)
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(model_home))
|
||||
return model_home
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lindera_ipadic(language_model_home):
|
||||
model_path = language_model_home / "lindera" / "ipadic"
|
||||
extracted_model = model_path / "main"
|
||||
config_path = model_path / "config.yml"
|
||||
|
||||
if extracted_model.exists():
|
||||
shutil.rmtree(extracted_model)
|
||||
|
||||
with zipfile.ZipFile(model_path / "main.zip", "r") as zip_ref:
|
||||
zip_ref.extractall(model_path)
|
||||
config_path.write_text(
|
||||
"segmenter:\n"
|
||||
' mode: "normal"\n'
|
||||
f' dictionary: "{extracted_model.resolve().as_posix()}"\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if extracted_model.exists():
|
||||
shutil.rmtree(extracted_model)
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
||||
# Use local random state to avoid affecting other tests
|
||||
@@ -684,6 +722,90 @@ def test_fts_ngram(mem_db: DBConnection):
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
|
||||
def test_fts_jieba_tokenizer(mem_db: DBConnection, language_model_home):
|
||||
data = pa.table({"text": ["我们都有光明的前途", "光明的前途"]})
|
||||
table = mem_db.create_table("test_jieba", data=data)
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
results = table.search("我们", query_type="fts").limit(10).to_list()
|
||||
assert [row["text"] for row in results] == ["我们都有光明的前途"]
|
||||
|
||||
|
||||
def test_fts_jieba_missing_language_model_note(
|
||||
mem_db: DBConnection, monkeypatch, tmp_path
|
||||
):
|
||||
missing_root = tmp_path / "missing-language-models"
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
|
||||
table = mem_db.create_table(
|
||||
"test_missing_jieba_model",
|
||||
data=pa.table({"text": ["我们都有光明的前途"]}),
|
||||
)
|
||||
|
||||
with pytest.raises((ValueError, RuntimeError)) as e:
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
output = exception_output(e)
|
||||
assert "Invalid directory path:" in output
|
||||
assert "LANCE_LANGUAGE_MODEL_HOME" in output
|
||||
assert "jieba/default" in output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fts_jieba_missing_language_model_note_async(monkeypatch, tmp_path):
|
||||
missing_root = tmp_path / "missing-language-models"
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
|
||||
db = await ldb.connect_async(tmp_path / "async-db")
|
||||
table = await db.create_table(
|
||||
"test_missing_jieba_model_async",
|
||||
data=pa.table({"text": ["我们都有光明的前途"]}),
|
||||
)
|
||||
|
||||
with pytest.raises((ValueError, RuntimeError)) as e:
|
||||
await table.create_index(
|
||||
"text",
|
||||
config=FTS(
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
),
|
||||
)
|
||||
|
||||
output = exception_output(e)
|
||||
assert "Invalid directory path:" in output
|
||||
assert "LANCE_LANGUAGE_MODEL_HOME" in output
|
||||
assert "jieba/default" in output
|
||||
|
||||
|
||||
def test_fts_lindera_tokenizer(
|
||||
mem_db: DBConnection, language_model_home, lindera_ipadic
|
||||
):
|
||||
data = pa.table({"text": ["成田国際空港", "東京国際空港", "羽田空港"]})
|
||||
table = mem_db.create_table("test_lindera", data=data)
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="lindera/ipadic",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
results = table.search("成田", query_type="fts").limit(10).to_list()
|
||||
assert [row["text"] for row in results] == ["成田国際空港"]
|
||||
|
||||
|
||||
def test_fts_query_to_json():
|
||||
"""Test that FTS query to_json() produces valid JSON strings with exact format."""
|
||||
|
||||
|
||||
@@ -16,11 +16,13 @@ from lancedb.index import (
|
||||
IvfSq,
|
||||
IvfHnswPq,
|
||||
IvfHnswSq,
|
||||
IvfHnswFlat,
|
||||
IvfRq,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
FTS,
|
||||
)
|
||||
from lancedb.table import IndexStatistics
|
||||
@@ -250,6 +252,21 @@ async def test_create_hnswpq_alias_index(some_table: AsyncTable):
|
||||
assert indices[0].index_type in {"HnswPq", "IvfHnswPq"}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_hnswflat_index(some_table: AsyncTable):
|
||||
await some_table.create_index("vector", config=HnswFlat(num_partitions=10))
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_hnswflat_alias_index(some_table: AsyncTable):
|
||||
await some_table.create_index("vector", config=IvfHnswFlat(num_partitions=5))
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type in {"HnswFlat", "IvfHnswFlat"}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_ivfsq_index(some_table: AsyncTable):
|
||||
await some_table.create_index("vector", config=IvfSq(num_partitions=10))
|
||||
@@ -295,6 +312,7 @@ def test_index_statistics_index_type_lists_all_supported_values():
|
||||
"IVF_RQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"FTS",
|
||||
"BTREE",
|
||||
"BITMAP",
|
||||
|
||||
@@ -9,21 +9,6 @@ from lancedb import DBConnection, Table, connect
|
||||
from lancedb.permutation import Permutation, Permutations, permutation_builder
|
||||
|
||||
|
||||
def test_permutation_persistence(tmp_path):
|
||||
db = connect(tmp_path)
|
||||
tbl = db.create_table("test_table", pa.table({"x": range(100), "y": range(100)}))
|
||||
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl).shuffle().persist(db, "test_permutation").execute()
|
||||
)
|
||||
assert permutation_tbl.count_rows() == 100
|
||||
|
||||
re_open = db.open_table("test_permutation")
|
||||
assert re_open.count_rows() == 100
|
||||
|
||||
assert permutation_tbl.to_arrow() == re_open.to_arrow()
|
||||
|
||||
|
||||
def test_split_random_ratios(mem_db):
|
||||
"""Test random splitting with ratios."""
|
||||
tbl = mem_db.create_table(
|
||||
|
||||
@@ -6,6 +6,8 @@ import contextlib
|
||||
from datetime import timedelta
|
||||
import http.server
|
||||
import json
|
||||
import multiprocessing as mp
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from unittest.mock import MagicMock, patch
|
||||
@@ -1230,3 +1232,82 @@ def test_background_loop_cancellation(exception):
|
||||
with pytest.raises(exception):
|
||||
loop.run(None)
|
||||
mock_future.cancel.assert_called_once()
|
||||
|
||||
|
||||
def _remote_fork_child(port: int, queue) -> None:
|
||||
# Build a fresh Connection in the child so we exercise the at-fork-child
|
||||
# tokio runtime reset rather than relying on an inherited reqwest client.
|
||||
db = lancedb.connect(
|
||||
"db://dev",
|
||||
api_key="fake",
|
||||
host_override=f"http://localhost:{port}",
|
||||
client_config={
|
||||
"retry_config": {"retries": 0},
|
||||
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
|
||||
},
|
||||
)
|
||||
queue.put(db.table_names())
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform != "linux",
|
||||
reason=(
|
||||
"fork() is unavailable on Windows and unsafe on macOS "
|
||||
"(Apple frameworks/TLS are not fork-safe)"
|
||||
),
|
||||
)
|
||||
def test_remote_connection_after_fork():
|
||||
"""A freshly-built remote Connection in a forked child should not hang.
|
||||
|
||||
The pyo3-async-runtimes tokio runtime would otherwise be inherited from
|
||||
the parent with dead worker threads; the at-fork-child handler in our
|
||||
runtime module rebuilds it on first use in the child.
|
||||
"""
|
||||
|
||||
def handler(request):
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(b'{"tables": []}')
|
||||
|
||||
server = http.server.HTTPServer(("localhost", 0), make_mock_http_handler(handler))
|
||||
port = server.server_address[1]
|
||||
server_thread = threading.Thread(target=server.serve_forever)
|
||||
server_thread.start()
|
||||
try:
|
||||
# Hit the server in the parent first so the runtime + LOOP are warm
|
||||
# before fork; a fresh child must still succeed.
|
||||
parent_db = lancedb.connect(
|
||||
"db://dev",
|
||||
api_key="fake",
|
||||
host_override=f"http://localhost:{port}",
|
||||
client_config={
|
||||
"retry_config": {"retries": 0},
|
||||
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
|
||||
},
|
||||
)
|
||||
assert parent_db.table_names() == []
|
||||
|
||||
ctx = mp.get_context("fork")
|
||||
queue = ctx.Queue()
|
||||
proc = ctx.Process(target=_remote_fork_child, args=(port, queue))
|
||||
proc.start()
|
||||
proc.join(timeout=15)
|
||||
|
||||
if proc.is_alive():
|
||||
proc.terminate()
|
||||
proc.join(timeout=5)
|
||||
if proc.is_alive():
|
||||
proc.kill()
|
||||
proc.join()
|
||||
pytest.fail("Remote connection hung after fork")
|
||||
|
||||
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
|
||||
assert not queue.empty(), "child produced no result"
|
||||
assert queue.get() == []
|
||||
|
||||
# Parent connection must still be usable after the child returned.
|
||||
assert parent_db.table_names() == []
|
||||
finally:
|
||||
server.shutdown()
|
||||
server_thread.join()
|
||||
|
||||
@@ -11,7 +11,7 @@ from unittest.mock import patch
|
||||
|
||||
import lancedb
|
||||
from lancedb.dependencies import _PANDAS_AVAILABLE
|
||||
from lancedb.index import HnswPq, HnswSq, IvfPq
|
||||
from lancedb.index import HnswFlat, HnswPq, HnswSq, IvfPq
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pyarrow as pa
|
||||
@@ -917,6 +917,21 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
"my_vector", replace=True, config=expected_config, name=None, train=True
|
||||
)
|
||||
|
||||
table.create_index(
|
||||
vector_column_name="my_vector",
|
||||
metric="cosine",
|
||||
index_type="IVF_HNSW_FLAT",
|
||||
sample_rate=0.1,
|
||||
m=29,
|
||||
ef_construction=10,
|
||||
)
|
||||
expected_config = HnswFlat(
|
||||
distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10
|
||||
)
|
||||
mock_create_index.assert_called_with(
|
||||
"my_vector", replace=True, config=expected_config, name=None, train=True
|
||||
)
|
||||
|
||||
|
||||
@patch("lancedb.table.AsyncTable.create_index")
|
||||
def test_create_index_name_and_train_parameters(
|
||||
|
||||
@@ -1,14 +1,29 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import functools
|
||||
import multiprocessing as mp
|
||||
import pickle
|
||||
import sys
|
||||
|
||||
import lancedb
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
from lancedb.permutation import Permutation, Permutations, permutation_builder
|
||||
from lancedb.util import tbl_to_tensor
|
||||
from lancedb.permutation import Permutation
|
||||
|
||||
torch = pytest.importorskip("torch")
|
||||
|
||||
|
||||
def _open_native_table(uri: str, table_name: str):
|
||||
"""Top-level connection factory used by the explicit-factory pickle test.
|
||||
|
||||
Defined at module scope so that pickle can resolve it by name in the
|
||||
worker / unpickling process.
|
||||
"""
|
||||
return lancedb.connect(uri).open_table(table_name)
|
||||
|
||||
|
||||
def test_table_dataloader(mem_db):
|
||||
table = mem_db.create_table("test_table", pa.table({"a": range(1000)}))
|
||||
dataloader = torch.utils.data.DataLoader(
|
||||
@@ -40,3 +55,156 @@ def test_permutation_dataloader(mem_db):
|
||||
for batch in dataloader:
|
||||
assert batch.size(0) == 1
|
||||
assert batch.size(1) == 10
|
||||
|
||||
|
||||
def test_permutation_is_picklable(tmp_db):
|
||||
"""A Permutation must be picklable so it can be used with PyTorch's
|
||||
DataLoader when num_workers > 0 (which uses multiprocessing and pickles
|
||||
the dataset to pass it to worker processes)."""
|
||||
table = tmp_db.create_table("test_table", pa.table({"a": range(1000)}))
|
||||
permutation = Permutation.identity(table)
|
||||
|
||||
pickled = pickle.dumps(permutation)
|
||||
restored = pickle.loads(pickled)
|
||||
|
||||
assert len(restored) == 1000
|
||||
rows = restored.__getitems__([0, 1, 2])
|
||||
assert rows == [{"a": 0}, {"a": 1}, {"a": 2}]
|
||||
|
||||
|
||||
def test_permutation_with_memory_base_is_picklable(mem_db):
|
||||
"""An in-memory base table is inlined into the pickle as Arrow IPC bytes
|
||||
and rebuilt on the other side as an in-memory LanceTable, so the
|
||||
Permutation round-trips even though the original database can't be
|
||||
reopened across processes."""
|
||||
table = mem_db.create_table("test_table", pa.table({"a": range(50)}))
|
||||
permutation = Permutation.identity(table)
|
||||
|
||||
restored = pickle.loads(pickle.dumps(permutation))
|
||||
|
||||
assert len(restored) == 50
|
||||
assert restored.__getitems__([0, 10, 49]) == [{"a": 0}, {"a": 10}, {"a": 49}]
|
||||
|
||||
|
||||
def test_permutation_dataloader_multiprocessing(tmp_db):
|
||||
"""Using a Permutation with a PyTorch DataLoader that has num_workers > 0
|
||||
must work end-to-end. Each worker process gets a pickled copy of the
|
||||
dataset and reads batches from it."""
|
||||
table = tmp_db.create_table("test_table", pa.table({"a": range(1000)}))
|
||||
permutation = Permutation.identity(table)
|
||||
|
||||
dataloader = torch.utils.data.DataLoader(
|
||||
permutation,
|
||||
batch_size=10,
|
||||
shuffle=True,
|
||||
num_workers=2,
|
||||
multiprocessing_context="spawn",
|
||||
)
|
||||
seen = 0
|
||||
for batch in dataloader:
|
||||
assert batch["a"].size(0) == 10
|
||||
seen += batch["a"].size(0)
|
||||
assert seen == 1000
|
||||
|
||||
|
||||
def test_permutation_pickle_with_connection_factory(tmp_path):
|
||||
"""When the user provides a connection_factory, pickling should round-trip
|
||||
through that factory rather than introspecting the connection URI. Useful
|
||||
for remote / cloud connections where the URI alone isn't reopenable."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
db.create_table("test_table", pa.table({"a": range(50)}))
|
||||
|
||||
factory = functools.partial(_open_native_table, str(tmp_path))
|
||||
permutation = Permutation.identity(factory("test_table")).with_connection_factory(
|
||||
factory
|
||||
)
|
||||
|
||||
restored = pickle.loads(pickle.dumps(permutation))
|
||||
|
||||
assert len(restored) == 50
|
||||
# The factory survives pickling and is what powered base-table reopen.
|
||||
assert restored.connection_factory is not None
|
||||
assert restored.connection_factory.func is _open_native_table
|
||||
assert restored.__getitems__([0, 1, 2]) == [{"a": 0}, {"a": 1}, {"a": 2}]
|
||||
|
||||
|
||||
def test_permutation_with_builder_is_picklable(tmp_db):
|
||||
"""A Permutation built from a non-identity permutation table must round-trip
|
||||
through pickle while preserving the row order defined by the permutation."""
|
||||
table = tmp_db.create_table("test_table", pa.table({"a": range(100)}))
|
||||
perm_tbl = (
|
||||
permutation_builder(table)
|
||||
.split_random(ratios=[0.8, 0.2], seed=42, split_names=["train", "test"])
|
||||
.shuffle(seed=42)
|
||||
.execute()
|
||||
)
|
||||
permutations = Permutations(table, perm_tbl)
|
||||
permutation = permutations["train"]
|
||||
|
||||
indices = list(range(len(permutation)))
|
||||
expected = permutation.__getitems__(indices)
|
||||
|
||||
restored = pickle.loads(pickle.dumps(permutation))
|
||||
|
||||
assert len(restored) == len(permutation)
|
||||
assert restored.__getitems__(indices) == expected
|
||||
|
||||
|
||||
def _multiworker_dataloader_target(db_uri: str, result_queue):
|
||||
import lancedb
|
||||
from lancedb.permutation import Permutation
|
||||
|
||||
db = lancedb.connect(db_uri)
|
||||
table = db.open_table("test_table")
|
||||
permutation = Permutation.identity(table)
|
||||
|
||||
dataloader = torch.utils.data.DataLoader(
|
||||
permutation,
|
||||
batch_size=10,
|
||||
num_workers=2,
|
||||
multiprocessing_context="fork",
|
||||
)
|
||||
count = 0
|
||||
for batch in dataloader:
|
||||
assert batch["a"].size(0) == 10
|
||||
count += 1
|
||||
result_queue.put(count)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform != "linux",
|
||||
reason=(
|
||||
"fork() is unavailable on Windows and unsafe on macOS "
|
||||
"(Apple frameworks/TLS are not fork-safe)"
|
||||
),
|
||||
)
|
||||
def test_permutation_dataloader_fork_workers(tmp_path):
|
||||
"""A Permutation used by a fork-based DataLoader should not hang.
|
||||
|
||||
PyTorch's DataLoader uses fork-based multiprocessing by default on Linux.
|
||||
LanceDB drives async work through a background asyncio thread that does
|
||||
not survive a fork, so any LOOP.run() in a worker blocks forever.
|
||||
"""
|
||||
import lancedb
|
||||
|
||||
db_uri = str(tmp_path / "db")
|
||||
db = lancedb.connect(db_uri)
|
||||
db.create_table("test_table", pa.table({"a": list(range(1000))}))
|
||||
|
||||
ctx = mp.get_context("spawn")
|
||||
queue = ctx.Queue()
|
||||
proc = ctx.Process(target=_multiworker_dataloader_target, args=(db_uri, queue))
|
||||
proc.start()
|
||||
proc.join(timeout=30)
|
||||
|
||||
if proc.is_alive():
|
||||
proc.terminate()
|
||||
proc.join(timeout=5)
|
||||
if proc.is_alive():
|
||||
proc.kill()
|
||||
proc.join()
|
||||
pytest.fail("Permutation hung when iterated in a fork-based DataLoader worker")
|
||||
|
||||
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
|
||||
assert not queue.empty(), "child produced no batches"
|
||||
assert queue.get() == 100
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::error::PythonErrorExt;
|
||||
use crate::runtime::future_into_py;
|
||||
use arrow::{
|
||||
datatypes::SchemaRef,
|
||||
pyarrow::{IntoPyArrow, ToPyArrow},
|
||||
@@ -12,9 +14,6 @@ use lancedb::arrow::SendableRecordBatchStream;
|
||||
use pyo3::{
|
||||
Bound, Py, PyAny, PyRef, PyResult, Python, exceptions::PyStopAsyncIteration, pyclass, pymethods,
|
||||
};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::error::PythonErrorExt;
|
||||
|
||||
#[pyclass]
|
||||
pub struct RecordBatchStream {
|
||||
|
||||
@@ -7,6 +7,12 @@ use std::{
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
error::PythonErrorExt,
|
||||
namespace::{create_namespace_storage_options_provider, extract_namespace_arc},
|
||||
runtime::future_into_py,
|
||||
table::Table,
|
||||
};
|
||||
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
|
||||
use lancedb::{
|
||||
connection::Connection as LanceConnection,
|
||||
@@ -20,13 +26,6 @@ use pyo3::{
|
||||
pyclass, pyfunction, pymethods,
|
||||
types::{PyDict, PyDictMethods},
|
||||
};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::{
|
||||
error::PythonErrorExt,
|
||||
namespace::{create_namespace_storage_options_provider, extract_namespace_arc},
|
||||
table::Table,
|
||||
};
|
||||
|
||||
#[pyclass]
|
||||
pub struct Connection {
|
||||
@@ -396,12 +395,17 @@ impl Connection {
|
||||
future_into_py(py, async move {
|
||||
use lance_namespace::models::CreateNamespaceRequest;
|
||||
// Mode is now a string field
|
||||
let mode_str = mode.and_then(|m| match m.to_lowercase().as_str() {
|
||||
"create" => Some("Create".to_string()),
|
||||
"exist_ok" => Some("ExistOk".to_string()),
|
||||
"overwrite" => Some("Overwrite".to_string()),
|
||||
_ => None,
|
||||
});
|
||||
let mode_str = mode
|
||||
.map(|m| match m.to_lowercase().as_str() {
|
||||
"create" => Ok("Create".to_string()),
|
||||
"exist_ok" => Ok("ExistOk".to_string()),
|
||||
"overwrite" => Ok("Overwrite".to_string()),
|
||||
_ => Err(PyValueError::new_err(format!(
|
||||
"Invalid mode {:?}: expected one of 'create', 'exist_ok', 'overwrite'",
|
||||
m
|
||||
))),
|
||||
})
|
||||
.transpose()?;
|
||||
let request = CreateNamespaceRequest {
|
||||
id: Some(namespace_path),
|
||||
mode: mode_str,
|
||||
@@ -429,16 +433,26 @@ impl Connection {
|
||||
future_into_py(py, async move {
|
||||
use lance_namespace::models::DropNamespaceRequest;
|
||||
// Mode and Behavior are now string fields
|
||||
let mode_str = mode.and_then(|m| match m.to_uppercase().as_str() {
|
||||
"SKIP" => Some("Skip".to_string()),
|
||||
"FAIL" => Some("Fail".to_string()),
|
||||
_ => None,
|
||||
});
|
||||
let behavior_str = behavior.and_then(|b| match b.to_uppercase().as_str() {
|
||||
"RESTRICT" => Some("Restrict".to_string()),
|
||||
"CASCADE" => Some("Cascade".to_string()),
|
||||
_ => None,
|
||||
});
|
||||
let mode_str = mode
|
||||
.map(|m| match m.to_uppercase().as_str() {
|
||||
"SKIP" => Ok("Skip".to_string()),
|
||||
"FAIL" => Ok("Fail".to_string()),
|
||||
_ => Err(PyValueError::new_err(format!(
|
||||
"Invalid mode {:?}: expected one of 'skip', 'fail'",
|
||||
m
|
||||
))),
|
||||
})
|
||||
.transpose()?;
|
||||
let behavior_str = behavior
|
||||
.map(|b| match b.to_uppercase().as_str() {
|
||||
"RESTRICT" => Ok("Restrict".to_string()),
|
||||
"CASCADE" => Ok("Cascade".to_string()),
|
||||
_ => Err(PyValueError::new_err(format!(
|
||||
"Invalid behavior {:?}: expected one of 'restrict', 'cascade'",
|
||||
b
|
||||
))),
|
||||
})
|
||||
.transpose()?;
|
||||
let request = DropNamespaceRequest {
|
||||
id: Some(namespace_path),
|
||||
mode: mode_str,
|
||||
|
||||
@@ -8,7 +8,9 @@
|
||||
//! DataFusion [`Expr`] nodes, bypassing SQL string parsing.
|
||||
|
||||
use arrow::{datatypes::DataType, pyarrow::PyArrowType};
|
||||
use datafusion_common::ScalarValue;
|
||||
use lancedb::expr::{DfExpr, col as ldb_col, contains, expr_cast, lit as df_lit, lower, upper};
|
||||
use pyo3::types::PyBytes;
|
||||
use pyo3::{Bound, PyAny, PyResult, exceptions::PyValueError, prelude::*, pyfunction};
|
||||
|
||||
/// A type-safe DataFusion expression.
|
||||
@@ -141,7 +143,7 @@ pub fn expr_col(name: &str) -> PyExpr {
|
||||
|
||||
/// Create a literal value expression.
|
||||
///
|
||||
/// Supported Python types: `bool`, `int`, `float`, `str`.
|
||||
/// Supported Python types: `bool`, `int`, `float`, `str`, `bytes`.
|
||||
#[pyfunction]
|
||||
pub fn expr_lit(value: Bound<'_, PyAny>) -> PyResult<PyExpr> {
|
||||
// bool must be checked before int because bool is a subclass of int in Python
|
||||
@@ -157,8 +159,12 @@ pub fn expr_lit(value: Bound<'_, PyAny>) -> PyResult<PyExpr> {
|
||||
if let Ok(s) = value.extract::<String>() {
|
||||
return Ok(PyExpr(df_lit(s)));
|
||||
}
|
||||
if value.is_instance_of::<PyBytes>() {
|
||||
let bytes = value.extract::<Vec<u8>>()?;
|
||||
return Ok(PyExpr(df_lit(ScalarValue::Binary(Some(bytes)))));
|
||||
}
|
||||
Err(PyValueError::new_err(format!(
|
||||
"unsupported literal type: {}. Supported: bool, int, float, str",
|
||||
"unsupported literal type: {}. Supported: bool, int, float, str, bytes",
|
||||
value.get_type().name()?
|
||||
)))
|
||||
}
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use lancedb::index::vector::{IvfFlatIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder};
|
||||
use lancedb::index::vector::{
|
||||
IvfFlatIndexBuilder, IvfHnswFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder,
|
||||
IvfPqIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder,
|
||||
};
|
||||
use lancedb::index::{
|
||||
Index as LanceDbIndex,
|
||||
scalar::{BTreeIndexBuilder, FtsIndexBuilder},
|
||||
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
|
||||
};
|
||||
use pyo3::IntoPyObject;
|
||||
use pyo3::types::PyStringMethods;
|
||||
@@ -162,8 +164,26 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
||||
}
|
||||
Ok(LanceDbIndex::IvfHnswSq(hnsw_sq_builder))
|
||||
}
|
||||
"HnswFlat" => {
|
||||
let params = source.extract::<IvfHnswFlatParams>()?;
|
||||
let distance_type = parse_distance_type(params.distance_type)?;
|
||||
let mut hnsw_flat_builder = IvfHnswFlatIndexBuilder::default()
|
||||
.distance_type(distance_type)
|
||||
.max_iterations(params.max_iterations)
|
||||
.sample_rate(params.sample_rate)
|
||||
.num_edges(params.m)
|
||||
.ef_construction(params.ef_construction);
|
||||
if let Some(num_partitions) = params.num_partitions {
|
||||
hnsw_flat_builder = hnsw_flat_builder.num_partitions(num_partitions);
|
||||
}
|
||||
if let Some(target_partition_size) = params.target_partition_size {
|
||||
hnsw_flat_builder =
|
||||
hnsw_flat_builder.target_partition_size(target_partition_size);
|
||||
}
|
||||
Ok(LanceDbIndex::IvfHnswFlat(hnsw_flat_builder))
|
||||
}
|
||||
not_supported => Err(PyValueError::new_err(format!(
|
||||
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, or IvfHnswSq",
|
||||
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, IvfHnswSq, or IvfHnswFlat",
|
||||
not_supported
|
||||
))),
|
||||
}
|
||||
@@ -250,6 +270,17 @@ struct IvfHnswSqParams {
|
||||
target_partition_size: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
struct IvfHnswFlatParams {
|
||||
distance_type: String,
|
||||
num_partitions: Option<u32>,
|
||||
max_iterations: u32,
|
||||
sample_rate: u32,
|
||||
m: u32,
|
||||
ef_construction: u32,
|
||||
target_partition_size: Option<u32>,
|
||||
}
|
||||
|
||||
#[pyclass(get_all)]
|
||||
/// A description of an index currently configured on a column
|
||||
pub struct IndexConfig {
|
||||
|
||||
@@ -28,6 +28,7 @@ pub mod index;
|
||||
pub mod namespace;
|
||||
pub mod permutation;
|
||||
pub mod query;
|
||||
pub mod runtime;
|
||||
pub mod session;
|
||||
pub mod table;
|
||||
pub mod util;
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use crate::{
|
||||
arrow::RecordBatchStream, connection::Connection, error::PythonErrorExt, table::Table,
|
||||
arrow::RecordBatchStream, error::PythonErrorExt, runtime::future_into_py, table::Table,
|
||||
};
|
||||
use arrow::pyarrow::{PyArrowType, ToPyArrow};
|
||||
use lancedb::{
|
||||
@@ -21,7 +21,6 @@ use pyo3::{
|
||||
pyclass, pymethods,
|
||||
types::{PyAnyMethods, PyDict, PyDictMethods, PyType},
|
||||
};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
fn table_from_py<'a>(table: Bound<'a, PyAny>) -> PyResult<Bound<'a, Table>> {
|
||||
if table.hasattr("_inner")? {
|
||||
@@ -80,24 +79,6 @@ impl PyAsyncPermutationBuilder {
|
||||
|
||||
#[pymethods]
|
||||
impl PyAsyncPermutationBuilder {
|
||||
#[pyo3(signature = (database, table_name))]
|
||||
pub fn persist(
|
||||
slf: PyRefMut<'_, Self>,
|
||||
database: Bound<'_, PyAny>,
|
||||
table_name: String,
|
||||
) -> PyResult<Self> {
|
||||
let conn = if database.hasattr("_conn")? {
|
||||
database
|
||||
.getattr("_conn")?
|
||||
.getattr("_inner")?
|
||||
.cast_into::<Connection>()?
|
||||
} else {
|
||||
database.getattr("_inner")?.cast_into::<Connection>()?
|
||||
};
|
||||
let database = conn.borrow().database()?;
|
||||
slf.modify(|builder| builder.persist(database, table_name))
|
||||
}
|
||||
|
||||
#[pyo3(signature = (*, ratios=None, counts=None, fixed=None, seed=None, split_names=None))]
|
||||
pub fn split_random(
|
||||
slf: PyRefMut<'_, Self>,
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::expr::PyExpr;
|
||||
use crate::runtime::future_into_py;
|
||||
use crate::util::parse_distance_type;
|
||||
use crate::{arrow::RecordBatchStream, util::PyLanceDB};
|
||||
use crate::{error::PythonErrorExt, index::class_name};
|
||||
use arrow::array::Array;
|
||||
use arrow::array::ArrayData;
|
||||
use arrow::array::make_array;
|
||||
@@ -36,12 +41,6 @@ use pyo3::types::{PyDict, PyString};
|
||||
use pyo3::{Borrowed, FromPyObject, exceptions::PyRuntimeError};
|
||||
use pyo3::{PyErr, pyclass};
|
||||
use pyo3::{exceptions::PyValueError, intern};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::expr::PyExpr;
|
||||
use crate::util::parse_distance_type;
|
||||
use crate::{arrow::RecordBatchStream, util::PyLanceDB};
|
||||
use crate::{error::PythonErrorExt, index::class_name};
|
||||
|
||||
impl<'a, 'py> FromPyObject<'a, 'py> for PyLanceDB<FtsQuery> {
|
||||
type Error = PyErr;
|
||||
|
||||
142
python/src/runtime.rs
Normal file
142
python/src/runtime.rs
Normal file
@@ -0,0 +1,142 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
//! Fork-safe wrapper around tokio + pyo3-async-runtimes.
|
||||
//!
|
||||
//! `pyo3_async_runtimes::tokio` keeps its multi-threaded runtime in a
|
||||
//! `OnceLock` that can never be replaced. Tokio's worker threads do not
|
||||
//! survive `fork()`, so once a child inherits a "frozen" runtime, every
|
||||
//! `future_into_py` call hangs forever.
|
||||
//!
|
||||
//! We sidestep the global by routing every future through our own
|
||||
//! [`LanceRuntime`] (a [`pyo3_async_runtimes::generic::Runtime`] impl) backed
|
||||
//! by an [`AtomicPtr`] to a tokio runtime that we own. A `pthread_atfork`
|
||||
//! child handler nulls the pointer; the next `spawn` rebuilds the runtime in
|
||||
//! the child. This mirrors the pattern used in the Lance Python bindings.
|
||||
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::{AtomicBool, AtomicPtr, Ordering};
|
||||
|
||||
use pyo3::{Bound, PyAny, PyResult, Python, conversion::IntoPyObject};
|
||||
use pyo3_async_runtimes::{
|
||||
TaskLocals,
|
||||
generic::{ContextExt, JoinError, Runtime},
|
||||
};
|
||||
use tokio::{runtime, task};
|
||||
|
||||
static RUNTIME: AtomicPtr<runtime::Runtime> = AtomicPtr::new(std::ptr::null_mut());
|
||||
static RUNTIME_INSTALLING: AtomicBool = AtomicBool::new(false);
|
||||
static ATFORK_INSTALLED: AtomicBool = AtomicBool::new(false);
|
||||
|
||||
fn create_runtime() -> runtime::Runtime {
|
||||
runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.thread_name("lancedb-tokio-worker")
|
||||
.build()
|
||||
.expect("Failed to build tokio runtime")
|
||||
}
|
||||
|
||||
fn get_runtime() -> &'static runtime::Runtime {
|
||||
loop {
|
||||
let ptr = RUNTIME.load(Ordering::SeqCst);
|
||||
if !ptr.is_null() {
|
||||
return unsafe { &*ptr };
|
||||
}
|
||||
if !RUNTIME_INSTALLING.fetch_or(true, Ordering::SeqCst) {
|
||||
break;
|
||||
}
|
||||
std::thread::yield_now();
|
||||
}
|
||||
if !ATFORK_INSTALLED.fetch_or(true, Ordering::SeqCst) {
|
||||
install_atfork();
|
||||
}
|
||||
let new_ptr = Box::into_raw(Box::new(create_runtime()));
|
||||
RUNTIME.store(new_ptr, Ordering::SeqCst);
|
||||
unsafe { &*new_ptr }
|
||||
}
|
||||
|
||||
/// Runs in async-signal context after `fork()` in the child. We can only
|
||||
/// touch atomics here; we deliberately leak the previous runtime because
|
||||
/// dropping a tokio `Runtime` would try to join its (now-dead) worker
|
||||
/// threads and hang.
|
||||
extern "C" fn atfork_child() {
|
||||
RUNTIME.store(std::ptr::null_mut(), Ordering::SeqCst);
|
||||
RUNTIME_INSTALLING.store(false, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
#[cfg(not(windows))]
|
||||
fn install_atfork() {
|
||||
unsafe { libc::pthread_atfork(None, None, Some(atfork_child)) };
|
||||
}
|
||||
|
||||
#[cfg(windows)]
|
||||
fn install_atfork() {}
|
||||
|
||||
/// Marker type implementing [`Runtime`] over our fork-safe runtime slot.
|
||||
pub struct LanceRuntime;
|
||||
|
||||
/// Newtype wrapper around `tokio::task::JoinError` so we can implement the
|
||||
/// foreign [`JoinError`] trait without violating orphan rules.
|
||||
pub struct LanceJoinError(task::JoinError);
|
||||
|
||||
impl JoinError for LanceJoinError {
|
||||
fn is_panic(&self) -> bool {
|
||||
self.0.is_panic()
|
||||
}
|
||||
fn into_panic(self) -> Box<dyn std::any::Any + Send + 'static> {
|
||||
self.0.into_panic()
|
||||
}
|
||||
}
|
||||
|
||||
impl Runtime for LanceRuntime {
|
||||
type JoinError = LanceJoinError;
|
||||
type JoinHandle = Pin<Box<dyn Future<Output = Result<(), Self::JoinError>> + Send>>;
|
||||
|
||||
fn spawn<F>(fut: F) -> Self::JoinHandle
|
||||
where
|
||||
F: Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
let handle = get_runtime().spawn(fut);
|
||||
Box::pin(async move { handle.await.map_err(LanceJoinError) })
|
||||
}
|
||||
|
||||
fn spawn_blocking<F>(f: F) -> Self::JoinHandle
|
||||
where
|
||||
F: FnOnce() + Send + 'static,
|
||||
{
|
||||
let handle = get_runtime().spawn_blocking(f);
|
||||
Box::pin(async move { handle.await.map_err(LanceJoinError) })
|
||||
}
|
||||
}
|
||||
|
||||
tokio::task_local! {
|
||||
static TASK_LOCALS: std::cell::OnceCell<TaskLocals>;
|
||||
}
|
||||
|
||||
impl ContextExt for LanceRuntime {
|
||||
fn scope<F, R>(locals: TaskLocals, fut: F) -> Pin<Box<dyn Future<Output = R> + Send>>
|
||||
where
|
||||
F: Future<Output = R> + Send + 'static,
|
||||
{
|
||||
let cell = std::cell::OnceCell::new();
|
||||
cell.set(locals).unwrap();
|
||||
Box::pin(TASK_LOCALS.scope(cell, fut))
|
||||
}
|
||||
|
||||
fn get_task_locals() -> Option<TaskLocals> {
|
||||
TASK_LOCALS
|
||||
.try_with(|c| c.get().cloned())
|
||||
.unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop-in replacement for `pyo3_async_runtimes::tokio::future_into_py` that
|
||||
/// uses our fork-safe runtime.
|
||||
pub fn future_into_py<F, T>(py: Python<'_>, fut: F) -> PyResult<Bound<'_, PyAny>>
|
||||
where
|
||||
F: Future<Output = PyResult<T>> + Send + 'static,
|
||||
T: for<'py> IntoPyObject<'py> + Send + 'static,
|
||||
{
|
||||
pyo3_async_runtimes::generic::future_into_py::<LanceRuntime, _, T>(py, fut)
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use crate::runtime::future_into_py;
|
||||
use crate::{
|
||||
connection::Connection,
|
||||
error::PythonErrorExt,
|
||||
@@ -24,7 +25,6 @@ use pyo3::{
|
||||
pyclass, pymethods,
|
||||
types::{IntoPyDict, PyAnyMethods, PyDict, PyDictMethods},
|
||||
};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
mod scannable;
|
||||
|
||||
|
||||
@@ -33,6 +33,14 @@ class TestExprConstruction:
|
||||
e = lit(True)
|
||||
assert isinstance(e, Expr)
|
||||
|
||||
def test_lit_bytes(self):
|
||||
e = lit(b"\xde\xad\xbe\xef")
|
||||
assert isinstance(e, Expr)
|
||||
|
||||
def test_lit_bytes_empty(self):
|
||||
e = lit(b"")
|
||||
assert isinstance(e, Expr)
|
||||
|
||||
def test_lit_unsupported_type_raises(self):
|
||||
with pytest.raises(Exception):
|
||||
lit([1, 2, 3])
|
||||
@@ -135,6 +143,43 @@ class TestExprOperators:
|
||||
assert e.to_sql() == "(name = 'alice')"
|
||||
|
||||
|
||||
class TestExprBytesLiteral:
|
||||
def test_bytes_to_sql(self):
|
||||
e = lit(b"\xde\xad\xbe\xef")
|
||||
assert e.to_sql() == "X'DEADBEEF'"
|
||||
|
||||
def test_empty_bytes_to_sql(self):
|
||||
e = lit(b"")
|
||||
assert e.to_sql() == "X''"
|
||||
|
||||
def test_bytes_repr(self):
|
||||
e = lit(b"\x01\x02")
|
||||
assert repr(e) == "Expr(X'0102')"
|
||||
|
||||
def test_bytes_equality_expr_sql(self):
|
||||
e = col("data") == lit(b"\xca\xfe")
|
||||
assert e.to_sql() == "(data = X'CAFE')"
|
||||
|
||||
def test_bytes_ne_expr_sql(self):
|
||||
e = col("data") != lit(b"\xff")
|
||||
assert e.to_sql() == "(data <> X'FF')"
|
||||
|
||||
def test_bytes_compound_expr_sql(self):
|
||||
e = (col("data") == lit(b"\x01")) & (col("id") > lit(5))
|
||||
assert e.to_sql() == "((data = X'01') AND (id > 5))"
|
||||
|
||||
def test_bytes_in_function_call(self):
|
||||
# Regression test: binary literals inside scalar function calls
|
||||
# used to fail because DataFusion's unparser does not support Binary
|
||||
# scalars. Now handled via a placeholder-substitution rewrite.
|
||||
e = func("contains", col("data"), lit(b"\xff"))
|
||||
assert e.to_sql() == "contains(data, X'FF')"
|
||||
|
||||
def test_bytes_in_not(self):
|
||||
e = ~(col("data") == lit(b"\xff"))
|
||||
assert e.to_sql() == "NOT (data = X'FF')"
|
||||
|
||||
|
||||
class TestExprStringMethods:
|
||||
def test_lower(self):
|
||||
e = col("name").lower()
|
||||
@@ -385,3 +430,44 @@ class TestColNamingIntegration:
|
||||
)
|
||||
assert "upper_name" in result.schema.names
|
||||
assert sorted(result["upper_name"].to_pylist()) == ["ALICE", "BOB", "CHARLIE"]
|
||||
|
||||
|
||||
# ── bytes / binary column integration tests ───────────────────────────────────
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def binary_table(tmp_path):
|
||||
db = lancedb.connect(str(tmp_path))
|
||||
data = pa.table(
|
||||
{
|
||||
"id": [1, 2, 3],
|
||||
"payload": pa.array(
|
||||
[b"\x01\x02", b"\xca\xfe", b"\xff\x00"],
|
||||
type=pa.binary(),
|
||||
),
|
||||
}
|
||||
)
|
||||
return db.create_table("binary_test", data)
|
||||
|
||||
|
||||
class TestExprBytesIntegration:
|
||||
def test_binary_equality_filter(self, binary_table):
|
||||
result = (
|
||||
binary_table.search().where(col("payload") == lit(b"\xca\xfe")).to_arrow()
|
||||
)
|
||||
assert result.num_rows == 1
|
||||
assert result["id"][0].as_py() == 2
|
||||
|
||||
def test_binary_ne_filter(self, binary_table):
|
||||
result = (
|
||||
binary_table.search().where(col("payload") != lit(b"\x01\x02")).to_arrow()
|
||||
)
|
||||
assert result.num_rows == 2
|
||||
|
||||
def test_binary_compound_filter(self, binary_table):
|
||||
result = (
|
||||
binary_table.search()
|
||||
.where((col("payload") == lit(b"\x01\x02")) | (col("id") == lit(3)))
|
||||
.to_arrow()
|
||||
)
|
||||
assert result.num_rows == 2
|
||||
|
||||
@@ -40,7 +40,7 @@ lance-datafusion.workspace = true
|
||||
lance-datagen = { workspace = true }
|
||||
lance-file = { workspace = true }
|
||||
lance-io = { workspace = true }
|
||||
lance-index = { workspace = true }
|
||||
lance-index = { workspace = true, features = ["tokenizer-jieba", "tokenizer-lindera"] }
|
||||
lance-table = { workspace = true }
|
||||
lance-linalg = { workspace = true }
|
||||
lance-testing = { workspace = true }
|
||||
@@ -108,7 +108,12 @@ test-log = "0.2"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
aws = ["lance/aws", "lance-io/aws", "lance-namespace-impls/dir-aws"]
|
||||
aws = [
|
||||
"lance/aws",
|
||||
"lance-io/aws",
|
||||
"lance-namespace-impls/dir-aws",
|
||||
"object_store/aws",
|
||||
]
|
||||
oss = ["lance/oss", "lance-io/oss", "lance-namespace-impls/dir-oss"]
|
||||
gcs = ["lance/gcp", "lance-io/gcp", "lance-namespace-impls/dir-gcp"]
|
||||
azure = [
|
||||
|
||||
@@ -505,8 +505,15 @@ impl ListingDatabase {
|
||||
// Filter out the commit store query param -- it's a lancedb param
|
||||
url.query_pairs_mut().clear();
|
||||
url.query_pairs_mut().extend_pairs(filtered_querys);
|
||||
// Take a copy of the query string so we can propagate it to lance
|
||||
let query_string = url.query().map(|s| s.to_string());
|
||||
// Take a copy of the query string so we can propagate it to lance.
|
||||
// `query_pairs_mut()` leaves the URL with `Some("")` even when no
|
||||
// pairs survive (or none existed in the first place), so an empty
|
||||
// string here must be treated the same as "no query" — otherwise
|
||||
// every table URI ends up with a trailing `?`, which makes downstream
|
||||
// sub-paths (e.g. MemWAL gen paths) re-parse as path=<base table> +
|
||||
// query=<sub-path>, causing Lance to find the base table dataset
|
||||
// when looking up the sub-path.
|
||||
let query_string = url.query().filter(|q| !q.is_empty()).map(|s| s.to_string());
|
||||
// clear the query string so we can use the url as the base uri
|
||||
// use .set_query(None) instead of .set_query("") because the latter
|
||||
// will add a trailing '?' to the url
|
||||
@@ -715,7 +722,7 @@ impl ListingDatabase {
|
||||
let commit_handler = commit_handler_from_url(&uri, &Some(object_store_params)).await?;
|
||||
for name in names {
|
||||
let dir_name = format!("{}.{}", name, LANCE_EXTENSION);
|
||||
let full_path = self.base_path.child(dir_name.clone());
|
||||
let full_path = self.base_path.clone().join(dir_name.clone());
|
||||
|
||||
commit_handler.delete(&full_path).await?;
|
||||
|
||||
@@ -2213,6 +2220,133 @@ mod tests {
|
||||
assert_eq!(uri, expected);
|
||||
}
|
||||
|
||||
/// Regression: connecting via a URL-style URI (which goes through
|
||||
/// `url::Url::parse` and the `query_pairs_mut()` path) must not
|
||||
/// append a trailing `?` to per-table URIs when the input URI has
|
||||
/// no query string.
|
||||
///
|
||||
/// Earlier, `query_pairs_mut().clear()` left the URL with
|
||||
/// `query=Some("")`, which then propagated as a trailing `?` on
|
||||
/// every table URI. Sub-path lookups against that URI (e.g. MemWAL
|
||||
/// `<table_uri>/_mem_wal/<shard>/<rand>_gen_<n>`) re-parsed as
|
||||
/// `path=<base table>` + `query=/_mem_wal/...`, causing
|
||||
/// `Dataset::write` to find the base table dataset and falsely
|
||||
/// report `Dataset already exists`.
|
||||
/// Mirrors the URL-mutation step from
|
||||
/// [`ListingDatabase::connect_with_options`] so we can assert the
|
||||
/// fix without going through filesystem setup (which is awkward
|
||||
/// across platforms — see the `file://` test below).
|
||||
fn capture_query_like_connect(input_uri: &str) -> Option<String> {
|
||||
let mut url = url::Url::parse(input_uri).unwrap();
|
||||
let mut filtered_querys = Vec::new();
|
||||
for (key, value) in url.query_pairs() {
|
||||
if key == ENGINE || key == MIRRORED_STORE {
|
||||
continue;
|
||||
}
|
||||
filtered_querys.push((key.to_string(), value.to_string()));
|
||||
}
|
||||
url.query_pairs_mut().clear();
|
||||
url.query_pairs_mut().extend_pairs(filtered_querys);
|
||||
url.query().filter(|q| !q.is_empty()).map(|s| s.to_string())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_capture_query_treats_empty_as_none() {
|
||||
// No query at all. With the bug, `query_pairs_mut()` left the
|
||||
// URL with `query=Some("")` and we used to propagate that.
|
||||
assert_eq!(
|
||||
capture_query_like_connect("s3://bucket/prefix/"),
|
||||
None,
|
||||
"empty query after mutation must be treated as no query"
|
||||
);
|
||||
|
||||
// Real query is propagated.
|
||||
assert_eq!(
|
||||
capture_query_like_connect("s3://bucket/prefix/?foo=bar"),
|
||||
Some("foo=bar".to_string())
|
||||
);
|
||||
|
||||
// lancedb-internal `engine=` is stripped; nothing remains, so
|
||||
// query_string is None — not Some("").
|
||||
assert_eq!(
|
||||
capture_query_like_connect(&format!("s3://bucket/prefix/?{}=mem", ENGINE)),
|
||||
None
|
||||
);
|
||||
|
||||
// Mixed: drop `engine=`, keep the rest.
|
||||
let captured =
|
||||
capture_query_like_connect(&format!("s3://bucket/prefix/?{}=mem&foo=bar", ENGINE));
|
||||
assert_eq!(captured.as_deref(), Some("foo=bar"));
|
||||
}
|
||||
|
||||
/// Regression: connecting via a URL-style URI (which goes through
|
||||
/// `url::Url::parse` and the `query_pairs_mut()` path) must not
|
||||
/// append a trailing `?` to per-table URIs when the input URI has
|
||||
/// no query string. Sub-path lookups against such a URI (e.g.
|
||||
/// MemWAL `<table_uri>/_mem_wal/<shard>/<rand>_gen_<n>`) re-parse
|
||||
/// as `path=<base table>` + `query=/_mem_wal/...`, causing
|
||||
/// `Dataset::write` to find the base table dataset and falsely
|
||||
/// report `Dataset already exists`.
|
||||
///
|
||||
/// Skipped on Windows: `try_create_dir` does not understand
|
||||
/// `file:///C:/…` paths so `connect_with_options` fails before
|
||||
/// even reaching the URL-mutation logic. The pure URL-mutation
|
||||
/// invariant is covered by
|
||||
/// `test_capture_query_treats_empty_as_none` above, which runs
|
||||
/// on all platforms.
|
||||
#[cfg(not(windows))]
|
||||
#[tokio::test]
|
||||
async fn test_table_uri_url_path_has_no_trailing_question_mark() {
|
||||
let tempdir = tempdir().unwrap();
|
||||
let uri = format!("file://{}", tempdir.path().to_str().unwrap());
|
||||
|
||||
let request = ConnectRequest {
|
||||
uri: uri.clone(),
|
||||
#[cfg(feature = "remote")]
|
||||
client_config: Default::default(),
|
||||
options: Default::default(),
|
||||
namespace_client_properties: Default::default(),
|
||||
manifest_enabled: false,
|
||||
read_consistency_interval: None,
|
||||
session: None,
|
||||
};
|
||||
let db = ListingDatabase::connect_with_options(&request)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
db.query_string, None,
|
||||
"no input query → no captured query_string"
|
||||
);
|
||||
|
||||
let table_uri = db.table_uri("test").unwrap();
|
||||
assert!(
|
||||
!table_uri.ends_with('?'),
|
||||
"table_uri must not have a trailing `?`: {}",
|
||||
table_uri
|
||||
);
|
||||
assert_eq!(table_uri, format!("{}/test.lance", uri));
|
||||
|
||||
// A real query string should still be propagated.
|
||||
let with_query = format!("{}?foo=bar", uri);
|
||||
let request_with_query = ConnectRequest {
|
||||
uri: with_query,
|
||||
#[cfg(feature = "remote")]
|
||||
client_config: Default::default(),
|
||||
options: Default::default(),
|
||||
namespace_client_properties: Default::default(),
|
||||
manifest_enabled: false,
|
||||
read_consistency_interval: None,
|
||||
session: None,
|
||||
};
|
||||
let db_with_query = ListingDatabase::connect_with_options(&request_with_query)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(db_with_query.query_string.as_deref(), Some("foo=bar"));
|
||||
let table_uri = db_with_query.table_uri("test").unwrap();
|
||||
assert_eq!(table_uri, format!("{}/test.lance?foo=bar", uri));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_client() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
@@ -11,6 +11,7 @@ use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore;
|
||||
use lance_io::object_store::{ObjectStoreParams, StorageOptionsAccessor};
|
||||
use lance_namespace::{
|
||||
LanceNamespace,
|
||||
error::{ErrorCode, NamespaceError},
|
||||
models::{
|
||||
CreateNamespaceRequest, CreateNamespaceResponse, DeclareTableRequest,
|
||||
DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableRequest,
|
||||
@@ -29,7 +30,7 @@ use crate::database::listing::{
|
||||
OPT_NEW_TABLE_V2_MANIFEST_PATHS,
|
||||
};
|
||||
use crate::error::{Error, Result};
|
||||
use crate::table::NativeTable;
|
||||
use crate::table::{NativeTable, map_namespace_lance_error};
|
||||
use lance::dataset::WriteMode;
|
||||
|
||||
use super::{
|
||||
@@ -37,6 +38,19 @@ use super::{
|
||||
Database, OpenTableRequest, TableNamesRequest,
|
||||
};
|
||||
|
||||
/// Returns true if the given `lance::Error` (anywhere in its source chain) is a
|
||||
/// `NamespaceError::TableAlreadyExists`.
|
||||
fn is_table_already_exists_namespace_error(err: &lance::Error) -> bool {
|
||||
let mut current: Option<&(dyn std::error::Error + 'static)> = Some(err);
|
||||
while let Some(e) = current {
|
||||
if let Some(ns_err) = e.downcast_ref::<NamespaceError>() {
|
||||
return ns_err.code() == ErrorCode::TableAlreadyExists;
|
||||
}
|
||||
current = e.source();
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// A database implementation that uses lance-namespace for table management
|
||||
pub struct LanceNamespaceDatabase {
|
||||
namespace: Arc<dyn LanceNamespace>,
|
||||
@@ -356,13 +370,15 @@ impl Database for LanceNamespaceDatabase {
|
||||
(loc, opts, response.managed_versioning)
|
||||
}
|
||||
Err(e)
|
||||
if matches!(request.mode, CreateTableMode::Create) && {
|
||||
let err_str = e.to_string();
|
||||
err_str.contains("already exists")
|
||||
|| err_str.contains("TableAlreadyExists")
|
||||
|| err_str.contains("table already exists")
|
||||
} =>
|
||||
if matches!(request.mode, CreateTableMode::Create)
|
||||
&& is_table_already_exists_namespace_error(&e) =>
|
||||
{
|
||||
// A declare conflict can either mean (a) the table was previously
|
||||
// *declared* but never written (in which case we should proceed and
|
||||
// create it), or (b) the table is fully realized (in which case the
|
||||
// user is creating something that already exists and we should
|
||||
// surface TableAlreadyExists). Disambiguate by describing the table
|
||||
// and checking whether it has both a version and a schema.
|
||||
let response = self
|
||||
.namespace
|
||||
.describe_table(DescribeTableRequest {
|
||||
@@ -370,11 +386,8 @@ impl Database for LanceNamespaceDatabase {
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|describe_err| Error::Runtime {
|
||||
message: format!(
|
||||
"Failed to describe existing declared table after declare conflict: {}",
|
||||
describe_err
|
||||
),
|
||||
.map_err(|describe_err| {
|
||||
map_namespace_lance_error(describe_err, &request.name)
|
||||
})?;
|
||||
|
||||
if response.version.is_some() && response.schema.is_some() {
|
||||
@@ -394,9 +407,7 @@ impl Database for LanceNamespaceDatabase {
|
||||
(loc, opts, response.managed_versioning)
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(Error::Runtime {
|
||||
message: format!("Failed to declare table: {}", e),
|
||||
});
|
||||
return Err(map_namespace_lance_error(e, &request.name));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1086,8 +1097,120 @@ mod tests {
|
||||
.execute()
|
||||
.await;
|
||||
|
||||
// Verify: Should return an error
|
||||
assert!(result.is_err());
|
||||
// Verify: Should return TableNotFound — not a generic Runtime/internal error
|
||||
// (regression test for ENT-1235: open_table on missing table previously surfaced as
|
||||
// a generic 500/Runtime error rather than TableNotFound).
|
||||
match result {
|
||||
Err(Error::TableNotFound { name, .. }) => {
|
||||
assert_eq!(name, "non_existent_table");
|
||||
}
|
||||
Err(other) => panic!("Expected TableNotFound, got: {:?}", other),
|
||||
Ok(_) => panic!("Expected open_table to fail, but it succeeded"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_open_table_not_found_at_root() {
|
||||
// Same as above, but at the root namespace (no parent namespace creation).
|
||||
// Covers the common code path used by `db.open_table("foo")` without a namespace.
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
let result = conn.open_table("missing_at_root").execute().await;
|
||||
|
||||
match result {
|
||||
Err(Error::TableNotFound { name, .. }) => {
|
||||
assert_eq!(name, "missing_at_root");
|
||||
}
|
||||
Err(other) => panic!("Expected TableNotFound, got: {:?}", other),
|
||||
Ok(_) => panic!("Expected open_table to fail, but it succeeded"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_create_table_already_exists() {
|
||||
// Regression test for ENT-1235: create_table on an existing table (in default
|
||||
// Create mode) should return TableAlreadyExists, not a generic Runtime/500 error.
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
conn.create_namespace(CreateNamespaceRequest {
|
||||
id: Some(vec!["test_ns".into()]),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.expect("Failed to create namespace");
|
||||
|
||||
// Create the table once.
|
||||
conn.create_table("dup_table", create_test_data())
|
||||
.namespace(vec!["test_ns".into()])
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to create table the first time");
|
||||
|
||||
// Try to create it again with the default Create mode.
|
||||
let result = conn
|
||||
.create_table("dup_table", create_test_data())
|
||||
.namespace(vec!["test_ns".into()])
|
||||
.execute()
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Err(Error::TableAlreadyExists { name }) => {
|
||||
assert_eq!(name, "dup_table");
|
||||
}
|
||||
Err(other) => panic!("Expected TableAlreadyExists, got: {:?}", other),
|
||||
Ok(_) => panic!("Expected create_table to fail, but it succeeded"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_create_table_already_exists_at_root() {
|
||||
// Same as above, but at the root namespace.
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
conn.create_table("dup_root", create_test_data())
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to create table the first time");
|
||||
|
||||
let result = conn
|
||||
.create_table("dup_root", create_test_data())
|
||||
.execute()
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Err(Error::TableAlreadyExists { name }) => {
|
||||
assert_eq!(name, "dup_root");
|
||||
}
|
||||
Err(other) => panic!("Expected TableAlreadyExists, got: {:?}", other),
|
||||
Ok(_) => panic!("Expected create_table to fail, but it succeeded"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -138,4 +138,69 @@ mod tests {
|
||||
let sql = expr_to_sql_string(&expr).unwrap();
|
||||
assert!(sql.contains("price"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_binary_literal() {
|
||||
use datafusion_common::ScalarValue;
|
||||
let expr = lit(ScalarValue::Binary(Some(vec![0xde, 0xad, 0xbe, 0xef])));
|
||||
let sql = expr_to_sql_string(&expr).unwrap();
|
||||
assert_eq!(sql, "X'DEADBEEF'");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_binary_literal_in_filter() {
|
||||
use datafusion_common::ScalarValue;
|
||||
let expr = col("data").eq(lit(ScalarValue::Binary(Some(vec![0xca, 0xfe]))));
|
||||
let sql = expr_to_sql_string(&expr).unwrap();
|
||||
assert_eq!(sql, "(data = X'CAFE')");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_binary_literal_compound() {
|
||||
use datafusion_common::ScalarValue;
|
||||
let bin_expr = col("data").eq(lit(ScalarValue::Binary(Some(vec![0x01]))));
|
||||
let int_expr = col("id").gt(lit(5i64));
|
||||
let combined = bin_expr.and(int_expr);
|
||||
let sql = expr_to_sql_string(&combined).unwrap();
|
||||
assert_eq!(sql, "((data = X'01') AND (id > 5))");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_null_binary_literal() {
|
||||
use datafusion_common::ScalarValue;
|
||||
let expr = lit(ScalarValue::Binary(None));
|
||||
let sql = expr_to_sql_string(&expr).unwrap();
|
||||
assert_eq!(sql, "NULL");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_binary_literal_in_function_call() {
|
||||
use datafusion_common::ScalarValue;
|
||||
// Binary literals inside scalar function arguments must also be
|
||||
// serialized correctly (regression test for placeholder rewrite path).
|
||||
let expr = contains(col("data"), lit(ScalarValue::Binary(Some(vec![0xff]))));
|
||||
let sql = expr_to_sql_string(&expr).unwrap();
|
||||
assert_eq!(sql, "contains(data, X'FF')");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_binary_literal_in_negation() {
|
||||
use datafusion_common::ScalarValue;
|
||||
use std::ops::Not;
|
||||
let expr = col("data")
|
||||
.eq(lit(ScalarValue::Binary(Some(vec![0xab, 0xcd]))))
|
||||
.not();
|
||||
let sql = expr_to_sql_string(&expr).unwrap();
|
||||
assert_eq!(sql, "NOT (data = X'ABCD')");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_binary_literals() {
|
||||
use datafusion_common::ScalarValue;
|
||||
let lhs = col("a").eq(lit(ScalarValue::Binary(Some(vec![0x01]))));
|
||||
let rhs = col("b").eq(lit(ScalarValue::Binary(Some(vec![0x02, 0x03]))));
|
||||
let expr = lhs.and(rhs);
|
||||
let sql = expr_to_sql_string(&expr).unwrap();
|
||||
assert_eq!(sql, "((a = X'01') AND (b = X'0203'))");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
|
||||
use datafusion_expr::Expr;
|
||||
use datafusion_sql::unparser::{self, dialect::Dialect};
|
||||
|
||||
@@ -28,7 +30,36 @@ impl Dialect for LanceSqlDialect {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn expr_to_sql_string(expr: &Expr) -> crate::Result<String> {
|
||||
/// Prefix for placeholder strings inserted in place of binary literals. Chosen
|
||||
/// to be extremely unlikely to occur in user data.
|
||||
const BINARY_PLACEHOLDER_PREFIX: &str = "__lancedb_binary_placeholder_";
|
||||
|
||||
fn bytes_to_hex_sql(bytes: &[u8]) -> String {
|
||||
let hex: String = bytes.iter().map(|b| format!("{b:02X}")).collect();
|
||||
format!("X'{hex}'")
|
||||
}
|
||||
|
||||
/// Returns true if *expr* contains a `Binary` or `LargeBinary` scalar literal
|
||||
/// anywhere in its subtree. DataFusion's SQL unparser cannot serialize those
|
||||
/// variants, so we route such expressions through a placeholder-substitution
|
||||
/// path that emits SQL `X'...'` byte-string literals.
|
||||
fn has_binary_literal(expr: &Expr) -> bool {
|
||||
let mut found = false;
|
||||
let _ = expr.apply(&mut |e: &Expr| {
|
||||
if matches!(
|
||||
e,
|
||||
Expr::Literal(ScalarValue::Binary(_) | ScalarValue::LargeBinary(_), _)
|
||||
) {
|
||||
found = true;
|
||||
Ok(TreeNodeRecursion::Stop)
|
||||
} else {
|
||||
Ok(TreeNodeRecursion::Continue)
|
||||
}
|
||||
});
|
||||
found
|
||||
}
|
||||
|
||||
fn run_unparser(expr: &Expr) -> crate::Result<String> {
|
||||
let ast = unparser::Unparser::new(&LanceSqlDialect)
|
||||
.expr_to_sql(expr)
|
||||
.map_err(|e| crate::Error::InvalidInput {
|
||||
@@ -36,3 +67,49 @@ pub fn expr_to_sql_string(expr: &Expr) -> crate::Result<String> {
|
||||
})?;
|
||||
Ok(ast.to_string())
|
||||
}
|
||||
|
||||
pub fn expr_to_sql_string(expr: &Expr) -> crate::Result<String> {
|
||||
// Fast path: no binary literals — DataFusion's unparser handles everything.
|
||||
if !has_binary_literal(expr) {
|
||||
return run_unparser(expr);
|
||||
}
|
||||
|
||||
// Slow path: DataFusion's unparser cannot serialize `Binary`/`LargeBinary`
|
||||
// scalars, so we rewrite each one to a unique string-literal placeholder,
|
||||
// let the unparser do the rest of the work, then substitute the SQL
|
||||
// `X'...'` byte-string literal back in. This keeps the operator/function
|
||||
// serialization logic centralized in DataFusion and works for every
|
||||
// expression node type the unparser supports.
|
||||
let mut bindings: Vec<Vec<u8>> = Vec::new();
|
||||
let rewritten = expr
|
||||
.clone()
|
||||
.transform(|e: Expr| match e {
|
||||
Expr::Literal(ScalarValue::Binary(Some(bytes)), m)
|
||||
| Expr::Literal(ScalarValue::LargeBinary(Some(bytes)), m) => {
|
||||
let placeholder = format!("{}{}__", BINARY_PLACEHOLDER_PREFIX, bindings.len());
|
||||
bindings.push(bytes);
|
||||
Ok(Transformed::yes(Expr::Literal(
|
||||
ScalarValue::Utf8(Some(placeholder)),
|
||||
m,
|
||||
)))
|
||||
}
|
||||
Expr::Literal(ScalarValue::Binary(None), m)
|
||||
| Expr::Literal(ScalarValue::LargeBinary(None), m) => {
|
||||
Ok(Transformed::yes(Expr::Literal(ScalarValue::Null, m)))
|
||||
}
|
||||
other => Ok(Transformed::no(other)),
|
||||
})
|
||||
.map_err(|e| crate::Error::InvalidInput {
|
||||
message: format!("failed to rewrite expression: {}", e),
|
||||
})?
|
||||
.data;
|
||||
|
||||
let mut sql = run_unparser(&rewritten)?;
|
||||
for (i, bytes) in bindings.iter().enumerate() {
|
||||
// The unparser quotes string literals with single quotes, so the
|
||||
// placeholder appears as `'__lancedb_binary_placeholder_<i>__'`.
|
||||
let quoted = format!("'{}{}__'", BINARY_PLACEHOLDER_PREFIX, i);
|
||||
sql = sql.replace("ed, &bytes_to_hex_sql(bytes));
|
||||
}
|
||||
Ok(sql)
|
||||
}
|
||||
|
||||
@@ -13,7 +13,10 @@ use crate::{DistanceType, Error, Result, table::BaseTable};
|
||||
|
||||
use self::{
|
||||
scalar::{BTreeIndexBuilder, BitmapIndexBuilder, LabelListIndexBuilder},
|
||||
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, IvfSqIndexBuilder},
|
||||
vector::{
|
||||
IvfHnswFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder,
|
||||
IvfSqIndexBuilder,
|
||||
},
|
||||
};
|
||||
|
||||
pub mod scalar;
|
||||
@@ -67,6 +70,10 @@ pub enum Index {
|
||||
/// IVF-HNSW index with Scalar Quantization
|
||||
/// It is a variant of the HNSW algorithm that uses scalar quantization to compress the vectors.
|
||||
IvfHnswSq(IvfHnswSqIndexBuilder),
|
||||
|
||||
/// IVF-HNSW index without quantization.
|
||||
/// Stores raw vectors, providing the highest recall at the cost of more memory and disk space.
|
||||
IvfHnswFlat(IvfHnswFlatIndexBuilder),
|
||||
}
|
||||
|
||||
/// Builder for the create_index operation
|
||||
@@ -290,6 +297,8 @@ pub enum IndexType {
|
||||
IvfHnswPq,
|
||||
#[serde(alias = "IVF_HNSW_SQ")]
|
||||
IvfHnswSq,
|
||||
#[serde(alias = "IVF_HNSW_FLAT")]
|
||||
IvfHnswFlat,
|
||||
// Scalar
|
||||
#[serde(alias = "BTREE")]
|
||||
BTree,
|
||||
@@ -311,6 +320,7 @@ impl std::fmt::Display for IndexType {
|
||||
Self::IvfRq => write!(f, "IVF_RQ"),
|
||||
Self::IvfHnswPq => write!(f, "IVF_HNSW_PQ"),
|
||||
Self::IvfHnswSq => write!(f, "IVF_HNSW_SQ"),
|
||||
Self::IvfHnswFlat => write!(f, "IVF_HNSW_FLAT"),
|
||||
Self::BTree => write!(f, "BTREE"),
|
||||
Self::Bitmap => write!(f, "BITMAP"),
|
||||
Self::LabelList => write!(f, "LABEL_LIST"),
|
||||
@@ -334,6 +344,7 @@ impl std::str::FromStr for IndexType {
|
||||
"IVF_RQ" => Ok(Self::IvfRq),
|
||||
"IVF_HNSW_PQ" => Ok(Self::IvfHnswPq),
|
||||
"IVF_HNSW_SQ" => Ok(Self::IvfHnswSq),
|
||||
"IVF_HNSW_FLAT" => Ok(Self::IvfHnswFlat),
|
||||
_ => Err(Error::InvalidInput {
|
||||
message: format!("the input value {} is not a valid IndexType", value),
|
||||
}),
|
||||
|
||||
@@ -474,3 +474,46 @@ impl IvfHnswSqIndexBuilder {
|
||||
impl_ivf_params_setter!();
|
||||
impl_hnsw_params_setter!();
|
||||
}
|
||||
|
||||
/// Builder for an IVF_HNSW_FLAT index.
|
||||
///
|
||||
/// This index combines IVF partitioning with an HNSW graph per partition,
|
||||
/// storing raw (unquantized) vectors. It offers the highest recall among
|
||||
/// the IVF_HNSW family at the cost of more memory and disk space compared
|
||||
/// to [`IvfHnswSqIndexBuilder`] or [`IvfHnswPqIndexBuilder`].
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct IvfHnswFlatIndexBuilder {
|
||||
// IVF
|
||||
#[serde(rename = "metric_type")]
|
||||
pub(crate) distance_type: DistanceType,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(crate) num_partitions: Option<u32>,
|
||||
pub(crate) sample_rate: u32,
|
||||
pub(crate) max_iterations: u32,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(crate) target_partition_size: Option<u32>,
|
||||
|
||||
// HNSW
|
||||
pub(crate) m: u32,
|
||||
pub(crate) ef_construction: u32,
|
||||
}
|
||||
|
||||
impl Default for IvfHnswFlatIndexBuilder {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
distance_type: DistanceType::L2,
|
||||
num_partitions: None,
|
||||
sample_rate: 256,
|
||||
max_iterations: 50,
|
||||
m: 20,
|
||||
ef_construction: 300,
|
||||
target_partition_size: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IvfHnswFlatIndexBuilder {
|
||||
impl_distance_type_setter!();
|
||||
impl_ivf_params_setter!();
|
||||
impl_hnsw_params_setter!();
|
||||
}
|
||||
|
||||
@@ -5,11 +5,12 @@
|
||||
|
||||
use std::{fmt::Formatter, sync::Arc};
|
||||
|
||||
use futures::{TryFutureExt, stream::BoxStream};
|
||||
use futures::{StreamExt, TryFutureExt, stream::BoxStream};
|
||||
use lance::io::WrappingObjectStore;
|
||||
use object_store::{
|
||||
Error, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
|
||||
PutMultipartOptions, PutOptions, PutPayload, PutResult, Result, UploadPart, path::Path,
|
||||
CopyOptions, Error, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
|
||||
ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result,
|
||||
UploadPart, path::Path,
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
@@ -93,20 +94,6 @@ impl ObjectStore for MirroringObjectStore {
|
||||
self.primary.get_opts(location, options).await
|
||||
}
|
||||
|
||||
async fn head(&self, location: &Path) -> Result<ObjectMeta> {
|
||||
self.primary.head(location).await
|
||||
}
|
||||
|
||||
async fn delete(&self, location: &Path) -> Result<()> {
|
||||
if !location.primary_only() {
|
||||
match self.secondary.delete(location).await {
|
||||
Err(Error::NotFound { .. }) | Ok(_) => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
self.primary.delete(location).await
|
||||
}
|
||||
|
||||
fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result<ObjectMeta>> {
|
||||
self.primary.list(prefix)
|
||||
}
|
||||
@@ -115,21 +102,40 @@ impl ObjectStore for MirroringObjectStore {
|
||||
self.primary.list_with_delimiter(prefix).await
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &Path, to: &Path) -> Result<()> {
|
||||
if to.primary_only() {
|
||||
self.primary.copy(from, to).await
|
||||
} else {
|
||||
self.secondary.copy(from, to).await?;
|
||||
self.primary.copy(from, to).await?;
|
||||
Ok(())
|
||||
}
|
||||
fn delete_stream(
|
||||
&self,
|
||||
locations: BoxStream<'static, Result<Path>>,
|
||||
) -> BoxStream<'static, Result<Path>> {
|
||||
let primary = self.primary.clone();
|
||||
let secondary = self.secondary.clone();
|
||||
locations
|
||||
.map(move |location| {
|
||||
let primary = primary.clone();
|
||||
let secondary = secondary.clone();
|
||||
async move {
|
||||
let location = location?;
|
||||
if !location.primary_only() {
|
||||
match secondary.delete(&location).await {
|
||||
Err(Error::NotFound { .. }) | Ok(_) => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
primary.delete(&location).await?;
|
||||
Ok(location)
|
||||
}
|
||||
})
|
||||
.buffered(10)
|
||||
.boxed()
|
||||
}
|
||||
|
||||
async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
|
||||
if !to.primary_only() {
|
||||
self.secondary.copy(from, to).await?;
|
||||
async fn copy_opts(&self, from: &Path, to: &Path, options: CopyOptions) -> Result<()> {
|
||||
if to.primary_only() {
|
||||
self.primary.copy_opts(from, to, options).await
|
||||
} else {
|
||||
self.secondary.copy_opts(from, to, options.clone()).await?;
|
||||
self.primary.copy_opts(from, to, options).await?;
|
||||
Ok(())
|
||||
}
|
||||
self.primary.copy_if_not_exists(from, to).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,9 +10,9 @@ use bytes::Bytes;
|
||||
use futures::stream::BoxStream;
|
||||
use lance::io::WrappingObjectStore;
|
||||
use object_store::{
|
||||
GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
|
||||
PutMultipartOptions, PutOptions, PutPayload, PutResult, Result as OSResult, UploadPart,
|
||||
path::Path,
|
||||
CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
|
||||
PutMultipartOptions, PutOptions, PutPayload, PutResult, RenameOptions, Result as OSResult,
|
||||
UploadPart, path::Path,
|
||||
};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
@@ -81,11 +81,6 @@ impl IoTrackingStore {
|
||||
#[async_trait::async_trait]
|
||||
#[deny(clippy::missing_trait_methods)]
|
||||
impl ObjectStore for IoTrackingStore {
|
||||
async fn put(&self, location: &Path, bytes: PutPayload) -> OSResult<PutResult> {
|
||||
self.record_write(bytes.content_length() as u64);
|
||||
self.target.put(location, bytes).await
|
||||
}
|
||||
|
||||
async fn put_opts(
|
||||
&self,
|
||||
location: &Path,
|
||||
@@ -96,14 +91,6 @@ impl ObjectStore for IoTrackingStore {
|
||||
self.target.put_opts(location, bytes, opts).await
|
||||
}
|
||||
|
||||
async fn put_multipart(&self, location: &Path) -> OSResult<Box<dyn MultipartUpload>> {
|
||||
let target = self.target.put_multipart(location).await?;
|
||||
Ok(Box::new(IoTrackingMultipartUpload {
|
||||
target,
|
||||
stats: self.stats.clone(),
|
||||
}))
|
||||
}
|
||||
|
||||
async fn put_multipart_opts(
|
||||
&self,
|
||||
location: &Path,
|
||||
@@ -116,15 +103,6 @@ impl ObjectStore for IoTrackingStore {
|
||||
}))
|
||||
}
|
||||
|
||||
async fn get(&self, location: &Path) -> OSResult<GetResult> {
|
||||
let result = self.target.get(location).await;
|
||||
if let Ok(result) = &result {
|
||||
let num_bytes = result.range.end - result.range.start;
|
||||
self.record_read(num_bytes);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
async fn get_opts(&self, location: &Path, options: GetOptions) -> OSResult<GetResult> {
|
||||
let result = self.target.get_opts(location, options).await;
|
||||
if let Ok(result) = &result {
|
||||
@@ -134,14 +112,6 @@ impl ObjectStore for IoTrackingStore {
|
||||
result
|
||||
}
|
||||
|
||||
async fn get_range(&self, location: &Path, range: std::ops::Range<u64>) -> OSResult<Bytes> {
|
||||
let result = self.target.get_range(location, range).await;
|
||||
if let Ok(result) = &result {
|
||||
self.record_read(result.len() as u64);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
async fn get_ranges(
|
||||
&self,
|
||||
location: &Path,
|
||||
@@ -154,20 +124,11 @@ impl ObjectStore for IoTrackingStore {
|
||||
result
|
||||
}
|
||||
|
||||
async fn head(&self, location: &Path) -> OSResult<ObjectMeta> {
|
||||
self.record_read(0);
|
||||
self.target.head(location).await
|
||||
}
|
||||
|
||||
async fn delete(&self, location: &Path) -> OSResult<()> {
|
||||
fn delete_stream(
|
||||
&self,
|
||||
locations: BoxStream<'static, OSResult<Path>>,
|
||||
) -> BoxStream<'static, OSResult<Path>> {
|
||||
self.record_write(0);
|
||||
self.target.delete(location).await
|
||||
}
|
||||
|
||||
fn delete_stream<'a>(
|
||||
&'a self,
|
||||
locations: BoxStream<'a, OSResult<Path>>,
|
||||
) -> BoxStream<'a, OSResult<Path>> {
|
||||
self.target.delete_stream(locations)
|
||||
}
|
||||
|
||||
@@ -190,24 +151,14 @@ impl ObjectStore for IoTrackingStore {
|
||||
self.target.list_with_delimiter(prefix).await
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &Path, to: &Path) -> OSResult<()> {
|
||||
async fn copy_opts(&self, from: &Path, to: &Path, options: CopyOptions) -> OSResult<()> {
|
||||
self.record_write(0);
|
||||
self.target.copy(from, to).await
|
||||
self.target.copy_opts(from, to, options).await
|
||||
}
|
||||
|
||||
async fn rename(&self, from: &Path, to: &Path) -> OSResult<()> {
|
||||
async fn rename_opts(&self, from: &Path, to: &Path, options: RenameOptions) -> OSResult<()> {
|
||||
self.record_write(0);
|
||||
self.target.rename(from, to).await
|
||||
}
|
||||
|
||||
async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> OSResult<()> {
|
||||
self.record_write(0);
|
||||
self.target.rename_if_not_exists(from, to).await
|
||||
}
|
||||
|
||||
async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> OSResult<()> {
|
||||
self.record_write(0);
|
||||
self.target.copy_if_not_exists(from, to).await
|
||||
self.target.rename_opts(from, to, options).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1540,6 +1540,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
Index::IvfPq(p) => ("IVF_PQ", Some(to_json(p)?)),
|
||||
Index::IvfSq(p) => ("IVF_SQ", Some(to_json(p)?)),
|
||||
Index::IvfHnswSq(p) => ("IVF_HNSW_SQ", Some(to_json(p)?)),
|
||||
Index::IvfHnswFlat(p) => ("IVF_HNSW_FLAT", Some(to_json(p)?)),
|
||||
Index::IvfRq(p) => ("IVF_RQ", Some(to_json(p)?)),
|
||||
Index::BTree(p) => ("BTREE", Some(to_json(p)?)),
|
||||
Index::Bitmap(p) => ("BITMAP", Some(to_json(p)?)),
|
||||
@@ -2068,7 +2069,8 @@ mod tests {
|
||||
use serde_json::json;
|
||||
|
||||
use crate::index::vector::{
|
||||
IvfFlatIndexBuilder, IvfHnswSqIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder,
|
||||
IvfFlatIndexBuilder, IvfHnswFlatIndexBuilder, IvfHnswSqIndexBuilder, IvfRqIndexBuilder,
|
||||
IvfSqIndexBuilder,
|
||||
};
|
||||
use crate::remote::JSON_CONTENT_TYPE;
|
||||
use crate::remote::db::DEFAULT_SERVER_VERSION;
|
||||
@@ -3321,6 +3323,35 @@ mod tests {
|
||||
.ef_construction(500),
|
||||
),
|
||||
),
|
||||
(
|
||||
"IVF_HNSW_FLAT",
|
||||
json!({
|
||||
"metric_type": "l2",
|
||||
"sample_rate": 256,
|
||||
"max_iterations": 50,
|
||||
"m": 20,
|
||||
"ef_construction": 300,
|
||||
}),
|
||||
Index::IvfHnswFlat(Default::default()),
|
||||
),
|
||||
(
|
||||
"IVF_HNSW_FLAT",
|
||||
json!({
|
||||
"metric_type": "cosine",
|
||||
"num_partitions": 64,
|
||||
"sample_rate": 256,
|
||||
"max_iterations": 50,
|
||||
"m": 40,
|
||||
"ef_construction": 500,
|
||||
}),
|
||||
Index::IvfHnswFlat(
|
||||
IvfHnswFlatIndexBuilder::default()
|
||||
.distance_type(DistanceType::Cosine)
|
||||
.num_partitions(64)
|
||||
.num_edges(40)
|
||||
.ef_construction(500),
|
||||
),
|
||||
),
|
||||
(
|
||||
"IVF_SQ",
|
||||
json!({
|
||||
|
||||
@@ -36,6 +36,7 @@ pub use query::AnyQuery;
|
||||
|
||||
use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore;
|
||||
use lance_namespace::LanceNamespace;
|
||||
use lance_namespace::error::NamespaceError;
|
||||
use lance_namespace::models::DescribeTableRequest;
|
||||
use lance_table::format::Manifest;
|
||||
use lance_table::io::commit::CommitHandler;
|
||||
@@ -94,6 +95,53 @@ pub use schema_evolution::{AddColumnsResult, AlterColumnsResult, DropColumnsResu
|
||||
use serde_with::skip_serializing_none;
|
||||
pub use update::{UpdateBuilder, UpdateResult};
|
||||
|
||||
/// Walk a boxed error chain to find the innermost `NamespaceError`.
|
||||
///
|
||||
/// Callers like `DatasetBuilder::from_namespace` re-wrap their inner namespace error
|
||||
/// inside a fresh `lance::Error::Namespace`, so a single downcast at the top level
|
||||
/// won't find it. This walks `.source()` to unwrap arbitrarily nested layers.
|
||||
fn find_namespace_error<'a>(
|
||||
err: &'a (dyn std::error::Error + 'static),
|
||||
) -> Option<&'a NamespaceError> {
|
||||
let mut current: Option<&(dyn std::error::Error + 'static)> = Some(err);
|
||||
while let Some(e) = current {
|
||||
if let Some(ns_err) = e.downcast_ref::<NamespaceError>() {
|
||||
return Some(ns_err);
|
||||
}
|
||||
current = e.source();
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Map a `lance::Error` coming from a `lance-namespace` call into a `lancedb::Error`,
|
||||
/// preserving the fine-grained namespace error code (e.g. `TableNotFound`,
|
||||
/// `TableAlreadyExists`). Errors that aren't recognized namespace error variants fall
|
||||
/// through to a generic runtime error rather than `TableNotFound`/`TableAlreadyExists`.
|
||||
pub(crate) fn map_namespace_lance_error(err: lance::Error, table_name: &str) -> Error {
|
||||
if let Some(code) = find_namespace_error(&err).map(NamespaceError::code) {
|
||||
match code {
|
||||
lance_namespace::error::ErrorCode::TableNotFound => {
|
||||
return Error::TableNotFound {
|
||||
name: table_name.to_string(),
|
||||
source: Box::new(err),
|
||||
};
|
||||
}
|
||||
lance_namespace::error::ErrorCode::TableAlreadyExists => {
|
||||
return Error::TableAlreadyExists {
|
||||
name: table_name.to_string(),
|
||||
};
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
match err {
|
||||
lance::Error::Namespace { source, .. } => Error::Runtime {
|
||||
message: format!("Namespace error: {}", source),
|
||||
},
|
||||
other => other.into(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Defines the type of column
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum ColumnKind {
|
||||
@@ -1494,12 +1542,7 @@ impl NativeTable {
|
||||
// and storage options from the namespace
|
||||
let builder = DatasetBuilder::from_namespace(namespace_client.clone(), table_id)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
lance::Error::Namespace { source, .. } => Error::Runtime {
|
||||
message: format!("Failed to get table info from namespace: {:?}", source),
|
||||
},
|
||||
e => e.into(),
|
||||
})?;
|
||||
.map_err(|e| map_namespace_lance_error(e, name))?;
|
||||
|
||||
let dataset = builder
|
||||
.with_read_params(params)
|
||||
@@ -2033,6 +2076,24 @@ impl NativeTable {
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
}
|
||||
Index::IvfHnswFlat(index) => {
|
||||
Self::validate_index_type(field, "IVF HNSW FLAT", supported_vector_data_type)?;
|
||||
let ivf_params = Self::build_ivf_params(
|
||||
index.num_partitions,
|
||||
index.target_partition_size,
|
||||
index.sample_rate,
|
||||
index.max_iterations,
|
||||
);
|
||||
let hnsw_params = HnswBuildParams::default()
|
||||
.num_edges(index.m as usize)
|
||||
.ef_construction(index.ef_construction as usize);
|
||||
let lance_idx_params = VectorIndexParams::ivf_hnsw(
|
||||
index.distance_type.into(),
|
||||
ivf_params,
|
||||
hnsw_params,
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2058,7 +2119,8 @@ impl NativeTable {
|
||||
| Index::IvfPq(_)
|
||||
| Index::IvfRq(_)
|
||||
| Index::IvfHnswPq(_)
|
||||
| Index::IvfHnswSq(_) => IndexType::Vector,
|
||||
| Index::IvfHnswSq(_)
|
||||
| Index::IvfHnswFlat(_) => IndexType::Vector,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2329,13 +2391,28 @@ impl BaseTable for NativeTable {
|
||||
message: "Multi-column (composite) indices are not yet supported".to_string(),
|
||||
});
|
||||
}
|
||||
let schema = self.schema().await?;
|
||||
let dataset = self.dataset.get().await?;
|
||||
let Some(field_path) = dataset.schema().resolve_case_insensitive(&opts.columns[0]) else {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"Unable to get field named {:?}. Valid fields: {:?}",
|
||||
opts.columns[0],
|
||||
dataset.schema().field_paths()
|
||||
),
|
||||
});
|
||||
};
|
||||
let field = (*field_path.last().expect("resolved field path is non-empty")).clone();
|
||||
let names = field_path
|
||||
.iter()
|
||||
.map(|f| f.name.as_str())
|
||||
.collect::<Vec<_>>();
|
||||
let column = lance_core::datatypes::format_field_path(&names);
|
||||
drop(dataset);
|
||||
|
||||
let field = schema.field_with_name(&opts.columns[0])?;
|
||||
|
||||
let lance_idx_params = self.make_index_params(field, opts.index.clone()).await?;
|
||||
let index_type = self.get_index_type_for_field(field, &opts.index);
|
||||
let columns = [field.name().as_str()];
|
||||
let field = Field::from(&field);
|
||||
let lance_idx_params = self.make_index_params(&field, opts.index.clone()).await?;
|
||||
let index_type = self.get_index_type_for_field(&field, &opts.index);
|
||||
let columns = [column.as_str()];
|
||||
self.dataset.ensure_mutable()?;
|
||||
let mut dataset = (*self.dataset.get().await?).clone();
|
||||
let mut builder = dataset
|
||||
@@ -2481,11 +2558,11 @@ impl BaseTable for NativeTable {
|
||||
|
||||
let mut columns = Vec::with_capacity(idx.fields.len());
|
||||
for field_id in &idx.fields {
|
||||
let Some(field) = dataset.schema().field_by_id(*field_id) else {
|
||||
let Ok(field_path) = dataset.schema().field_path(*field_id) else {
|
||||
log::warn!("The index {} ({}) referenced a field with id {} which does not exist in the schema", idx.name, idx.uuid, field_id);
|
||||
return None;
|
||||
};
|
||||
columns.push(field.name.clone());
|
||||
columns.push(field_path);
|
||||
}
|
||||
|
||||
let name = idx.name.clone();
|
||||
@@ -2699,7 +2776,7 @@ mod tests {
|
||||
|
||||
use arrow_array::{
|
||||
Array, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray, RecordBatch,
|
||||
RecordBatchIterator, RecordBatchReader, StringArray,
|
||||
RecordBatchIterator, RecordBatchReader, StringArray, StructArray,
|
||||
builder::{ListBuilder, StringBuilder},
|
||||
};
|
||||
use arrow_array::{BinaryArray, LargeBinaryArray};
|
||||
@@ -3176,6 +3253,56 @@ mod tests {
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_index_ivf_hnsw_flat() {
|
||||
use arrow_array::RecordBatch;
|
||||
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
|
||||
use rand;
|
||||
use std::iter::repeat_with;
|
||||
|
||||
use crate::index::vector::IvfHnswFlatIndexBuilder;
|
||||
use arrow_array::Float32Array;
|
||||
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
let conn = connect(uri).execute().await.unwrap();
|
||||
|
||||
let dimension = 16;
|
||||
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
|
||||
"embeddings",
|
||||
DataType::FixedSizeList(
|
||||
Arc::new(Field::new("item", DataType::Float32, true)),
|
||||
dimension,
|
||||
),
|
||||
false,
|
||||
)]));
|
||||
|
||||
let float_arr = Float32Array::from(
|
||||
repeat_with(rand::random::<f32>)
|
||||
.take(512 * dimension as usize)
|
||||
.collect::<Vec<f32>>(),
|
||||
);
|
||||
|
||||
let vectors = Arc::new(create_fixed_size_list(float_arr, dimension).unwrap());
|
||||
let batch = RecordBatch::try_new(schema.clone(), vec![vectors.clone()]).unwrap();
|
||||
|
||||
let table = conn.create_table("test", batch).execute().await.unwrap();
|
||||
|
||||
let index = IvfHnswFlatIndexBuilder::default();
|
||||
table
|
||||
.create_index(&["embeddings"], Index::IvfHnswFlat(index))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let index_configs = table.list_indices().await.unwrap();
|
||||
assert_eq!(index_configs.len(), 1);
|
||||
let index = index_configs.into_iter().next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::IvfHnswFlat);
|
||||
assert_eq!(index.columns, vec!["embeddings".to_string()]);
|
||||
assert_eq!(table.count_rows(None).await.unwrap(), 512);
|
||||
}
|
||||
|
||||
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
|
||||
let list_type = DataType::FixedSizeList(
|
||||
Arc::new(Field::new("item", values.data_type().clone(), true)),
|
||||
@@ -3255,6 +3382,57 @@ mod tests {
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_scalar_index_on_nested_field() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
|
||||
let metadata_fields = vec![Arc::new(Field::new("user_id", DataType::Int32, false))];
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new(
|
||||
"metadata",
|
||||
DataType::Struct(metadata_fields.clone().into()),
|
||||
false,
|
||||
),
|
||||
]));
|
||||
let metadata = StructArray::new(
|
||||
metadata_fields.into(),
|
||||
vec![Arc::new(Int32Array::from_iter_values(0..10))],
|
||||
None,
|
||||
);
|
||||
let batch = RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(Int32Array::from_iter_values(0..10)),
|
||||
Arc::new(metadata),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
let conn = ConnectBuilder::new(uri).execute().await.unwrap();
|
||||
let table = conn
|
||||
.create_table("my_table", batch)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
table
|
||||
.create_index(
|
||||
&["metadata.user_id"],
|
||||
Index::BTree(BTreeIndexBuilder::default()),
|
||||
)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let index_configs = table.list_indices().await.unwrap();
|
||||
assert_eq!(index_configs.len(), 1);
|
||||
assert_eq!(
|
||||
index_configs[0].columns,
|
||||
vec!["metadata.user_id".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_bitmap_index() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
|
||||
Reference in New Issue
Block a user