Compare commits

..

14 Commits

Author SHA1 Message Date
Brendan Clement
32c77879c9 feat(nodejs): surface skip_auto_cleanup on add and merge insert 2026-05-14 12:59:21 -07:00
Brendan Clement
9330a9b851 feat(nodejs): expose connectNamespace for namespace-backed connections (#3383)
### Summary 

Adds a `connectNamespace(implName, properties, options?)` to the NodeJS
SDK`. Closes #3380.

### Testing
- pnpm test
- Ran smoke test

```
import { connectNamespace } from "lancedb"
import { tmpdir } from "os";
import { mkdtempSync } from "fs";
import { join } from "path";

const dir = mkdtempSync(join(tmpdir(), "lancedb-connect-namespace-smoke-"));
console.log(`Using temp dir: ${dir}\n`);

// 1. Happy path: connect via the "dir" namespace impl, create + list a table.
console.log('Connecting via connectNamespace("dir", { root })...');
const db = await connectNamespace("dir", { root: dir });
console.log("  ✓ connected:", db.display());

console.log("Creating a table and listing it...");
await db.createTable("users", [
  { id: 1, name: "alice" },
  { id: 2, name: "bob" },
]);
console.log("  ✓ tableNames ->", await db.tableNames());

const table = await db.openTable("users");
console.log("  ✓ users.countRows ->", await table.countRows());

// 2. Storage options pass-through.
console.log("\nReconnecting with storageOptions (plumbing check)...");
const dbWithOpts = await connectNamespace(
  "dir",
  { root: dir },
  { storageOptions: { newTableDataStorageVersion: "stable" } },
);
console.log("  ✓ connected with storageOptions:", dbWithOpts.display());
await dbWithOpts.close();

// 3. Empty implName -> clear error.
console.log("\nCalling connectNamespace('', {}) (expect error)...");
try {
  await connectNamespace("", {});
  console.error("  UNEXPECTED: empty implName did not throw");
} catch (err) {
  console.log(`  ✓ Got expected error: ${err.message.split("\n")[0]}`);
}

// 4. Unknown impl -> error.
console.log("\nCalling connectNamespace('not-a-real-impl', {}) (expect error)...");
try {
  await connectNamespace("not-a-real-impl", {});
  console.error("  UNEXPECTED: unknown impl did not throw");
} catch (err) {
  console.log(`  ✓ Got expected error: ${err.message.split("\n")[0]}`);
}

// 5. Create a table inside a child namespace, then reconnect with a fresh
//    connectNamespace call and confirm the table is reachable via that
//    namespace path. (The dir+manifest impl keeps the namespace hierarchy in
//    a root manifest, so "scoping" happens via namespacePath args, not by
//    pointing root at a subdir.)
console.log("\nCreating a table inside a child namespace...");
const dir2 = mkdtempSync(join(tmpdir(), "lancedb-connect-namespace-smoke-"));
const writer = await connectNamespace("dir", {
  root: dir2,
  manifest_enabled: "true",
});
await writer.createNamespace(["analytics"]);
await writer.createTable(
  "orders",
  [
    { id: 1, total: 10 },
    { id: 2, total: 20 },
  ],
  ["analytics"],
);
console.log(
  "  ✓ writer sees tables under [analytics] ->",
  await writer.tableNames(["analytics"]),
);
await writer.close();

console.log("Reconnecting and reading the table via its namespace path...");
const reader = await connectNamespace("dir", {
  root: dir2,
  manifest_enabled: "true",
});
console.log(
  "  ✓ reader tableNames(['analytics']) ->",
  await reader.tableNames(["analytics"]),
);
const orders = await reader.openTable("orders", ["analytics"]);
console.log("  ✓ orders.countRows via reader ->", await orders.countRows());
await reader.close();

await db.close();
console.log("\nAll checks passed.");
```

```
Using temp dir: /var/folders/bj/hn6jv9c50y301d1nx0y8xmn00000gn/T/lancedb-connect-namespace-smoke-WByF1P

Connecting via connectNamespace("dir", { root })...
  ✓ connected: LanceNamespaceDatabase
Creating a table and listing it...
  ✓ tableNames -> [ 'users' ]
  ✓ users.countRows -> 2

Reconnecting with storageOptions (plumbing check)...
  ✓ connected with storageOptions: LanceNamespaceDatabase

Calling connectNamespace('', {}) (expect error)...
  ✓ Got expected error: implName must be a non-empty string

Calling connectNamespace('not-a-real-impl', {}) (expect error)...
  ✓ Got expected error: Invalid input, Failed to connect to namespace: Namespace { source: Unsupported { message: "Implementation 'not-a-real-impl' is not available. Supported: dir, rest" }, location: Location { file: "/Users/brendan/.cargo/git/checkouts/lance-8ddea23c38163eda/f693245/rust/lance-namespace-impls/src/connect.rs", line: 216, column: 14 } }

Creating a table inside a child namespace...
  ✓ writer sees tables under [analytics] -> [ 'orders' ]
Reconnecting and reading the table via its namespace path...
  ✓ reader tableNames(['analytics']) -> [ 'orders' ]
  ✓ orders.countRows via reader -> 2

All checks passed.
```

### Docs
- regenerated docs
2026-05-13 16:16:56 -07:00
Brendan Clement
02de07576e feat(nodejs): add namespace management methods on Connection (#3371)
### Summary

Closes #3363 

Adds the four namespace management methods to the NodeJS `Connection`,
bringing parity with the Rust core and Python bindings:

- `listNamespaces(parent?, options?)`
- `createNamespace(namespacePath, options?)`
- `dropNamespace(namespacePath, options?)`
- `describeNamespace(namespacePath)`

### Test plan
- npm test
- Ran a smoke test script

```typescript
import { connect } from '<lancePath>'
import { tmpdir } from "os";
import { mkdtempSync } from "fs";
import { join } from "path";

const dir = mkdtempSync(join(tmpdir(), "lancedb-smoke-"));
console.log(`Using temp dir: ${dir}\n`);

const db = await connect(dir, {
  namespaceClientProperties: { manifest_enabled: "true" },
});

console.log("Creating namespaces...");
await db.createNamespace(["analytics"]);
await db.createNamespace(["analytics", "sales"], {
  properties: { owner: "brendan", purpose: "smoke-test" },
});
await db.createNamespace(["marketing"]);

const root = await db.listNamespaces();
console.log("Root namespaces:", root.namespaces);

const children = await db.listNamespaces(["analytics"]);
console.log("Children of 'analytics':", children.namespaces);

const descWithProps = await db.describeNamespace(["analytics", "sales"]);
console.log("Describe analytics/sales (with properties):", descWithProps);

const descNoProps = await db.describeNamespace(["analytics"]);
console.log("Describe analytics (no properties):", descNoProps);

console.log("Describing a non-existent namespace (expect error)...");
try {
  await db.describeNamespace(["does-not-exist"]);
  console.error("  UNEXPECTED: describe succeeded for non-existent namespace");
} catch (err) {
  console.log(`  ✓ Got expected error: ${err.message.split("\n")[0]}`);
}

await db.dropNamespace(["marketing"]);
const afterDrop = await db.listNamespaces();
console.log("Root after dropping marketing:", afterDrop.namespaces);

await db.close();
console.log("\nAll operations completed successfully.");
```

```
Using temp dir: /var/folders/bj/hn6jv9c50y301d1nx0y8xmn00000gn/T/lancedb-smoke-MUC5NI

Creating namespaces...
Root namespaces: [ 'analytics', 'marketing' ]
Children of 'analytics': [ 'sales' ]
Describe analytics/sales (with properties): { properties: { purpose: 'smoke-test', owner: 'brendan' } }
Describe analytics (no properties): {}
Describing a non-existent namespace (expect error)...
  ✓ Got expected error: lance error: Namespace error: Namespace not found: does-not-exist, rust/lance-namespace-impls/src/dir/manifest.rs:2495:14  Caused by: Namespace error: Namespace not found: does-not-exist, rust/lance-namespace-impls/src/dir/manifest.rs:2495:14    Caused by: Namespace not found: does-not-exist
Root after dropping marketing: [ 'analytics' ]

All operations completed successfully.
```

### Documentation
- regenerated docs
2026-05-13 11:49:27 -07:00
Will Jones
81617fd3d9 ci(nodejs): switch from npm to pnpm 11 (#3373)
## Summary

Switch the nodejs bindings and examples package from npm to pnpm 11 to
pick up its stronger supply-chain defaults:

- `minimumReleaseAge` defaults to 1 day, so newly-published (potentially
compromised) versions aren't resolved into installs for at least 24h.
- Install lifecycle scripts (`preinstall`/`install`/`postinstall`) are
no longer run for arbitrary transitive deps; only an explicit allowlist
may run them, and unapproved scripts cause install to fail
(`strictDepBuilds: true`).
- Audit uses GHSA IDs and `--fix=update` to add patched versions to
`minimumReleaseAgeExclude`.

This is the same class of protection that would have blunted the recent
TanStack/`@uipath`/etc. compromise discussed in the [Aikido
write-up](https://www.aikido.dev/blog/mini-shai-hulud-is-back-tanstack-compromised).

## Changes

- Replace `nodejs/package-lock.json` and
`nodejs/examples/package-lock.json` with `pnpm-lock.yaml`.
- Pin pnpm via `packageManager: pnpm@11.1.1` in both `package.json`s.
- Add `pnpm-workspace.yaml` with the four build-script packages we
actually need: `@biomejs/biome`, `onnxruntime-node`, `protobufjs`,
`sharp`. Everything else is blocked from running install scripts.
- Update package.json scripts (`npm run X` → `pnpm X`).
- Update workflows: `.github/workflows/nodejs.yml`,
`.github/workflows/npm-publish.yml`, and
`.github/workflows/codex-fix-ci.yml` — install pnpm via
`pnpm/action-setup@v4` and switch `setup-node` caches to
`pnpm-lock.yaml`.
- Refresh `nodejs/AGENTS.md`, `nodejs/CLAUDE.md`, and
`nodejs/CONTRIBUTING.md`.

`docs/package-lock.json` is **not** touched — out of scope for this PR.

## Test plan

- [ ] `Lint` job (lint Rust/TS + examples lint) passes on CI.
- [ ] `Linux (NodeJS 18/20)` build+test passes, including the examples
test step.
- [ ] `macos` build+test passes.
- [ ] `NPM Publish` workflow's PR dry-run completes (build matrix + test
matrix + dry `npm publish`).
- [ ] No new install-script approvals are required at install time.

## Follow-ups

- `update_package_lock_run_nodejs.yml` references a composite action
path that doesn't exist
(`./.github/workflows/update_package_lock_nodejs`); it was already
broken pre-PR. We may want to either delete this workflow or rewrite it
for pnpm in a follow-up.
- Consider migrating `docs/` to pnpm in a separate PR.

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 11:27:38 -07:00
Brendan Clement
011fdd5c94 feat(nodejs): add prewarmData method on Table (#3374)
### Summary
- Closes #3362 
- Adds `prewarmData(columns?: string[])` to the Node bindings, mirroring
the Rust and Python implementations

### Testing
- [x] `npm run build` (regenerates the napi `.node` module + TS
declarations)
- [x] `npm run lint`
- [x] `npm test
- [ ] live test against remote table - just waiting for my dev stack to
get created

### Documentation
- updated docs
2026-05-12 15:29:48 -07:00
Shengan Zhang
650f173236 feat(python): add IVF_HNSW_FLAT vector index support (#3366)
## Summary

Wire up `IVF_HNSW_FLAT` in the Rust core and Python SDK. The index was
documented at https://docs.lancedb.com/indexing/vector-index but
`lancedb.Table.create_index(index_type="IVF_HNSW_FLAT")` raised
`ValueError: Unknown index type IVF_HNSW_FLAT` — the underlying
`pylance` already accepted it, only the LanceDB wrapper was missing the
wiring.

**Rust core (`rust/lancedb`):**
- Add `Index::IvfHnswFlat` / `IndexType::IvfHnswFlat` variants and the
`IvfHnswFlatIndexBuilder` (modelled on `IvfHnswSqIndexBuilder`).
- Build Lance params via the existing `VectorIndexParams::ivf_hnsw(...)`
helper, keeping symmetry with the other `IVF_HNSW_*` variants.
- Forward the variant in `RemoteTable::create_index` and add two
parametrised tests (default + customised config) for the JSON
serialisation.
- New `NativeTable` integration test
(`test_create_index_ivf_hnsw_flat`).

**Python binding (`python/`):**
- New `HnswFlat` dataclass + backwards-compat `IvfHnswFlat` alias.
- PyO3 `extract_index_params` recognises the `HnswFlat` config.
- `LanceTable.create_index(index_type="IVF_HNSW_FLAT", …)` and the sync
`RemoteTable.create_index` both dispatch to the new config.
- `IndexStatistics.index_type` `Literal` and `_lancedb.pyi` stubs cover
the new type so `pyright`/`make check` stays clean.
- Async integration tests (`HnswFlat` + `IvfHnswFlat` alias) and a sync
dispatcher test, mirroring the existing `IVF_HNSW_SQ` coverage.
- Existing `test_index_statistics_index_type_lists_all_supported_values`
updated to include `IVF_HNSW_FLAT`.

A matching Node.js / TypeScript binding is in a follow-up PR.

Closes #3331

## Test plan

- [ ] \`cargo check --quiet --features remote --tests --examples\`
- [ ] \`cargo test --quiet --features remote -p lancedb\` (covers the
new \`test_create_index_ivf_hnsw_flat\` and the two new parametrised
\`RemoteTable::create_index\` cases)
- [ ] \`cargo fmt --all\` / \`cargo clippy --quiet --features remote
--tests --examples\`
- [ ] \`cd python && make develop && make check && make test\` (covers
the two new async tests, the alias test, the dispatcher test, and the
updated \`test_index_statistics_index_type_lists_all_supported_values\`
assertion)
2026-05-11 15:08:32 -07:00
Xuanwo
9b21c136c6 feat(python): support model-backed native FTS tokenizers (#3289)
This wires Lance's existing `jieba/*` and `lindera/*` native FTS
tokenizers through the Python SDK instead of leaving them behind
disabled features and narrow public typing. It also documents the
`LANCE_LANGUAGE_MODEL_HOME` model layout and adds Python coverage for
successful CJK indexing plus missing-model error guidance.

Closes #2168.
2026-05-08 23:53:14 +08:00
Heng Ge
694aa48e19 fix(database): drop spurious trailing ? from listing-database URIs (#3357)
## Summary

`url::Url::query_pairs_mut()` leaves the URL with `query=Some("")` after
`.clear()` even when the input had no query string. The listing-database
connect path then captured that empty query into
`ListingDatabase::query_string`, and `table_uri()` blindly appended
`?<query>` to every per-table URI — producing URIs like
`s3://bucket/prefix/foo.lance?`.

The trailing `?` is benign for normal table operations, but it breaks
any caller that constructs a sub-path from the table URI. In particular,
MemWAL flushes write to `<table_uri>/_mem_wal/<shard>/<rand>_gen_<n>`,
which `url::Url::parse` then re-parses as `path=<base table>` +
`query=/_mem_wal/...`. `Dataset::write` resolves the base table dataset,
finds it already exists, and fails with `Dataset already exists:
…_gen_1` on the very first MemTable flush (observed deterministically
against S3 across all merge_insert LSM modes; tracked in
[lance-format/lance#6713](https://github.com/lance-format/lance/pull/6715)).

## Fix

Treat `Some("")` query the same as no query when capturing
`query_string`. A real `?foo=bar` query is still propagated unchanged.

Adds a regression test covering both the empty-query and non-empty-query
paths.

## Verification

- `url::Url::parse("s3://bucket/prefix/").query()` → `None`, but after
`query_pairs_mut().clear()` → `Some("")`. Confirmed in a standalone
repro.
- Without this fix, every `table_uri()` for an `s3://`-style connection
ends with `?`, breaking MemWAL and any future sub-path consumer in the
same way.
- New unit test `test_table_uri_url_path_has_no_trailing_question_mark`
exercises both code paths.
2026-05-07 23:29:29 -07:00
LanceDB Robot
455ba5abbf chore: update lance dependency to v7.0.0-beta.7 (#3356)
## Summary
- Update Lance Rust workspace dependencies to `7.0.0-beta.7` using
`ci/set_lance_version.py`.
- Update the Java `lance-core` Maven property to `7.0.0-beta.7`.
- Refresh `Cargo.lock` for the new Lance tag:
https://github.com/lance-format/lance/releases/tag/v7.0.0-beta.7

## Verification
- `cargo clippy --workspace --tests --all-features -- -D warnings`
- `cargo fmt --all`
2026-05-07 16:04:38 -07:00
Octopus
5338aeb006 ci: avoid passing GPG passphrase on command line in Java publish workflow (#3313)
Fixes #3299

## Problem

Two security issues exist in `.github/workflows/java-publish.yml`:

1. **`gpg-passphrase` input is misused**: `actions/setup-java`'s
`gpg-passphrase` input expects the **name** of an environment variable
(default: `GPG_PASSPHRASE`), not the secret value itself. The previous
value `${{ secrets.GPG_PASSPHRASE }}` was setting the env var name to
the actual secret, which is incorrect.

2. **Passphrase visible on the command line**: `-Dgpg.passphrase=${{
secrets.GPG_PASSPHRASE }}` passes the GPG passphrase as a Maven system
property argument, making it visible in process listings and potentially
echoed in debug logs — a supply-chain security risk for release
workflows.

## Solution

- Fix `gpg-passphrase: MAVEN_GPG_PASSPHRASE` — use the correct env var
name so `actions/setup-java` generates a proper Maven `settings.xml`
entry that reads from `MAVEN_GPG_PASSPHRASE`.
- Remove `-Dgpg.passphrase=...` from the Maven CLI invocation.
- Add `MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}` to the
`env:` block of the Publish step, so the passphrase is available as an
environment variable rather than a CLI argument.

## Testing

The Java publish workflow only runs on tag pushes, so this cannot be
exercised in a PR build. The logic change is straightforward:
`actions/setup-java` is documented to write a `settings.xml` that reads
`<gpg.passphrase>` from the named env var, and `maven-gpg-plugin` picks
it up from there without any CLI argument.

Co-authored-by: octo-patch <octo-patch@github.com>
2026-05-07 08:45:27 -07:00
LanceDB Robot
47a34f5cca chore: update lance dependency to v7.0.0-beta.4 (#3348)
## Summary
- Update Lance Rust dependencies to `v7.0.0-beta.4` using
`ci/set_lance_version.py`.
- Update the Java `lance-core` dependency property to `7.0.0-beta.4`.
- Align LanceDB with dependency updates required by Lance 7, including
`object_store` 0.13 API compatibility.

Triggering tag:
https://github.com/lance-format/lance/releases/tag/v7.0.0-beta.4

## Verification
- `cargo clippy --workspace --tests --all-features -- -D warnings`
- `cargo fmt --all`
2026-05-05 18:36:39 -07:00
Weston Pace
a17c241e86 feat(python): make Permutation fork-safe for PyTorch DataLoader workers (#3339)
## Summary

PyTorch's `DataLoader` uses fork-based multiprocessing by default on
Linux, but threads do not survive `fork()`. LanceDB's Python bindings
drive async work through two threaded layers, both of which become inert
in a forked child:

- `BackgroundEventLoop` runs an asyncio loop on a Python
`threading.Thread`.
- `pyo3-async-runtimes::tokio` holds a global multi-threaded tokio
runtime whose worker threads also die on fork — and its runtime lives in
a `OnceLock` that cannot be replaced after first use.

As a result, any `Permutation` (or other async API) used inside a
fork-based `DataLoader` worker hangs indefinitely. This PR makes both
layers fork-safe so `Permutation` works as a `torch.utils.data.Dataset`
with `num_workers > 0`.

## Approach

### Rust — new `python/src/runtime.rs`

Mirrors the pattern used in [Lance's Python
bindings](456198cd6f/python/src/lib.rs (L139)),
adapted for the async-bridge use case.

- `LanceRuntime` implements `pyo3_async_runtimes::generic::Runtime +
ContextExt`, backed by an `AtomicPtr<tokio::runtime::Runtime>` we own
(sidestepping `pyo3-async-runtimes`'s frozen `OnceLock` global).
- A `pthread_atfork(after_in_child)` handler nulls the pointer; the next
`spawn` rebuilds the runtime in the child. The previous runtime is
intentionally **leaked** — calling `Drop` would try to join now-dead
worker threads and hang.
- `runtime::future_into_py` is a drop-in for
`pyo3_async_runtimes::tokio::future_into_py`. All ~80 call sites in
`arrow.rs` / `connection.rs` / `permutation.rs` / `query.rs` /
`table.rs` are updated to route through it.
- `python/Cargo.toml` adds `libc = "0.2"` and the tokio
`rt-multi-thread` feature.

### Python — `lancedb/background_loop.py`

- Refactors `BackgroundEventLoop.__init__` to a reusable `_start()`
method.
- An `os.register_at_fork(after_in_child=…)` hook calls `LOOP._start()`
to give the singleton a fresh asyncio loop and thread **in place**. This
matters because the rest of the codebase imports `LOOP` via `from
.background_loop import LOOP` — rebinding the module attribute would
leave those references holding the dead loop.

### Python — `lancedb/__init__.py`

Removes the `__warn_on_fork` pre-fork warning (and the now-unused
`import warnings`). Fork is supported.

## Test plan

- [x] New `test_permutation_dataloader_fork_workers` in
`python/tests/test_torch.py`: runs a `Permutation` through
`torch.utils.data.DataLoader(num_workers=2,
multiprocessing_context="fork")` inside a spawn-isolated child with a
30s hang detector. **Pre-fix**: timed out at 36s. **Post-fix**: passes
in ~3.6s.
- [x] New `test_remote_connection_after_fork` in
`python/tests/test_remote_db.py`: forks a child that creates a fresh
`lancedb.connect(...)` against a mock HTTP server and calls
`table_names()`; passes in <1s, validates the runtime reset is
sufficient for fresh remote clients.
- [x] All 62 tests in `test_torch.py` + `test_permutation.py` pass.
- [x] All 35 tests in `test_remote_db.py` pass.
- [x] `test_table.py` (87) + `test_db.py` + `test_query.py` (157, minus
one unrelated `sentence_transformers` import skip) — 244 passing.
- [x] `cargo clippy -p lancedb-python --tests` clean.
- [x] `cargo fmt`, `ruff check`, `ruff format` all clean.

## Known limitation (follow-up)

This PR makes a **freshly-built** `lancedb.connect(...)` work in a
forked child. An **inherited** `Connection` from the parent still
carries an inherited `reqwest::Client` whose hyper connection pool
references socket FDs and TCP/TLS state shared with the parent — using
it from the child after fork is unsafe (especially with HTTP/1.1
keep-alive). The recommended pattern for fork-based `DataLoader` workers
that hit a remote DB is to construct a new connection inside the worker.
Auto-clearing inherited HTTP client pools on fork would require tracking
live `Connection` instances in `lancedb` core and is left for a
follow-up PR.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 13:44:10 -07:00
Weston Pace
1fc23e5473 fix(python): make Permutation picklable for PyTorch multiprocessing (#3335)
## Summary

When pytorch is used with multiprocessing and the mp mode is spawn then
the Permutation needs to be pickled. It could not be pickled because
`Table` and `Connection` are not serializable. This PR adds pickle
support to Permutation without adding general pickle support to `Table`
or `Connection`. To add general support we probably need to start by
adding serialization in the namespace client.

In the meantime this PR enable pickling by adding special cases for:

 * In-memory tables (just serialize as Arrow IPC)
 * Native tables (serialize the URI)

If a user is not using one of the above cases (e.g. using a remote
connection) then they will need to provide a connection factory that can
be pickled.

## Breaking change

`PermutationBuilder.persist(...)` is removed from the Python bindings;
the permutation table is now always in-memory. The underlying Rust
`PermutationBuilder::persist` API is untouched and can be re-exposed
later if needed. It probably won't make sense to do that until we have a
way to serialize `Table` and `Connection`.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 21:37:58 -07:00
qingfeng-occ
87b831bcae fix(node): remove redundant postbuild:release script to fix build failure (#3285)
The `build:release` command already outputs the `*.node` files directly
to the `dist/` directory via the `--output-dir dist` flag.

Therefore, the `postbuild:release` script, which attempts to copy
`*.node` files from the `lancedb/` source directory, fails with a "no
such file or directory" error because the source files do not exist
there.

This commit removes the redundant `postbuild:release` script to resolve
the build failure.

fix #3284

Signed-off-by: qingfeng-occ <qing.feng@zte.com.cn>
2026-05-04 09:37:18 -07:00
84 changed files with 15719 additions and 15851 deletions

View File

@@ -45,7 +45,9 @@ jobs:
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: 20
# pnpm 11 (used by the nodejs install step below) requires
# Node >= 22.13; use 24 since 22 hits EOL in October.
node-version: 24
- name: Install Codex CLI
run: npm install -g @openai/codex
@@ -79,10 +81,14 @@ jobs:
java-version: '11'
cache: maven
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 11.1.1
- name: Install Node.js dependencies for TypeScript bindings
run: |
cd nodejs
npm ci
pnpm install --frozen-lockfile
- name: Configure git user
run: |
@@ -137,7 +143,7 @@ jobs:
- For Rust test failures: Run the specific test with "cargo test -p <crate> <test_name>"
- For Python test failures: Build with "cd python && maturin develop" then run "pytest <specific_test_file>::<test_name>"
- For Java test failures: Run "cd java && mvn test -Dtest=<TestClass>#<testMethod>"
- For TypeScript test failures: Run "cd nodejs && npm run build && npm test -- --testNamePattern='<test_name>'"
- For TypeScript test failures: Run "cd nodejs && pnpm build && pnpm test -- --testNamePattern='<test_name>'"
- Do NOT run the full test suite - only run the tests that were failing
7. If the additional guidelines are provided, follow them as well.

View File

@@ -43,7 +43,7 @@ jobs:
server-username: SONATYPE_USER
server-password: SONATYPE_TOKEN
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
gpg-passphrase: MAVEN_GPG_PASSPHRASE
- name: Set git config
run: |
git config --global user.email "dev+gha@lancedb.com"
@@ -58,10 +58,11 @@ jobs:
echo "use-agent" >> ~/.gnupg/gpg.conf
echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf
export GPG_TTY=$(tty)
./mvnw --batch-mode -DskipTests -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -pl lancedb-core -am -P deploy-to-ossrh
./mvnw --batch-mode -DskipTests -DpushChanges=false deploy -pl lancedb-core -am -P deploy-to-ossrh
env:
SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
report-failure:
name: Report Workflow Failure

View File

@@ -42,11 +42,17 @@ jobs:
with:
fetch-depth: 0
lfs: true
- uses: pnpm/action-setup@v4
with:
version: 11.1.1
- uses: actions/setup-node@v4
with:
node-version: 20
cache: 'npm'
cache-dependency-path: nodejs/package-lock.json
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
# in October. The library itself still supports Node >= 18
# (see test matrix below).
node-version: 24
cache: 'pnpm'
cache-dependency-path: nodejs/pnpm-lock.yaml
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
components: rustfmt, clippy
@@ -61,11 +67,13 @@ jobs:
run: cargo clippy --profile ci --all --all-features -- -D warnings
- name: Lint Typescript
run: |
npm ci
npm run lint-ci
pnpm install --frozen-lockfile
pnpm lint-ci
- name: Lint examples
working-directory: nodejs/examples
run: npm ci && npm run lint-ci
# The `@lancedb/lancedb` dep points at file:../dist; pnpm errors if
# that dir is missing, so create an empty one for lint-only runs.
run: mkdir -p ../dist && pnpm install --frozen-lockfile && pnpm lint-ci
linux:
name: Linux (NodeJS ${{ matrix.node-version }})
timeout-minutes: 30
@@ -82,14 +90,18 @@ jobs:
with:
fetch-depth: 0
lfs: true
- uses: actions/setup-node@v4
name: Setup Node.js 20 for build
- uses: pnpm/action-setup@v4
with:
# @napi-rs/cli v3 requires Node >= 20.12 (via @inquirer/prompts@8).
# Build always on Node 20; tests run on the matrix version below.
node-version: 20
cache: 'npm'
cache-dependency-path: nodejs/package-lock.json
version: 11.1.1
- uses: actions/setup-node@v4
name: Setup Node.js 24 for build
with:
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
# in October. Build/install runs on Node 24; tests run on the
# matrix version below using direct jest invocation.
node-version: 24
cache: 'pnpm'
cache-dependency-path: nodejs/pnpm-lock.yaml
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: |
@@ -97,45 +109,52 @@ jobs:
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
npm ci --include=optional
npm run build:debug -- --profile ci
pnpm install --frozen-lockfile
# No `--` separator: pnpm forwards it literally, which would
# make napi-rs treat `--profile ci` as a cargo passthrough arg.
pnpm build:debug --profile ci
pnpm tsc
- name: Setup examples
working-directory: nodejs/examples
run: pnpm install --frozen-lockfile
- name: Check docs
run: |
# We run this as part of the job because the binary needs to be built
# first to export the types of the native code.
set -e
# `pnpm docs` would invoke pnpm's built-in `docs` command, not
# the script — use `pnpm run docs`.
pnpm run docs
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
echo "Docs need to be updated"
echo "Run 'pnpm run docs', fix any warnings, and commit the changes."
exit 1
fi
- uses: actions/setup-node@v4
name: Setup Node.js ${{ matrix.node-version }} for test
with:
node-version: ${{ matrix.node-version }}
- name: Compile TypeScript
run: npm run tsc
- name: Setup localstack
working-directory: .
run: docker compose up --detach --wait
- name: Test
env:
S3_TEST: "1"
run: npm run test
- name: Setup examples
working-directory: nodejs/examples
run: npm ci
# Newer @smithy/core uses dynamic ESM imports.
NODE_OPTIONS: "--experimental-vm-modules"
# Invoke jest directly because pnpm 11 itself requires Node 22+
# while the matrix tests on older Node versions.
run: npx jest --verbose
- name: Test examples
working-directory: ./
env:
OPENAI_API_KEY: test
OPENAI_BASE_URL: http://0.0.0.0:8000
NODE_OPTIONS: "--experimental-vm-modules"
run: |
python ci/mock_openai.py &
cd nodejs/examples
npm test
- name: Check docs
run: |
# We run this as part of the job because the binary needs to be built
# first to export the types of the native code.
set -e
npm ci
npm run docs
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
echo "Docs need to be updated"
echo "Run 'npm run docs', fix any warnings, and commit the changes."
exit 1
fi
npx jest --testEnvironment jest-environment-node-single-context --verbose
macos:
timeout-minutes: 30
runs-on: "macos-14"
@@ -148,20 +167,28 @@ jobs:
with:
fetch-depth: 0
lfs: true
- uses: pnpm/action-setup@v4
with:
version: 11.1.1
- uses: actions/setup-node@v4
with:
node-version: 20
cache: 'npm'
cache-dependency-path: nodejs/package-lock.json
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
# in October.
node-version: 24
cache: 'pnpm'
cache-dependency-path: nodejs/pnpm-lock.yaml
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: |
brew install protobuf
- name: Build
run: |
npm ci --include=optional
npm run build:debug -- --profile ci
npm run tsc
pnpm install --frozen-lockfile
# No `--` separator: pnpm forwards it literally, which would
# make napi-rs treat `--profile ci` as a cargo passthrough arg.
pnpm build:debug --profile ci
pnpm tsc
- name: Test
run: |
npm run test
pnpm test

View File

@@ -171,13 +171,18 @@ jobs:
working-directory: nodejs
steps:
- uses: actions/checkout@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 11.1.1
- name: Setup node
uses: actions/setup-node@v4
if: ${{ !matrix.settings.docker }}
with:
node-version: 20
cache: npm
cache-dependency-path: nodejs/package-lock.json
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
# in October.
node-version: 24
cache: pnpm
cache-dependency-path: nodejs/pnpm-lock.yaml
- name: Install
uses: dtolnay/rust-toolchain@stable
if: ${{ !matrix.settings.docker }}
@@ -195,7 +200,7 @@ jobs:
target/
key: nodejs-${{ matrix.settings.target }}-cargo-${{ matrix.settings.host }}
- name: Install dependencies
run: npm ci
run: pnpm install --frozen-lockfile
- name: Install Zig
uses: mlugg/setup-zig@v2
if: ${{ contains(matrix.settings.target, 'musl') }}
@@ -248,7 +253,7 @@ jobs:
# one to do the upload.
- name: Make generic artifacts
if: ${{ matrix.settings.target == 'aarch64-apple-darwin' }}
run: npm run tsc
run: pnpm tsc
- name: Upload Generic Artifacts
if: ${{ matrix.settings.target == 'aarch64-apple-darwin' }}
uses: actions/upload-artifact@v4
@@ -283,14 +288,24 @@ jobs:
working-directory: nodejs
steps:
- uses: actions/checkout@v4
- name: Setup node
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 11.1.1
- name: Setup Node.js 24 for install
uses: actions/setup-node@v4
with:
# pnpm 11 requires Node >= 22.13; use 24 since 22 hits EOL
# in October.
node-version: 24
cache: pnpm
cache-dependency-path: nodejs/pnpm-lock.yaml
- name: Install dependencies
run: pnpm install --frozen-lockfile
- name: Setup Node.js ${{ matrix.node }} for test
uses: actions/setup-node@v4
with:
node-version: ${{ matrix.node }}
cache: npm
cache-dependency-path: nodejs/package-lock.json
- name: Install dependencies
run: npm ci
- name: Download artifacts
uses: actions/download-artifact@v4
with:
@@ -311,7 +326,9 @@ jobs:
- name: Move built files
run: cp dist/native.d.ts dist/native.js dist/*.node lancedb/
- name: Test bindings
run: npm test
# Invoke jest directly because pnpm 11 itself requires Node 22+
# while the matrix tests on older Node versions.
run: npx jest --verbose
publish:
name: Publish
runs-on: ubuntu-latest
@@ -323,15 +340,19 @@ jobs:
- test-lancedb
steps:
- uses: actions/checkout@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 11.1.1
- name: Setup node
uses: actions/setup-node@v4
with:
node-version: 24
cache: npm
cache-dependency-path: nodejs/package-lock.json
cache: pnpm
cache-dependency-path: nodejs/pnpm-lock.yaml
registry-url: "https://registry.npmjs.org"
- name: Install dependencies
run: npm ci
run: pnpm install --frozen-lockfile
- uses: actions/download-artifact@v4
with:
name: nodejs-dist
@@ -351,7 +372,7 @@ jobs:
- name: Display structure of downloaded files
run: find dist && find nodejs-artifacts
- name: Move artifacts
run: npx napi artifacts -d nodejs-artifacts
run: pnpm exec napi artifacts -d nodejs-artifacts
- name: List packages
run: find npm
- name: Publish

1888
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -13,20 +13,20 @@ categories = ["database-implementations"]
rust-version = "1.91.0"
[workspace.dependencies]
lance = { "version" = "=7.0.0-beta.2", default-features = false, "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.0.0-beta.2", default-features = false, "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.0.0-beta.2", default-features = false, "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.0.0-beta.2", "tag" = "v7.0.0-beta.2", "git" = "https://github.com/lance-format/lance.git" }
lance = { "version" = "=7.0.0-beta.7", default-features = false, "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.0.0-beta.7", default-features = false, "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.0.0-beta.7", default-features = false, "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
ahash = "0.8"
# Note that this one does not include pyarrow
arrow = { version = "58.0.0", optional = false }
@@ -54,7 +54,7 @@ half = { "version" = "2.7.1", default-features = false, features = [
futures = "0"
log = "0.4"
moka = { version = "0.12", features = ["future"] }
object_store = "0.12.0"
object_store = "0.13.2"
pin-project = "1.0.7"
rand = "0.9"
snafu = "0.8"

View File

@@ -51,6 +51,18 @@ ignore = [
# https://rustsec.org/advisories/RUSTSEC-2024-0436
{ id = "RUSTSEC-2024-0436", reason = "transitive via datafusion; awaiting ecosystem migration" },
# encoding: unmaintained. Reached through lindera-dictionary, which is
# required by the native Lindera tokenizer path. Lindera has not migrated
# off this crate yet.
# https://rustsec.org/advisories/RUSTSEC-2021-0153
{ id = "RUSTSEC-2021-0153", reason = "transitive via lindera-dictionary for native Lindera tokenizer" },
# fast-float: unsound and unmaintained. Reached only through polars-arrow
# from the optional Polars integration; replacement requires a Polars
# dependency upgrade.
# https://rustsec.org/advisories/RUSTSEC-2024-0379
{ id = "RUSTSEC-2024-0379", reason = "transitive via polars-arrow; waiting on Polars migration" },
# tantivy: segfault on malformed input due to missing bounds check.
# Pulled in via lance for full-text search. We only feed tantivy
# documents we construct ourselves, not attacker-controlled bytes.
@@ -68,11 +80,17 @@ ignore = [
# https://rustsec.org/advisories/RUSTSEC-2025-0119
{ id = "RUSTSEC-2025-0119", reason = "transitive via hf-hub/indicatif; cosmetic formatting crate" },
# rustls-pemfile: unmaintained. Reached from two separate chains:
# rustls-native-certs 0.6 (via hyper-rustls 0.24) and object_store 0.12.
# Both upstream dependencies need to move before we can drop it.
# https://rustsec.org/advisories/RUSTSEC-2025-0134
{ id = "RUSTSEC-2025-0134", reason = "transitive via rustls-native-certs/object_store; waiting on upstream migration" },
# bincode: unmaintained. Reached through lindera and lindera-dictionary,
# which are required by the native Lindera tokenizer path. Lindera has not
# migrated to another serialization format yet.
# https://rustsec.org/advisories/RUSTSEC-2025-0141
{ id = "RUSTSEC-2025-0141", reason = "transitive via lindera/lindera-dictionary for native Lindera tokenizer" },
# lru: soundness issue in IterMut. Reached only through aws-sdk-s3 in
# LanceDB's dev-dependency graph; LanceDB does not use that iterator
# directly. Clearing this requires the AWS SDK chain to update lru.
# https://rustsec.org/advisories/RUSTSEC-2026-0002
{ id = "RUSTSEC-2026-0002", reason = "transitive via aws-sdk-s3 dev-dependency; waiting on AWS SDK lru upgrade" },
# rustls-webpki 0.101.7 (old major line): name-constraint checks for
# URI / wildcard names. Pulled in only via the legacy rustls 0.21 chain
@@ -89,6 +107,12 @@ ignore = [
# we actively use is upgraded to 0.103.13 which contains the fix.
# https://rustsec.org/advisories/RUSTSEC-2026-0104
{ id = "RUSTSEC-2026-0104", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
# rand 0.8.5: soundness issue only when ThreadRng reseeds inside a custom
# logger. Reached through several transitive chains. LanceDB does not use
# rand from a custom logger; upgrade once all pinned chains accept 0.8.6+.
# https://rustsec.org/advisories/RUSTSEC-2026-0097
{ id = "RUSTSEC-2026-0097", reason = "transitive rand 0.8.5; LanceDB does not call ThreadRng from custom logging" },
]
# ---------------------------------------------------------------------------

View File

@@ -12,20 +12,22 @@ Typescript.
* `src/`: Rust bindings source code
* `lancedb/`: Typescript package source code
* `__test__/`: Unit tests
* `examples/`: An npm package with the examples shown in the documentation
* `examples/`: A pnpm package with the examples shown in the documentation
## Development environment
To set up your development environment, you will need to install the following:
1. Node.js 14 or later
2. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
3. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
1. Node.js 22 or later (required by pnpm 11)
2. [pnpm](https://pnpm.io/installation) 11 or later (or run via `corepack enable`,
which uses the `packageManager` field in `package.json`)
3. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
4. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
Initial setup:
```shell
npm install
pnpm install
```
### Commit Hooks
@@ -39,38 +41,38 @@ pre-commit install
## Development
Most common development commands can be run using the npm scripts.
Most common development commands can be run using the pnpm scripts.
Build the package
```shell
npm install
npm run build
pnpm install
pnpm build
```
Lint:
```shell
npm run lint
pnpm lint
```
Format and fix lints:
```shell
npm run lint-fix
pnpm lint-fix
```
Run tests:
```shell
npm test
pnpm test
```
To run a single test:
```shell
# Single file: table.test.ts
npm test -- table.test.ts
pnpm test -- table.test.ts
# Single test: 'merge insert' in table.test.ts
npm test -- table.test.ts --testNamePattern=merge\ insert
pnpm test -- table.test.ts --testNamePattern=merge\ insert
```

View File

@@ -148,6 +148,33 @@ Creates a new empty Table
***
### createNamespace()
```ts
abstract createNamespace(namespacePath, options?): Promise<CreateNamespaceResponse>
```
Create a new namespace at the given path.
#### Parameters
* **namespacePath**: `string`[]
The namespace path to create.
* **options?**: `Partial`&lt;[`CreateNamespaceOptions`](../interfaces/CreateNamespaceOptions.md)&gt;
Creation `mode`
("create" | "exist_ok" | "overwrite") and optional `properties`
to attach to the namespace.
#### Returns
`Promise`&lt;[`CreateNamespaceResponse`](../interfaces/CreateNamespaceResponse.md)&gt;
The properties of the
created namespace and an optional transaction id.
***
### createTable()
#### createTable(options, namespacePath)
@@ -230,6 +257,29 @@ Creates a new Table and initialize it with new data.
***
### describeNamespace()
```ts
abstract describeNamespace(namespacePath): Promise<DescribeNamespaceResponse>
```
Describe a namespace, returning its properties.
#### Parameters
* **namespacePath**: `string`[]
The namespace path to describe, in
parent → child order, e.g. `["analytics", "sales"]`.
#### Returns
`Promise`&lt;[`DescribeNamespaceResponse`](../interfaces/DescribeNamespaceResponse.md)&gt;
The namespace's properties
(may be undefined if the namespace has none).
***
### display()
```ts
@@ -263,6 +313,36 @@ Drop all tables in the database.
***
### dropNamespace()
```ts
abstract dropNamespace(namespacePath, options?): Promise<DropNamespaceResponse>
```
Drop a namespace.
Use `behavior: "cascade"` to also drop everything contained in the
namespace (sub-namespaces and tables). The default `"restrict"`
behavior refuses to drop a non-empty namespace.
#### Parameters
* **namespacePath**: `string`[]
The namespace path to drop.
* **options?**: `Partial`&lt;[`DropNamespaceOptions`](../interfaces/DropNamespaceOptions.md)&gt;
`mode` ("skip" | "fail"
for missing-namespace handling) and `behavior` ("restrict" | "cascade").
#### Returns
`Promise`&lt;[`DropNamespaceResponse`](../interfaces/DropNamespaceResponse.md)&gt;
Any properties returned by
the server and an optional transaction id.
***
### dropTable()
```ts
@@ -299,6 +379,36 @@ Return true if the connection has not been closed
***
### listNamespaces()
```ts
abstract listNamespaces(namespacePath?, options?): Promise<ListNamespacesResponse>
```
List the immediate child namespaces under the given parent.
Results may be paginated. To retrieve subsequent pages, pass the
`pageToken` returned by a previous call.
#### Parameters
* **namespacePath?**: `string`[]
The parent namespace path. Defaults
to the root namespace if omitted.
* **options?**: `Partial`&lt;[`ListNamespacesOptions`](../interfaces/ListNamespacesOptions.md)&gt;
Pagination options
(`pageToken`, `limit`).
#### Returns
`Promise`&lt;[`ListNamespacesResponse`](../interfaces/ListNamespacesResponse.md)&gt;
Child namespace names and
an optional token for fetching the next page.
***
### openTable()
```ts

View File

@@ -501,6 +501,34 @@ Modeled after ``VACUUM`` in PostgreSQL.
***
### prewarmData()
```ts
abstract prewarmData(columns?): Promise<void>
```
Prewarm one or more columns of data in the table.
#### Parameters
* **columns?**: `string`[]
The columns to prewarm. If undefined, all columns are prewarmed.
This will load the column data into the page cache so that future queries that
read those columns avoid the initial cold-start latency. This call initiates
prewarming and returns once the request is accepted; the warming itself may
continue in the background. Calling it on already-prewarmed columns is a
no-op on the server.
Prewarming is generally useful for columns used in filters or projections.
Large columns (e.g. high-dimensional vectors or binary data) may not be
practical to prewarm.
This feature is currently only supported on remote tables.
#### Returns
`Promise`&lt;`void`&gt;
***
### prewarmIndex()
```ts

View File

@@ -0,0 +1,131 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / connectNamespace
# Function: connectNamespace()
## connectNamespace(implName, config, options)
```ts
function connectNamespace(
implName,
config,
options?): Promise<Connection>
```
Connect to a LanceDB database through a namespace.
Unlike [connect](connect.md), which routes by URI scheme (local path vs.
`db://` cloud), `connectNamespace` always returns a namespace-backed
connection. The `implName` selects the namespace implementation:
- `"dir"` — directory namespace, configured with [DirNamespaceConfig](../interfaces/DirNamespaceConfig.md).
- `"rest"` — remote REST catalog, configured with [RestNamespaceConfig](../interfaces/RestNamespaceConfig.md).
- Any other string — full module path for a custom implementation,
configured with a free-form string-keyed `properties` map.
### Parameters
* **implName**: `"dir"`
* **config**: [`DirNamespaceConfig`](../interfaces/DirNamespaceConfig.md)
* **options?**: `Partial`&lt;[`ConnectNamespaceOptions`](../interfaces/ConnectNamespaceOptions.md)&gt;
### Returns
`Promise`&lt;[`Connection`](../classes/Connection.md)&gt;
### Examples
```ts
const db = await connectNamespace("dir", { root: "/path/to/db" });
await db.createTable("users", [{ id: 1 }]);
```
```ts
const db = await connectNamespace("rest", {
uri: "https://catalog.example.com",
headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
});
```
```ts
const db = await connectNamespace("my.custom.Namespace", {
endpoint: "...",
});
```
## connectNamespace(implName, config, options)
```ts
function connectNamespace(
implName,
config,
options?): Promise<Connection>
```
Connect through the built-in REST namespace.
Configured with [RestNamespaceConfig](../interfaces/RestNamespaceConfig.md). See the function-level
documentation above for the full surface, examples, and how this
relates to [connect](connect.md).
### Parameters
* **implName**: `"rest"`
* **config**: [`RestNamespaceConfig`](../interfaces/RestNamespaceConfig.md)
* **options?**: `Partial`&lt;[`ConnectNamespaceOptions`](../interfaces/ConnectNamespaceOptions.md)&gt;
### Returns
`Promise`&lt;[`Connection`](../classes/Connection.md)&gt;
### Example
```ts
const db = await connectNamespace("rest", {
uri: "https://catalog.example.com",
headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
});
```
## connectNamespace(implName, properties, options)
```ts
function connectNamespace(
implName,
properties,
options?): Promise<Connection>
```
Connect through a custom namespace implementation by full module path,
configured with a free-form string-keyed `properties` map. Use the
typed overloads above for the built-in `"dir"` and `"rest"` impls.
See the function-level documentation above for examples and how this
relates to [connect](connect.md).
### Parameters
* **implName**: `string`
* **properties**: `Record`&lt;`string`, `string`&gt;
* **options?**: `Partial`&lt;[`ConnectNamespaceOptions`](../interfaces/ConnectNamespaceOptions.md)&gt;
### Returns
`Promise`&lt;[`Connection`](../classes/Connection.md)&gt;
### Example
```ts
const db = await connectNamespace("my.custom.Namespace", {
endpoint: "...",
});
```

View File

@@ -51,10 +51,17 @@
- [ClientConfig](interfaces/ClientConfig.md)
- [ColumnAlteration](interfaces/ColumnAlteration.md)
- [CompactionStats](interfaces/CompactionStats.md)
- [ConnectNamespaceOptions](interfaces/ConnectNamespaceOptions.md)
- [ConnectionOptions](interfaces/ConnectionOptions.md)
- [CreateNamespaceOptions](interfaces/CreateNamespaceOptions.md)
- [CreateNamespaceResponse](interfaces/CreateNamespaceResponse.md)
- [CreateTableOptions](interfaces/CreateTableOptions.md)
- [DeleteResult](interfaces/DeleteResult.md)
- [DescribeNamespaceResponse](interfaces/DescribeNamespaceResponse.md)
- [DirNamespaceConfig](interfaces/DirNamespaceConfig.md)
- [DropColumnsResult](interfaces/DropColumnsResult.md)
- [DropNamespaceOptions](interfaces/DropNamespaceOptions.md)
- [DropNamespaceResponse](interfaces/DropNamespaceResponse.md)
- [ExecutableQuery](interfaces/ExecutableQuery.md)
- [FragmentStatistics](interfaces/FragmentStatistics.md)
- [FragmentSummaryStats](interfaces/FragmentSummaryStats.md)
@@ -69,12 +76,15 @@
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
- [IvfPqOptions](interfaces/IvfPqOptions.md)
- [IvfRqOptions](interfaces/IvfRqOptions.md)
- [ListNamespacesOptions](interfaces/ListNamespacesOptions.md)
- [ListNamespacesResponse](interfaces/ListNamespacesResponse.md)
- [MergeResult](interfaces/MergeResult.md)
- [OpenTableOptions](interfaces/OpenTableOptions.md)
- [OptimizeOptions](interfaces/OptimizeOptions.md)
- [OptimizeStats](interfaces/OptimizeStats.md)
- [QueryExecutionOptions](interfaces/QueryExecutionOptions.md)
- [RemovalStats](interfaces/RemovalStats.md)
- [RestNamespaceConfig](interfaces/RestNamespaceConfig.md)
- [RetryConfig](interfaces/RetryConfig.md)
- [ShuffleOptions](interfaces/ShuffleOptions.md)
- [SplitCalculatedOptions](interfaces/SplitCalculatedOptions.md)
@@ -107,6 +117,7 @@
- [RecordBatchIterator](functions/RecordBatchIterator.md)
- [connect](functions/connect.md)
- [connectNamespace](functions/connectNamespace.md)
- [makeArrowTable](functions/makeArrowTable.md)
- [packBits](functions/packBits.md)
- [permutationBuilder](functions/permutationBuilder.md)

View File

@@ -0,0 +1,54 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / ConnectNamespaceOptions
# Interface: ConnectNamespaceOptions
## Properties
### namespaceClientProperties?
```ts
optional namespaceClientProperties: Record<string, string>;
```
Extra properties for the backing namespace client.
***
### readConsistencyInterval?
```ts
optional readConsistencyInterval: number;
```
The interval, in seconds, at which to check for updates to the table
from other processes. If None, then consistency is not checked. For
performance reasons, this is the default. For strong consistency, set
this to zero seconds. Then every read will check for updates from other
processes. As a compromise, you can set this to a non-zero value for
eventual consistency.
***
### session?
```ts
optional session: Session;
```
The session to use for this connection. Holds shared caches and other
session-specific state.
***
### storageOptions?
```ts
optional storageOptions: Record<string, string>;
```
Configuration for object storage. The available options are described
at https://docs.lancedb.com/storage/

View File

@@ -0,0 +1,27 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / CreateNamespaceOptions
# Interface: CreateNamespaceOptions
## Properties
### mode?
```ts
optional mode: "overwrite" | "create" | "exist_ok";
```
Creation mode.
***
### properties?
```ts
optional properties: Record<string, string>;
```
Properties to set on the new namespace.

View File

@@ -0,0 +1,23 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / CreateNamespaceResponse
# Interface: CreateNamespaceResponse
## Properties
### properties?
```ts
optional properties: Record<string, string>;
```
***
### transactionId?
```ts
optional transactionId: string;
```

View File

@@ -0,0 +1,15 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / DescribeNamespaceResponse
# Interface: DescribeNamespaceResponse
## Properties
### properties?
```ts
optional properties: Record<string, string>;
```

View File

@@ -0,0 +1,47 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / DirNamespaceConfig
# Interface: DirNamespaceConfig
Configuration for the built-in directory namespace (`"dir"`).
The directory namespace stores tables under a single root path (local
filesystem or object storage URI). See
[https://docs.lancedb.com/namespaces](https://docs.lancedb.com/namespaces) for the documented surface;
less-common knobs live under [DirNamespaceConfig.extraProperties](DirNamespaceConfig.md#extraproperties).
## Properties
### extraProperties?
```ts
optional extraProperties: Record<string, string>;
```
Additional raw properties passed verbatim to the namespace
implementation (e.g. `storage.*`, `credential_vendor.*`). Typed
fields above take precedence on key collision.
***
### manifestEnabled?
```ts
optional manifestEnabled: boolean;
```
Whether to maintain a namespace manifest at the root. Required for
child namespaces. Defaults to true on the impl side.
***
### root
```ts
root: string;
```
Root path or URI containing the LanceDB tables.

View File

@@ -0,0 +1,27 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / DropNamespaceOptions
# Interface: DropNamespaceOptions
## Properties
### behavior?
```ts
optional behavior: "restrict" | "cascade";
```
Refuse to drop if non-empty (restrict) or drop recursively (cascade).
***
### mode?
```ts
optional mode: "fail" | "skip";
```
Whether to skip if the namespace doesn't exist, or fail.

View File

@@ -0,0 +1,23 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / DropNamespaceResponse
# Interface: DropNamespaceResponse
## Properties
### properties?
```ts
optional properties: Record<string, string>;
```
***
### transactionId?
```ts
optional transactionId: string[];
```

View File

@@ -0,0 +1,27 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / ListNamespacesOptions
# Interface: ListNamespacesOptions
## Properties
### limit?
```ts
optional limit: number;
```
An optional limit to the number of results to return.
***
### pageToken?
```ts
optional pageToken: string;
```
Token from a previous response for pagination.

View File

@@ -0,0 +1,23 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / ListNamespacesResponse
# Interface: ListNamespacesResponse
## Properties
### namespaces
```ts
namespaces: string[];
```
***
### pageToken?
```ts
optional pageToken: string;
```

View File

@@ -0,0 +1,47 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / RestNamespaceConfig
# Interface: RestNamespaceConfig
Configuration for the built-in REST namespace (`"rest"`).
The REST namespace talks to a remote catalog server over HTTP. See
[https://docs.lancedb.com/namespaces](https://docs.lancedb.com/namespaces) for the documented surface;
less-common knobs (TLS, metrics) live under
[RestNamespaceConfig.extraProperties](RestNamespaceConfig.md#extraproperties).
## Properties
### extraProperties?
```ts
optional extraProperties: Record<string, string>;
```
Additional raw properties passed verbatim to the namespace
implementation (e.g. `tls.*`, `ops_metrics_enabled`, `delimiter`).
Typed fields above take precedence on key collision.
***
### headers?
```ts
optional headers: Record<string, string>;
```
HTTP headers forwarded with each request. Keys are passed through
as-is (e.g. `"x-api-key"`, `"Authorization"`).
***
### uri
```ts
uri: string;
```
Catalog endpoint URL.

View File

@@ -94,11 +94,11 @@ of raw SQL strings with [where][lancedb.query.LanceQueryBuilder.where] and
## Full text search
::: lancedb.fts.create_index
Use [lancedb.table.Table.create_fts_index][] for the synchronous API or
[lancedb.table.AsyncTable.create_index][] with [lancedb.index.FTS][] for the
asynchronous API.
::: lancedb.fts.populate_index
::: lancedb.fts.search_index
::: lancedb.index.FTS
## Utilities

View File

@@ -28,7 +28,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<arrow.version>15.0.0</arrow.version>
<lance-core.version>7.0.0-beta.2</lance-core.version>
<lance-core.version>7.0.0-beta.7</lance-core.version>
<spotless.skip>false</spotless.skip>
<spotless.version>2.30.0</spotless.version>
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>

View File

@@ -3,11 +3,11 @@ The core Rust library is in the `../rust/lancedb` directory, the rust binding
code is in the `src/` directory and the typescript bindings are in
the `lancedb/` directory.
Whenever you change the Rust code, you will need to recompile: `npm run build`.
Whenever you change the Rust code, you will need to recompile: `pnpm build`.
Common commands:
* Build: `npm run build`
* Lint: `npm run lint`
* Fix lints: `npm run lint-fix`
* Test: `npm test`
* Run single test file: `npm test __test__/arrow.test.ts`
* Build: `pnpm build`
* Lint: `pnpm lint`
* Fix lints: `pnpm lint-fix`
* Test: `pnpm test`
* Run single test file: `pnpm test __test__/arrow.test.ts`

View File

@@ -12,20 +12,22 @@ Typescript.
* `src/`: Rust bindings source code
* `lancedb/`: Typescript package source code
* `__test__/`: Unit tests
* `examples/`: An npm package with the examples shown in the documentation
* `examples/`: A pnpm package with the examples shown in the documentation
## Development environment
To set up your development environment, you will need to install the following:
1. Node.js 14 or later
2. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
3. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
1. Node.js 22 or later (required by pnpm 11)
2. [pnpm](https://pnpm.io/installation) 11 or later (or run via `corepack enable`,
which uses the `packageManager` field in `package.json`)
3. Rust's package manager, Cargo. Use [rustup](https://rustup.rs/) to install.
4. [protoc](https://grpc.io/docs/protoc-installation/) (Protocol Buffers compiler)
Initial setup:
```shell
npm install
pnpm install
```
### Commit Hooks
@@ -39,38 +41,38 @@ pre-commit install
## Development
Most common development commands can be run using the npm scripts.
Most common development commands can be run using the pnpm scripts.
Build the package
```shell
npm install
npm run build
pnpm install
pnpm build
```
Lint:
```shell
npm run lint
pnpm lint
```
Format and fix lints:
```shell
npm run lint-fix
pnpm lint-fix
```
Run tests:
```shell
npm test
pnpm test
```
To run a single test:
```shell
# Single file: table.test.ts
npm test -- table.test.ts
pnpm test -- table.test.ts
# Single test: 'merge insert' in table.test.ts
npm test -- table.test.ts --testNamePattern=merge\ insert
pnpm test -- table.test.ts --testNamePattern=merge\ insert
```

View File

@@ -22,6 +22,7 @@ arrow-schema.workspace = true
env_logger.workspace = true
futures.workspace = true
lancedb = { path = "../rust/lancedb", default-features = false }
lance-namespace.workspace = true
napi = { version = "3.8.3", default-features = false, features = [
"napi9",
"async"

View File

@@ -4,7 +4,7 @@
import { readdirSync } from "fs";
import { Field, Float64, Schema } from "apache-arrow";
import * as tmp from "tmp";
import { Connection, Table, connect } from "../lancedb";
import { Connection, Table, connect, connectNamespace } from "../lancedb";
import { LocalTable } from "../lancedb/table";
describe("when connecting", () => {
@@ -306,3 +306,186 @@ describe("clone table functionality", () => {
).rejects.toThrow("Deep clone is not yet implemented");
});
});
describe("namespaces", () => {
let tmpDir: tmp.DirResult;
let db: Connection;
beforeEach(async () => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
// The local DirectoryNamespace backend only supports child namespaces
// when manifest mode is enabled (see lance-namespace-impls/src/dir.rs).
db = await connect(tmpDir.name, {
// biome-ignore lint/style/useNamingConvention: opaque backend property key, must match Rust
namespaceClientProperties: { manifest_enabled: "true" },
});
});
afterEach(() => tmpDir.removeCallback());
it("should create and describe a namespace", async () => {
await db.createNamespace(["myns"]);
const desc = await db.describeNamespace(["myns"]);
expect(desc).toBeDefined();
});
it("should list namespaces created at the root", async () => {
await db.createNamespace(["alpha"]);
await db.createNamespace(["beta"]);
const list = await db.listNamespaces();
expect(list.namespaces).toEqual(expect.arrayContaining(["alpha", "beta"]));
});
it("should list child namespaces under a parent", async () => {
await db.createNamespace(["parent"]);
await db.createNamespace(["parent", "child"]);
const list = await db.listNamespaces(["parent"]);
expect(list.namespaces).toContain("child");
});
it("should drop a namespace", async () => {
await db.createNamespace(["ephemeral"]);
await db.dropNamespace(["ephemeral"]);
const list = await db.listNamespaces();
expect(list.namespaces).not.toContain("ephemeral");
});
it("should raise an error on any namespace op after close", async () => {
await db.close();
await expect(db.describeNamespace(["foo"])).rejects.toThrow(
"Connection is closed",
);
await expect(db.listNamespaces()).rejects.toThrow("Connection is closed");
await expect(db.createNamespace(["foo"])).rejects.toThrow(
"Connection is closed",
);
await expect(db.dropNamespace(["foo"])).rejects.toThrow(
"Connection is closed",
);
});
it("should raise an understandable error when describing a non-existent namespace", async () => {
await expect(db.describeNamespace(["does-not-exist"])).rejects.toThrow(
/not found/i,
);
});
it("should raise an error when creating a namespace that already exists", async () => {
await db.createNamespace(["dup"]);
await expect(db.createNamespace(["dup"])).rejects.toThrow();
});
it("should reject an unrecognized createNamespace mode with a clear error", async () => {
await expect(
// biome-ignore lint/suspicious/noExplicitAny: deliberately bypass TS to test runtime validation
db.createNamespace(["x"], { mode: "frobnicate" as any }),
).rejects.toThrow(/Invalid mode 'frobnicate'/);
});
it("should reject an unrecognized dropNamespace mode with a clear error", async () => {
await db.createNamespace(["x"]);
await expect(
// biome-ignore lint/suspicious/noExplicitAny: deliberately bypass TS to test runtime validation
db.dropNamespace(["x"], { mode: "frobnicate" as any }),
).rejects.toThrow(/Invalid mode 'frobnicate'/);
});
it("should reject an unrecognized dropNamespace behavior with a clear error", async () => {
await db.createNamespace(["x"]);
await expect(
// biome-ignore lint/suspicious/noExplicitAny: deliberately bypass TS to test runtime validation
db.dropNamespace(["x"], { behavior: "frobnicate" as any }),
).rejects.toThrow(/Invalid behavior 'frobnicate'/);
});
});
describe("connectNamespace", () => {
let tmpDir: tmp.DirResult;
beforeEach(() => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
});
afterEach(() => tmpDir.removeCallback());
it("connects via the dir implementation and supports table ops", async () => {
const db = await connectNamespace("dir", { root: tmpDir.name });
await db.createTable("users", [{ id: 1 }, { id: 2 }]);
await expect(db.tableNames()).resolves.toContain("users");
});
it("throws a clear error when implName is empty", async () => {
await expect(connectNamespace("", {})).rejects.toThrow(
"implName must be a non-empty string",
);
});
it("throws when the namespace implementation is unknown", async () => {
await expect(connectNamespace("not-a-real-impl", {})).rejects.toThrow();
});
it("passes storage options through to the namespace", async () => {
const db = await connectNamespace(
"dir",
{ root: tmpDir.name },
{ storageOptions: { newTableDataStorageVersion: "stable" } },
);
await db.createTable("plumbing", [{ id: 1 }]);
await expect(db.tableNames()).resolves.toContain("plumbing");
});
it("supports child namespaces when manifestEnabled is true on the dir config", async () => {
const writer = await connectNamespace("dir", {
root: tmpDir.name,
manifestEnabled: true,
});
await writer.createNamespace(["analytics"]);
await writer.createTable("orders", [{ id: 1 }, { id: 2 }], ["analytics"]);
await writer.close();
const reader = await connectNamespace("dir", {
root: tmpDir.name,
manifestEnabled: true,
});
await expect(reader.tableNames(["analytics"])).resolves.toContain("orders");
const orders = await reader.openTable("orders", ["analytics"]);
await expect(orders.countRows()).resolves.toBe(2);
});
it("merges extraProperties into the dir config and is overridden by typed fields", async () => {
// Two observable assertions:
// - Typed `root` overrides extraProperties.root: createTable would fail
// under the bogus path if the override didn't happen.
// - extraProperties.manifest_enabled="false" is honored end-to-end. Child
// namespaces require manifest mode (default true), so explicitly
// disabling it via extraProperties must make createNamespace reject. If
// extraProperties pass-through were silently broken, the default would
// let createNamespace succeed.
const db = await connectNamespace("dir", {
root: tmpDir.name,
extraProperties: {
root: "/should/be/overridden",
// biome-ignore lint/style/useNamingConvention: backend property key
manifest_enabled: "false",
},
});
await db.createTable("base", [{ id: 1 }]);
await expect(db.tableNames()).resolves.toContain("base");
await expect(db.createNamespace(["analytics"])).rejects.toThrow();
});
it("flows unknown top-level keys through when implName is dynamic (no silent drop)", async () => {
// Routes via the third overload because `impl` is `string`, not the
// literal `"dir"`. The dispatcher still notices the runtime value is
// "dir", but unknown keys like `manifest_enabled` must not be silently
// dropped during the conversion.
//
// Asserting a *negative* outcome (manifest disabled -> createNamespace
// rejects) is required for observability, since the backend default for
// `manifest_enabled` is true.
const impl: string = "dir";
const db = await connectNamespace(impl, {
root: tmpDir.name,
// biome-ignore lint/style/useNamingConvention: backend property key
manifest_enabled: "false",
});
await expect(db.createNamespace(["mixed"])).rejects.toThrow();
});
});

View File

@@ -115,6 +115,12 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
await expect(table.countRows()).resolves.toBe(1);
});
it("should accept skipAutoCleanup on add()", async () => {
await table.add([{ id: 1 }], { skipAutoCleanup: true });
await table.add([{ id: 2 }], { skipAutoCleanup: true });
await expect(table.countRows()).resolves.toBe(2);
});
it("should let me close the table", async () => {
expect(table.isOpen()).toBe(true);
table.close();
@@ -1870,6 +1876,25 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(results.length).toBe(3);
});
test("prewarmData errors on local tables", async () => {
const db = await connect(tmpDir.name);
const data = [
{ text: "alpha", vector: [0.1, 0.2, 0.3] },
{ text: "beta", vector: [0.4, 0.5, 0.6] },
];
const table = await db.createTable("prewarm_data_test", data);
// prewarmData is only supported on remote tables. We verify the call
// is wired through napi and surfaces the expected error for both
// arg shapes (undefined and string[]).
await expect(table.prewarmData()).rejects.toThrow(
"prewarm_data is currently only supported on remote tables",
);
await expect(table.prewarmData(["text"])).rejects.toThrow(
"prewarm_data is currently only supported on remote tables",
);
});
test("full text index on list", async () => {
const db = await connect(tmpDir.name);
const data = [

File diff suppressed because it is too large Load Diff

View File

@@ -11,16 +11,17 @@
"test": "node --experimental-vm-modules node_modules/.bin/jest --testEnvironment jest-environment-node-single-context --verbose",
"lint": "biome check *.ts && biome format *.ts",
"lint-ci": "biome ci .",
"lint-fix": "biome check --write *.ts && npm run format",
"lint-fix": "biome check --write *.ts && pnpm format",
"format": "biome format --write *.ts"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"packageManager": "pnpm@11.1.1",
"dependencies": {
"@huggingface/transformers": "^3.0.2",
"@huggingface/transformers": "3.0.2",
"@lancedb/lancedb": "file:../dist",
"openai": "^4.29.2",
"sharp": "^0.33.5"
"openai": "4.29.2",
"sharp": "0.33.5"
},
"devDependencies": {
"@biomejs/biome": "^1.7.3",

3466
nodejs/examples/pnpm-lock.yaml generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
# Block resolution of versions less than 24h old (Shai-Hulud window).
# This is the pnpm 11 default but pinned here so it's visible to
# reviewers and survives a future pnpm major flipping the default.
minimumReleaseAge: 1440
# Fail install if a transitive dep tries to run an unapproved script.
strictDepBuilds: true
allowBuilds:
'@biomejs/biome': true
onnxruntime-node: true
protobufjs: true
sharp: true

View File

@@ -16,6 +16,18 @@ import {
} from "./arrow";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import { Connection as LanceDbConnection } from "./native";
import type {
CreateNamespaceResponse,
DescribeNamespaceResponse,
DropNamespaceResponse,
ListNamespacesResponse,
} from "./native";
export type {
CreateNamespaceResponse,
DescribeNamespaceResponse,
DropNamespaceResponse,
ListNamespacesResponse,
};
import { sanitizeTable } from "./sanitize";
import { LocalTable, Table } from "./table";
@@ -110,6 +122,28 @@ export interface TableNamesOptions {
/** An optional limit to the number of results to return. */
limit?: number;
}
export interface ListNamespacesOptions {
/** Token from a previous response for pagination. */
pageToken?: string;
/** An optional limit to the number of results to return. */
limit?: number;
}
export interface CreateNamespaceOptions {
/** Creation mode. */
mode?: "create" | "exist_ok" | "overwrite";
/** Properties to set on the new namespace. */
properties?: Record<string, string>;
}
export interface DropNamespaceOptions {
/** Whether to skip if the namespace doesn't exist, or fail. */
mode?: "skip" | "fail";
/** Refuse to drop if non-empty (restrict) or drop recursively (cascade). */
behavior?: "restrict" | "cascade";
}
/**
* A LanceDB Connection that allows you to open tables and create new ones.
*
@@ -268,6 +302,69 @@ export abstract class Connection {
*/
abstract dropAllTables(namespacePath?: string[]): Promise<void>;
/**
* Describe a namespace, returning its properties.
*
* @param {string[]} namespacePath - The namespace path to describe, in
* parent → child order, e.g. `["analytics", "sales"]`.
* @returns {Promise<DescribeNamespaceResponse>} The namespace's properties
* (may be undefined if the namespace has none).
*/
abstract describeNamespace(
namespacePath: string[],
): Promise<DescribeNamespaceResponse>;
/**
* List the immediate child namespaces under the given parent.
*
* Results may be paginated. To retrieve subsequent pages, pass the
* `pageToken` returned by a previous call.
*
* @param {string[]} namespacePath - The parent namespace path. Defaults
* to the root namespace if omitted.
* @param {Partial<ListNamespacesOptions>} options - Pagination options
* (`pageToken`, `limit`).
* @returns {Promise<ListNamespacesResponse>} Child namespace names and
* an optional token for fetching the next page.
*/
abstract listNamespaces(
namespacePath?: string[],
options?: Partial<ListNamespacesOptions>,
): Promise<ListNamespacesResponse>;
/**
* Create a new namespace at the given path.
*
* @param {string[]} namespacePath - The namespace path to create.
* @param {Partial<CreateNamespaceOptions>} options - Creation `mode`
* ("create" | "exist_ok" | "overwrite") and optional `properties`
* to attach to the namespace.
* @returns {Promise<CreateNamespaceResponse>} The properties of the
* created namespace and an optional transaction id.
*/
abstract createNamespace(
namespacePath: string[],
options?: Partial<CreateNamespaceOptions>,
): Promise<CreateNamespaceResponse>;
/**
* Drop a namespace.
*
* Use `behavior: "cascade"` to also drop everything contained in the
* namespace (sub-namespaces and tables). The default `"restrict"`
* behavior refuses to drop a non-empty namespace.
*
* @param {string[]} namespacePath - The namespace path to drop.
* @param {Partial<DropNamespaceOptions>} options - `mode` ("skip" | "fail"
* for missing-namespace handling) and `behavior` ("restrict" | "cascade").
* @returns {Promise<DropNamespaceResponse>} Any properties returned by
* the server and an optional transaction id.
*/
abstract dropNamespace(
namespacePath: string[],
options?: Partial<DropNamespaceOptions>,
): Promise<DropNamespaceResponse>;
/**
* Clone a table from a source table.
*
@@ -515,6 +612,45 @@ export class LocalConnection extends Connection {
async dropAllTables(namespacePath?: string[]): Promise<void> {
return this.inner.dropAllTables(namespacePath ?? []);
}
describeNamespace(
namespacePath: string[],
): Promise<DescribeNamespaceResponse> {
return this.inner.describeNamespace(namespacePath);
}
listNamespaces(
namespacePath?: string[],
options?: Partial<ListNamespacesOptions>,
): Promise<ListNamespacesResponse> {
return this.inner.listNamespaces(
namespacePath ?? [],
options?.pageToken,
options?.limit,
);
}
createNamespace(
namespacePath: string[],
options?: Partial<CreateNamespaceOptions>,
): Promise<CreateNamespaceResponse> {
return this.inner.createNamespace(
namespacePath,
options?.mode,
options?.properties,
);
}
dropNamespace(
namespacePath: string[],
options?: Partial<DropNamespaceOptions>,
): Promise<DropNamespaceResponse> {
return this.inner.dropNamespace(
namespacePath,
options?.mode,
options?.behavior,
);
}
}
/**

View File

@@ -8,6 +8,7 @@ import {
} from "./connection";
import {
ConnectNamespaceOptions,
ConnectionOptions,
Connection as LanceDbConnection,
JsHeaderProvider as NativeJsHeaderProvider,
@@ -22,6 +23,7 @@ export { JsHeaderProvider as NativeJsHeaderProvider } from "./native.js";
export {
AddColumnsSql,
ConnectionOptions,
ConnectNamespaceOptions,
IndexStatistics,
IndexConfig,
ClientConfig,
@@ -62,6 +64,13 @@ export {
CreateTableOptions,
TableNamesOptions,
OpenTableOptions,
ListNamespacesOptions,
CreateNamespaceOptions,
DropNamespaceOptions,
ListNamespacesResponse,
CreateNamespaceResponse,
DropNamespaceResponse,
DescribeNamespaceResponse,
} from "./connection";
export { Session } from "./native.js";
@@ -293,3 +302,197 @@ export async function connect(
);
return new LocalConnection(nativeConn);
}
/**
* Configuration for the built-in directory namespace (`"dir"`).
*
* The directory namespace stores tables under a single root path (local
* filesystem or object storage URI). See
* {@link https://docs.lancedb.com/namespaces} for the documented surface;
* less-common knobs live under {@link DirNamespaceConfig.extraProperties}.
*/
export interface DirNamespaceConfig {
/** Root path or URI containing the LanceDB tables. */
root: string;
/**
* Whether to maintain a namespace manifest at the root. Required for
* child namespaces. Defaults to true on the impl side.
*/
manifestEnabled?: boolean;
/**
* Additional raw properties passed verbatim to the namespace
* implementation (e.g. `storage.*`, `credential_vendor.*`). Typed
* fields above take precedence on key collision.
*/
extraProperties?: Record<string, string>;
}
/**
* Configuration for the built-in REST namespace (`"rest"`).
*
* The REST namespace talks to a remote catalog server over HTTP. See
* {@link https://docs.lancedb.com/namespaces} for the documented surface;
* less-common knobs (TLS, metrics) live under
* {@link RestNamespaceConfig.extraProperties}.
*/
export interface RestNamespaceConfig {
/** Catalog endpoint URL. */
uri: string;
/**
* HTTP headers forwarded with each request. Keys are passed through
* as-is (e.g. `"x-api-key"`, `"Authorization"`).
*/
headers?: Record<string, string>;
/**
* Additional raw properties passed verbatim to the namespace
* implementation (e.g. `tls.*`, `ops_metrics_enabled`, `delimiter`).
* Typed fields above take precedence on key collision.
*/
extraProperties?: Record<string, string>;
}
function dirConfigToProperties(
config: DirNamespaceConfig,
): Record<string, string> {
// Spread the whole input so that unknown keys (e.g. a raw `manifest_enabled`
// passed via the dynamic-impl path) flow through instead of being dropped.
// Typed transformations layer on top.
const { manifestEnabled, extraProperties, ...rest } = config;
const properties: Record<string, string> = {
...(extraProperties ?? {}),
...(rest as Record<string, string>),
};
if (manifestEnabled !== undefined) {
properties.manifest_enabled = String(manifestEnabled);
}
return properties;
}
function restConfigToProperties(
config: RestNamespaceConfig,
): Record<string, string> {
const { headers, extraProperties, ...rest } = config;
const properties: Record<string, string> = {
...(extraProperties ?? {}),
...(rest as Record<string, string>),
};
if (headers) {
for (const [name, value] of Object.entries(headers)) {
properties[`headers.${name}`] = value;
}
}
return properties;
}
/**
* Connect to a LanceDB database through a namespace.
*
* Unlike {@link connect}, which routes by URI scheme (local path vs.
* `db://` cloud), `connectNamespace` always returns a namespace-backed
* connection. The `implName` selects the namespace implementation:
*
* - `"dir"` — directory namespace, configured with {@link DirNamespaceConfig}.
* - `"rest"` — remote REST catalog, configured with {@link RestNamespaceConfig}.
* - Any other string — full module path for a custom implementation,
* configured with a free-form string-keyed `properties` map.
*
* @example Typed dir namespace
* ```ts
* const db = await connectNamespace("dir", { root: "/path/to/db" });
* await db.createTable("users", [{ id: 1 }]);
* ```
*
* @example Typed REST namespace with auth headers
* ```ts
* const db = await connectNamespace("rest", {
* uri: "https://catalog.example.com",
* headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
* });
* ```
*
* @example Custom implementation with raw properties
* ```ts
* const db = await connectNamespace("my.custom.Namespace", {
* endpoint: "...",
* });
* ```
*/
export function connectNamespace(
implName: "dir",
config: DirNamespaceConfig,
options?: Partial<ConnectNamespaceOptions>,
): Promise<Connection>;
/**
* Connect through the built-in REST namespace.
*
* Configured with {@link RestNamespaceConfig}. See the function-level
* documentation above for the full surface, examples, and how this
* relates to {@link connect}.
*
* @example
* ```ts
* const db = await connectNamespace("rest", {
* uri: "https://catalog.example.com",
* headers: { "x-api-key": process.env.CATALOG_KEY ?? "" },
* });
* ```
*/
export function connectNamespace(
implName: "rest",
config: RestNamespaceConfig,
options?: Partial<ConnectNamespaceOptions>,
): Promise<Connection>;
/**
* Connect through a custom namespace implementation by full module path,
* configured with a free-form string-keyed `properties` map. Use the
* typed overloads above for the built-in `"dir"` and `"rest"` impls.
*
* See the function-level documentation above for examples and how this
* relates to {@link connect}.
*
* @example
* ```ts
* const db = await connectNamespace("my.custom.Namespace", {
* endpoint: "...",
* });
* ```
*/
export function connectNamespace(
implName: string,
properties: Record<string, string>,
options?: Partial<ConnectNamespaceOptions>,
): Promise<Connection>;
export async function connectNamespace(
implName: string,
configOrProperties:
| DirNamespaceConfig
| RestNamespaceConfig
| Record<string, string>,
options?: Partial<ConnectNamespaceOptions>,
): Promise<Connection> {
let properties: Record<string, string>;
if (implName === "dir") {
properties = dirConfigToProperties(
configOrProperties as DirNamespaceConfig,
);
} else if (implName === "rest") {
properties = restConfigToProperties(
configOrProperties as RestNamespaceConfig,
);
} else {
properties = configOrProperties as Record<string, string>;
}
const finalOptions: ConnectNamespaceOptions = (options ??
{}) as ConnectNamespaceOptions;
finalOptions.storageOptions = cleanseStorageOptions(
finalOptions.storageOptions,
);
const nativeConn = await LanceDbConnection.newWithNamespace(
implName,
properties,
finalOptions,
);
return new LocalConnection(nativeConn);
}

View File

@@ -87,6 +87,23 @@ export class MergeInsertBuilder {
this.#schema,
);
}
/**
* Skip the automatic cleanup of old dataset versions that would otherwise
* run as part of this merge insert's commit. Forwards to
* `MergeInsertBuilder::skip_auto_cleanup` in lance-core.
*
* Useful for high-frequency writers that prefer to manage version cleanup
* themselves, or writers without delete permissions on the underlying storage.
*
* @param skip - If true, the auto-cleanup step is skipped at commit time.
*/
skipAutoCleanup(skip: boolean): MergeInsertBuilder {
return new MergeInsertBuilder(
this.#native.skipAutoCleanup(skip),
this.#schema,
);
}
/**
* Executes the merge insert operation
*

View File

@@ -56,6 +56,18 @@ export interface AddDataOptions {
* If "overwrite" then the new data will replace the existing data in the table.
*/
mode: "append" | "overwrite";
/**
* If true, skip the automatic cleanup of old dataset versions that would
* otherwise run as part of this write's commit. Forwards to
* `WriteParams.skip_auto_cleanup` in lance-core.
*
* Useful for high-frequency writers that prefer to manage version cleanup
* themselves (for example, via a separate periodic optimize job), or for
* writers that don't have delete permissions on the underlying storage.
*
* Defaults to false.
*/
skipAutoCleanup?: boolean;
}
export interface UpdateOptions {
@@ -285,6 +297,25 @@ export abstract class Table {
*/
abstract prewarmIndex(name: string): Promise<void>;
/**
* Prewarm one or more columns of data in the table.
*
* @param columns The columns to prewarm. If undefined, all columns are prewarmed.
*
* This will load the column data into the page cache so that future queries that
* read those columns avoid the initial cold-start latency. This call initiates
* prewarming and returns once the request is accepted; the warming itself may
* continue in the background. Calling it on already-prewarmed columns is a
* no-op on the server.
*
* Prewarming is generally useful for columns used in filters or projections.
* Large columns (e.g. high-dimensional vectors or binary data) may not be
* practical to prewarm.
*
* This feature is currently only supported on remote tables.
*/
abstract prewarmData(columns?: string[]): Promise<void>;
/**
* Waits for asynchronous indexing to complete on the table.
*
@@ -617,7 +648,7 @@ export class LocalTable extends Table {
const schema = await this.schema();
const buffer = await fromDataToBuffer(data, undefined, schema);
return await this.inner.add(buffer, mode);
return await this.inner.add(buffer, mode, options?.skipAutoCleanup);
}
async update(
@@ -710,6 +741,10 @@ export class LocalTable extends Table {
await this.inner.prewarmIndex(name);
}
async prewarmData(columns?: string[]): Promise<void> {
await this.inner.prewarmData(columns);
}
async waitForIndex(
indexNames: string[],
timeoutSeconds: number,

10452
nodejs/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -38,15 +38,15 @@
"url": "https://github.com/lancedb/lancedb"
},
"devDependencies": {
"@aws-sdk/client-dynamodb": "^3.33.0",
"@aws-sdk/client-kms": "^3.33.0",
"@aws-sdk/client-s3": "^3.33.0",
"@aws-sdk/client-dynamodb": "3.1003.0",
"@aws-sdk/client-kms": "3.1003.0",
"@aws-sdk/client-s3": "3.1003.0",
"@biomejs/biome": "^1.7.3",
"@jest/globals": "^29.7.0",
"@napi-rs/cli": "^3.5.1",
"@napi-rs/cli": "3.5.1",
"@types/axios": "^0.14.0",
"@types/jest": "^29.1.2",
"@types/node": "^22.7.4",
"@types/node": "22.7.4",
"@types/tmp": "^0.2.6",
"apache-arrow-15": "npm:apache-arrow@15.0.0",
"apache-arrow-16": "npm:apache-arrow@16.0.0",
@@ -57,9 +57,9 @@
"shx": "^0.3.4",
"tmp": "^0.2.3",
"ts-jest": "^29.1.2",
"typedoc": "^0.26.4",
"typedoc-plugin-markdown": "^4.2.1",
"typescript": "^5.5.4",
"typedoc": "0.26.4",
"typedoc-plugin-markdown": "4.2.1",
"typescript": "5.5.4",
"typescript-eslint": "^7.1.0"
},
"ava": {
@@ -68,16 +68,16 @@
"engines": {
"node": ">= 18"
},
"packageManager": "pnpm@11.1.1",
"cpu": ["x64", "arm64"],
"os": ["darwin", "linux", "win32"],
"scripts": {
"artifacts": "napi artifacts",
"build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir lancedb",
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/ && node -e \"require('fs').writeFileSync('dist/package.json', JSON.stringify({name:'@lancedb/lancedb',type:'commonjs'}))\"",
"build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir dist",
"postbuild:release": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
"build": "npm run build:debug && npm run tsc",
"build-release": "npm run build:release && npm run tsc",
"build": "pnpm build:debug && pnpm tsc",
"build-release": "pnpm build:release && pnpm tsc",
"tsc": "tsc -b",
"posttsc": "shx cp lancedb/native.d.ts dist/native.d.ts",
"lint-ci": "biome ci .",
@@ -87,7 +87,7 @@
"lint-fix": "biome check --write . && biome format --write .",
"prepublishOnly": "napi prepublish -t npm",
"test": "jest --verbose",
"integration": "S3_TEST=1 npm run test",
"integration": "S3_TEST=1 pnpm test",
"universal": "napi universalize",
"version": "napi version"
},
@@ -95,8 +95,8 @@
"reflect-metadata": "^0.2.2"
},
"optionalDependencies": {
"@huggingface/transformers": "^3.0.2",
"openai": "^4.29.2"
"@huggingface/transformers": "3.0.2",
"openai": "4.29.2"
},
"peerDependencies": {
"apache-arrow": ">=15.0.0 <=18.1.0"

7317
nodejs/pnpm-lock.yaml generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,18 @@
# Flat node_modules layout. The @napi-rs/cli build step fails to locate
# the cdylib artifact under pnpm's isolated layout; the hoisted linker
# mirrors npm's structure and unblocks the native build.
nodeLinker: hoisted
# Block resolution of versions less than 24h old (Shai-Hulud window).
# This is the pnpm 11 default but pinned here so it's visible to
# reviewers and survives a future pnpm major flipping the default.
minimumReleaseAge: 1440
# Fail install if a transitive dep tries to run an unapproved script.
strictDepBuilds: true
allowBuilds:
'@biomejs/biome': true
onnxruntime-node: true
protobufjs: true
sharp: true

View File

@@ -8,12 +8,16 @@ use lancedb::database::{CreateTableMode, Database};
use napi::bindgen_prelude::*;
use napi_derive::*;
use crate::ConnectNamespaceOptions;
use crate::ConnectionOptions;
use crate::error::NapiErrorExt;
use crate::header::JsHeaderProvider;
use crate::table::Table;
use lancedb::connection::{ConnectBuilder, Connection as LanceDBConnection};
use lancedb::connection::{ConnectBuilder, Connection as LanceDBConnection, connect_namespace};
use lance_namespace::models::{
CreateNamespaceRequest, DescribeNamespaceRequest, DropNamespaceRequest, ListNamespacesRequest,
};
use lancedb::ipc::{ipc_file_to_batches, ipc_file_to_schema};
#[napi]
@@ -21,6 +25,29 @@ pub struct Connection {
inner: Option<LanceDBConnection>,
}
#[napi(object)]
pub struct DescribeNamespaceResponse {
pub properties: Option<HashMap<String, String>>,
}
#[napi(object)]
pub struct ListNamespacesResponse {
pub namespaces: Vec<String>,
pub page_token: Option<String>,
}
#[napi(object)]
pub struct CreateNamespaceResponse {
pub properties: Option<HashMap<String, String>>,
pub transaction_id: Option<String>,
}
#[napi(object)]
pub struct DropNamespaceResponse {
pub properties: Option<HashMap<String, String>>,
pub transaction_id: Option<Vec<String>>,
}
impl Connection {
pub(crate) fn inner_new(inner: LanceDBConnection) -> Self {
Self { inner: Some(inner) }
@@ -106,6 +133,39 @@ impl Connection {
Ok(Self::inner_new(builder.execute().await.default_error()?))
}
/// Create a new Connection instance backed by a namespace implementation.
#[napi(factory)]
pub async fn new_with_namespace(
impl_name: String,
properties: HashMap<String, String>,
options: ConnectNamespaceOptions,
) -> napi::Result<Self> {
if impl_name.is_empty() {
return Err(napi::Error::from_reason(
"implName must be a non-empty string",
));
}
let mut builder = connect_namespace(&impl_name, properties);
if let Some(interval) = options.read_consistency_interval {
builder =
builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));
}
if let Some(storage_options) = options.storage_options {
for (key, value) in storage_options {
builder = builder.storage_option(key, value);
}
}
if let Some(namespace_client_properties) = options.namespace_client_properties {
builder = builder.namespace_client_properties(namespace_client_properties);
}
if let Some(session) = options.session {
builder = builder.session(session.inner.clone());
}
Ok(Self::inner_new(builder.execute().await.default_error()?))
}
#[napi]
pub fn display(&self) -> napi::Result<String> {
Ok(self.get_inner()?.to_string())
@@ -273,4 +333,130 @@ impl Connection {
let ns = namespace_path.unwrap_or_default();
self.get_inner()?.drop_all_tables(&ns).await.default_error()
}
#[napi(catch_unwind)]
/// Describe a namespace and return its properties.
pub async fn describe_namespace(
&self,
namespace_path: Vec<String>,
) -> napi::Result<DescribeNamespaceResponse> {
let req = DescribeNamespaceRequest {
id: Some(namespace_path),
..Default::default()
};
let resp = self
.get_inner()?
.describe_namespace(req)
.await
.default_error()?;
Ok(DescribeNamespaceResponse {
properties: resp.properties,
})
}
#[napi(catch_unwind)]
/// List child namespaces under the given namespace path
pub async fn list_namespaces(
&self,
namespace_path: Option<Vec<String>>,
page_token: Option<String>,
limit: Option<u32>,
) -> napi::Result<ListNamespacesResponse> {
let req = ListNamespacesRequest {
id: namespace_path,
page_token,
limit: limit.map(|l| l as i32),
..Default::default()
};
let resp = self
.get_inner()?
.list_namespaces(req)
.await
.default_error()?;
Ok(ListNamespacesResponse {
namespaces: resp.namespaces,
page_token: resp.page_token,
})
}
#[napi(catch_unwind)]
/// Create a new namespace with optional properties.
pub async fn create_namespace(
&self,
namespace_path: Vec<String>,
mode: Option<String>,
properties: Option<HashMap<String, String>>,
) -> napi::Result<CreateNamespaceResponse> {
let mode_str = mode
.map(|m| match m.to_lowercase().as_str() {
"create" => Ok("Create".to_string()),
"exist_ok" => Ok("ExistOk".to_string()),
"overwrite" => Ok("Overwrite".to_string()),
_ => Err(napi::Error::from_reason(format!(
"Invalid mode '{}': expected one of 'create', 'exist_ok', 'overwrite'",
m
))),
})
.transpose()?;
let req = CreateNamespaceRequest {
id: Some(namespace_path),
mode: mode_str,
properties,
..Default::default()
};
let resp = self
.get_inner()?
.create_namespace(req)
.await
.default_error()?;
Ok(CreateNamespaceResponse {
properties: resp.properties,
transaction_id: resp.transaction_id,
})
}
#[napi(catch_unwind)]
/// Drop a namespace.
pub async fn drop_namespace(
&self,
namespace_path: Vec<String>,
mode: Option<String>,
behavior: Option<String>,
) -> napi::Result<DropNamespaceResponse> {
let mode_str = mode
.map(|m| match m.to_lowercase().as_str() {
"skip" => Ok("Skip".to_string()),
"fail" => Ok("Fail".to_string()),
_ => Err(napi::Error::from_reason(format!(
"Invalid mode '{}': expected one of 'skip', 'fail'",
m
))),
})
.transpose()?;
let behavior_str = behavior
.map(|b| match b.to_lowercase().as_str() {
"restrict" => Ok("Restrict".to_string()),
"cascade" => Ok("Cascade".to_string()),
_ => Err(napi::Error::from_reason(format!(
"Invalid behavior '{}': expected one of 'restrict', 'cascade'",
b
))),
})
.transpose()?;
let req = DropNamespaceRequest {
id: Some(namespace_path),
mode: mode_str,
behavior: behavior_str,
..Default::default()
};
let resp = self
.get_inner()?
.drop_namespace(req)
.await
.default_error()?;
Ok(DropNamespaceResponse {
properties: resp.properties,
transaction_id: resp.transaction_id,
})
}
}

View File

@@ -67,6 +67,26 @@ pub struct OpenTableOptions {
pub storage_options: Option<HashMap<String, String>>,
}
#[napi(object)]
#[derive(Debug)]
pub struct ConnectNamespaceOptions {
/// The interval, in seconds, at which to check for updates to the table
/// from other processes. If None, then consistency is not checked. For
/// performance reasons, this is the default. For strong consistency, set
/// this to zero seconds. Then every read will check for updates from other
/// processes. As a compromise, you can set this to a non-zero value for
/// eventual consistency.
pub read_consistency_interval: Option<f64>,
/// Configuration for object storage. The available options are described
/// at https://docs.lancedb.com/storage/
pub storage_options: Option<HashMap<String, String>>,
/// Extra properties for the backing namespace client.
pub namespace_client_properties: Option<HashMap<String, String>>,
/// The session to use for this connection. Holds shared caches and other
/// session-specific state.
pub session: Option<session::Session>,
}
#[napi_derive::module_init]
fn init() {
let env = Env::new()

View File

@@ -50,6 +50,13 @@ impl NativeMergeInsertBuilder {
this
}
#[napi]
pub fn skip_auto_cleanup(&self, skip: bool) -> Self {
let mut this = self.clone();
this.inner.skip_auto_cleanup(skip);
this
}
#[napi(catch_unwind)]
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeResult> {
let data = ipc_file_to_batches(buf.to_vec())

View File

@@ -6,7 +6,7 @@ use std::collections::HashMap;
use lancedb::ipc::{ipc_file_to_batches, ipc_file_to_schema};
use lancedb::table::{
AddDataMode, ColumnAlteration as LanceColumnAlteration, Duration, NewColumnTransform,
OptimizeAction, OptimizeOptions, Table as LanceDbTable,
OptimizeAction, OptimizeOptions, Table as LanceDbTable, WriteOptions,
};
use napi::bindgen_prelude::*;
use napi_derive::napi;
@@ -68,7 +68,12 @@ impl Table {
}
#[napi(catch_unwind)]
pub async fn add(&self, buf: Buffer, mode: String) -> napi::Result<AddResult> {
pub async fn add(
&self,
buf: Buffer,
mode: String,
skip_auto_cleanup: Option<bool>,
) -> napi::Result<AddResult> {
let batches = ipc_file_to_batches(buf.to_vec())
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
let batches = batches
@@ -92,6 +97,13 @@ impl Table {
return Err(napi::Error::from_reason(format!("Invalid mode: {}", mode)));
};
if skip_auto_cleanup.unwrap_or(false) {
op = op.write_options(WriteOptions {
skip_auto_cleanup: true,
..Default::default()
});
}
let res = op.execute().await.default_error()?;
Ok(res.into())
}
@@ -159,6 +171,14 @@ impl Table {
.default_error()
}
#[napi(catch_unwind)]
pub async fn prewarm_data(&self, columns: Option<Vec<String>>) -> napi::Result<()> {
self.inner_ref()?
.prewarm_data(columns)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn wait_for_index(&self, index_names: Vec<String>, timeout_s: i64) -> Result<()> {
let timeout = std::time::Duration::from_secs(timeout_s.try_into().unwrap());

View File

@@ -35,7 +35,8 @@ futures.workspace = true
serde = "1"
serde_json = "1"
snafu.workspace = true
tokio = { version = "1.40", features = ["sync"] }
tokio = { version = "1.40", features = ["sync", "rt-multi-thread"] }
libc = "0.2"
[build-dependencies]
pyo3-build-config = { version = "0.28", features = [

View File

@@ -7,7 +7,6 @@ import os
from concurrent.futures import ThreadPoolExecutor
from datetime import timedelta
from typing import Dict, Optional, Union, Any, List
import warnings
__version__ = importlib.metadata.version("lancedb")
@@ -438,13 +437,3 @@ __all__ = [
"Table",
"__version__",
]
def __warn_on_fork():
warnings.warn(
"lance is not fork-safe. If you are using multiprocessing, use spawn instead.",
)
if hasattr(os, "register_at_fork"):
os.register_at_fork(before=__warn_on_fork) # type: ignore[attr-defined]

View File

@@ -12,6 +12,7 @@ from .index import (
LabelList,
HnswPq,
HnswSq,
HnswFlat,
FTS,
)
from lance_namespace import (
@@ -25,6 +26,7 @@ from .remote import ClientConfig
IvfHnswPq: type[HnswPq] = HnswPq
IvfHnswSq: type[HnswSq] = HnswSq
IvfHnswFlat: type[HnswFlat] = HnswFlat
class PyExpr:
"""A type-safe DataFusion expression node (Rust-side handle)."""
@@ -180,6 +182,7 @@ class Table:
IvfPq,
HnswPq,
HnswSq,
HnswFlat,
BTree,
Bitmap,
LabelList,
@@ -442,7 +445,7 @@ class AsyncPermutationBuilder:
async def execute(self) -> Table: ...
def async_permutation_builder(
table: Table, dest_table_name: str
table: Table,
) -> AsyncPermutationBuilder: ...
def fts_query_to_json(query: Any) -> str: ...

View File

@@ -2,7 +2,9 @@
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
import asyncio
import os
import threading
import warnings
class BackgroundEventLoop:
@@ -13,6 +15,9 @@ class BackgroundEventLoop:
"""
def __init__(self):
self._start()
def _start(self):
self.loop = asyncio.new_event_loop()
self.thread = threading.Thread(
target=self.loop.run_forever,
@@ -31,3 +36,30 @@ class BackgroundEventLoop:
LOOP = BackgroundEventLoop()
_FORK_WARNED = False
def _reset_after_fork():
# Threads do not survive fork(), so the asyncio loop in LOOP.thread is
# dead in the child. Re-initialize the singleton in place so existing
# `from .background_loop import LOOP` references in other modules see
# the new state. The Rust-side tokio runtime is reset analogously by a
# pthread_atfork hook installed in the _lancedb extension.
LOOP._start()
global _FORK_WARNED
if not _FORK_WARNED:
_FORK_WARNED = True
warnings.warn(
"lancedb fork support is experimental: the internal async "
"runtime has been reset in the forked child, but a small chance "
"of deadlock remains if other state was mid-operation at fork "
"time. The 'forkserver' or 'spawn' multiprocessing start method "
"is likely a safer alternative.",
RuntimeWarning,
stacklevel=2,
)
if hasattr(os, "register_at_fork"):
os.register_at_fork(after_in_child=_reset_after_fork)

View File

@@ -7,6 +7,7 @@ from typing import Literal, Optional
from ._lancedb import (
IndexConfig,
)
from .types import BaseTokenizerType
lang_mapping = {
"ar": "Arabic",
@@ -111,8 +112,12 @@ class FTS:
- "simple": Splits text by whitespace and punctuation.
- "whitespace": Split text by whitespace, but not punctuation.
- "raw": No tokenization. The entire text is treated as a single token.
- "ngram": N-gram tokenizer for substring-style matching.
- "jieba/*": Jieba tokenizer loaded from Lance's language model home.
- "lindera/*": Lindera tokenizer loaded from Lance's language model home.
language : str, default "English"
The language to use for tokenization.
The language to use for stemming and stop-word removal. This is not the
primary way to enable CJK tokenization.
max_token_length : int, default 40
The maximum token length to index. Tokens longer than this length will be
ignored.
@@ -127,10 +132,17 @@ class FTS:
ascii_folding : bool, default True
Whether to fold ASCII characters. This converts accented characters to
their ASCII equivalent. For example, "café" would be converted to "cafe".
Notes
-----
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
require tokenizer models in Lance's language model home. Set
``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
directory under ``lance/language_models``.
"""
with_position: bool = False
base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple"
base_tokenizer: BaseTokenizerType = "simple"
language: str = "English"
max_token_length: Optional[int] = 40
lower_case: bool = True
@@ -376,9 +388,98 @@ class HnswSq:
target_partition_size: Optional[int] = None
@dataclass
class HnswFlat:
"""Describe a HNSW-FLAT index configuration.
HNSW-FLAT stands for Hierarchical Navigable Small World without quantization.
It stores raw vectors in the HNSW graph, providing the highest recall among
the IVF_HNSW family at the cost of more memory and disk space compared to
:class:`HnswSq` or :class:`HnswPq`.
Parameters
----------
distance_type: str, default "l2"
The distance metric used to train the index.
The following distance types are available:
"l2" - Euclidean distance. This is a very common distance metric that
accounts for both magnitude and direction when determining the distance
between vectors. l2 distance has a range of [0, ∞).
"cosine" - Cosine distance. Cosine distance is a distance metric
calculated from the cosine similarity between two vectors. Cosine
similarity is a measure of similarity between two non-zero vectors of an
inner product space. It is defined to equal the cosine of the angle
between them. Unlike l2, the cosine distance is not affected by the
magnitude of the vectors. Cosine distance has a range of [0, 2].
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
l2 norm is 1), then dot distance is equivalent to the cosine distance.
num_partitions, default sqrt(num_rows)
The number of IVF partitions to create.
For HNSW, we recommend a small number of partitions. Setting this to 1
works well for most tables. For very large tables, training just one HNSW
graph will require too much memory. Each partition becomes its own HNSW
graph, so setting this value higher reduces the peak memory use of
training.
max_iterations, default 50
Max iterations to train kmeans.
When training an IVF index we use kmeans to calculate the partitions.
This parameter controls how many iterations of kmeans to run.
sample_rate, default 256
The rate used to calculate the number of training vectors for kmeans.
m, default 20
The number of neighbors to select for each vector in the HNSW graph.
This value controls the tradeoff between search speed and accuracy.
The higher the value the more accurate the search but the slower it
will be.
ef_construction, default 300
The number of candidates to evaluate during the construction of the HNSW
graph.
This value controls the tradeoff between build speed and accuracy.
The higher the value the more accurate the build but the slower it will
be. 150 to 300 is the typical range. 100 is a minimum for good quality
search results. In most cases, there is no benefit to setting this higher
than 500. This value should be set to a value that is not less than `ef`
in the search phase.
target_partition_size, default is 1,048,576
The target size of each partition.
"""
distance_type: Literal["l2", "cosine", "dot"] = "l2"
num_partitions: Optional[int] = None
max_iterations: int = 50
sample_rate: int = 256
m: int = 20
ef_construction: int = 300
target_partition_size: Optional[int] = None
# Backwards-compatible aliases
IvfHnswPq = HnswPq
IvfHnswSq = HnswSq
IvfHnswFlat = HnswFlat
@dataclass
@@ -698,11 +799,13 @@ __all__ = [
"IvfPq",
"IvfHnswPq",
"IvfHnswSq",
"IvfHnswFlat",
"IvfSq",
"IvfRq",
"IvfFlat",
"HnswPq",
"HnswSq",
"HnswFlat",
"IndexConfig",
"FTS",
"Bitmap",

View File

@@ -1,11 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
from deprecation import deprecated
from lancedb import AsyncConnection, DBConnection
import pyarrow as pa
import copy
import json
from deprecation import deprecated
import pyarrow as pa
from ._lancedb import async_permutation_builder, PermutationReader
from .table import LanceTable
from .background_loop import LOOP
@@ -36,10 +37,7 @@ class PermutationBuilder:
be referenced by name in the future. If names are not provided then they can only
be referenced by their ordinal index. There is no requirement to name every split.
By default, the permutation will be stored in memory and will be lost when the
program exits. To persist the permutation (for very large datasets or to share
the permutation across multiple workers) use the [persist](#persist) method to
create a permanent table.
The permutation is stored in memory and will be lost when the program exits.
"""
def __init__(self, table: LanceTable):
@@ -51,15 +49,6 @@ class PermutationBuilder:
"""
self._async = async_permutation_builder(table)
def persist(
self, database: Union[DBConnection, AsyncConnection], table_name: str
) -> "PermutationBuilder":
"""
Persist the permutation to the given database.
"""
self._async.persist(database, table_name)
return self
def split_random(
self,
*,
@@ -380,20 +369,44 @@ class Permutation:
def __init__(
self,
reader: PermutationReader,
base_table: LanceTable,
permutation_table: Optional[LanceTable],
split: int,
selection: dict[str, str],
batch_size: int,
transform_fn: Callable[pa.RecordBatch, Any],
offset: Optional[int] = None,
limit: Optional[int] = None,
connection_factory: Optional[Callable[[str], LanceTable]] = None,
_reader: Optional[PermutationReader] = None,
):
"""
Internal constructor. Use [from_tables](#from_tables) instead.
"""
assert reader is not None, "reader is required"
assert base_table is not None, "base_table is required"
assert selection is not None, "selection is required"
self.reader = reader
self.base_table = base_table
self.permutation_table = permutation_table
self.split = split
self.selection = selection
self.transform_fn = transform_fn
self.batch_size = batch_size
self.offset = offset
self.limit = limit
self.connection_factory = connection_factory
if _reader is None:
_reader = LOOP.run(self._build_reader())
self.reader: PermutationReader = _reader
async def _build_reader(self) -> PermutationReader:
reader = await PermutationReader.from_tables(
self.base_table, self.permutation_table, self.split
)
if self.offset is not None:
reader = await reader.with_offset(self.offset)
if self.limit is not None:
reader = await reader.with_limit(self.limit)
return reader
def _with_selection(self, selection: dict[str, str]) -> "Permutation":
"""
@@ -402,21 +415,97 @@ class Permutation:
Does not validation of the selection and it replaces it entirely. This is not
intended for public use.
"""
return Permutation(self.reader, selection, self.batch_size, self.transform_fn)
def _with_reader(self, reader: PermutationReader) -> "Permutation":
"""
Creates a new permutation with the given reader
This is an internal method and should not be used directly.
"""
return Permutation(reader, self.selection, self.batch_size, self.transform_fn)
new = copy.copy(self)
new.selection = selection
return new
def with_batch_size(self, batch_size: int) -> "Permutation":
"""
Creates a new permutation with the given batch size
"""
return Permutation(self.reader, self.selection, batch_size, self.transform_fn)
new = copy.copy(self)
new.batch_size = batch_size
return new
def with_connection_factory(
self, connection_factory: Callable[[str], LanceTable]
) -> "Permutation":
"""
Creates a new permutation that will use ``connection_factory`` to reopen
the base table when this permutation is unpickled in a worker process.
The factory is a callable that takes a single argument — the base table
name — and returns a [LanceTable]. It must be picklable; the worker
will pickle it via standard ``pickle`` and call it to recover the base
table. Picklable callables in practice means top-level (module-level)
functions, ``functools.partial`` of such functions, or instances of
picklable classes implementing ``__call__``. Lambdas and closures over
local variables don't pickle with the default protocol.
Setting a factory is necessary when the URI alone is not enough to
re-open the connection — most importantly for LanceDB Cloud (``db://``)
connections, where ``api_key`` and ``region`` aren't recoverable from
the connection object after construction.
For local file or cloud-storage paths the factory is optional: if not
set, ``__getstate__`` falls back to capturing
``(uri, storage_options, namespace_path)`` and re-opening via
``lancedb.connect(uri, storage_options=...)``.
Examples
--------
Basic native (file-system path), parameterized via ``functools.partial``::
import functools, lancedb
from lancedb.permutation import Permutation
def open_native_table(uri: str, table_name: str):
return lancedb.connect(uri).open_table(table_name)
factory = functools.partial(open_native_table, "/data/lance_db")
permutation = Permutation.identity(
factory("training")
).with_connection_factory(factory)
Native via :func:`lancedb.connect_namespace` (e.g. a directory- or
REST-backed namespace client). The factory takes the
implementation name and properties dict as partial-bound args so
the worker can rebuild the same namespace connection::
def open_via_namespace(
impl: str, properties: dict[str, str], table_name: str,
):
return lancedb.connect_namespace(impl, properties).open_table(
table_name,
)
factory = functools.partial(
open_via_namespace,
"dir",
{"root": "/data/lance_db"},
)
LanceDB Cloud, reading credentials from env vars at worker startup
so secrets aren't pickled into the dataset::
import os, lancedb
def open_remote_table(table_name: str):
db = lancedb.connect(
"db://my-database",
api_key=os.environ["LANCEDB_API_KEY"],
region=os.environ.get("LANCEDB_REGION", "us-east-1"),
)
return db.open_table(table_name)
permutation = Permutation.identity(
open_remote_table("training")
).with_connection_factory(open_remote_table)
"""
assert connection_factory is not None, "connection_factory is required"
new = copy.copy(self)
new.connection_factory = connection_factory
return new
@classmethod
def identity(cls, table: LanceTable) -> "Permutation":
@@ -489,11 +578,126 @@ class Permutation:
schema = await reader.output_schema(None)
initial_selection = {name: name for name in schema.names}
return cls(
reader, initial_selection, DEFAULT_BATCH_SIZE, Transforms.arrow2python
base_table,
permutation_table,
split,
initial_selection,
DEFAULT_BATCH_SIZE,
Transforms.arrow2python,
_reader=reader,
)
return LOOP.run(do_from_tables())
def __getstate__(self) -> dict[str, Any]:
"""Build a picklable state dict for this permutation.
The base table is captured either via a user-supplied
``connection_factory`` (see [with_connection_factory]) or, as a
fallback, by introspecting ``(uri, storage_options, namespace_path)``
on the connection. The permutation table — always an in-memory
LanceDB table — is captured as a pyarrow Table (which pickles via
Arrow IPC natively). The reader is dropped from the wire format;
``__setstate__`` rebuilds it from the restored tables.
"""
permutation_data: Optional[pa.Table] = None
if self.permutation_table is not None:
permutation_data = self.permutation_table.to_arrow()
common = {
"base_table_name": self.base_table.name,
"permutation_data": permutation_data,
"split": self.split,
"selection": self.selection,
"batch_size": self.batch_size,
"transform_fn": self.transform_fn,
"offset": self.offset,
"limit": self.limit,
"connection_factory": self.connection_factory,
}
if self.connection_factory is not None:
# The factory carries enough state to recover the base table on
# its own; we don't need to capture the URI / storage options /
# namespace from the existing connection.
return common
# URI-introspection fallback: only viable for native (OSS) connections
# where (uri, storage_options) is enough to reopen. Remote / cloud
# connections don't expose recoverable api_key / region — those users
# must call with_connection_factory().
try:
base_uri = self.base_table._conn.uri
storage_options = self.base_table._conn.storage_options
except AttributeError as e:
raise ValueError(
"Cannot pickle this Permutation: the base table's connection "
"does not expose a uri/storage_options, which usually means it "
"is a remote (LanceDB Cloud) connection. Call "
"Permutation.with_connection_factory(...) first to provide a "
"picklable callable that re-opens the base table from a worker "
"process."
) from e
if base_uri.startswith("memory://"):
# In-memory base tables don't exist in any worker process by
# default, so dump the entire base table into the pickle. This
# can be expensive for large datasets — users with large
# in-memory base tables should either persist them or set a
# connection_factory.
return {
**common,
"base_table_data": self.base_table.to_arrow(),
}
return {
**common,
"base_table_uri": base_uri,
"base_table_namespace": self.base_table._namespace_path,
"base_table_storage_options": storage_options,
}
def __setstate__(self, state: dict[str, Any]) -> None:
from . import connect
connection_factory = state["connection_factory"]
if connection_factory is not None:
base_table = connection_factory(state["base_table_name"])
elif "base_table_data" in state:
# In-memory base table inlined into the pickle; rebuild the same
# way we rebuild the in-memory permutation table.
mem_db = connect("memory://")
base_table = mem_db.create_table(
state["base_table_name"], state["base_table_data"]
)
else:
base_db = connect(
state["base_table_uri"],
storage_options=state["base_table_storage_options"],
)
base_table = base_db.open_table(
state["base_table_name"],
namespace_path=state["base_table_namespace"] or None,
)
permutation_table: Optional[LanceTable] = None
if state["permutation_data"] is not None:
mem_db = connect("memory://")
permutation_table = mem_db.create_table(
"permutation", state["permutation_data"]
)
self.base_table = base_table
self.permutation_table = permutation_table
self.split = state["split"]
self.selection = state["selection"]
self.batch_size = state["batch_size"]
self.transform_fn = state["transform_fn"]
self.offset = state["offset"]
self.limit = state["limit"]
self.connection_factory = connection_factory
self.reader = LOOP.run(self._build_reader())
@property
def schema(self) -> pa.Schema:
async def do_output_schema():
@@ -760,7 +964,9 @@ class Permutation:
for expensive operations such as image decoding.
"""
assert transform is not None, "transform is required"
return Permutation(self.reader, self.selection, self.batch_size, transform)
new = copy.copy(self)
new.transform_fn = transform
return new
def __getitem__(self, index: int) -> Any:
"""
@@ -795,12 +1001,10 @@ class Permutation:
"""
Skip the first `skip` rows of the permutation
"""
async def do_with_skip():
reader = await self.reader.with_offset(skip)
return self._with_reader(reader)
return LOOP.run(do_with_skip())
new = copy.copy(self)
new.offset = skip
new.reader = LOOP.run(new._build_reader())
return new
@deprecated(details="Use with_take instead")
def take(self, limit: int) -> "Permutation":
@@ -818,12 +1022,10 @@ class Permutation:
"""
Limit the permutation to `limit` rows (following any `skip`)
"""
async def do_with_take():
reader = await self.reader.with_limit(limit)
return self._with_reader(reader)
return LOOP.run(do_with_take())
new = copy.copy(self)
new.limit = limit
new.reader = LOOP.run(new._build_reader())
return new
@deprecated(details="Use with_repeat instead")
def repeat(self, times: int) -> "Permutation":

View File

@@ -22,6 +22,7 @@ from lancedb.index import (
FTS,
BTree,
Bitmap,
HnswFlat,
HnswSq,
IvfFlat,
IvfPq,
@@ -39,6 +40,7 @@ from lancedb.table import _normalize_progress
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
from ..types import BaseTokenizerType
class RemoteTable(Table):
@@ -167,7 +169,7 @@ class RemoteTable(Table):
wait_timeout: Optional[timedelta] = None,
with_position: bool = False,
# tokenizer configs:
base_tokenizer: str = "simple",
base_tokenizer: BaseTokenizerType = "simple",
language: str = "English",
max_token_length: Optional[int] = 40,
lower_case: bool = True,
@@ -284,13 +286,15 @@ class RemoteTable(Table):
)
elif index_type == "IVF_HNSW_SQ":
config = HnswSq(distance_type=metric, num_partitions=num_partitions)
elif index_type == "IVF_HNSW_FLAT":
config = HnswFlat(distance_type=metric, num_partitions=num_partitions)
elif index_type == "IVF_FLAT":
config = IvfFlat(distance_type=metric, num_partitions=num_partitions)
else:
raise ValueError(
f"Unknown vector index type: {index_type}. Valid options are"
" 'IVF_FLAT', 'IVF_PQ', 'IVF_RQ', 'IVF_SQ',"
" 'IVF_HNSW_PQ', 'IVF_HNSW_SQ'"
" 'IVF_HNSW_PQ', 'IVF_HNSW_SQ', 'IVF_HNSW_FLAT'"
)
LOOP.run(

View File

@@ -57,6 +57,7 @@ from .index import (
LabelList,
HnswPq,
HnswSq,
HnswFlat,
FTS,
)
from .merge import LanceMergeInsertBuilder
@@ -86,6 +87,59 @@ from .util import (
)
from .index import lang_mapping
_MODEL_BACKED_TOKENIZER_PREFIXES = ("jieba", "lindera")
_MODEL_BACKED_TOKENIZER_ERRORS = (
"unknown base tokenizer",
"Invalid directory path:",
"Failed to load Jieba",
"Failed to load tokenizer config",
"Failed to initialize default tokenizer",
)
def _add_unique_note(exception: BaseException, note: str) -> None:
existing_notes = getattr(exception, "__notes__", ()) or ()
message = (
exception.args[0]
if exception.args and isinstance(exception.args[0], str)
else ""
)
if note not in existing_notes and note not in message:
add_note(exception, note)
def _is_model_backed_tokenizer(base_tokenizer: str) -> bool:
return any(
base_tokenizer == prefix or base_tokenizer.startswith(f"{prefix}/")
for prefix in _MODEL_BACKED_TOKENIZER_PREFIXES
)
def _maybe_add_fts_error_note(
exception: BaseException, *, base_tokenizer: str, language: Optional[str] = None
) -> None:
message = str(exception)
if language is not None and "not support the requested language" in message:
supported_langs = ", ".join(lang_mapping.values())
_add_unique_note(exception, f"Supported languages: {supported_langs}")
return
if not _is_model_backed_tokenizer(base_tokenizer):
return
if not any(marker in message for marker in _MODEL_BACKED_TOKENIZER_ERRORS):
return
_add_unique_note(
exception,
"Model-backed tokenizers such as 'jieba/default' and 'lindera/ipadic' "
"require tokenizer models in Lance's language model home. Set "
"LANCE_LANGUAGE_MODEL_HOME to override the default platform data "
"directory under 'lance/language_models'. Expected layouts include "
"'<model-home>/jieba/default/...' and "
"'<model-home>/lindera/ipadic/...'.",
)
if TYPE_CHECKING:
from .db import LanceDBConnection
@@ -958,7 +1012,10 @@ class Table(ABC):
tokenizer_name: str, default "default"
A compatibility alias for native tokenizer configs. Can be "raw",
"default" or the 2 letter language code followed by "_stem". So
for english it would be "en_stem".
for english it would be "en_stem". For new native FTS indexes, use
``base_tokenizer`` directly; ``tokenizer_name`` is a legacy
compatibility alias and does not expose model-backed tokenizer names
such as ``jieba/default`` or ``lindera/ipadic``.
use_tantivy: bool, default False
Deprecated legacy Tantivy parameter. Setting this to True raises an
error.
@@ -972,8 +1029,11 @@ class Table(ABC):
- "whitespace": Split text by whitespace, but not punctuation.
- "raw": No tokenization. The entire text is treated as a single token.
- "ngram": N-Gram tokenizer.
- "jieba/*": Jieba tokenizer loaded from Lance's language model home.
- "lindera/*": Lindera tokenizer loaded from Lance's language model home.
language : str, default "English"
The language to use for tokenization.
The language to use for stemming and stop-word removal. This is not
the primary way to enable CJK tokenization.
max_token_length : int, default 40
The maximum token length to index. Tokens longer than this length will be
ignored.
@@ -999,6 +1059,13 @@ class Table(ABC):
The timeout to wait if indexing is asynchronous.
name: str, optional
The name of the index. If not provided, a default name will be generated.
Notes
-----
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
require tokenizer models in Lance's language model home. Set
``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
directory under ``lance/language_models``.
"""
raise NotImplementedError
@@ -2170,7 +2237,13 @@ class LanceTable(Table):
index_cache_size: Optional[int] = None,
num_bits: int = 8,
index_type: Literal[
"IVF_FLAT", "IVF_SQ", "IVF_PQ", "IVF_RQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
"IVF_FLAT",
"IVF_SQ",
"IVF_PQ",
"IVF_RQ",
"IVF_HNSW_SQ",
"IVF_HNSW_PQ",
"IVF_HNSW_FLAT",
] = "IVF_PQ",
max_iterations: int = 50,
sample_rate: int = 256,
@@ -2257,6 +2330,16 @@ class LanceTable(Table):
ef_construction=ef_construction,
target_partition_size=target_partition_size,
)
elif index_type == "IVF_HNSW_FLAT":
config = HnswFlat(
distance_type=metric,
num_partitions=num_partitions,
max_iterations=max_iterations,
sample_rate=sample_rate,
m=m,
ef_construction=ef_construction,
target_partition_size=target_partition_size,
)
else:
raise ValueError(f"Unknown index type {index_type}")
@@ -2462,14 +2545,22 @@ class LanceTable(Table):
**tokenizer_configs,
)
LOOP.run(
self._table.create_index(
field_names,
replace=replace,
config=config,
name=name,
try:
LOOP.run(
self._table.create_index(
field_names,
replace=replace,
config=config,
name=name,
)
)
)
except (ValueError, RuntimeError) as e:
_maybe_add_fts_error_note(
e,
base_tokenizer=config.base_tokenizer,
language=config.language,
)
raise e
@staticmethod
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
@@ -3799,7 +3890,18 @@ class AsyncTable:
*,
replace: Optional[bool] = None,
config: Optional[
Union[IvfFlat, IvfPq, IvfRq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
Union[
IvfFlat,
IvfPq,
IvfRq,
HnswPq,
HnswSq,
HnswFlat,
BTree,
Bitmap,
LabelList,
FTS,
]
] = None,
wait_timeout: Optional[timedelta] = None,
name: Optional[str] = None,
@@ -3846,6 +3948,7 @@ class AsyncTable:
IvfRq,
HnswPq,
HnswSq,
HnswFlat,
BTree,
Bitmap,
LabelList,
@@ -3865,11 +3968,13 @@ class AsyncTable:
name=name,
train=train,
)
except ValueError as e:
if "not support the requested language" in str(e):
supported_langs = ", ".join(lang_mapping.values())
help_msg = f"Supported languages: {supported_langs}"
add_note(e, help_msg)
except (ValueError, RuntimeError) as e:
if isinstance(config, FTS):
_maybe_add_fts_error_note(
e,
base_tokenizer=config.base_tokenizer,
language=config.language,
)
raise e
async def drop_index(self, name: str) -> None:
@@ -5014,6 +5119,7 @@ class IndexStatistics:
"IVF_RQ",
"IVF_HNSW_SQ",
"IVF_HNSW_PQ",
"IVF_HNSW_FLAT",
"FTS",
"BTREE",
"BITMAP",

View File

@@ -24,6 +24,7 @@ VectorIndexType = Literal[
"IVF_PQ",
"IVF_HNSW_SQ",
"IVF_HNSW_PQ",
"IVF_HNSW_FLAT",
"IVF_RQ",
]
ScalarIndexType = Literal["BTREE", "BITMAP", "LABEL_LIST"]
@@ -31,6 +32,7 @@ IndexType = Literal[
"IVF_PQ",
"IVF_HNSW_PQ",
"IVF_HNSW_SQ",
"IVF_HNSW_FLAT",
"IVF_SQ",
"FTS",
"BTREE",
@@ -40,4 +42,5 @@ IndexType = Literal[
]
# Tokenizer literals
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
BuiltinTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
BaseTokenizerType = BuiltinTokenizerType | str

View File

@@ -0,0 +1,8 @@
我们 98740 r
都 202780 d
有 423765 v
光明 1219 n
的 318825 uj
前途 1263 n
前 62779 f
途 857 n

View File

@@ -0,0 +1,4 @@
segmenter:
mode: "normal"
dictionary:
path: "./python/tests/models/lindera/ipadic/main"

Binary file not shown.

View File

@@ -15,7 +15,10 @@
# limitations under the License.
import os
import random
import shutil
from unittest import mock
from pathlib import Path
import zipfile
import lancedb as ldb
from lancedb.db import DBConnection
@@ -36,6 +39,8 @@ import pytest
import pytest_asyncio
from utils import exception_output
TEST_LANGUAGE_MODEL_HOME = Path(__file__).parent / "models"
@pytest.fixture
def table(tmp_path) -> ldb.table.LanceTable:
@@ -89,6 +94,40 @@ def table(tmp_path) -> ldb.table.LanceTable:
return table
@pytest.fixture
def language_model_home(monkeypatch, tmp_path):
model_home = tmp_path / "language-models"
shutil.copytree(TEST_LANGUAGE_MODEL_HOME, model_home)
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(model_home))
return model_home
@pytest.fixture
def lindera_ipadic(language_model_home):
model_path = language_model_home / "lindera" / "ipadic"
extracted_model = model_path / "main"
config_path = model_path / "config.yml"
if extracted_model.exists():
shutil.rmtree(extracted_model)
with zipfile.ZipFile(model_path / "main.zip", "r") as zip_ref:
zip_ref.extractall(model_path)
config_path.write_text(
"segmenter:\n"
' mode: "normal"\n'
" dictionary:\n"
f' path: "{extracted_model.resolve().as_posix()}"\n',
encoding="utf-8",
)
try:
yield
finally:
if extracted_model.exists():
shutil.rmtree(extracted_model)
@pytest_asyncio.fixture
async def async_table(tmp_path) -> ldb.table.AsyncTable:
# Use local random state to avoid affecting other tests
@@ -684,6 +723,90 @@ def test_fts_ngram(mem_db: DBConnection):
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
def test_fts_jieba_tokenizer(mem_db: DBConnection, language_model_home):
data = pa.table({"text": ["我们都有光明的前途", "光明的前途"]})
table = mem_db.create_table("test_jieba", data=data)
table.create_fts_index(
"text",
base_tokenizer="jieba/default",
stem=False,
remove_stop_words=False,
ascii_folding=False,
)
results = table.search("我们", query_type="fts").limit(10).to_list()
assert [row["text"] for row in results] == ["我们都有光明的前途"]
def test_fts_jieba_missing_language_model_note(
mem_db: DBConnection, monkeypatch, tmp_path
):
missing_root = tmp_path / "missing-language-models"
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
table = mem_db.create_table(
"test_missing_jieba_model",
data=pa.table({"text": ["我们都有光明的前途"]}),
)
with pytest.raises((ValueError, RuntimeError)) as e:
table.create_fts_index(
"text",
base_tokenizer="jieba/default",
stem=False,
remove_stop_words=False,
ascii_folding=False,
)
output = exception_output(e)
assert "Invalid directory path:" in output
assert "LANCE_LANGUAGE_MODEL_HOME" in output
assert "jieba/default" in output
@pytest.mark.asyncio
async def test_fts_jieba_missing_language_model_note_async(monkeypatch, tmp_path):
missing_root = tmp_path / "missing-language-models"
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
db = await ldb.connect_async(tmp_path / "async-db")
table = await db.create_table(
"test_missing_jieba_model_async",
data=pa.table({"text": ["我们都有光明的前途"]}),
)
with pytest.raises((ValueError, RuntimeError)) as e:
await table.create_index(
"text",
config=FTS(
base_tokenizer="jieba/default",
stem=False,
remove_stop_words=False,
ascii_folding=False,
),
)
output = exception_output(e)
assert "Invalid directory path:" in output
assert "LANCE_LANGUAGE_MODEL_HOME" in output
assert "jieba/default" in output
def test_fts_lindera_tokenizer(
mem_db: DBConnection, language_model_home, lindera_ipadic
):
data = pa.table({"text": ["成田国際空港", "東京国際空港", "羽田空港"]})
table = mem_db.create_table("test_lindera", data=data)
table.create_fts_index(
"text",
base_tokenizer="lindera/ipadic",
stem=False,
remove_stop_words=False,
ascii_folding=False,
)
results = table.search("成田", query_type="fts").limit(10).to_list()
assert [row["text"] for row in results] == ["成田国際空港"]
def test_fts_query_to_json():
"""Test that FTS query to_json() produces valid JSON strings with exact format."""

View File

@@ -16,11 +16,13 @@ from lancedb.index import (
IvfSq,
IvfHnswPq,
IvfHnswSq,
IvfHnswFlat,
IvfRq,
Bitmap,
LabelList,
HnswPq,
HnswSq,
HnswFlat,
FTS,
)
from lancedb.table import IndexStatistics
@@ -250,6 +252,21 @@ async def test_create_hnswpq_alias_index(some_table: AsyncTable):
assert indices[0].index_type in {"HnswPq", "IvfHnswPq"}
@pytest.mark.asyncio
async def test_create_hnswflat_index(some_table: AsyncTable):
await some_table.create_index("vector", config=HnswFlat(num_partitions=10))
indices = await some_table.list_indices()
assert len(indices) == 1
@pytest.mark.asyncio
async def test_create_hnswflat_alias_index(some_table: AsyncTable):
await some_table.create_index("vector", config=IvfHnswFlat(num_partitions=5))
indices = await some_table.list_indices()
assert len(indices) == 1
assert indices[0].index_type in {"HnswFlat", "IvfHnswFlat"}
@pytest.mark.asyncio
async def test_create_ivfsq_index(some_table: AsyncTable):
await some_table.create_index("vector", config=IvfSq(num_partitions=10))
@@ -295,6 +312,7 @@ def test_index_statistics_index_type_lists_all_supported_values():
"IVF_RQ",
"IVF_HNSW_SQ",
"IVF_HNSW_PQ",
"IVF_HNSW_FLAT",
"FTS",
"BTREE",
"BITMAP",

View File

@@ -9,21 +9,6 @@ from lancedb import DBConnection, Table, connect
from lancedb.permutation import Permutation, Permutations, permutation_builder
def test_permutation_persistence(tmp_path):
db = connect(tmp_path)
tbl = db.create_table("test_table", pa.table({"x": range(100), "y": range(100)}))
permutation_tbl = (
permutation_builder(tbl).shuffle().persist(db, "test_permutation").execute()
)
assert permutation_tbl.count_rows() == 100
re_open = db.open_table("test_permutation")
assert re_open.count_rows() == 100
assert permutation_tbl.to_arrow() == re_open.to_arrow()
def test_split_random_ratios(mem_db):
"""Test random splitting with ratios."""
tbl = mem_db.create_table(

View File

@@ -6,6 +6,8 @@ import contextlib
from datetime import timedelta
import http.server
import json
import multiprocessing as mp
import sys
import threading
import time
from unittest.mock import MagicMock, patch
@@ -1230,3 +1232,82 @@ def test_background_loop_cancellation(exception):
with pytest.raises(exception):
loop.run(None)
mock_future.cancel.assert_called_once()
def _remote_fork_child(port: int, queue) -> None:
# Build a fresh Connection in the child so we exercise the at-fork-child
# tokio runtime reset rather than relying on an inherited reqwest client.
db = lancedb.connect(
"db://dev",
api_key="fake",
host_override=f"http://localhost:{port}",
client_config={
"retry_config": {"retries": 0},
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
},
)
queue.put(db.table_names())
@pytest.mark.skipif(
sys.platform != "linux",
reason=(
"fork() is unavailable on Windows and unsafe on macOS "
"(Apple frameworks/TLS are not fork-safe)"
),
)
def test_remote_connection_after_fork():
"""A freshly-built remote Connection in a forked child should not hang.
The pyo3-async-runtimes tokio runtime would otherwise be inherited from
the parent with dead worker threads; the at-fork-child handler in our
runtime module rebuilds it on first use in the child.
"""
def handler(request):
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b'{"tables": []}')
server = http.server.HTTPServer(("localhost", 0), make_mock_http_handler(handler))
port = server.server_address[1]
server_thread = threading.Thread(target=server.serve_forever)
server_thread.start()
try:
# Hit the server in the parent first so the runtime + LOOP are warm
# before fork; a fresh child must still succeed.
parent_db = lancedb.connect(
"db://dev",
api_key="fake",
host_override=f"http://localhost:{port}",
client_config={
"retry_config": {"retries": 0},
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
},
)
assert parent_db.table_names() == []
ctx = mp.get_context("fork")
queue = ctx.Queue()
proc = ctx.Process(target=_remote_fork_child, args=(port, queue))
proc.start()
proc.join(timeout=15)
if proc.is_alive():
proc.terminate()
proc.join(timeout=5)
if proc.is_alive():
proc.kill()
proc.join()
pytest.fail("Remote connection hung after fork")
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
assert not queue.empty(), "child produced no result"
assert queue.get() == []
# Parent connection must still be usable after the child returned.
assert parent_db.table_names() == []
finally:
server.shutdown()
server_thread.join()

View File

@@ -11,7 +11,7 @@ from unittest.mock import patch
import lancedb
from lancedb.dependencies import _PANDAS_AVAILABLE
from lancedb.index import HnswPq, HnswSq, IvfPq
from lancedb.index import HnswFlat, HnswPq, HnswSq, IvfPq
import numpy as np
import polars as pl
import pyarrow as pa
@@ -917,6 +917,21 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
"my_vector", replace=True, config=expected_config, name=None, train=True
)
table.create_index(
vector_column_name="my_vector",
metric="cosine",
index_type="IVF_HNSW_FLAT",
sample_rate=0.1,
m=29,
ef_construction=10,
)
expected_config = HnswFlat(
distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10
)
mock_create_index.assert_called_with(
"my_vector", replace=True, config=expected_config, name=None, train=True
)
@patch("lancedb.table.AsyncTable.create_index")
def test_create_index_name_and_train_parameters(

View File

@@ -1,14 +1,29 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
import functools
import multiprocessing as mp
import pickle
import sys
import lancedb
import pyarrow as pa
import pytest
from lancedb.permutation import Permutation, Permutations, permutation_builder
from lancedb.util import tbl_to_tensor
from lancedb.permutation import Permutation
torch = pytest.importorskip("torch")
def _open_native_table(uri: str, table_name: str):
"""Top-level connection factory used by the explicit-factory pickle test.
Defined at module scope so that pickle can resolve it by name in the
worker / unpickling process.
"""
return lancedb.connect(uri).open_table(table_name)
def test_table_dataloader(mem_db):
table = mem_db.create_table("test_table", pa.table({"a": range(1000)}))
dataloader = torch.utils.data.DataLoader(
@@ -40,3 +55,156 @@ def test_permutation_dataloader(mem_db):
for batch in dataloader:
assert batch.size(0) == 1
assert batch.size(1) == 10
def test_permutation_is_picklable(tmp_db):
"""A Permutation must be picklable so it can be used with PyTorch's
DataLoader when num_workers > 0 (which uses multiprocessing and pickles
the dataset to pass it to worker processes)."""
table = tmp_db.create_table("test_table", pa.table({"a": range(1000)}))
permutation = Permutation.identity(table)
pickled = pickle.dumps(permutation)
restored = pickle.loads(pickled)
assert len(restored) == 1000
rows = restored.__getitems__([0, 1, 2])
assert rows == [{"a": 0}, {"a": 1}, {"a": 2}]
def test_permutation_with_memory_base_is_picklable(mem_db):
"""An in-memory base table is inlined into the pickle as Arrow IPC bytes
and rebuilt on the other side as an in-memory LanceTable, so the
Permutation round-trips even though the original database can't be
reopened across processes."""
table = mem_db.create_table("test_table", pa.table({"a": range(50)}))
permutation = Permutation.identity(table)
restored = pickle.loads(pickle.dumps(permutation))
assert len(restored) == 50
assert restored.__getitems__([0, 10, 49]) == [{"a": 0}, {"a": 10}, {"a": 49}]
def test_permutation_dataloader_multiprocessing(tmp_db):
"""Using a Permutation with a PyTorch DataLoader that has num_workers > 0
must work end-to-end. Each worker process gets a pickled copy of the
dataset and reads batches from it."""
table = tmp_db.create_table("test_table", pa.table({"a": range(1000)}))
permutation = Permutation.identity(table)
dataloader = torch.utils.data.DataLoader(
permutation,
batch_size=10,
shuffle=True,
num_workers=2,
multiprocessing_context="spawn",
)
seen = 0
for batch in dataloader:
assert batch["a"].size(0) == 10
seen += batch["a"].size(0)
assert seen == 1000
def test_permutation_pickle_with_connection_factory(tmp_path):
"""When the user provides a connection_factory, pickling should round-trip
through that factory rather than introspecting the connection URI. Useful
for remote / cloud connections where the URI alone isn't reopenable."""
db = lancedb.connect(tmp_path)
db.create_table("test_table", pa.table({"a": range(50)}))
factory = functools.partial(_open_native_table, str(tmp_path))
permutation = Permutation.identity(factory("test_table")).with_connection_factory(
factory
)
restored = pickle.loads(pickle.dumps(permutation))
assert len(restored) == 50
# The factory survives pickling and is what powered base-table reopen.
assert restored.connection_factory is not None
assert restored.connection_factory.func is _open_native_table
assert restored.__getitems__([0, 1, 2]) == [{"a": 0}, {"a": 1}, {"a": 2}]
def test_permutation_with_builder_is_picklable(tmp_db):
"""A Permutation built from a non-identity permutation table must round-trip
through pickle while preserving the row order defined by the permutation."""
table = tmp_db.create_table("test_table", pa.table({"a": range(100)}))
perm_tbl = (
permutation_builder(table)
.split_random(ratios=[0.8, 0.2], seed=42, split_names=["train", "test"])
.shuffle(seed=42)
.execute()
)
permutations = Permutations(table, perm_tbl)
permutation = permutations["train"]
indices = list(range(len(permutation)))
expected = permutation.__getitems__(indices)
restored = pickle.loads(pickle.dumps(permutation))
assert len(restored) == len(permutation)
assert restored.__getitems__(indices) == expected
def _multiworker_dataloader_target(db_uri: str, result_queue):
import lancedb
from lancedb.permutation import Permutation
db = lancedb.connect(db_uri)
table = db.open_table("test_table")
permutation = Permutation.identity(table)
dataloader = torch.utils.data.DataLoader(
permutation,
batch_size=10,
num_workers=2,
multiprocessing_context="fork",
)
count = 0
for batch in dataloader:
assert batch["a"].size(0) == 10
count += 1
result_queue.put(count)
@pytest.mark.skipif(
sys.platform != "linux",
reason=(
"fork() is unavailable on Windows and unsafe on macOS "
"(Apple frameworks/TLS are not fork-safe)"
),
)
def test_permutation_dataloader_fork_workers(tmp_path):
"""A Permutation used by a fork-based DataLoader should not hang.
PyTorch's DataLoader uses fork-based multiprocessing by default on Linux.
LanceDB drives async work through a background asyncio thread that does
not survive a fork, so any LOOP.run() in a worker blocks forever.
"""
import lancedb
db_uri = str(tmp_path / "db")
db = lancedb.connect(db_uri)
db.create_table("test_table", pa.table({"a": list(range(1000))}))
ctx = mp.get_context("spawn")
queue = ctx.Queue()
proc = ctx.Process(target=_multiworker_dataloader_target, args=(db_uri, queue))
proc.start()
proc.join(timeout=30)
if proc.is_alive():
proc.terminate()
proc.join(timeout=5)
if proc.is_alive():
proc.kill()
proc.join()
pytest.fail("Permutation hung when iterated in a fork-based DataLoader worker")
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
assert not queue.empty(), "child produced no batches"
assert queue.get() == 100

View File

@@ -3,6 +3,8 @@
use std::sync::Arc;
use crate::error::PythonErrorExt;
use crate::runtime::future_into_py;
use arrow::{
datatypes::SchemaRef,
pyarrow::{IntoPyArrow, ToPyArrow},
@@ -12,9 +14,6 @@ use lancedb::arrow::SendableRecordBatchStream;
use pyo3::{
Bound, Py, PyAny, PyRef, PyResult, Python, exceptions::PyStopAsyncIteration, pyclass, pymethods,
};
use pyo3_async_runtimes::tokio::future_into_py;
use crate::error::PythonErrorExt;
#[pyclass]
pub struct RecordBatchStream {

View File

@@ -7,6 +7,12 @@ use std::{
time::Duration,
};
use crate::{
error::PythonErrorExt,
namespace::{create_namespace_storage_options_provider, extract_namespace_arc},
runtime::future_into_py,
table::Table,
};
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
use lancedb::{
connection::Connection as LanceConnection,
@@ -20,13 +26,6 @@ use pyo3::{
pyclass, pyfunction, pymethods,
types::{PyDict, PyDictMethods},
};
use pyo3_async_runtimes::tokio::future_into_py;
use crate::{
error::PythonErrorExt,
namespace::{create_namespace_storage_options_provider, extract_namespace_arc},
table::Table,
};
#[pyclass]
pub struct Connection {

View File

@@ -1,11 +1,13 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
use lancedb::index::vector::{IvfFlatIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder};
use lancedb::index::vector::{
IvfFlatIndexBuilder, IvfHnswFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder,
IvfPqIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder,
};
use lancedb::index::{
Index as LanceDbIndex,
scalar::{BTreeIndexBuilder, FtsIndexBuilder},
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
};
use pyo3::IntoPyObject;
use pyo3::types::PyStringMethods;
@@ -162,8 +164,26 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
}
Ok(LanceDbIndex::IvfHnswSq(hnsw_sq_builder))
}
"HnswFlat" => {
let params = source.extract::<IvfHnswFlatParams>()?;
let distance_type = parse_distance_type(params.distance_type)?;
let mut hnsw_flat_builder = IvfHnswFlatIndexBuilder::default()
.distance_type(distance_type)
.max_iterations(params.max_iterations)
.sample_rate(params.sample_rate)
.num_edges(params.m)
.ef_construction(params.ef_construction);
if let Some(num_partitions) = params.num_partitions {
hnsw_flat_builder = hnsw_flat_builder.num_partitions(num_partitions);
}
if let Some(target_partition_size) = params.target_partition_size {
hnsw_flat_builder =
hnsw_flat_builder.target_partition_size(target_partition_size);
}
Ok(LanceDbIndex::IvfHnswFlat(hnsw_flat_builder))
}
not_supported => Err(PyValueError::new_err(format!(
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, or IvfHnswSq",
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, IvfHnswSq, or IvfHnswFlat",
not_supported
))),
}
@@ -250,6 +270,17 @@ struct IvfHnswSqParams {
target_partition_size: Option<u32>,
}
#[derive(FromPyObject)]
struct IvfHnswFlatParams {
distance_type: String,
num_partitions: Option<u32>,
max_iterations: u32,
sample_rate: u32,
m: u32,
ef_construction: u32,
target_partition_size: Option<u32>,
}
#[pyclass(get_all)]
/// A description of an index currently configured on a column
pub struct IndexConfig {

View File

@@ -28,6 +28,7 @@ pub mod index;
pub mod namespace;
pub mod permutation;
pub mod query;
pub mod runtime;
pub mod session;
pub mod table;
pub mod util;

View File

@@ -4,7 +4,7 @@
use std::sync::{Arc, Mutex};
use crate::{
arrow::RecordBatchStream, connection::Connection, error::PythonErrorExt, table::Table,
arrow::RecordBatchStream, error::PythonErrorExt, runtime::future_into_py, table::Table,
};
use arrow::pyarrow::{PyArrowType, ToPyArrow};
use lancedb::{
@@ -21,7 +21,6 @@ use pyo3::{
pyclass, pymethods,
types::{PyAnyMethods, PyDict, PyDictMethods, PyType},
};
use pyo3_async_runtimes::tokio::future_into_py;
fn table_from_py<'a>(table: Bound<'a, PyAny>) -> PyResult<Bound<'a, Table>> {
if table.hasattr("_inner")? {
@@ -80,24 +79,6 @@ impl PyAsyncPermutationBuilder {
#[pymethods]
impl PyAsyncPermutationBuilder {
#[pyo3(signature = (database, table_name))]
pub fn persist(
slf: PyRefMut<'_, Self>,
database: Bound<'_, PyAny>,
table_name: String,
) -> PyResult<Self> {
let conn = if database.hasattr("_conn")? {
database
.getattr("_conn")?
.getattr("_inner")?
.cast_into::<Connection>()?
} else {
database.getattr("_inner")?.cast_into::<Connection>()?
};
let database = conn.borrow().database()?;
slf.modify(|builder| builder.persist(database, table_name))
}
#[pyo3(signature = (*, ratios=None, counts=None, fixed=None, seed=None, split_names=None))]
pub fn split_random(
slf: PyRefMut<'_, Self>,

View File

@@ -4,6 +4,11 @@
use std::sync::Arc;
use std::time::Duration;
use crate::expr::PyExpr;
use crate::runtime::future_into_py;
use crate::util::parse_distance_type;
use crate::{arrow::RecordBatchStream, util::PyLanceDB};
use crate::{error::PythonErrorExt, index::class_name};
use arrow::array::Array;
use arrow::array::ArrayData;
use arrow::array::make_array;
@@ -36,12 +41,6 @@ use pyo3::types::{PyDict, PyString};
use pyo3::{Borrowed, FromPyObject, exceptions::PyRuntimeError};
use pyo3::{PyErr, pyclass};
use pyo3::{exceptions::PyValueError, intern};
use pyo3_async_runtimes::tokio::future_into_py;
use crate::expr::PyExpr;
use crate::util::parse_distance_type;
use crate::{arrow::RecordBatchStream, util::PyLanceDB};
use crate::{error::PythonErrorExt, index::class_name};
impl<'a, 'py> FromPyObject<'a, 'py> for PyLanceDB<FtsQuery> {
type Error = PyErr;

142
python/src/runtime.rs Normal file
View File

@@ -0,0 +1,142 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Fork-safe wrapper around tokio + pyo3-async-runtimes.
//!
//! `pyo3_async_runtimes::tokio` keeps its multi-threaded runtime in a
//! `OnceLock` that can never be replaced. Tokio's worker threads do not
//! survive `fork()`, so once a child inherits a "frozen" runtime, every
//! `future_into_py` call hangs forever.
//!
//! We sidestep the global by routing every future through our own
//! [`LanceRuntime`] (a [`pyo3_async_runtimes::generic::Runtime`] impl) backed
//! by an [`AtomicPtr`] to a tokio runtime that we own. A `pthread_atfork`
//! child handler nulls the pointer; the next `spawn` rebuilds the runtime in
//! the child. This mirrors the pattern used in the Lance Python bindings.
use std::future::Future;
use std::pin::Pin;
use std::sync::atomic::{AtomicBool, AtomicPtr, Ordering};
use pyo3::{Bound, PyAny, PyResult, Python, conversion::IntoPyObject};
use pyo3_async_runtimes::{
TaskLocals,
generic::{ContextExt, JoinError, Runtime},
};
use tokio::{runtime, task};
static RUNTIME: AtomicPtr<runtime::Runtime> = AtomicPtr::new(std::ptr::null_mut());
static RUNTIME_INSTALLING: AtomicBool = AtomicBool::new(false);
static ATFORK_INSTALLED: AtomicBool = AtomicBool::new(false);
fn create_runtime() -> runtime::Runtime {
runtime::Builder::new_multi_thread()
.enable_all()
.thread_name("lancedb-tokio-worker")
.build()
.expect("Failed to build tokio runtime")
}
fn get_runtime() -> &'static runtime::Runtime {
loop {
let ptr = RUNTIME.load(Ordering::SeqCst);
if !ptr.is_null() {
return unsafe { &*ptr };
}
if !RUNTIME_INSTALLING.fetch_or(true, Ordering::SeqCst) {
break;
}
std::thread::yield_now();
}
if !ATFORK_INSTALLED.fetch_or(true, Ordering::SeqCst) {
install_atfork();
}
let new_ptr = Box::into_raw(Box::new(create_runtime()));
RUNTIME.store(new_ptr, Ordering::SeqCst);
unsafe { &*new_ptr }
}
/// Runs in async-signal context after `fork()` in the child. We can only
/// touch atomics here; we deliberately leak the previous runtime because
/// dropping a tokio `Runtime` would try to join its (now-dead) worker
/// threads and hang.
extern "C" fn atfork_child() {
RUNTIME.store(std::ptr::null_mut(), Ordering::SeqCst);
RUNTIME_INSTALLING.store(false, Ordering::SeqCst);
}
#[cfg(not(windows))]
fn install_atfork() {
unsafe { libc::pthread_atfork(None, None, Some(atfork_child)) };
}
#[cfg(windows)]
fn install_atfork() {}
/// Marker type implementing [`Runtime`] over our fork-safe runtime slot.
pub struct LanceRuntime;
/// Newtype wrapper around `tokio::task::JoinError` so we can implement the
/// foreign [`JoinError`] trait without violating orphan rules.
pub struct LanceJoinError(task::JoinError);
impl JoinError for LanceJoinError {
fn is_panic(&self) -> bool {
self.0.is_panic()
}
fn into_panic(self) -> Box<dyn std::any::Any + Send + 'static> {
self.0.into_panic()
}
}
impl Runtime for LanceRuntime {
type JoinError = LanceJoinError;
type JoinHandle = Pin<Box<dyn Future<Output = Result<(), Self::JoinError>> + Send>>;
fn spawn<F>(fut: F) -> Self::JoinHandle
where
F: Future<Output = ()> + Send + 'static,
{
let handle = get_runtime().spawn(fut);
Box::pin(async move { handle.await.map_err(LanceJoinError) })
}
fn spawn_blocking<F>(f: F) -> Self::JoinHandle
where
F: FnOnce() + Send + 'static,
{
let handle = get_runtime().spawn_blocking(f);
Box::pin(async move { handle.await.map_err(LanceJoinError) })
}
}
tokio::task_local! {
static TASK_LOCALS: std::cell::OnceCell<TaskLocals>;
}
impl ContextExt for LanceRuntime {
fn scope<F, R>(locals: TaskLocals, fut: F) -> Pin<Box<dyn Future<Output = R> + Send>>
where
F: Future<Output = R> + Send + 'static,
{
let cell = std::cell::OnceCell::new();
cell.set(locals).unwrap();
Box::pin(TASK_LOCALS.scope(cell, fut))
}
fn get_task_locals() -> Option<TaskLocals> {
TASK_LOCALS
.try_with(|c| c.get().cloned())
.unwrap_or_default()
}
}
/// Drop-in replacement for `pyo3_async_runtimes::tokio::future_into_py` that
/// uses our fork-safe runtime.
pub fn future_into_py<F, T>(py: Python<'_>, fut: F) -> PyResult<Bound<'_, PyAny>>
where
F: Future<Output = PyResult<T>> + Send + 'static,
T: for<'py> IntoPyObject<'py> + Send + 'static,
{
pyo3_async_runtimes::generic::future_into_py::<LanceRuntime, _, T>(py, fut)
}

View File

@@ -2,6 +2,7 @@
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
use std::{collections::HashMap, sync::Arc};
use crate::runtime::future_into_py;
use crate::{
connection::Connection,
error::PythonErrorExt,
@@ -24,7 +25,6 @@ use pyo3::{
pyclass, pymethods,
types::{IntoPyDict, PyAnyMethods, PyDict, PyDictMethods},
};
use pyo3_async_runtimes::tokio::future_into_py;
mod scannable;

View File

@@ -40,7 +40,7 @@ lance-datafusion.workspace = true
lance-datagen = { workspace = true }
lance-file = { workspace = true }
lance-io = { workspace = true }
lance-index = { workspace = true }
lance-index = { workspace = true, features = ["tokenizer-jieba", "tokenizer-lindera"] }
lance-table = { workspace = true }
lance-linalg = { workspace = true }
lance-testing = { workspace = true }
@@ -108,7 +108,12 @@ test-log = "0.2"
[features]
default = []
aws = ["lance/aws", "lance-io/aws", "lance-namespace-impls/dir-aws"]
aws = [
"lance/aws",
"lance-io/aws",
"lance-namespace-impls/dir-aws",
"object_store/aws",
]
oss = ["lance/oss", "lance-io/oss", "lance-namespace-impls/dir-oss"]
gcs = ["lance/gcp", "lance-io/gcp", "lance-namespace-impls/dir-gcp"]
azure = [

View File

@@ -505,8 +505,15 @@ impl ListingDatabase {
// Filter out the commit store query param -- it's a lancedb param
url.query_pairs_mut().clear();
url.query_pairs_mut().extend_pairs(filtered_querys);
// Take a copy of the query string so we can propagate it to lance
let query_string = url.query().map(|s| s.to_string());
// Take a copy of the query string so we can propagate it to lance.
// `query_pairs_mut()` leaves the URL with `Some("")` even when no
// pairs survive (or none existed in the first place), so an empty
// string here must be treated the same as "no query" — otherwise
// every table URI ends up with a trailing `?`, which makes downstream
// sub-paths (e.g. MemWAL gen paths) re-parse as path=<base table> +
// query=<sub-path>, causing Lance to find the base table dataset
// when looking up the sub-path.
let query_string = url.query().filter(|q| !q.is_empty()).map(|s| s.to_string());
// clear the query string so we can use the url as the base uri
// use .set_query(None) instead of .set_query("") because the latter
// will add a trailing '?' to the url
@@ -715,7 +722,7 @@ impl ListingDatabase {
let commit_handler = commit_handler_from_url(&uri, &Some(object_store_params)).await?;
for name in names {
let dir_name = format!("{}.{}", name, LANCE_EXTENSION);
let full_path = self.base_path.child(dir_name.clone());
let full_path = self.base_path.clone().join(dir_name.clone());
commit_handler.delete(&full_path).await?;
@@ -842,6 +849,10 @@ impl ListingDatabase {
write_params.mode = WriteMode::Overwrite;
}
if request.write_options.skip_auto_cleanup {
write_params.skip_auto_cleanup = true;
}
write_params.session = Some(self.session.clone());
write_params
@@ -2027,6 +2038,7 @@ mod tests {
}),
..Default::default()
}),
..Default::default()
};
let table = db
@@ -2100,6 +2112,7 @@ mod tests {
}),
..Default::default()
}),
..Default::default()
};
let table = db
@@ -2213,6 +2226,133 @@ mod tests {
assert_eq!(uri, expected);
}
/// Regression: connecting via a URL-style URI (which goes through
/// `url::Url::parse` and the `query_pairs_mut()` path) must not
/// append a trailing `?` to per-table URIs when the input URI has
/// no query string.
///
/// Earlier, `query_pairs_mut().clear()` left the URL with
/// `query=Some("")`, which then propagated as a trailing `?` on
/// every table URI. Sub-path lookups against that URI (e.g. MemWAL
/// `<table_uri>/_mem_wal/<shard>/<rand>_gen_<n>`) re-parsed as
/// `path=<base table>` + `query=/_mem_wal/...`, causing
/// `Dataset::write` to find the base table dataset and falsely
/// report `Dataset already exists`.
/// Mirrors the URL-mutation step from
/// [`ListingDatabase::connect_with_options`] so we can assert the
/// fix without going through filesystem setup (which is awkward
/// across platforms — see the `file://` test below).
fn capture_query_like_connect(input_uri: &str) -> Option<String> {
let mut url = url::Url::parse(input_uri).unwrap();
let mut filtered_querys = Vec::new();
for (key, value) in url.query_pairs() {
if key == ENGINE || key == MIRRORED_STORE {
continue;
}
filtered_querys.push((key.to_string(), value.to_string()));
}
url.query_pairs_mut().clear();
url.query_pairs_mut().extend_pairs(filtered_querys);
url.query().filter(|q| !q.is_empty()).map(|s| s.to_string())
}
#[test]
fn test_capture_query_treats_empty_as_none() {
// No query at all. With the bug, `query_pairs_mut()` left the
// URL with `query=Some("")` and we used to propagate that.
assert_eq!(
capture_query_like_connect("s3://bucket/prefix/"),
None,
"empty query after mutation must be treated as no query"
);
// Real query is propagated.
assert_eq!(
capture_query_like_connect("s3://bucket/prefix/?foo=bar"),
Some("foo=bar".to_string())
);
// lancedb-internal `engine=` is stripped; nothing remains, so
// query_string is None — not Some("").
assert_eq!(
capture_query_like_connect(&format!("s3://bucket/prefix/?{}=mem", ENGINE)),
None
);
// Mixed: drop `engine=`, keep the rest.
let captured =
capture_query_like_connect(&format!("s3://bucket/prefix/?{}=mem&foo=bar", ENGINE));
assert_eq!(captured.as_deref(), Some("foo=bar"));
}
/// Regression: connecting via a URL-style URI (which goes through
/// `url::Url::parse` and the `query_pairs_mut()` path) must not
/// append a trailing `?` to per-table URIs when the input URI has
/// no query string. Sub-path lookups against such a URI (e.g.
/// MemWAL `<table_uri>/_mem_wal/<shard>/<rand>_gen_<n>`) re-parse
/// as `path=<base table>` + `query=/_mem_wal/...`, causing
/// `Dataset::write` to find the base table dataset and falsely
/// report `Dataset already exists`.
///
/// Skipped on Windows: `try_create_dir` does not understand
/// `file:///C:/…` paths so `connect_with_options` fails before
/// even reaching the URL-mutation logic. The pure URL-mutation
/// invariant is covered by
/// `test_capture_query_treats_empty_as_none` above, which runs
/// on all platforms.
#[cfg(not(windows))]
#[tokio::test]
async fn test_table_uri_url_path_has_no_trailing_question_mark() {
let tempdir = tempdir().unwrap();
let uri = format!("file://{}", tempdir.path().to_str().unwrap());
let request = ConnectRequest {
uri: uri.clone(),
#[cfg(feature = "remote")]
client_config: Default::default(),
options: Default::default(),
namespace_client_properties: Default::default(),
manifest_enabled: false,
read_consistency_interval: None,
session: None,
};
let db = ListingDatabase::connect_with_options(&request)
.await
.unwrap();
assert_eq!(
db.query_string, None,
"no input query → no captured query_string"
);
let table_uri = db.table_uri("test").unwrap();
assert!(
!table_uri.ends_with('?'),
"table_uri must not have a trailing `?`: {}",
table_uri
);
assert_eq!(table_uri, format!("{}/test.lance", uri));
// A real query string should still be propagated.
let with_query = format!("{}?foo=bar", uri);
let request_with_query = ConnectRequest {
uri: with_query,
#[cfg(feature = "remote")]
client_config: Default::default(),
options: Default::default(),
namespace_client_properties: Default::default(),
manifest_enabled: false,
read_consistency_interval: None,
session: None,
};
let db_with_query = ListingDatabase::connect_with_options(&request_with_query)
.await
.unwrap();
assert_eq!(db_with_query.query_string.as_deref(), Some("foo=bar"));
let table_uri = db_with_query.table_uri("test").unwrap();
assert_eq!(table_uri, format!("{}/test.lance?foo=bar", uri));
}
#[tokio::test]
async fn test_namespace_client() {
let (_tempdir, db) = setup_database().await;

View File

@@ -414,6 +414,10 @@ impl Database for LanceNamespaceDatabase {
params.mode = WriteMode::Overwrite;
}
if request.write_options.skip_auto_cleanup {
params.skip_auto_cleanup = true;
}
// Set up storage options if provided
if let Some(storage_opts) = initial_storage_options {
let store_params = params

View File

@@ -13,7 +13,10 @@ use crate::{DistanceType, Error, Result, table::BaseTable};
use self::{
scalar::{BTreeIndexBuilder, BitmapIndexBuilder, LabelListIndexBuilder},
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, IvfSqIndexBuilder},
vector::{
IvfHnswFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder,
IvfSqIndexBuilder,
},
};
pub mod scalar;
@@ -67,6 +70,10 @@ pub enum Index {
/// IVF-HNSW index with Scalar Quantization
/// It is a variant of the HNSW algorithm that uses scalar quantization to compress the vectors.
IvfHnswSq(IvfHnswSqIndexBuilder),
/// IVF-HNSW index without quantization.
/// Stores raw vectors, providing the highest recall at the cost of more memory and disk space.
IvfHnswFlat(IvfHnswFlatIndexBuilder),
}
/// Builder for the create_index operation
@@ -290,6 +297,8 @@ pub enum IndexType {
IvfHnswPq,
#[serde(alias = "IVF_HNSW_SQ")]
IvfHnswSq,
#[serde(alias = "IVF_HNSW_FLAT")]
IvfHnswFlat,
// Scalar
#[serde(alias = "BTREE")]
BTree,
@@ -311,6 +320,7 @@ impl std::fmt::Display for IndexType {
Self::IvfRq => write!(f, "IVF_RQ"),
Self::IvfHnswPq => write!(f, "IVF_HNSW_PQ"),
Self::IvfHnswSq => write!(f, "IVF_HNSW_SQ"),
Self::IvfHnswFlat => write!(f, "IVF_HNSW_FLAT"),
Self::BTree => write!(f, "BTREE"),
Self::Bitmap => write!(f, "BITMAP"),
Self::LabelList => write!(f, "LABEL_LIST"),
@@ -334,6 +344,7 @@ impl std::str::FromStr for IndexType {
"IVF_RQ" => Ok(Self::IvfRq),
"IVF_HNSW_PQ" => Ok(Self::IvfHnswPq),
"IVF_HNSW_SQ" => Ok(Self::IvfHnswSq),
"IVF_HNSW_FLAT" => Ok(Self::IvfHnswFlat),
_ => Err(Error::InvalidInput {
message: format!("the input value {} is not a valid IndexType", value),
}),

View File

@@ -474,3 +474,46 @@ impl IvfHnswSqIndexBuilder {
impl_ivf_params_setter!();
impl_hnsw_params_setter!();
}
/// Builder for an IVF_HNSW_FLAT index.
///
/// This index combines IVF partitioning with an HNSW graph per partition,
/// storing raw (unquantized) vectors. It offers the highest recall among
/// the IVF_HNSW family at the cost of more memory and disk space compared
/// to [`IvfHnswSqIndexBuilder`] or [`IvfHnswPqIndexBuilder`].
#[derive(Debug, Clone, Serialize)]
pub struct IvfHnswFlatIndexBuilder {
// IVF
#[serde(rename = "metric_type")]
pub(crate) distance_type: DistanceType,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) target_partition_size: Option<u32>,
// HNSW
pub(crate) m: u32,
pub(crate) ef_construction: u32,
}
impl Default for IvfHnswFlatIndexBuilder {
fn default() -> Self {
Self {
distance_type: DistanceType::L2,
num_partitions: None,
sample_rate: 256,
max_iterations: 50,
m: 20,
ef_construction: 300,
target_partition_size: None,
}
}
}
impl IvfHnswFlatIndexBuilder {
impl_distance_type_setter!();
impl_ivf_params_setter!();
impl_hnsw_params_setter!();
}

View File

@@ -5,11 +5,12 @@
use std::{fmt::Formatter, sync::Arc};
use futures::{TryFutureExt, stream::BoxStream};
use futures::{StreamExt, TryFutureExt, stream::BoxStream};
use lance::io::WrappingObjectStore;
use object_store::{
Error, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
PutMultipartOptions, PutOptions, PutPayload, PutResult, Result, UploadPart, path::Path,
CopyOptions, Error, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result,
UploadPart, path::Path,
};
use async_trait::async_trait;
@@ -93,20 +94,6 @@ impl ObjectStore for MirroringObjectStore {
self.primary.get_opts(location, options).await
}
async fn head(&self, location: &Path) -> Result<ObjectMeta> {
self.primary.head(location).await
}
async fn delete(&self, location: &Path) -> Result<()> {
if !location.primary_only() {
match self.secondary.delete(location).await {
Err(Error::NotFound { .. }) | Ok(_) => {}
Err(e) => return Err(e),
}
}
self.primary.delete(location).await
}
fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result<ObjectMeta>> {
self.primary.list(prefix)
}
@@ -115,21 +102,40 @@ impl ObjectStore for MirroringObjectStore {
self.primary.list_with_delimiter(prefix).await
}
async fn copy(&self, from: &Path, to: &Path) -> Result<()> {
if to.primary_only() {
self.primary.copy(from, to).await
} else {
self.secondary.copy(from, to).await?;
self.primary.copy(from, to).await?;
Ok(())
}
fn delete_stream(
&self,
locations: BoxStream<'static, Result<Path>>,
) -> BoxStream<'static, Result<Path>> {
let primary = self.primary.clone();
let secondary = self.secondary.clone();
locations
.map(move |location| {
let primary = primary.clone();
let secondary = secondary.clone();
async move {
let location = location?;
if !location.primary_only() {
match secondary.delete(&location).await {
Err(Error::NotFound { .. }) | Ok(_) => {}
Err(e) => return Err(e),
}
}
primary.delete(&location).await?;
Ok(location)
}
})
.buffered(10)
.boxed()
}
async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
if !to.primary_only() {
self.secondary.copy(from, to).await?;
async fn copy_opts(&self, from: &Path, to: &Path, options: CopyOptions) -> Result<()> {
if to.primary_only() {
self.primary.copy_opts(from, to, options).await
} else {
self.secondary.copy_opts(from, to, options.clone()).await?;
self.primary.copy_opts(from, to, options).await?;
Ok(())
}
self.primary.copy_if_not_exists(from, to).await
}
}
@@ -228,6 +234,7 @@ mod test {
.create_table("test", data)
.write_options(WriteOptions {
lance_write_params: Some(param),
..Default::default()
})
.execute()
.await;

View File

@@ -10,9 +10,9 @@ use bytes::Bytes;
use futures::stream::BoxStream;
use lance::io::WrappingObjectStore;
use object_store::{
GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
PutMultipartOptions, PutOptions, PutPayload, PutResult, Result as OSResult, UploadPart,
path::Path,
CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
PutMultipartOptions, PutOptions, PutPayload, PutResult, RenameOptions, Result as OSResult,
UploadPart, path::Path,
};
#[derive(Debug, Default)]
@@ -81,11 +81,6 @@ impl IoTrackingStore {
#[async_trait::async_trait]
#[deny(clippy::missing_trait_methods)]
impl ObjectStore for IoTrackingStore {
async fn put(&self, location: &Path, bytes: PutPayload) -> OSResult<PutResult> {
self.record_write(bytes.content_length() as u64);
self.target.put(location, bytes).await
}
async fn put_opts(
&self,
location: &Path,
@@ -96,14 +91,6 @@ impl ObjectStore for IoTrackingStore {
self.target.put_opts(location, bytes, opts).await
}
async fn put_multipart(&self, location: &Path) -> OSResult<Box<dyn MultipartUpload>> {
let target = self.target.put_multipart(location).await?;
Ok(Box::new(IoTrackingMultipartUpload {
target,
stats: self.stats.clone(),
}))
}
async fn put_multipart_opts(
&self,
location: &Path,
@@ -116,15 +103,6 @@ impl ObjectStore for IoTrackingStore {
}))
}
async fn get(&self, location: &Path) -> OSResult<GetResult> {
let result = self.target.get(location).await;
if let Ok(result) = &result {
let num_bytes = result.range.end - result.range.start;
self.record_read(num_bytes);
}
result
}
async fn get_opts(&self, location: &Path, options: GetOptions) -> OSResult<GetResult> {
let result = self.target.get_opts(location, options).await;
if let Ok(result) = &result {
@@ -134,14 +112,6 @@ impl ObjectStore for IoTrackingStore {
result
}
async fn get_range(&self, location: &Path, range: std::ops::Range<u64>) -> OSResult<Bytes> {
let result = self.target.get_range(location, range).await;
if let Ok(result) = &result {
self.record_read(result.len() as u64);
}
result
}
async fn get_ranges(
&self,
location: &Path,
@@ -154,20 +124,11 @@ impl ObjectStore for IoTrackingStore {
result
}
async fn head(&self, location: &Path) -> OSResult<ObjectMeta> {
self.record_read(0);
self.target.head(location).await
}
async fn delete(&self, location: &Path) -> OSResult<()> {
fn delete_stream(
&self,
locations: BoxStream<'static, OSResult<Path>>,
) -> BoxStream<'static, OSResult<Path>> {
self.record_write(0);
self.target.delete(location).await
}
fn delete_stream<'a>(
&'a self,
locations: BoxStream<'a, OSResult<Path>>,
) -> BoxStream<'a, OSResult<Path>> {
self.target.delete_stream(locations)
}
@@ -190,24 +151,14 @@ impl ObjectStore for IoTrackingStore {
self.target.list_with_delimiter(prefix).await
}
async fn copy(&self, from: &Path, to: &Path) -> OSResult<()> {
async fn copy_opts(&self, from: &Path, to: &Path, options: CopyOptions) -> OSResult<()> {
self.record_write(0);
self.target.copy(from, to).await
self.target.copy_opts(from, to, options).await
}
async fn rename(&self, from: &Path, to: &Path) -> OSResult<()> {
async fn rename_opts(&self, from: &Path, to: &Path, options: RenameOptions) -> OSResult<()> {
self.record_write(0);
self.target.rename(from, to).await
}
async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> OSResult<()> {
self.record_write(0);
self.target.rename_if_not_exists(from, to).await
}
async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> OSResult<()> {
self.record_write(0);
self.target.copy_if_not_exists(from, to).await
self.target.rename_opts(from, to, options).await
}
}

View File

@@ -1540,6 +1540,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
Index::IvfPq(p) => ("IVF_PQ", Some(to_json(p)?)),
Index::IvfSq(p) => ("IVF_SQ", Some(to_json(p)?)),
Index::IvfHnswSq(p) => ("IVF_HNSW_SQ", Some(to_json(p)?)),
Index::IvfHnswFlat(p) => ("IVF_HNSW_FLAT", Some(to_json(p)?)),
Index::IvfRq(p) => ("IVF_RQ", Some(to_json(p)?)),
Index::BTree(p) => ("BTREE", Some(to_json(p)?)),
Index::Bitmap(p) => ("BITMAP", Some(to_json(p)?)),
@@ -2068,7 +2069,8 @@ mod tests {
use serde_json::json;
use crate::index::vector::{
IvfFlatIndexBuilder, IvfHnswSqIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder,
IvfFlatIndexBuilder, IvfHnswFlatIndexBuilder, IvfHnswSqIndexBuilder, IvfRqIndexBuilder,
IvfSqIndexBuilder,
};
use crate::remote::JSON_CONTENT_TYPE;
use crate::remote::db::DEFAULT_SERVER_VERSION;
@@ -3321,6 +3323,35 @@ mod tests {
.ef_construction(500),
),
),
(
"IVF_HNSW_FLAT",
json!({
"metric_type": "l2",
"sample_rate": 256,
"max_iterations": 50,
"m": 20,
"ef_construction": 300,
}),
Index::IvfHnswFlat(Default::default()),
),
(
"IVF_HNSW_FLAT",
json!({
"metric_type": "cosine",
"num_partitions": 64,
"sample_rate": 256,
"max_iterations": 50,
"m": 40,
"ef_construction": 500,
}),
Index::IvfHnswFlat(
IvfHnswFlatIndexBuilder::default()
.distance_type(DistanceType::Cosine)
.num_partitions(64)
.num_edges(40)
.ef_construction(500),
),
),
(
"IVF_SQ",
json!({

View File

@@ -189,6 +189,18 @@ pub struct WriteOptions {
// Coming soon: https://github.com/lancedb/lancedb/issues/992
// /// What behavior to take if the data contains invalid vectors
// pub on_bad_vectors: BadVectorHandling,
/// If true, skip the automatic cleanup of old dataset versions that would
/// otherwise run during the commit. This forwards to
/// [`WriteParams::skip_auto_cleanup`] in lance-core.
///
/// Useful for high-frequency writers that want to manage version cleanup
/// themselves (e.g. via a periodic optimize job), or for writers that
/// lack delete permissions on the underlying storage.
///
/// If `lance_write_params` is also set with `skip_auto_cleanup = true`,
/// the cleanup is skipped. Setting this field to `true` forces the flag
/// on regardless of `lance_write_params`.
pub skip_auto_cleanup: bool,
/// Advanced parameters that can be used to customize table creation
///
/// Overlapping `OpenTableBuilder` options (e.g. [AddDataBuilder::mode]) will take
@@ -2033,6 +2045,24 @@ impl NativeTable {
);
Ok(Box::new(lance_idx_params))
}
Index::IvfHnswFlat(index) => {
Self::validate_index_type(field, "IVF HNSW FLAT", supported_vector_data_type)?;
let ivf_params = Self::build_ivf_params(
index.num_partitions,
index.target_partition_size,
index.sample_rate,
index.max_iterations,
);
let hnsw_params = HnswBuildParams::default()
.num_edges(index.m as usize)
.ef_construction(index.ef_construction as usize);
let lance_idx_params = VectorIndexParams::ivf_hnsw(
index.distance_type.into(),
ivf_params,
hnsw_params,
);
Ok(Box::new(lance_idx_params))
}
}
}
@@ -2058,7 +2088,8 @@ impl NativeTable {
| Index::IvfPq(_)
| Index::IvfRq(_)
| Index::IvfHnswPq(_)
| Index::IvfHnswSq(_) => IndexType::Vector,
| Index::IvfHnswSq(_)
| Index::IvfHnswFlat(_) => IndexType::Vector,
}
}
@@ -2264,7 +2295,8 @@ impl BaseTable for NativeTable {
let output = add.into_plan(&table_schema, &table_def)?;
let lance_params = output
let skip_auto_cleanup = output.write_options.skip_auto_cleanup;
let mut lance_params = output
.write_options
.lance_write_params
.unwrap_or(WriteParams {
@@ -2274,6 +2306,9 @@ impl BaseTable for NativeTable {
},
..Default::default()
});
if skip_auto_cleanup {
lance_params.skip_auto_cleanup = true;
}
// Repartition for write parallelism if beneficial.
let plan = if num_partitions > 1 {
@@ -3176,6 +3211,56 @@ mod tests {
assert_eq!(stats.num_unindexed_rows, 0);
}
#[tokio::test]
async fn test_create_index_ivf_hnsw_flat() {
use arrow_array::RecordBatch;
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
use rand;
use std::iter::repeat_with;
use crate::index::vector::IvfHnswFlatIndexBuilder;
use arrow_array::Float32Array;
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let conn = connect(uri).execute().await.unwrap();
let dimension = 16;
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
"embeddings",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, true)),
dimension,
),
false,
)]));
let float_arr = Float32Array::from(
repeat_with(rand::random::<f32>)
.take(512 * dimension as usize)
.collect::<Vec<f32>>(),
);
let vectors = Arc::new(create_fixed_size_list(float_arr, dimension).unwrap());
let batch = RecordBatch::try_new(schema.clone(), vec![vectors.clone()]).unwrap();
let table = conn.create_table("test", batch).execute().await.unwrap();
let index = IvfHnswFlatIndexBuilder::default();
table
.create_index(&["embeddings"], Index::IvfHnswFlat(index))
.execute()
.await
.unwrap();
let index_configs = table.list_indices().await.unwrap();
assert_eq!(index_configs.len(), 1);
let index = index_configs.into_iter().next().unwrap();
assert_eq!(index.index_type, crate::index::IndexType::IvfHnswFlat);
assert_eq!(index.columns, vec!["embeddings".to_string()]);
assert_eq!(table.count_rows(None).await.unwrap(), 512);
}
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
let list_type = DataType::FixedSizeList(
Arc::new(Field::new("item", values.data_type().clone(), true)),

View File

@@ -441,6 +441,7 @@ mod tests {
.add(new_batch.clone())
.write_options(WriteOptions {
lance_write_params: Some(param),
..Default::default()
})
.mode(AddDataMode::Append)
.execute()
@@ -761,4 +762,56 @@ mod tests {
table2.add(struct_batch).execute().await.unwrap();
assert_eq!(table2.count_rows(None).await.unwrap(), 2);
}
#[tokio::test]
async fn test_add_skip_auto_cleanup() {
// Verifies WriteOptions::skip_auto_cleanup is forwarded to lance-core's
// WriteParams and actually suppresses the cleanup hook on commit.
let tmp_dir = tempfile::tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let conn = connect(uri).execute().await.unwrap();
let batch = record_batch!(("id", Int64, [1, 2, 3])).unwrap();
let table = conn.create_table("t", batch).execute().await.unwrap();
// Cleanup on every commit, with `older_than = 0s` so prior versions are
// immediately eligible.
table
.as_native()
.unwrap()
.update_config(vec![
("lance.auto_cleanup.interval".to_string(), "1".to_string()),
(
"lance.auto_cleanup.older_than".to_string(),
"0s".to_string(),
),
])
.await
.unwrap();
// Write several versions with skip_auto_cleanup; none should be removed.
for i in 0..3 {
let new_batch = record_batch!(("id", Int64, [10 + i])).unwrap();
table
.add(new_batch)
.write_options(WriteOptions {
skip_auto_cleanup: true,
..Default::default()
})
.execute()
.await
.unwrap();
}
let versions_before = table.list_versions().await.unwrap().len();
// Now write one more without the flag; cleanup should run and prune.
let new_batch = record_batch!(("id", Int64, [42])).unwrap();
table.add(new_batch).execute().await.unwrap();
let versions_after = table.list_versions().await.unwrap().len();
assert!(
versions_after < versions_before,
"auto-cleanup should have removed old versions once the skip flag was off \
(before={versions_before}, after={versions_after})"
);
}
}

View File

@@ -219,6 +219,7 @@ impl ExecutionPlan for InsertExec {
&& let Some(merged_txn) = merge_transactions(transactions)
{
let new_dataset = CommitBuilder::new(dataset.clone())
.with_skip_auto_cleanup(write_params.skip_auto_cleanup)
.execute(merged_txn)
.await?;
ds_wrapper.update(new_dataset);

View File

@@ -528,6 +528,7 @@ mod tests {
}),
..Default::default()
}),
..Default::default()
})
.execute()
.await
@@ -589,6 +590,7 @@ mod tests {
}),
..Default::default()
}),
..Default::default()
})
.execute()
.await

View File

@@ -55,6 +55,7 @@ pub struct MergeInsertBuilder {
pub(crate) when_not_matched_by_source_delete_filt: Option<String>,
pub(crate) timeout: Option<Duration>,
pub(crate) use_index: bool,
pub(crate) skip_auto_cleanup: bool,
}
impl MergeInsertBuilder {
@@ -69,6 +70,7 @@ impl MergeInsertBuilder {
when_not_matched_by_source_delete_filt: None,
timeout: None,
use_index: true,
skip_auto_cleanup: false,
}
}
@@ -148,6 +150,17 @@ impl MergeInsertBuilder {
self
}
/// Skip the automatic cleanup of old dataset versions that would otherwise
/// run during the merge insert commit.
///
/// This forwards to [`lance::dataset::MergeInsertBuilder::skip_auto_cleanup`]
/// in lance-core. Useful for high-frequency writers that want to manage
/// version cleanup themselves, or writers without delete permissions.
pub fn skip_auto_cleanup(&mut self, skip: bool) -> &mut Self {
self.skip_auto_cleanup = skip;
self
}
/// Executes the merge insert operation
///
/// Returns version and statistics about the merge operation including the number of rows
@@ -191,6 +204,9 @@ pub(crate) async fn execute_merge_insert(
builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep);
}
builder.use_index(params.use_index);
if params.skip_auto_cleanup {
builder.skip_auto_cleanup(true);
}
let future = if let Some(timeout) = params.timeout {
let future = builder