Compare commits

..

1 Commits

Author SHA1 Message Date
Lance Release
78cdb12732 Bump version: 0.28.0-beta.3 → 0.28.0-beta.4 2026-04-12 03:57:49 +00:00
82 changed files with 1717 additions and 2581 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.28.0-beta.10"
current_version = "0.28.0-beta.4"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -18,6 +18,6 @@ body:
label: Link
description: >
Provide a link to the existing documentation, if applicable.
placeholder: ex. https://docs.lancedb.com/tables/...
placeholder: ex. https://lancedb.com/docs/tables/...
validations:
required: false

View File

@@ -1,18 +0,0 @@
version: 2
# Scope: the root Cargo workspace, which produces the Rust binaries we
# ship to users (the Node.js and Python native extensions). The
# `rust/lancedb` library crate shares the same lockfile; its consumers
# pick their own dependency versions, but bumping transitive deps here
# keeps the binaries we ship current.
updates:
- package-ecosystem: cargo
directory: /
schedule:
interval: weekly
open-pull-requests-limit: 10
groups:
rust-minor-patch:
update-types:
- minor
- patch

View File

@@ -8,9 +8,6 @@ concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions:
contents: read
jobs:
labeler:
permissions:

View File

@@ -19,9 +19,6 @@ on:
paths:
- .github/workflows/java-publish.yml
permissions:
contents: read
jobs:
publish:
name: Build and Publish

View File

@@ -24,9 +24,6 @@ on:
- java/**
- .github/workflows/java.yml
permissions:
contents: read
jobs:
build-java:
runs-on: ubuntu-24.04

View File

@@ -10,10 +10,6 @@ on:
- nodejs/**
- java/**
- .github/workflows/license-header-check.yml
permissions:
contents: read
jobs:
check-licenses:
runs-on: ubuntu-latest

View File

@@ -15,9 +15,6 @@ on:
- .github/workflows/nodejs.yml
- docker-compose.yml
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

View File

@@ -14,16 +14,10 @@ on:
env:
PIP_EXTRA_INDEX_URL: "https://pypi.fury.io/lance-format/ https://pypi.fury.io/lancedb/"
permissions:
contents: read
jobs:
linux:
name: Python ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
timeout-minutes: 60
permissions:
id-token: write
contents: read
strategy:
matrix:
config:
@@ -63,12 +57,10 @@ jobs:
- uses: ./.github/workflows/upload_wheel
if: startsWith(github.ref, 'refs/tags/python-v')
with:
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
fury_token: ${{ secrets.FURY_TOKEN }}
mac:
timeout-minutes: 90
permissions:
id-token: write
contents: read
runs-on: ${{ matrix.config.runner }}
strategy:
matrix:
@@ -93,12 +85,10 @@ jobs:
- uses: ./.github/workflows/upload_wheel
if: startsWith(github.ref, 'refs/tags/python-v')
with:
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
fury_token: ${{ secrets.FURY_TOKEN }}
windows:
timeout-minutes: 60
permissions:
id-token: write
contents: read
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
@@ -117,6 +107,7 @@ jobs:
- uses: ./.github/workflows/upload_wheel
if: startsWith(github.ref, 'refs/tags/python-v')
with:
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
fury_token: ${{ secrets.FURY_TOKEN }}
gh-release:
if: startsWith(github.ref, 'refs/tags/python-v')

View File

@@ -17,9 +17,6 @@ on:
- .github/workflows/build_windows_wheel/**
- .github/workflows/run_tests/**
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
@@ -111,6 +108,7 @@ jobs:
- name: Install
run: |
pip install --extra-index-url https://pypi.fury.io/lance-format/ --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests,dev,embeddings]
pip install tantivy
pip install mlx
- name: Doctest
run: pytest --doctest-modules python/lancedb
@@ -229,5 +227,6 @@ jobs:
pip install "pydantic<2"
pip install pyarrow==16
pip install --extra-index-url https://pypi.fury.io/lance-format/ --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests]
pip install tantivy
- name: Run tests
run: pytest -m "not slow and not s3_test" -x -v --durations=30 python/tests

View File

@@ -9,15 +9,9 @@ on:
- Cargo.toml
- Cargo.lock
- rust-toolchain.toml
- deny.toml
- rust/**
- nodejs/Cargo.toml
- python/Cargo.toml
- .github/workflows/rust.yml
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
@@ -59,17 +53,6 @@ jobs:
- name: Run clippy (without remote feature)
run: cargo clippy --profile ci --workspace --tests -- -D warnings
deny:
# Supply-chain checks: advisories, licenses, banned crates, and source
# restrictions. Configuration lives in `deny.toml` at the workspace root.
timeout-minutes: 10
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- uses: EmbarkStudios/cargo-deny-action@v2
with:
command: check advisories bans licenses sources
build-no-lock:
runs-on: ubuntu-24.04
timeout-minutes: 30

View File

@@ -3,9 +3,6 @@ name: Update package-lock.json
on:
workflow_dispatch:
permissions:
contents: read
jobs:
publish:
runs-on: ubuntu-latest

View File

@@ -3,9 +3,6 @@ name: Update NodeJs package-lock.json
on:
workflow_dispatch:
permissions:
contents: read
jobs:
publish:
runs-on: ubuntu-latest

View File

@@ -2,6 +2,9 @@ name: upload-wheel
description: "Upload wheels to Pypi"
inputs:
pypi_token:
required: true
description: "release token for the repo"
fury_token:
required: true
description: "release token for the fury repo"
@@ -9,6 +12,12 @@ inputs:
runs:
using: "composite"
steps:
- name: Install dependencies
shell: bash
run: |
python -m pip install --upgrade pip
pip install twine
python3 -m pip install --upgrade pkginfo
- name: Choose repo
shell: bash
id: choose_repo
@@ -18,17 +27,19 @@ runs:
else
echo "repo=pypi" >> $GITHUB_OUTPUT
fi
- name: Publish to Fury
if: steps.choose_repo.outputs.repo == 'fury'
- name: Publish to PyPI
shell: bash
env:
FURY_TOKEN: ${{ inputs.fury_token }}
PYPI_TOKEN: ${{ inputs.pypi_token }}
run: |
WHEEL=$(ls target/wheels/lancedb-*.whl 2> /dev/null | head -n 1)
echo "Uploading $WHEEL to Fury"
curl -f -F package=@$WHEEL https://$FURY_TOKEN@push.fury.io/lancedb/
- name: Publish to PyPI
if: steps.choose_repo.outputs.repo == 'pypi'
uses: pypa/gh-action-pypi-publish@release/v1
with:
packages-dir: target/wheels/
if [[ ${{ steps.choose_repo.outputs.repo }} == fury ]]; then
WHEEL=$(ls target/wheels/lancedb-*.whl 2> /dev/null | head -n 1)
echo "Uploading $WHEEL to Fury"
curl -f -F package=@$WHEEL https://$FURY_TOKEN@push.fury.io/lancedb/
else
twine upload --repository ${{ steps.choose_repo.outputs.repo }} \
--username __token__ \
--password $PYPI_TOKEN \
target/wheels/lancedb-*.whl
fi

498
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,7 @@
[workspace]
members = ["rust/lancedb", "nodejs", "python"]
# Python package needs to be built by maturin.
exclude = ["python"]
resolver = "2"
[workspace.package]
@@ -13,20 +15,20 @@ categories = ["database-implementations"]
rust-version = "1.91.0"
[workspace.dependencies]
lance = { "version" = "=6.0.0-beta.4", default-features = false, "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=6.0.0-beta.4", default-features = false, "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=6.0.0-beta.4", default-features = false, "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=6.0.0-beta.4", "tag" = "v6.0.0-beta.4", "git" = "https://github.com/lance-format/lance.git" }
lance = { "version" = "=5.1.0-beta.3", default-features = false, "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=5.1.0-beta.3", default-features = false, "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=5.1.0-beta.3", default-features = false, "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=5.1.0-beta.3", "tag" = "v5.1.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
ahash = "0.8"
# Note that this one does not include pyarrow
arrow = { version = "57.2", optional = false }

View File

@@ -15,7 +15,7 @@
# **The Multimodal AI Lakehouse**
[**How to Install** ](#how-to-install) ✦ [**Detailed Documentation**](https://docs.lancedb.com) ✦ [**Tutorials and Recipes**](https://github.com/lancedb/vectordb-recipes/tree/main) ✦ [**Contributors**](#contributors)
[**How to Install** ](#how-to-install) ✦ [**Detailed Documentation**](https://lancedb.com/docs) ✦ [**Tutorials and Recipes**](https://github.com/lancedb/vectordb-recipes/tree/main) ✦ [**Contributors**](#contributors)
**The ultimate multimodal data platform for AI/ML applications.**
@@ -57,7 +57,7 @@ LanceDB is a central location where developers can build, train and analyze thei
## **How to Install**:
Follow the [Quickstart](https://docs.lancedb.com/quickstart) doc to set up LanceDB locally.
Follow the [Quickstart](https://lancedb.com/docs/quickstart/) doc to set up LanceDB locally.
**API & SDK:** We also support Python, Typescript and Rust SDKs

172
deny.toml
View File

@@ -1,172 +0,0 @@
# cargo-deny configuration for LanceDB.
#
# Run locally with `cargo deny check`. See
# https://embarkstudios.github.io/cargo-deny/ for the full reference.
# The set of target triples we care about. cargo-deny will only consider
# dependencies that are used on at least one of these targets. Keeping this
# explicit avoids noise from platform-specific crates (e.g. wasm, android,
# ios) that we never actually ship.
[graph]
targets = [
"x86_64-unknown-linux-gnu",
"aarch64-unknown-linux-gnu",
"x86_64-apple-darwin",
"aarch64-apple-darwin",
"x86_64-pc-windows-msvc",
"aarch64-pc-windows-msvc",
]
all-features = true
[output]
feature-depth = 1
# ---------------------------------------------------------------------------
# Advisories: security vulnerabilities and yanked crates.
# ---------------------------------------------------------------------------
[advisories]
version = 2
# Fail the check if any crate in the lockfile has been yanked from crates.io.
# Yanked crates are a signal the author retracted the release (often due to
# bugs or security issues) and should not be depended on.
yanked = "deny"
# Advisory IDs we have explicitly reviewed and chosen to accept. Every
# entry must include a rationale and, where possible, an upstream issue
# pointing to a fix. Revisit this list whenever dependencies are updated.
ignore = [
# rsa: Marvin Attack timing side-channel in PKCS#1 v1.5 decryption.
# Reached only through opendal → reqsign → rsa. We do not use RSA
# decryption in LanceDB ourselves; this is dormant in the signing path.
# No fixed release exists upstream as of this writing.
# https://rustsec.org/advisories/RUSTSEC-2023-0071
{ id = "RUSTSEC-2023-0071", reason = "rsa crate via opendal/reqsign; no fixed upstream release" },
# instant: unmaintained. Pulled in via backoff → instant. Upstream
# recommends switching to `web-time`; fix has to come from backoff.
# https://rustsec.org/advisories/RUSTSEC-2024-0384
{ id = "RUSTSEC-2024-0384", reason = "transitive via backoff; waiting on backoff replacement" },
# paste: unmaintained (author archived the repo). Used transitively by
# datafusion and the arrow ecosystem; widespread, no drop-in replacement.
# https://rustsec.org/advisories/RUSTSEC-2024-0436
{ id = "RUSTSEC-2024-0436", reason = "transitive via datafusion; awaiting ecosystem migration" },
# tantivy: segfault on malformed input due to missing bounds check.
# Pulled in via lance for full-text search. We only feed tantivy
# documents we construct ourselves, not attacker-controlled bytes.
# Tracked for a lance dependency bump.
# https://rustsec.org/advisories/RUSTSEC-2025-0003
{ id = "RUSTSEC-2025-0003", reason = "tantivy via lance; inputs are internally produced, not user-supplied bytes" },
# backoff: unmaintained. Reached only via async-openai. Replacement
# requires async-openai to migrate (or us to drop async-openai).
# https://rustsec.org/advisories/RUSTSEC-2025-0012
{ id = "RUSTSEC-2025-0012", reason = "transitive via async-openai; waiting on upstream migration" },
# number_prefix: unmaintained. Transitive via indicatif → hf-hub.
# No security impact, just maintenance status.
# https://rustsec.org/advisories/RUSTSEC-2025-0119
{ id = "RUSTSEC-2025-0119", reason = "transitive via hf-hub/indicatif; cosmetic formatting crate" },
# rustls-pemfile: unmaintained. Reached from two separate chains:
# rustls-native-certs 0.6 (via hyper-rustls 0.24) and object_store 0.12.
# Both upstream dependencies need to move before we can drop it.
# https://rustsec.org/advisories/RUSTSEC-2025-0134
{ id = "RUSTSEC-2025-0134", reason = "transitive via rustls-native-certs/object_store; waiting on upstream migration" },
# rustls-webpki 0.101.7 (old major line): name-constraint checks for
# URI / wildcard names. Pulled in only via the legacy rustls 0.21 chain
# from aws-smithy-http-client. The 0.103 line we actively use is patched.
# Clearing the 0.101 copy requires the aws-sdk chain to migrate off
# rustls 0.21.
# https://rustsec.org/advisories/RUSTSEC-2026-0098
# https://rustsec.org/advisories/RUSTSEC-2026-0099
{ id = "RUSTSEC-2026-0098", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
{ id = "RUSTSEC-2026-0099", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
# rustls-webpki 0.101.7: reachable panic in CRL parsing. Same legacy
# rustls 0.21 chain from aws-smithy-http-client as above. The 0.103 line
# we actively use is upgraded to 0.103.13 which contains the fix.
# https://rustsec.org/advisories/RUSTSEC-2026-0104
{ id = "RUSTSEC-2026-0104", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
]
# ---------------------------------------------------------------------------
# Licenses: only allow licenses we've reviewed as compatible with Apache-2.0.
# ---------------------------------------------------------------------------
[licenses]
version = 2
# SPDX identifiers for licenses that are compatible with our Apache-2.0
# distribution. Additions require legal review.
allow = [
"Apache-2.0",
"Apache-2.0 WITH LLVM-exception",
"MIT",
"BSD-2-Clause",
"BSD-3-Clause",
"ISC",
"Unicode-3.0",
"Unicode-DFS-2016",
"Zlib",
"CC0-1.0",
"MPL-2.0",
"BSL-1.0",
"OpenSSL",
# 0BSD ("BSD Zero Clause") is effectively public domain — no attribution
# required. Pulled in by `mock_instant`.
"0BSD",
# bzip2-1.0.6 is the permissive upstream bzip2 license (BSD-like). Pulled
# in by `libbz2-rs-sys`, the pure-Rust bzip2 implementation.
"bzip2-1.0.6",
# CDLA-Permissive-2.0 is a permissive data license used by `webpki-roots`
# for the Mozilla CA root bundle. Data-only, distribution-compatible.
"CDLA-Permissive-2.0",
]
confidence-threshold = 0.8
# Crates whose license cannot be determined from Cargo metadata but whose
# license we've manually confirmed from upstream. Keep this list minimal.
[[licenses.clarify]]
# polars-arrow-format omits the `license` field in its Cargo.toml, but the
# upstream repo (pola-rs/polars-arrow-format) is dual-licensed Apache-2.0 OR
# MIT. See https://github.com/pola-rs/polars-arrow-format/blob/main/LICENSE
crate = "polars-arrow-format"
expression = "Apache-2.0 OR MIT"
license-files = []
# ---------------------------------------------------------------------------
# Bans: disallow specific crates and flag dependency hygiene issues.
# ---------------------------------------------------------------------------
[bans]
# Warn (not deny) on duplicate versions of the same crate. In a large
# workspace like this one, duplicates are common and often unavoidable
# transitively. We surface them to discourage growth, but don't fail CI.
multiple-versions = "warn"
# Wildcard version requirements (`foo = "*"`) are a footgun — they let any
# future release in without review. Ban them outright.
wildcards = "deny"
# Internal workspace crates reference each other via `path = "..."`, which
# cargo-deny sees as a wildcard version. That's fine for private workspace
# members (not published to crates.io), so allow it specifically for paths.
allow-wildcard-paths = true
# Features that, if enabled, should cause the check to fail.
deny = []
# Crates to skip when checking for duplicate versions.
skip = []
# Similar to `skip`, but also skips the entire transitive subtree.
skip-tree = []
# ---------------------------------------------------------------------------
# Sources: restrict where crates can come from.
# ---------------------------------------------------------------------------
[sources]
# Deny any registry other than the ones explicitly listed below.
unknown-registry = "deny"
# Deny any git dependency whose host isn't in the allow-list below. This
# prevents accidental pulls from arbitrary forks.
unknown-git = "deny"
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
# Lance is developed in a sibling repo and pulled as a git dependency until
# releases are cut to crates.io. Allow that specific host.
allow-git = [
"https://github.com/lance-format/lance",
]

View File

@@ -24,4 +24,4 @@ RUN python --version && \
rustc --version && \
protoc --version
RUN pip install --no-cache-dir lancedb
RUN pip install --no-cache-dir tantivy lancedb

View File

@@ -1,6 +1,6 @@
# LanceDB Documentation
LanceDB docs are available at [docs.lancedb.com](https://docs.lancedb.com).
LanceDB docs are available at [lancedb.com/docs](https://lancedb.com/docs).
The SDK docs are built and deployed automatically by [Github Actions](../.github/workflows/docs.yml)
whenever a commit is pushed to the `main` branch. So it is possible for the docs to show

View File

@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
<dependency>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-core</artifactId>
<version>0.28.0-beta.10</version>
<version>0.28.0-beta.4</version>
</dependency>
```

View File

@@ -34,7 +34,7 @@ const results = await table.vectorSearch([0.1, 0.3]).limit(20).toArray();
console.log(results);
```
The [quickstart](https://docs.lancedb.com/quickstart/) contains more complete examples.
The [quickstart](https://lancedb.com/docs/quickstart/basic-usage/) contains more complete examples.
## Development

View File

@@ -41,29 +41,6 @@ for testing purposes.
***
### manifestEnabled?
```ts
optional manifestEnabled: boolean;
```
(For LanceDB OSS only): use directory namespace manifests as the source
of truth for table metadata. Existing directory-listed root tables are
migrated into the manifest on access.
***
### namespaceClientProperties?
```ts
optional namespaceClientProperties: Record<string, string>;
```
(For LanceDB OSS only): extra properties for the backing namespace
client used by manifest-enabled native connections.
***
### readConsistencyInterval?
```ts
@@ -112,4 +89,4 @@ optional storageOptions: Record<string, string>;
(For LanceDB OSS only): configuration for object storage.
The available options are described at https://docs.lancedb.com/storage/
The available options are described at https://lancedb.com/docs/storage/

View File

@@ -97,4 +97,4 @@ Configuration for object storage.
Options already set on the connection will be inherited by the table,
but can be overridden here.
The available options are described at https://docs.lancedb.com/storage/
The available options are described at https://lancedb.com/docs/storage/

View File

@@ -42,4 +42,4 @@ Configuration for object storage.
Options already set on the connection will be inherited by the table,
but can be overridden here.
The available options are described at https://docs.lancedb.com/storage/
The available options are described at https://lancedb.com/docs/storage/

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.28.0-beta.10</version>
<version>0.28.0-beta.4</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.28.0-beta.10</version>
<version>0.28.0-beta.4</version>
<packaging>pom</packaging>
<name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description>
@@ -28,7 +28,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<arrow.version>15.0.0</arrow.version>
<lance-core.version>6.0.0-beta.4</lance-core.version>
<lance-core.version>5.1.0-beta.3</lance-core.version>
<spotless.skip>false</spotless.skip>
<spotless.version>2.30.0</spotless.version>
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>

View File

@@ -1,8 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.28.0-beta.10"
publish = false
version = "0.28.0-beta.4"
license.workspace = true
description.workspace = true
repository.workspace = true
@@ -32,8 +31,8 @@ lzma-sys = { version = "0.1", features = ["static"] }
log.workspace = true
# Pin to resolve build failures; update periodically for security patches.
aws-lc-sys = "=0.40.0"
aws-lc-rs = "=1.16.3"
aws-lc-sys = "=0.38.0"
aws-lc-rs = "=1.16.1"
[build-dependencies]
napi-build = "2.3.1"

View File

@@ -30,7 +30,7 @@ const results = await table.vectorSearch([0.1, 0.3]).limit(20).toArray();
console.log(results);
```
The [quickstart](https://docs.lancedb.com/quickstart/) contains more complete examples.
The [quickstart](https://lancedb.com/docs/quickstart/basic-usage/) contains more complete examples.
## Development

View File

@@ -1,8 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
import { spawn } from "node:child_process";
import * as path from "node:path";
import { RecordBatch } from "apache-arrow";
import * as tmp from "tmp";
import { Connection, Index, Table, connect, makeArrowTable } from "../lancedb";
@@ -78,91 +76,4 @@ describe("rerankers", function () {
expect(result).toHaveLength(2);
});
it("does not keep process alive after rerank query", async function () {
const script = `
import * as lancedb from "./dist/index.js";
import * as os from "node:os";
import * as path from "node:path";
import * as fs from "node:fs/promises";
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "lancedb-rerank-exit-"));
const db = await lancedb.connect(dir);
const table = await db.createTable("test", [{ text: "hello", vector: [1, 2, 3] }], {
mode: "overwrite",
});
await table.createIndex("text", { config: lancedb.Index.fts() });
await table.waitForIndex(["text_idx"], 30);
const reranker = await lancedb.rerankers.RRFReranker.create();
await table
.query()
.nearestTo([1, 2, 3])
.fullTextSearch("hello")
.rerank(reranker)
.toArray();
table.close();
db.close();
`;
await new Promise<void>((resolve, reject) => {
const child = spawn(
process.execPath,
["--input-type=module", "-e", script],
{
cwd: path.resolve(__dirname, ".."),
stdio: ["ignore", "pipe", "pipe"],
},
);
let stdout = "";
let stderr = "";
child.stdout.on("data", (chunk) => {
stdout += chunk.toString();
});
child.stderr.on("data", (chunk) => {
stderr += chunk.toString();
});
const timeout = setTimeout(() => {
child.kill();
reject(
new Error(
`child process did not exit in time\nstdout:\n${stdout}\nstderr:\n${stderr}`,
),
);
}, 20_000);
child.on("error", (err) => {
clearTimeout(timeout);
reject(err);
});
child.on("exit", (code, signal) => {
clearTimeout(timeout);
if (signal !== null) {
reject(
new Error(
`child process exited with signal ${signal}\nstdout:\n${stdout}\nstderr:\n${stderr}`,
),
);
return;
}
if (code !== 0) {
reject(
new Error(
`child process exited with code ${code}\nstdout:\n${stdout}\nstderr:\n${stderr}`,
),
);
return;
}
resolve();
});
});
});
});

View File

@@ -42,7 +42,7 @@ export interface CreateTableOptions {
* Options already set on the connection will be inherited by the table,
* but can be overridden here.
*
* The available options are described at https://docs.lancedb.com/storage/
* The available options are described at https://lancedb.com/docs/storage/
*/
storageOptions?: Record<string, string>;
@@ -78,7 +78,7 @@ export interface OpenTableOptions {
* Options already set on the connection will be inherited by the table,
* but can be overridden here.
*
* The available options are described at https://docs.lancedb.com/storage/
* The available options are described at https://lancedb.com/docs/storage/
*/
storageOptions?: Record<string, string>;
/**

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.28.0-beta.10",
"version": "0.28.0-beta.4",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.28.0-beta.10",
"version": "0.28.0-beta.4",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.28.0-beta.10",
"version": "0.28.0-beta.4",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.28.0-beta.10",
"version": "0.28.0-beta.4",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.28.0-beta.10",
"version": "0.28.0-beta.4",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.28.0-beta.10",
"version": "0.28.0-beta.4",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.28.0-beta.10",
"version": "0.28.0-beta.4",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.28.0-beta.10",
"version": "0.28.0-beta.3",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.28.0-beta.10",
"version": "0.28.0-beta.3",
"cpu": [
"x64",
"arm64"

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.28.0-beta.10",
"version": "0.28.0-beta.4",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -67,12 +67,6 @@ impl Connection {
builder = builder.storage_option(key, value);
}
}
if let Some(manifest_enabled) = options.manifest_enabled {
builder = builder.manifest_enabled(manifest_enabled);
}
if let Some(namespace_client_properties) = options.namespace_client_properties {
builder = builder.namespace_client_properties(namespace_client_properties);
}
// Create client config, optionally with header provider
let client_config = options.client_config.unwrap_or_default();

View File

@@ -35,15 +35,8 @@ pub struct ConnectionOptions {
pub read_consistency_interval: Option<f64>,
/// (For LanceDB OSS only): configuration for object storage.
///
/// The available options are described at https://docs.lancedb.com/storage/
/// The available options are described at https://lancedb.com/docs/storage/
pub storage_options: Option<HashMap<String, String>>,
/// (For LanceDB OSS only): use directory namespace manifests as the source
/// of truth for table metadata. Existing directory-listed root tables are
/// migrated into the manifest on access.
pub manifest_enabled: Option<bool>,
/// (For LanceDB OSS only): extra properties for the backing namespace
/// client used by manifest-enabled native connections.
pub namespace_client_properties: Option<HashMap<String, String>>,
/// (For LanceDB OSS only): the session to use for this connection. Holds
/// shared caches and other session-specific state.
pub session: Option<session::Session>,

View File

@@ -18,7 +18,6 @@ type RerankHybridFn = ThreadsafeFunction<
RerankHybridCallbackArgs,
Status,
false,
true,
>;
/// Reranker implementation that "wraps" a NodeJS Reranker implementation.
@@ -33,10 +32,7 @@ impl Reranker {
pub fn new(
rerank_hybrid: Function<RerankHybridCallbackArgs, Promise<Buffer>>,
) -> napi::Result<Self> {
let rerank_hybrid = rerank_hybrid
.build_threadsafe_function()
.weak::<true>()
.build()?;
let rerank_hybrid = rerank_hybrid.build_threadsafe_function().build()?;
Ok(Self { rerank_hybrid })
}
}

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.31.0-beta.10"
current_version = "0.31.0-beta.4"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,7 +1,6 @@
[package]
name = "lancedb-python"
version = "0.31.0-beta.10"
publish = false
version = "0.31.0-beta.4"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -183,6 +183,7 @@
| stack-data | 0.6.3 | MIT License | http://github.com/alexmojaki/stack_data |
| sympy | 1.14.0 | BSD License | https://sympy.org |
| tabulate | 0.9.0 | MIT License | https://github.com/astanin/python-tabulate |
| tantivy | 0.25.1 | UNKNOWN | UNKNOWN |
| threadpoolctl | 3.6.0 | BSD License | https://github.com/joblib/threadpoolctl |
| timm | 1.0.24 | Apache Software License | https://github.com/huggingface/pytorch-image-models |
| tinycss2 | 1.4.0 | BSD License | https://www.courtbouillon.org/tinycss2 |

View File

@@ -57,6 +57,7 @@ tests = [
"duckdb>=0.9.0",
"pytz>=2023.3",
"polars>=0.19, <=1.3.0",
"tantivy>=0.20.0",
"pyarrow-stubs>=16.0",
"pylance>=5.0.0b5",
"requests>=2.31.0",

View File

@@ -73,7 +73,6 @@ def connect(
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
storage_options: Optional[Dict[str, str]] = None,
session: Optional[Session] = None,
manifest_enabled: bool = False,
namespace_client_impl: Optional[str] = None,
namespace_client_properties: Optional[Dict[str, str]] = None,
namespace_client_pushdown_operations: Optional[List[str]] = None,
@@ -111,11 +110,7 @@ def connect(
default configuration is used.
storage_options: dict, optional
Additional options for the storage backend. See available options at
<https://docs.lancedb.com/storage/>
manifest_enabled : bool, default False
When true for local/native connections, use directory namespace
manifests as the source of truth for table metadata. Existing
directory-listed root tables are migrated into the manifest on access.
<https://lancedb.com/docs/storage/>
session: Session, optional
(For LanceDB OSS only)
A session to use for this connection. Sessions allow you to configure
@@ -163,11 +158,11 @@ def connect(
conn : DBConnection
A connection to a LanceDB database.
"""
if namespace_client_impl is not None:
if namespace_client_properties is None:
if namespace_client_impl is not None or namespace_client_properties is not None:
if namespace_client_impl is None or namespace_client_properties is None:
raise ValueError(
"namespace_client_properties must be provided when "
"namespace_client_impl is set"
"Both namespace_client_impl and "
"namespace_client_properties must be provided"
)
if kwargs:
raise ValueError(f"Unknown keyword arguments: {kwargs}")
@@ -180,12 +175,6 @@ def connect(
namespace_client_pushdown_operations=namespace_client_pushdown_operations,
)
if namespace_client_properties is not None and not manifest_enabled:
raise ValueError(
"namespace_client_impl must be provided when using "
"namespace_client_properties unless manifest_enabled=True"
)
if namespace_client_pushdown_operations is not None:
raise ValueError(
"namespace_client_pushdown_operations is only valid when "
@@ -223,92 +212,9 @@ def connect(
read_consistency_interval=read_consistency_interval,
storage_options=storage_options,
session=session,
manifest_enabled=manifest_enabled,
namespace_client_properties=namespace_client_properties,
)
WORKER_PROPERTY_PREFIX = "_lancedb_worker_"
def _apply_worker_overrides(props: dict[str, str]) -> dict[str, str]:
"""Apply worker property overrides.
Any key starting with ``_lancedb_worker_`` is extracted, the prefix
is stripped, and the resulting key-value pair is put back into the
map (overriding the existing value if present). The original
prefixed key is removed.
"""
worker_keys = [k for k in props if k.startswith(WORKER_PROPERTY_PREFIX)]
if not worker_keys:
return props
result = dict(props)
for key in worker_keys:
value = result.pop(key)
real_key = key[len(WORKER_PROPERTY_PREFIX) :]
result[real_key] = value
return result
def deserialize_conn(
data: str,
*,
for_worker: bool = False,
) -> DBConnection:
"""Reconstruct a DBConnection from a serialized string.
The string must have been produced by
:meth:`DBConnection.serialize`.
Parameters
----------
data : str
String produced by ``serialize()``.
for_worker : bool, default False
When ``True``, any namespace client property whose key starts
with ``_lancedb_worker_`` has that prefix stripped and the
value overrides the corresponding property. For example,
``_lancedb_worker_uri`` replaces ``uri``.
Returns
-------
DBConnection
A new connection matching the serialized state.
"""
import json
parsed = json.loads(data)
connection_type = parsed.get("connection_type")
rci_secs = parsed.get("read_consistency_interval_seconds")
rci = timedelta(seconds=rci_secs) if rci_secs is not None else None
storage_options = parsed.get("storage_options")
if connection_type == "namespace":
props = dict(parsed.get("namespace_client_properties") or {})
if for_worker:
props = _apply_worker_overrides(props)
return connect_namespace(
namespace_client_impl=parsed["namespace_client_impl"],
namespace_client_properties=props,
read_consistency_interval=rci,
storage_options=storage_options,
namespace_client_pushdown_operations=parsed.get(
"namespace_client_pushdown_operations"
),
)
elif connection_type == "local":
return LanceDBConnection(
parsed["uri"],
read_consistency_interval=rci,
storage_options=storage_options,
manifest_enabled=parsed.get("manifest_enabled", False),
namespace_client_properties=parsed.get("namespace_client_properties"),
)
else:
raise ValueError(f"Unknown connection_type: {connection_type}")
async def connect_async(
uri: URI,
*,
@@ -319,8 +225,6 @@ async def connect_async(
client_config: Optional[Union[ClientConfig, Dict[str, Any]]] = None,
storage_options: Optional[Dict[str, str]] = None,
session: Optional[Session] = None,
manifest_enabled: bool = False,
namespace_client_properties: Optional[Dict[str, str]] = None,
) -> AsyncConnection:
"""Connect to a LanceDB database.
@@ -353,20 +257,13 @@ async def connect_async(
default configuration is used.
storage_options: dict, optional
Additional options for the storage backend. See available options at
<https://docs.lancedb.com/storage/>
<https://lancedb.com/docs/storage/>
session: Session, optional
(For LanceDB OSS only)
A session to use for this connection. Sessions allow you to configure
cache sizes for index and metadata caches, which can significantly
impact memory use and performance. They can also be re-used across
multiple connections to share the same cache state.
manifest_enabled : bool, default False
When true for local/native connections, use directory namespace
manifests as the source of truth for table metadata. Existing
directory-listed root tables are migrated into the manifest on access.
namespace_client_properties : dict, optional
Additional directory namespace client properties to use with
``manifest_enabled=True``.
Examples
--------
@@ -409,8 +306,6 @@ async def connect_async(
client_config,
storage_options,
session,
manifest_enabled,
namespace_client_properties,
)
)

View File

@@ -242,8 +242,6 @@ async def connect(
client_config: Optional[Union[ClientConfig, Dict[str, Any]]],
storage_options: Optional[Dict[str, str]],
session: Optional[Session],
manifest_enabled: bool = False,
namespace_client_properties: Optional[Dict[str, str]] = None,
) -> Connection: ...
class RecordBatchStream:

View File

@@ -96,7 +96,7 @@ def data_to_reader(
f"Unknown data type {type(data)}. "
"Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
"pyarrow Table/RecordBatch, or Pydantic models. "
"See https://docs.lancedb.com/tables/ for examples."
"See https://lancedb.com/docs/tables/ for examples."
)

View File

@@ -282,7 +282,7 @@ class DBConnection(EnforceOverrides):
Additional options for the storage backend. Options already set on the
connection will be inherited by the table, but can be overridden here.
See available options at
<https://docs.lancedb.com/storage/>
<https://lancedb.com/docs/storage/>
To enable stable row IDs (row IDs remain stable after compaction,
update, delete, and merges), set `new_table_enable_stable_row_ids`
@@ -433,7 +433,7 @@ class DBConnection(EnforceOverrides):
Additional options for the storage backend. Options already set on the
connection will be inherited by the table, but can be overridden here.
See available options at
<https://docs.lancedb.com/storage/>
<https://lancedb.com/docs/storage/>
Returns
-------
@@ -529,19 +529,6 @@ class DBConnection(EnforceOverrides):
"namespace_client is not supported for this connection type"
)
def serialize(self) -> str:
"""Serialize this connection for reconstruction.
The returned string can be passed to :func:`lancedb.deserialize_conn`
to recreate an equivalent connection, e.g. in a remote worker.
Returns
-------
str
Serialized representation of this connection.
"""
raise NotImplementedError("serialize is not supported for this connection type")
class LanceDBConnection(DBConnection):
"""
@@ -590,16 +577,10 @@ class LanceDBConnection(DBConnection):
read_consistency_interval: Optional[timedelta] = None,
storage_options: Optional[Dict[str, str]] = None,
session: Optional[Session] = None,
manifest_enabled: bool = False,
namespace_client_properties: Optional[Dict[str, str]] = None,
_inner: Optional[LanceDbConnection] = None,
):
self.storage_options = storage_options
self._manifest_enabled = manifest_enabled
self._namespace_client_properties = namespace_client_properties
if _inner is not None:
self._conn = _inner
self._cached_namespace_client = None
return
if not isinstance(uri, Path):
@@ -638,8 +619,6 @@ class LanceDBConnection(DBConnection):
None,
storage_options,
session,
manifest_enabled,
namespace_client_properties,
)
# TODO: It would be nice if we didn't store self.storage_options but it is
@@ -647,8 +626,8 @@ class LanceDBConnection(DBConnection):
# work because some paths like LanceDBConnection.from_inner will lose the
# storage_options. Also, this class really shouldn't be holding any state
# beyond _conn.
self.storage_options = storage_options
self._conn = AsyncConnection(LOOP.run(do_connect()))
self._cached_namespace_client: Optional[LanceNamespace] = None
@property
def read_consistency_interval(self) -> Optional[timedelta]:
@@ -673,24 +652,6 @@ class LanceDBConnection(DBConnection):
val += ")"
return val
@override
def serialize(self) -> str:
import json
rci = self.read_consistency_interval
return json.dumps(
{
"connection_type": "local",
"uri": self.uri,
"storage_options": self.storage_options,
"manifest_enabled": self._manifest_enabled,
"namespace_client_properties": self._namespace_client_properties,
"read_consistency_interval_seconds": (
rci.total_seconds() if rci else None
),
}
)
async def _async_get_table_names(self, start_after: Optional[str], limit: int):
conn = AsyncConnection(await lancedb_connect(self.uri))
return await conn.table_names(start_after=start_after, limit=limit)
@@ -726,10 +687,10 @@ class LanceDBConnection(DBConnection):
"""
if namespace_path is None:
namespace_path = []
return self._namespace_conn().list_namespaces(
namespace_path=namespace_path,
page_token=page_token,
limit=limit,
return LOOP.run(
self._conn.list_namespaces(
namespace_path=namespace_path, page_token=page_token, limit=limit
)
)
@override
@@ -739,10 +700,27 @@ class LanceDBConnection(DBConnection):
mode: Optional[str] = None,
properties: Optional[Dict[str, str]] = None,
) -> CreateNamespaceResponse:
return self._namespace_conn().create_namespace(
namespace_path=namespace_path,
mode=mode,
properties=properties,
"""Create a new namespace.
Parameters
----------
namespace_path: List[str]
The namespace identifier to create.
mode: str, optional
Creation mode - "create" (fail if exists), "exist_ok" (skip if exists),
or "overwrite" (replace if exists). Case insensitive.
properties: Dict[str, str], optional
Properties to set on the namespace.
Returns
-------
CreateNamespaceResponse
Response containing the properties of the created namespace.
"""
return LOOP.run(
self._conn.create_namespace(
namespace_path=namespace_path, mode=mode, properties=properties
)
)
@override
@@ -752,19 +730,46 @@ class LanceDBConnection(DBConnection):
mode: Optional[str] = None,
behavior: Optional[str] = None,
) -> DropNamespaceResponse:
return self._namespace_conn().drop_namespace(
namespace_path=namespace_path,
mode=mode,
behavior=behavior,
"""Drop a namespace.
Parameters
----------
namespace_path: List[str]
The namespace identifier to drop.
mode: str, optional
Whether to skip if not exists ("SKIP") or fail ("FAIL"). Case insensitive.
behavior: str, optional
Whether to restrict drop if not empty ("RESTRICT") or cascade ("CASCADE").
Case insensitive.
Returns
-------
DropNamespaceResponse
Response containing properties and transaction_id if applicable.
"""
return LOOP.run(
self._conn.drop_namespace(
namespace_path=namespace_path, mode=mode, behavior=behavior
)
)
@override
def describe_namespace(
self, namespace_path: List[str]
) -> DescribeNamespaceResponse:
return self._namespace_conn().describe_namespace(
namespace_path=namespace_path,
)
"""Describe a namespace.
Parameters
----------
namespace_path: List[str]
The namespace identifier to describe.
Returns
-------
DescribeNamespaceResponse
Response containing the namespace properties.
"""
return LOOP.run(self._conn.describe_namespace(namespace_path=namespace_path))
@override
def list_tables(
@@ -793,12 +798,6 @@ class LanceDBConnection(DBConnection):
"""
if namespace_path is None:
namespace_path = []
if namespace_path:
return self._namespace_conn().list_tables(
namespace_path=namespace_path,
page_token=page_token,
limit=limit,
)
return LOOP.run(
self._conn.list_tables(
namespace_path=namespace_path, page_token=page_token, limit=limit
@@ -887,22 +886,6 @@ class LanceDBConnection(DBConnection):
raise ValueError("mode must be either 'create' or 'overwrite'")
validate_table_name(name)
if namespace_path:
return self._namespace_conn().create_table(
name,
data=data,
schema=schema,
mode=mode,
exist_ok=exist_ok,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
embedding_functions=embedding_functions,
namespace_path=namespace_path,
storage_options=storage_options,
data_storage_version=data_storage_version,
enable_v2_manifest_paths=enable_v2_manifest_paths,
)
tbl = LanceTable.create(
self,
name,
@@ -918,19 +901,6 @@ class LanceDBConnection(DBConnection):
)
return tbl
def _namespace_conn(self) -> DBConnection:
"""Return a LanceNamespaceDBConnection backed by this connection's
directory namespace. Used to delegate child-namespace operations."""
from lancedb.namespace import LanceNamespaceDBConnection
return LanceNamespaceDBConnection(
self.namespace_client(),
read_consistency_interval=self.read_consistency_interval,
storage_options=self.storage_options,
namespace_client_impl=None,
namespace_client_properties=None,
)
@override
def open_table(
self,
@@ -947,8 +917,7 @@ class LanceDBConnection(DBConnection):
name: str
The name of the table.
namespace_path: List[str], optional
The namespace to open the table from. When non-empty, the
table is resolved through the directory namespace client.
The namespace to open the table from.
Returns
-------
@@ -967,14 +936,6 @@ class LanceDBConnection(DBConnection):
stacklevel=2,
)
if namespace_path:
return self._namespace_conn().open_table(
name,
namespace_path=namespace_path,
storage_options=storage_options,
index_cache_size=index_cache_size,
)
return LanceTable.open(
self,
name,
@@ -1059,9 +1020,6 @@ class LanceDBConnection(DBConnection):
"""
if namespace_path is None:
namespace_path = []
if namespace_path:
self._namespace_conn().drop_table(name, namespace_path=namespace_path)
return
LOOP.run(
self._conn.drop_table(
name, namespace_path=namespace_path, ignore_missing=ignore_missing
@@ -1113,17 +1071,14 @@ class LanceDBConnection(DBConnection):
"""Get the equivalent namespace client for this connection.
Returns a DirectoryNamespace pointing to the same root with the
same storage options. The result is cached for the lifetime of
this connection.
same storage options.
Returns
-------
LanceNamespace
The namespace client for this connection.
"""
if self._cached_namespace_client is None:
self._cached_namespace_client = LOOP.run(self._conn.namespace_client())
return self._cached_namespace_client
return LOOP.run(self._conn.namespace_client())
@deprecation.deprecated(
deprecated_in="0.15.1",
@@ -1398,7 +1353,6 @@ class AsyncConnection(object):
namespace_path: Optional[List[str]] = None,
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
location: Optional[str] = None,
namespace_client: Optional[Any] = None,
) -> AsyncTable:
"""Create an [AsyncTable][lancedb.table.AsyncTable] in the database.
@@ -1443,7 +1397,7 @@ class AsyncConnection(object):
Additional options for the storage backend. Options already set on the
connection will be inherited by the table, but can be overridden here.
See available options at
<https://docs.lancedb.com/storage/>
<https://lancedb.com/docs/storage/>
To enable stable row IDs (row IDs remain stable after compaction,
update, delete, and merges), set `new_table_enable_stable_row_ids`
@@ -1596,7 +1550,6 @@ class AsyncConnection(object):
namespace_path=namespace_path,
storage_options=storage_options,
location=location,
namespace_client=namespace_client,
)
else:
data = data_to_reader(data, schema)
@@ -1607,7 +1560,6 @@ class AsyncConnection(object):
namespace_path=namespace_path,
storage_options=storage_options,
location=location,
namespace_client=namespace_client,
)
return AsyncTable(new_table)
@@ -1636,7 +1588,7 @@ class AsyncConnection(object):
Additional options for the storage backend. Options already set on the
connection will be inherited by the table, but can be overridden here.
See available options at
<https://docs.lancedb.com/storage/>
<https://lancedb.com/docs/storage/>
index_cache_size: int, default 256
**Deprecated**: Use session-level cache configuration instead.
Create a Session with custom cache sizes and pass it to lancedb.connect().

View File

@@ -0,0 +1,201 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
"""Full text search index using tantivy-py"""
import os
from typing import List, Tuple, Optional
import pyarrow as pa
try:
import tantivy
except ImportError:
raise ImportError(
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
)
from .table import LanceTable
def create_index(
index_path: str,
text_fields: List[str],
ordering_fields: Optional[List[str]] = None,
tokenizer_name: str = "default",
) -> tantivy.Index:
"""
Create a new Index (not populated)
Parameters
----------
index_path : str
Path to the index directory
text_fields : List[str]
List of text fields to index
ordering_fields: List[str]
List of unsigned type fields to order by at search time
tokenizer_name : str, default "default"
The tokenizer to use
Returns
-------
index : tantivy.Index
The index object (not yet populated)
"""
if ordering_fields is None:
ordering_fields = []
# Declaring our schema.
schema_builder = tantivy.SchemaBuilder()
# special field that we'll populate with row_id
schema_builder.add_integer_field("doc_id", stored=True)
# data fields
for name in text_fields:
schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
if ordering_fields:
for name in ordering_fields:
schema_builder.add_unsigned_field(name, fast=True)
schema = schema_builder.build()
os.makedirs(index_path, exist_ok=True)
index = tantivy.Index(schema, path=index_path)
return index
def populate_index(
index: tantivy.Index,
table: LanceTable,
fields: List[str],
writer_heap_size: Optional[int] = None,
ordering_fields: Optional[List[str]] = None,
) -> int:
"""
Populate an index with data from a LanceTable
Parameters
----------
index : tantivy.Index
The index object
table : LanceTable
The table to index
fields : List[str]
List of fields to index
writer_heap_size : int
The writer heap size in bytes, defaults to 1GB
Returns
-------
int
The number of rows indexed
"""
if ordering_fields is None:
ordering_fields = []
writer_heap_size = writer_heap_size or 1024 * 1024 * 1024
# first check the fields exist and are string or large string type
nested = []
for name in fields:
try:
f = table.schema.field(name) # raises KeyError if not found
except KeyError:
f = resolve_path(table.schema, name)
nested.append(name)
if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
raise TypeError(f"Field {name} is not a string type")
# create a tantivy writer
writer = index.writer(heap_size=writer_heap_size)
# write data into index
dataset = table.to_lance()
row_id = 0
max_nested_level = 0
if len(nested) > 0:
max_nested_level = max([len(name.split(".")) for name in nested])
for b in dataset.to_batches(columns=fields + ordering_fields):
if max_nested_level > 0:
b = pa.Table.from_batches([b])
for _ in range(max_nested_level - 1):
b = b.flatten()
for i in range(b.num_rows):
doc = tantivy.Document()
for name in fields:
value = b[name][i].as_py()
if value is not None:
doc.add_text(name, value)
for name in ordering_fields:
value = b[name][i].as_py()
if value is not None:
doc.add_unsigned(name, value)
if not doc.is_empty:
doc.add_integer("doc_id", row_id)
writer.add_document(doc)
row_id += 1
# commit changes
writer.commit()
return row_id
def resolve_path(schema, field_name: str) -> pa.Field:
"""
Resolve a nested field path to a list of field names
Parameters
----------
field_name : str
The field name to resolve
Returns
-------
List[str]
The resolved path
"""
path = field_name.split(".")
field = schema.field(path.pop(0))
for segment in path:
if pa.types.is_struct(field.type):
field = field.type.field(segment)
else:
raise KeyError(f"field {field_name} not found in schema {schema}")
return field
def search_index(
index: tantivy.Index, query: str, limit: int = 10, ordering_field=None
) -> Tuple[Tuple[int], Tuple[float]]:
"""
Search an index for a query
Parameters
----------
index : tantivy.Index
The index object
query : str
The query string
limit : int
The maximum number of results to return
Returns
-------
ids_and_score: list[tuple[int], tuple[float]]
A tuple of two tuples, the first containing the document ids
and the second containing the scores
"""
searcher = index.searcher()
query = index.parse_query(query)
# get top results
if ordering_field:
results = searcher.search(query, limit, order_by_field=ordering_field)
else:
results = searcher.search(query, limit)
if results.count == 0:
return tuple(), tuple()
return tuple(
zip(
*[
(searcher.doc(doc_address)["doc_id"][0], score)
for score, doc_address in results.hits
]
)
)

View File

@@ -10,6 +10,7 @@ through a namespace abstraction.
from __future__ import annotations
import asyncio
import sys
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
@@ -24,24 +25,7 @@ if TYPE_CHECKING:
from datetime import timedelta
import pyarrow as pa
from lance_namespace_urllib3_client.models.json_arrow_data_type import JsonArrowDataType
from lance_namespace_urllib3_client.models.json_arrow_field import JsonArrowField
from lance_namespace_urllib3_client.models.json_arrow_schema import JsonArrowSchema
from lance_namespace_urllib3_client.models.query_table_request import QueryTableRequest
from lance_namespace_urllib3_client.models.query_table_request_columns import (
QueryTableRequestColumns,
)
from lance_namespace_urllib3_client.models.query_table_request_full_text_query import (
QueryTableRequestFullTextQuery,
)
from lance_namespace_urllib3_client.models.query_table_request_vector import (
QueryTableRequestVector,
)
from lance_namespace_urllib3_client.models.string_fts_query import StringFtsQuery
from lance_namespace.errors import TableNotFoundError
from lancedb._lancedb import connect_namespace_client as _connect_namespace_client
from lancedb.background_loop import LOOP
from lancedb.db import AsyncConnection, DBConnection
from lancedb.db import DBConnection, LanceDBConnection
from lancedb.namespace_utils import (
_normalize_create_namespace_mode,
_normalize_drop_namespace_mode,
@@ -56,11 +40,14 @@ from lance_namespace import (
ListNamespacesResponse,
ListTablesResponse,
ListTablesRequest,
DescribeTableRequest,
DescribeNamespaceRequest,
DropTableRequest,
ListNamespacesRequest,
CreateNamespaceRequest,
DropNamespaceRequest,
DeclareTableRequest,
CreateTableRequest,
)
from lancedb.table import AsyncTable, LanceTable, Table
from lancedb.util import validate_table_name
@@ -69,6 +56,21 @@ from lancedb.pydantic import LanceModel
from lancedb.embeddings import EmbeddingFunctionConfig
from ._lancedb import Session
from lance_namespace_urllib3_client.models.json_arrow_schema import JsonArrowSchema
from lance_namespace_urllib3_client.models.json_arrow_field import JsonArrowField
from lance_namespace_urllib3_client.models.json_arrow_data_type import JsonArrowDataType
from lance_namespace_urllib3_client.models.query_table_request import QueryTableRequest
from lance_namespace_urllib3_client.models.query_table_request_vector import (
QueryTableRequestVector,
)
from lance_namespace_urllib3_client.models.query_table_request_columns import (
QueryTableRequestColumns,
)
from lance_namespace_urllib3_client.models.query_table_request_full_text_query import (
QueryTableRequestFullTextQuery,
)
from lance_namespace_urllib3_client.models.string_fts_query import StringFtsQuery
def _query_to_namespace_request(
table_id: List[str],
@@ -379,8 +381,6 @@ class LanceNamespaceDBConnection(DBConnection):
storage_options: Optional[Dict[str, str]] = None,
session: Optional[Session] = None,
namespace_client_pushdown_operations: Optional[List[str]] = None,
namespace_client_impl: Optional[str] = None,
namespace_client_properties: Optional[Dict[str, str]] = None,
):
"""
Initialize a namespace-based LanceDB connection.
@@ -406,60 +406,12 @@ class LanceNamespaceDBConnection(DBConnection):
namespace.create_table() instead of using declare_table + local write.
Default is None (no pushdown, all operations run locally).
namespace_client_impl : Optional[str]
The namespace implementation name used to create this connection.
Stored for serialization purposes.
namespace_client_properties : Optional[Dict[str, str]]
The namespace properties used to create this connection.
Stored for serialization purposes.
"""
self._namespace_client = namespace_client
self.read_consistency_interval = read_consistency_interval
self.storage_options = storage_options or {}
self.session = session
self._namespace_client_pushdown_operations = set(
namespace_client_pushdown_operations or []
)
self._namespace_client_impl = namespace_client_impl
self._namespace_client_properties = namespace_client_properties
self._inner = AsyncConnection(
_connect_namespace_client(
namespace_client,
read_consistency_interval=(
read_consistency_interval.total_seconds()
if read_consistency_interval is not None
else None
),
storage_options=self.storage_options or None,
session=session,
namespace_client_pushdown_operations=(
list(self._namespace_client_pushdown_operations)
),
namespace_client_impl=namespace_client_impl,
namespace_client_properties=namespace_client_properties,
)
)
@override
def serialize(self) -> str:
import json
return json.dumps(
{
"connection_type": "namespace",
"namespace_client_impl": self._namespace_client_impl,
"namespace_client_properties": self._namespace_client_properties,
"namespace_client_pushdown_operations": sorted(
self._namespace_client_pushdown_operations
),
"storage_options": self.storage_options or None,
"read_consistency_interval_seconds": (
self.read_consistency_interval.total_seconds()
if self.read_consistency_interval
else None
),
}
)
self._pushdown_operations = set(namespace_client_pushdown_operations or [])
@override
def table_names(
@@ -512,10 +464,13 @@ class LanceNamespaceDBConnection(DBConnection):
if mode.lower() not in ["create", "overwrite"]:
raise ValueError("mode must be either 'create' or 'overwrite'")
validate_table_name(name)
async_table = LOOP.run(
self._inner.create_table(
name,
data,
table_id = namespace_path + [name]
if "CreateTable" in self._pushdown_operations:
return self._create_table_server_side(
name=name,
data=data,
schema=schema,
mode=mode,
exist_ok=exist_ok,
@@ -525,15 +480,127 @@ class LanceNamespaceDBConnection(DBConnection):
namespace_path=namespace_path,
storage_options=storage_options,
)
# Local create path: declare_table + local write
# Step 1: Get the table location and storage options from namespace
# In overwrite mode, if table exists, use describe_table to get
# existing location. Otherwise, call create_empty_table to reserve
# a new location
location = None
namespace_storage_options = None
if mode.lower() == "overwrite":
# Try to describe the table first to see if it exists
try:
describe_request = DescribeTableRequest(id=table_id)
describe_response = self._namespace_client.describe_table(
describe_request
)
location = describe_response.location
namespace_storage_options = describe_response.storage_options
except Exception:
# Table doesn't exist, will create a new one below
pass
if location is None:
# Table doesn't exist or mode is "create", reserve a new location
declare_request = DeclareTableRequest(
id=table_id,
location=None,
properties=self.storage_options if self.storage_options else None,
)
declare_response = self._namespace_client.declare_table(declare_request)
if not declare_response.location:
raise ValueError(
"Table location is missing from declare_table response"
)
location = declare_response.location
namespace_storage_options = declare_response.storage_options
# Merge storage options: self.storage_options < user options < namespace options
merged_storage_options = dict(self.storage_options)
if storage_options:
merged_storage_options.update(storage_options)
if namespace_storage_options:
merged_storage_options.update(namespace_storage_options)
# Step 2: Create table using LanceTable.create with the location
# We need a temporary connection for the LanceTable.create method
temp_conn = LanceDBConnection(
location, # Use the actual location as the connection URI
read_consistency_interval=self.read_consistency_interval,
storage_options=merged_storage_options,
session=self.session,
)
return LanceTable(
self,
# Note: storage_options_provider is auto-created in Rust from namespace_client
tbl = LanceTable.create(
temp_conn,
name,
data,
schema,
mode=mode,
exist_ok=exist_ok,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
embedding_functions=embedding_functions,
namespace_path=namespace_path,
storage_options=merged_storage_options,
location=location,
namespace_client=self._namespace_client,
pushdown_operations=self._pushdown_operations,
)
return tbl
def _create_table_server_side(
self,
name: str,
data: Optional[DATA],
schema: Optional[Union[pa.Schema, LanceModel]],
mode: str,
exist_ok: bool,
on_bad_vectors: str,
fill_value: float,
embedding_functions: Optional[List[EmbeddingFunctionConfig]],
namespace_path: Optional[List[str]],
storage_options: Optional[Dict[str, str]],
) -> Table:
"""Create a table using server-side namespace.create_table()."""
if namespace_path is None:
namespace_path = []
table_id = namespace_path + [name]
arrow_ipc_bytes = _data_to_arrow_ipc(
data=data,
schema=schema,
embedding_functions=embedding_functions,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
)
request = CreateTableRequest(
id=table_id,
mode=_normalize_create_table_mode(mode),
properties=self.storage_options if self.storage_options else None,
)
try:
self._namespace_client.create_table(request, arrow_ipc_bytes)
except Exception as e:
if exist_ok and "already exists" in str(e).lower():
return self.open_table(
name,
namespace_path=namespace_path,
storage_options=storage_options,
)
raise
return self.open_table(
name,
namespace_path=namespace_path,
namespace_client=self._namespace_client,
pushdown_operations=self._namespace_client_pushdown_operations,
_async=async_table,
storage_options=storage_options,
)
@override
@@ -547,28 +614,30 @@ class LanceNamespaceDBConnection(DBConnection):
) -> Table:
if namespace_path is None:
namespace_path = []
try:
async_table = LOOP.run(
self._inner.open_table(
name,
namespace_path=namespace_path,
storage_options=storage_options,
index_cache_size=index_cache_size,
)
)
except RuntimeError as e:
if "Table not found" in str(e):
table_id = namespace_path + [name]
raise TableNotFoundError(f"Table not found: {'$'.join(table_id)}")
raise
table_id = namespace_path + [name]
request = DescribeTableRequest(id=table_id)
response = self._namespace_client.describe_table(request)
return LanceTable(
self,
# Merge storage options: self.storage_options < user options < namespace options
merged_storage_options = dict(self.storage_options)
if storage_options:
merged_storage_options.update(storage_options)
if response.storage_options:
merged_storage_options.update(response.storage_options)
# Pass managed_versioning to avoid redundant describe_table call in Rust.
# Convert None to False since we already have the answer from describe_table.
managed_versioning = response.managed_versioning is True
# Note: storage_options_provider is auto-created in Rust from namespace_client
return self._lance_table_from_uri(
name,
response.location,
namespace_path=namespace_path,
storage_options=merged_storage_options,
index_cache_size=index_cache_size,
namespace_client=self._namespace_client,
pushdown_operations=self._namespace_client_pushdown_operations,
_async=async_table,
managed_versioning=managed_versioning,
)
@override
@@ -792,34 +861,33 @@ class LanceNamespaceDBConnection(DBConnection):
namespace_client: Optional[Any] = None,
managed_versioning: Optional[bool] = None,
) -> LanceTable:
# Open a table directly from the namespace-resolved physical location.
#
# Open the table through the Rust namespace-backed connection. The Rust
# layer keeps the logical namespace path and namespace client intact.
# Open a table directly from a URI using the location parameter
# Note: storage_options should already be merged by the caller
# Note: storage_options_provider is auto-created in Rust from namespace_client
if namespace_path is None:
namespace_path = []
async_table = LOOP.run(
self._inner.open_table(
name,
namespace_path=namespace_path,
storage_options=storage_options,
index_cache_size=index_cache_size,
location=None,
namespace_client=namespace_client,
managed_versioning=managed_versioning,
)
temp_conn = LanceDBConnection(
table_uri, # Use the table location as the connection URI
read_consistency_interval=self.read_consistency_interval,
storage_options=storage_options if storage_options is not None else {},
session=self.session,
)
return LanceTable(
self,
# Open the table using the temporary connection with the location parameter
# Pass namespace_client to enable managed versioning support and auto-create
# storage options provider
# Pass managed_versioning to avoid redundant describe_table call
# Pass pushdown_operations if configured on this connection
return LanceTable.open(
temp_conn,
name,
namespace_path=namespace_path,
storage_options=storage_options,
index_cache_size=index_cache_size,
location=table_uri,
namespace_client=namespace_client,
managed_versioning=managed_versioning,
pushdown_operations=self._namespace_client_pushdown_operations,
_async=async_table,
pushdown_operations=self._pushdown_operations,
)
@override
@@ -883,26 +951,7 @@ class AsyncLanceNamespaceDBConnection:
self.read_consistency_interval = read_consistency_interval
self.storage_options = storage_options or {}
self.session = session
self._namespace_client_pushdown_operations = set(
namespace_client_pushdown_operations or []
)
self._inner = AsyncConnection(
_connect_namespace_client(
namespace_client,
read_consistency_interval=(
read_consistency_interval.total_seconds()
if read_consistency_interval is not None
else None
),
storage_options=self.storage_options or None,
session=session,
namespace_client_pushdown_operations=(
list(self._namespace_client_pushdown_operations)
),
namespace_client_impl=None,
namespace_client_properties=None,
)
)
self._pushdown_operations = set(namespace_client_pushdown_operations or [])
async def table_names(
self,
@@ -954,16 +1003,145 @@ class AsyncLanceNamespaceDBConnection:
if mode.lower() not in ["create", "overwrite"]:
raise ValueError("mode must be either 'create' or 'overwrite'")
validate_table_name(name)
return await self._inner.create_table(
table_id = namespace_path + [name]
if "CreateTable" in self._pushdown_operations:
return await self._create_table_server_side(
name=name,
data=data,
schema=schema,
mode=mode,
exist_ok=exist_ok,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
embedding_functions=embedding_functions,
namespace_path=namespace_path,
storage_options=storage_options,
)
# Local create path: declare_table + local write
# Step 1: Get the table location and storage options from namespace
location = None
namespace_storage_options = None
if mode.lower() == "overwrite":
# Try to describe the table first to see if it exists
try:
describe_request = DescribeTableRequest(id=table_id)
describe_response = self._namespace_client.describe_table(
describe_request
)
location = describe_response.location
namespace_storage_options = describe_response.storage_options
except Exception:
# Table doesn't exist, will create a new one below
pass
if location is None:
# Table doesn't exist or mode is "create", reserve a new location
declare_request = DeclareTableRequest(
id=table_id,
location=None,
properties=self.storage_options if self.storage_options else None,
)
declare_response = self._namespace_client.declare_table(declare_request)
if not declare_response.location:
raise ValueError(
"Table location is missing from declare_table response"
)
location = declare_response.location
namespace_storage_options = declare_response.storage_options
# Merge storage options: self.storage_options < user options < namespace options
merged_storage_options = dict(self.storage_options)
if storage_options:
merged_storage_options.update(storage_options)
if namespace_storage_options:
merged_storage_options.update(namespace_storage_options)
# Step 2: Create table using LanceTable.create with the location
# Run the sync operation in a thread
def _create_table():
temp_conn = LanceDBConnection(
location,
read_consistency_interval=self.read_consistency_interval,
storage_options=merged_storage_options,
session=self.session,
)
# storage_options_provider is auto-created in Rust from namespace_client
return LanceTable.create(
temp_conn,
name,
data,
schema,
mode=mode,
exist_ok=exist_ok,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
embedding_functions=embedding_functions,
namespace_path=namespace_path,
storage_options=merged_storage_options,
location=location,
namespace_client=self._namespace_client,
pushdown_operations=self._pushdown_operations,
)
lance_table = await asyncio.to_thread(_create_table)
# Get the underlying async table from LanceTable
return lance_table._table
async def _create_table_server_side(
self,
name: str,
data: Optional[DATA],
schema: Optional[Union[pa.Schema, LanceModel]],
mode: str,
exist_ok: bool,
on_bad_vectors: str,
fill_value: float,
embedding_functions: Optional[List[EmbeddingFunctionConfig]],
namespace_path: Optional[List[str]],
storage_options: Optional[Dict[str, str]],
) -> AsyncTable:
"""Create a table using server-side namespace.create_table()."""
if namespace_path is None:
namespace_path = []
table_id = namespace_path + [name]
def _prepare_and_create():
arrow_ipc_bytes = _data_to_arrow_ipc(
data=data,
schema=schema,
embedding_functions=embedding_functions,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
)
request = CreateTableRequest(
id=table_id,
mode=_normalize_create_table_mode(mode),
properties=self.storage_options if self.storage_options else None,
)
self._namespace_client.create_table(request, arrow_ipc_bytes)
try:
await asyncio.to_thread(_prepare_and_create)
except Exception as e:
if exist_ok and "already exists" in str(e).lower():
return await self.open_table(
name,
namespace_path=namespace_path,
storage_options=storage_options,
)
raise
return await self.open_table(
name,
data,
schema=schema,
mode=mode,
exist_ok=exist_ok,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
namespace_path=namespace_path,
embedding_functions=embedding_functions,
storage_options=storage_options,
)
@@ -978,18 +1156,45 @@ class AsyncLanceNamespaceDBConnection:
"""Open an existing table from the namespace."""
if namespace_path is None:
namespace_path = []
try:
return await self._inner.open_table(
table_id = namespace_path + [name]
request = DescribeTableRequest(id=table_id)
response = self._namespace_client.describe_table(request)
# Merge storage options: self.storage_options < user options < namespace options
merged_storage_options = dict(self.storage_options)
if storage_options:
merged_storage_options.update(storage_options)
if response.storage_options:
merged_storage_options.update(response.storage_options)
# Capture managed_versioning from describe response.
# Convert None to False since we already have the answer from describe_table.
managed_versioning = response.managed_versioning is True
# Open table in a thread
# Note: storage_options_provider is auto-created in Rust from namespace_client
def _open_table():
temp_conn = LanceDBConnection(
response.location,
read_consistency_interval=self.read_consistency_interval,
storage_options=merged_storage_options,
session=self.session,
)
return LanceTable.open(
temp_conn,
name,
namespace_path=namespace_path,
storage_options=storage_options,
storage_options=merged_storage_options,
index_cache_size=index_cache_size,
location=response.location,
namespace_client=self._namespace_client,
managed_versioning=managed_versioning,
pushdown_operations=self._pushdown_operations,
)
except RuntimeError as e:
if "Table not found" in str(e):
table_id = namespace_path + [name]
raise TableNotFoundError(f"Table not found: {'$'.join(table_id)}")
raise
lance_table = await asyncio.to_thread(_open_table)
return lance_table._table
async def drop_table(self, name: str, namespace_path: Optional[List[str]] = None):
"""Drop a table from the namespace."""
@@ -1267,8 +1472,6 @@ def connect_namespace(
storage_options=storage_options,
session=session,
namespace_client_pushdown_operations=namespace_client_pushdown_operations,
namespace_client_impl=namespace_client_impl,
namespace_client_properties=namespace_client_properties,
)

View File

@@ -425,35 +425,6 @@ class Permutation:
"""
return Permutation.from_tables(table, None, None)
@classmethod
def from_table(cls, table: LanceTable) -> "_PermutationFromTable":
"""
Create a permutation directly from a base table, with HuggingFace /
PyTorch-style chaining for ``shuffle``, ``filter``, and ``split_*``.
This is a convenience wrapper that hides the two-step
``permutation_builder(table).shuffle().execute()`` /
``Permutation.from_tables(table, perm_tbl)`` dance. The returned object
accumulates builder operations and only materializes the underlying
permutation table on first read (any access of an attribute that is
not a builder operation), so chained calls do not pay an extra
``execute()`` for each step.
After the first read all ``Permutation`` methods (``select_columns``,
``with_format``, ``map``, ``__iter__``, ``fetch``, ``num_rows``, ...)
are forwarded transparently.
Examples
--------
>>> import lancedb
>>> db = lancedb.connect("memory:///")
>>> tbl = db.create_table("tbl", data=[{"x": x} for x in range(100)])
>>> perm = Permutation.from_table(tbl).shuffle(seed=42)
>>> perm.num_rows
100
"""
return _PermutationFromTable(table)
@classmethod
def from_tables(
cls,
@@ -871,85 +842,3 @@ class Permutation:
Repeat the permutation `times` times
"""
raise Exception("with_repeat is not yet implemented")
class _PermutationFromTable:
"""
Result of [Permutation.from_table](#from_table).
Records pending builder operations (``shuffle``, ``filter``, ``split_*``)
and lazily executes them on first read. After materialization all
Permutation reads / transforms (``select_columns``, ``with_format``,
``map``, ``__iter__``, ``fetch``, ``num_rows``, ...) are forwarded to the
underlying [Permutation].
"""
__slots__ = ("_base_table", "_pending_ops", "_materialized")
def __init__(
self,
base_table: LanceTable,
_pending_ops: Optional[list[tuple[str, tuple, dict]]] = None,
):
self._base_table = base_table
self._pending_ops: list[tuple[str, tuple, dict]] = (
list(_pending_ops) if _pending_ops is not None else []
)
self._materialized: Optional[Permutation] = None
def _with_op(
self, name: str, args: tuple, kwargs: dict
) -> "_PermutationFromTable":
return _PermutationFromTable(
self._base_table, _pending_ops=self._pending_ops + [(name, args, kwargs)]
)
def shuffle(self, *args, **kwargs) -> "_PermutationFromTable":
return self._with_op("shuffle", args, kwargs)
def filter(self, *args, **kwargs) -> "_PermutationFromTable":
return self._with_op("filter", args, kwargs)
def split_random(self, *args, **kwargs) -> "_PermutationFromTable":
return self._with_op("split_random", args, kwargs)
def split_sequential(self, *args, **kwargs) -> "_PermutationFromTable":
return self._with_op("split_sequential", args, kwargs)
def split_hash(self, *args, **kwargs) -> "_PermutationFromTable":
return self._with_op("split_hash", args, kwargs)
def split_calculated(self, *args, **kwargs) -> "_PermutationFromTable":
return self._with_op("split_calculated", args, kwargs)
def _materialize(self) -> Permutation:
if self._materialized is None:
if self._pending_ops:
builder = permutation_builder(self._base_table)
for name, args, kwargs in self._pending_ops:
builder = getattr(builder, name)(*args, **kwargs)
perm_tbl = builder.execute()
self._materialized = Permutation.from_tables(
self._base_table, perm_tbl
)
else:
self._materialized = Permutation.identity(self._base_table)
return self._materialized
def __getattr__(self, name: str) -> Any:
# Avoid recursion on dunder/private state.
if name.startswith("_"):
raise AttributeError(name)
return getattr(self._materialize(), name)
def __iter__(self):
return iter(self._materialize())
def __len__(self) -> int:
return len(self._materialize())
def __getitem__(self, index):
return self._materialize()[index]
def __getitems__(self, indices):
return self._materialize().__getitems__(indices)

View File

@@ -25,6 +25,7 @@ import deprecation
import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.fs as pa_fs
import pydantic
from lancedb.pydantic import PYDANTIC_VERSION
@@ -1525,7 +1526,9 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
return self._table._output_schema(self.to_query_object())
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
self._table._ensure_no_legacy_fts_index()
path, fs, exist = self._table._get_fts_index_path()
if exist:
return self.tantivy_to_arrow()
query = self._query
if self._phrase_query:
@@ -1549,6 +1552,90 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
):
raise NotImplementedError("to_batches on an FTS query")
def tantivy_to_arrow(self) -> pa.Table:
try:
import tantivy
except ImportError:
raise ImportError(
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
)
from .fts import search_index
# get the index path
path, fs, exist = self._table._get_fts_index_path()
# check if the index exist
if not exist:
raise FileNotFoundError(
"Fts index does not exist. "
"Please first call table.create_fts_index(['<field_names>']) to "
"create the fts index."
)
# Check that we are on local filesystem
if not isinstance(fs, pa_fs.LocalFileSystem):
raise NotImplementedError(
"Tantivy-based full text search "
"is only supported on the local filesystem"
)
# open the index
index = tantivy.Index.open(path)
# get the scores and doc ids
query = self._query
if self._phrase_query:
query = query.replace('"', "'")
query = f'"{query}"'
limit = self._limit if self._limit is not None else 10
row_ids, scores = search_index(
index, query, limit, ordering_field=self.ordering_field_name
)
if len(row_ids) == 0:
empty_schema = pa.schema([pa.field("_score", pa.float32())])
return pa.Table.from_batches([], schema=empty_schema)
scores = pa.array(scores)
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
output_tbl = output_tbl.append_column("_score", scores)
# this needs to match vector search results which are uint64
row_ids = pa.array(row_ids, type=pa.uint64())
if self._where is not None:
tmp_name = "__lancedb__duckdb__indexer__"
output_tbl = output_tbl.append_column(
tmp_name, pa.array(range(len(output_tbl)))
)
try:
# TODO would be great to have Substrait generate pyarrow compute
# expressions or conversely have pyarrow support SQL expressions
# using Substrait
import duckdb
indexer = duckdb.sql(
f"SELECT {tmp_name} FROM output_tbl WHERE {self._where}"
).to_arrow_table()[tmp_name]
output_tbl = output_tbl.take(indexer).drop([tmp_name])
row_ids = row_ids.take(indexer)
except ImportError:
import tempfile
import lance
# TODO Use "memory://" instead once that's supported
with tempfile.TemporaryDirectory() as tmp:
ds = lance.write_dataset(output_tbl, tmp)
output_tbl = ds.to_table(filter=self._where)
indexer = output_tbl[tmp_name]
row_ids = row_ids.take(indexer)
output_tbl = output_tbl.drop([tmp_name])
if self._with_row_id:
output_tbl = output_tbl.append_column("_rowid", row_ids)
if self._reranker is not None:
output_tbl = self._reranker.rerank_fts(self._query, output_tbl)
return output_tbl
def rerank(self, reranker: Reranker) -> LanceFtsQueryBuilder:
"""Rerank the results using the specified reranker.

View File

@@ -191,7 +191,7 @@ def _into_pyarrow_reader(
f"Unknown data type {type(data)}. "
"Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
"pyarrow Table/RecordBatch, or Pydantic models. "
"See https://docs.lancedb.com/tables/ for examples."
"See https://lancedb.com/docs/tables/ for examples."
)
@@ -943,26 +943,29 @@ class Table(ABC):
Parameters
----------
field_names: str or list of str
The name of the field to index. Native FTS indexes can only be
created on a single field at a time. To search over multiple text
fields, create a separate FTS index for each field.
The name(s) of the field to index.
If ``use_tantivy`` is False (default), only a single field name
(str) is supported. To index multiple fields, create a separate
FTS index for each field.
replace: bool, default False
If True, replace the existing index if it exists. Note that this is
not yet an atomic operation; the index will be temporarily
unavailable while the new index is being created.
writer_heap_size: int, default 1GB
Deprecated legacy Tantivy parameter. Any value other than the
default raises an error.
Only available with use_tantivy=True
ordering_field_names:
Deprecated legacy Tantivy parameter. Setting this raises an error.
A list of unsigned type fields to index to optionally order
results on at search time.
only available with use_tantivy=True
tokenizer_name: str, default "default"
A compatibility alias for native tokenizer configs. Can be "raw",
"default" or the 2 letter language code followed by "_stem". So
for english it would be "en_stem".
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
language code followed by "_stem". So for english it would be "en_stem".
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
use_tantivy: bool, default False
Deprecated legacy Tantivy parameter. Setting this to True raises an
error.
If True, use the legacy full-text search implementation based on tantivy.
If False, use the new full-text search implementation based on lance-index.
with_position: bool, default False
Only available with use_tantivy=False
If False, do not store the positions of the terms in the text.
This can reduce the size of the index and improve indexing speed.
But it will raise an exception for phrase queries.
@@ -1743,16 +1746,6 @@ class Table(ABC):
index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound
return (path, fs, index_exists)
def _ensure_no_legacy_fts_index(self):
path, _, exists = self._get_fts_index_path()
if exists:
raise ValueError(
"Legacy Tantivy FTS index detected at "
f"{path}. Tantivy-based FTS has been removed. "
"Delete the legacy index and recreate it with "
"table.create_fts_index(...)."
)
@abstractmethod
def uses_v2_manifest_paths(self) -> bool:
"""
@@ -2412,63 +2405,84 @@ class LanceTable(Table):
prefix_only: bool = False,
name: Optional[str] = None,
):
self._ensure_no_legacy_fts_index()
if not use_tantivy:
if not isinstance(field_names, str):
raise ValueError(
"Native FTS indexes can only be created on a single field "
"at a time. To search over multiple text fields, create a "
"separate FTS index for each field."
)
if use_tantivy:
raise ValueError(
"Tantivy-based FTS has been removed. "
"Remove use_tantivy and recreate the index with native FTS."
if tokenizer_name is None:
tokenizer_configs = {
"base_tokenizer": base_tokenizer,
"language": language,
"with_position": with_position,
"max_token_length": max_token_length,
"lower_case": lower_case,
"stem": stem,
"remove_stop_words": remove_stop_words,
"ascii_folding": ascii_folding,
"ngram_min_length": ngram_min_length,
"ngram_max_length": ngram_max_length,
"prefix_only": prefix_only,
}
else:
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
config = FTS(
**tokenizer_configs,
)
if ordering_field_names is not None:
raise ValueError(
"ordering_field_names was only supported by the removed "
"Tantivy-based FTS implementation."
# delete the existing legacy index if it exists
if replace:
path, fs, exist = self._get_fts_index_path()
if exist:
fs.delete_dir(path)
LOOP.run(
self._table.create_index(
field_names,
replace=replace,
config=config,
name=name,
)
)
if writer_heap_size != 1024 * 1024 * 1024:
raise ValueError(
"writer_heap_size was only supported by the removed "
"Tantivy-based FTS implementation."
)
if not isinstance(field_names, str):
raise ValueError(
"Native FTS indexes can only be created on a single field "
"at a time. To search over multiple text fields, create a "
"separate FTS index for each field."
)
if "." in field_names:
raise ValueError(
"Native FTS indexes can only be created on top-level fields. "
f"Received nested field path: {field_names!r}."
return
from .fts import create_index, populate_index
if isinstance(field_names, str):
field_names = [field_names]
if isinstance(ordering_field_names, str):
ordering_field_names = [ordering_field_names]
path, fs, exist = self._get_fts_index_path()
if exist:
if not replace:
raise ValueError("Index already exists. Use replace=True to overwrite.")
fs.delete_dir(path)
if not isinstance(fs, pa_fs.LocalFileSystem):
raise NotImplementedError(
"Full-text search is only supported on the local filesystem"
)
if tokenizer_name is None:
tokenizer_configs = {
"base_tokenizer": base_tokenizer,
"language": language,
"with_position": with_position,
"max_token_length": max_token_length,
"lower_case": lower_case,
"stem": stem,
"remove_stop_words": remove_stop_words,
"ascii_folding": ascii_folding,
"ngram_min_length": ngram_min_length,
"ngram_max_length": ngram_max_length,
"prefix_only": prefix_only,
}
else:
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
config = FTS(
**tokenizer_configs,
tokenizer_name = "default"
index = create_index(
path,
field_names,
ordering_fields=ordering_field_names,
tokenizer_name=tokenizer_name,
)
LOOP.run(
self._table.create_index(
field_names,
replace=replace,
config=config,
name=name,
)
populate_index(
index,
self,
field_names,
ordering_fields=ordering_field_names,
writer_heap_size=writer_heap_size,
)
@staticmethod
@@ -2915,7 +2929,6 @@ class LanceTable(Table):
namespace_path=namespace_path,
storage_options=storage_options,
location=location,
namespace_client=namespace_client,
)
)
return self

View File

@@ -180,7 +180,7 @@ def test_fts_fuzzy_query():
),
mode="overwrite",
)
table.create_fts_index("text", replace=True)
table.create_fts_index("text", use_tantivy=False, replace=True)
results = table.search(MatchQuery("foo", "text", fuzziness=1)).to_pandas()
assert len(results) == 4
@@ -230,7 +230,7 @@ def test_fts_boost_query():
),
mode="overwrite",
)
table.create_fts_index("desc", replace=True)
table.create_fts_index("desc", use_tantivy=False, replace=True)
results = table.search(
BoostQuery(
@@ -265,7 +265,7 @@ def test_fts_boolean_query(tmp_path):
],
mode="overwrite",
)
table.create_fts_index("text", replace=True)
table.create_fts_index("text", use_tantivy=False, replace=True)
# SHOULD
results = table.search(
@@ -319,7 +319,9 @@ def test_fts_native():
],
)
table.create_fts_index("text")
# passing `use_tantivy=False` to use lance FTS index
# `use_tantivy=True` by default
table.create_fts_index("text", use_tantivy=False)
table.search("puppy").limit(10).select(["text"]).to_list()
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
# ...
@@ -330,6 +332,7 @@ def test_fts_native():
# --8<-- [start:fts_config_folding]
table.create_fts_index(
"text",
use_tantivy=False,
language="French",
stem=True,
ascii_folding=True,
@@ -343,7 +346,7 @@ def test_fts_native():
table.search("puppy").limit(10).where("text='foo'", prefilter=False).to_list()
# --8<-- [end:fts_postfiltering]
# --8<-- [start:fts_with_position]
table.create_fts_index("text", with_position=True, replace=True)
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
# --8<-- [end:fts_with_position]
# --8<-- [start:fts_incremental_index]
table.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])

View File

@@ -15,7 +15,8 @@ import pytest
from lancedb.pydantic import LanceModel, Vector
def test_basic(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_basic(tmp_path, use_tantivy):
db = lancedb.connect(tmp_path)
assert db.uri == str(tmp_path)
@@ -48,7 +49,7 @@ def test_basic(tmp_path):
assert len(rs) == 1
assert rs["item"].iloc[0] == "foo"
table.create_fts_index("item")
table.create_fts_index("item", use_tantivy=use_tantivy)
rs = table.search("bar", query_type="fts").to_pandas()
assert len(rs) == 1
assert rs["item"].iloc[0] == "bar"
@@ -896,22 +897,42 @@ def test_bypass_vector_index_sync(tmp_db: lancedb.DBConnection):
def test_local_namespace_operations(tmp_path):
"""Test that local mode namespace operations work via directory namespace."""
"""Test that local mode namespace operations behave as expected."""
# Create a local database connection
db = lancedb.connect(tmp_path)
# Root namespace starts empty
assert db.list_namespaces().namespaces == []
# Test list_namespaces returns empty list for root namespace
namespaces = db.list_namespaces().namespaces
assert namespaces == []
# Create and list child namespace
db.create_namespace(["child"])
assert "child" in db.list_namespaces().namespaces
# Test list_namespaces with non-empty namespace raises NotImplementedError
with pytest.raises(
NotImplementedError,
match="Namespace operations are not supported for listing database",
):
db.list_namespaces(namespace_path=["test"])
# List namespaces under child
assert db.list_namespaces(namespace_path=["child"]).namespaces == []
# Drop namespace
db.drop_namespace(["child"])
assert db.list_namespaces().namespaces == []
def test_local_create_namespace_not_supported(tmp_path):
"""Test that create_namespace is not supported in local mode."""
db = lancedb.connect(tmp_path)
with pytest.raises(
NotImplementedError,
match="Namespace operations are not supported for listing database",
):
db.create_namespace(["test_namespace"])
def test_local_drop_namespace_not_supported(tmp_path):
"""Test that drop_namespace is not supported in local mode."""
db = lancedb.connect(tmp_path)
with pytest.raises(
NotImplementedError,
match="Namespace operations are not supported for listing database",
):
db.drop_namespace(["test_namespace"])
def test_clone_table_latest_version(tmp_path):

View File

@@ -36,6 +36,9 @@ import pytest
import pytest_asyncio
from utils import exception_output
pytest.importorskip("lancedb.fts")
tantivy = pytest.importorskip("tantivy")
@pytest.fixture
def table(tmp_path) -> ldb.table.LanceTable:
@@ -141,53 +144,58 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
return table
@pytest.mark.parametrize(
("kwargs", "match"),
[
(
{"use_tantivy": True},
"Tantivy-based FTS has been removed",
),
(
{"ordering_field_names": ["count"]},
"ordering_field_names was only supported",
),
(
{"writer_heap_size": 128},
"writer_heap_size was only supported",
),
],
)
def test_reject_removed_tantivy_parameters(table, kwargs, match):
with pytest.raises(ValueError, match=match):
table.create_fts_index("text", **kwargs)
def test_create_index(tmp_path):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert isinstance(index, tantivy.Index)
assert os.path.exists(str(tmp_path / "index"))
def test_reject_legacy_tantivy_index(table):
path, _, _ = table._get_fts_index_path()
os.makedirs(path, exist_ok=True)
def test_create_index_with_stemming(tmp_path, table):
index = ldb.fts.create_index(
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
)
assert isinstance(index, tantivy.Index)
assert os.path.exists(str(tmp_path / "index"))
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
table.search("puppy").limit(5).to_list()
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
table.create_fts_index("text")
# Check stemming by running tokenizer on non empty table
table.create_fts_index("text", tokenizer_name="en_stem", use_tantivy=True)
@pytest.mark.parametrize("use_tantivy", [True, False])
@pytest.mark.parametrize("with_position", [True, False])
def test_create_inverted_index(table, with_position):
def test_create_inverted_index(table, use_tantivy, with_position):
if use_tantivy and not with_position:
pytest.skip("we don't support building a tantivy index without position")
table.create_fts_index(
"text",
use_tantivy=use_tantivy,
with_position=with_position,
name="custom_fts_index",
)
indices = table.list_indices()
fts_indices = [i for i in indices if i.index_type == "FTS"]
assert any(i.name == "custom_fts_index" for i in fts_indices)
if not use_tantivy:
indices = table.list_indices()
fts_indices = [i for i in indices if i.index_type == "FTS"]
assert any(i.name == "custom_fts_index" for i in fts_indices)
def test_search_fts(table):
table.create_fts_index("text")
def test_populate_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
def test_search_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
ldb.fts.populate_index(index, table, ["text"])
index.reload()
results = ldb.fts.search_index(index, query="puppy", limit=5)
assert len(results) == 2
assert len(results[0]) == 5 # row_ids
assert len(results[1]) == 5 # _score
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_search_fts(table, use_tantivy):
table.create_fts_index("text", use_tantivy=use_tantivy)
results = table.search("puppy").select(["id", "text"]).limit(5).to_list()
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
@@ -196,52 +204,53 @@ def test_search_fts(table):
results = table.search("puppy").select(["id", "text"]).to_list()
assert len(results) == 10
# Test with a query
results = (
table.search(MatchQuery("puppy", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
# Test boost query
results = (
table.search(
BoostQuery(
MatchQuery("puppy", "text"),
MatchQuery("runs", "text"),
)
if not use_tantivy:
# Test with a query
results = (
table.search(MatchQuery("puppy", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results) == 5
# Test multi match query
table.create_fts_index("text2")
results = (
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test boost query
results = (
table.search(
BoostQuery(
MatchQuery("puppy", "text"),
MatchQuery("runs", "text"),
)
)
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
# Test boolean query
results = (
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
for r in results:
assert "puppy" in r["text"]
assert "runs" in r["text"]
# Test multi match query
table.create_fts_index("text2", use_tantivy=use_tantivy)
results = (
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Test boolean query
results = (
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
.select(["id", "text"])
.limit(5)
.to_list()
)
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
for r in results:
assert "puppy" in r["text"]
assert "runs" in r["text"]
@pytest.mark.asyncio
@@ -309,13 +318,13 @@ async def test_fts_select_async(async_table):
def test_search_fts_phrase_query(table):
table.create_fts_index("text", with_position=False)
table.create_fts_index("text", use_tantivy=False, with_position=False)
try:
phrase_results = table.search('"puppy runs"').limit(100).to_list()
assert False
except Exception:
pass
table.create_fts_index("text", with_position=True, replace=True)
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
results = table.search("puppy").limit(100).to_list()
# Test with quotation marks
@@ -366,8 +375,8 @@ async def test_search_fts_phrase_query_async(async_table):
def test_search_fts_specify_column(table):
table.create_fts_index("text")
table.create_fts_index("text2")
table.create_fts_index("text", use_tantivy=False)
table.create_fts_index("text2", use_tantivy=False)
results = table.search("puppy", fts_columns="text").limit(5).to_list()
assert len(results) == 5
@@ -461,8 +470,42 @@ async def test_search_fts_specify_column_async(async_table):
pass
def test_create_index_from_table(tmp_path, table):
table.create_fts_index("text")
def test_search_ordering_field_index_table(tmp_path, table):
table.create_fts_index("text", ordering_field_names=["count"], use_tantivy=True)
rows = (
table.search("puppy", ordering_field_name="count")
.limit(20)
.select(["text", "count"])
.to_list()
)
for r in rows:
assert "puppy" in r["text"]
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
def test_search_ordering_field_index(tmp_path, table):
index = ldb.fts.create_index(
str(tmp_path / "index"), ["text"], ordering_fields=["count"]
)
ldb.fts.populate_index(index, table, ["text"], ordering_fields=["count"])
index.reload()
results = ldb.fts.search_index(
index, query="puppy", limit=5, ordering_field="count"
)
assert len(results) == 2
assert len(results[0]) == 5 # row_ids
assert len(results[1]) == 5 # _distance
rows = table.to_lance().take(results[0]).to_pylist()
for r in rows:
assert "puppy" in r["text"]
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_create_index_from_table(tmp_path, table, use_tantivy):
table.create_fts_index("text", use_tantivy=use_tantivy)
df = table.search("puppy").limit(5).select(["text"]).to_pandas()
assert len(df) <= 5
assert "text" in df.columns
@@ -482,24 +525,36 @@ def test_create_index_from_table(tmp_path, table):
)
with pytest.raises(Exception, match="already exists"):
table.create_fts_index("text")
table.create_fts_index("text", use_tantivy=use_tantivy)
table.create_fts_index("text", replace=True)
table.create_fts_index("text", replace=True, use_tantivy=use_tantivy)
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
def test_create_index_multiple_columns(tmp_path, table):
with pytest.raises(ValueError, match="Native FTS indexes can only be created"):
table.create_fts_index(["text", "text2"])
table.create_fts_index(["text", "text2"], use_tantivy=True)
df = table.search("puppy").limit(5).to_pandas()
assert len(df) == 5
assert "text" in df.columns
assert "text2" in df.columns
def test_empty_rs(tmp_path, table, mocker):
table.create_fts_index(["text", "text2"], use_tantivy=True)
mocker.patch("lancedb.fts.search_index", return_value=([], []))
df = table.search("puppy").limit(5).to_pandas()
assert len(df) == 0
def test_nested_schema(tmp_path, table):
with pytest.raises(ValueError, match="top-level fields"):
table.create_fts_index("nested.text")
table.create_fts_index("nested.text", use_tantivy=True)
rs = table.search("puppy").limit(5).to_list()
assert len(rs) == 5
def test_search_index_with_filter(table):
table.create_fts_index("text")
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_search_index_with_filter(table, use_tantivy):
table.create_fts_index("text", use_tantivy=use_tantivy)
orig_import = __import__
def import_mock(name, *args):
@@ -529,7 +584,8 @@ def test_search_index_with_filter(table):
assert r["_rowid"] is not None
def test_null_input(table):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_null_input(table, use_tantivy):
table.add(
[
{
@@ -542,13 +598,14 @@ def test_null_input(table):
}
]
)
table.create_fts_index("text")
table.create_fts_index("text", use_tantivy=use_tantivy)
def test_syntax(table):
# https://github.com/lancedb/lancedb/issues/769
table.create_fts_index("text")
table.search("they could have been dogs OR").limit(10).to_list()
table.create_fts_index("text", use_tantivy=True)
with pytest.raises(ValueError, match="Syntax Error"):
table.search("they could have been dogs OR").limit(10).to_list()
# these should work
@@ -559,7 +616,6 @@ def test_syntax(table):
).to_list()
# phrase queries
table.create_fts_index("text", with_position=True, replace=True)
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
table.search('"they could have been dogs OR cats"').limit(10).to_list()
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
@@ -583,7 +639,7 @@ def test_language(mem_db: DBConnection):
table = mem_db.create_table("test", data=data)
with pytest.raises(ValueError) as e:
table.create_fts_index("text", language="klingon")
table.create_fts_index("text", use_tantivy=False, language="klingon")
assert exception_output(e) == (
"ValueError: LanceDB does not support the requested language: 'klingon'\n"
@@ -594,6 +650,7 @@ def test_language(mem_db: DBConnection):
table.create_fts_index(
"text",
use_tantivy=False,
language="French",
stem=True,
ascii_folding=True,
@@ -633,7 +690,7 @@ def test_fts_on_list(mem_db: DBConnection):
}
)
table = mem_db.create_table("test", data=data)
table.create_fts_index("text", with_position=True)
table.create_fts_index("text", use_tantivy=False, with_position=True)
res = table.search("lance").limit(5).to_list()
assert len(res) == 3
@@ -645,7 +702,7 @@ def test_fts_on_list(mem_db: DBConnection):
def test_fts_ngram(mem_db: DBConnection):
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
table = mem_db.create_table("test", data=data)
table.create_fts_index("text", base_tokenizer="ngram")
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
results = table.search("lan", query_type="fts").limit(10).to_list()
assert len(results) == 2
@@ -664,6 +721,7 @@ def test_fts_ngram(mem_db: DBConnection):
# test setting min_ngram_length and prefix_only
table.create_fts_index(
"text",
use_tantivy=False,
base_tokenizer="ngram",
replace=True,
ngram_min_length=2,
@@ -828,7 +886,7 @@ def test_fts_query_to_json():
def test_fts_fast_search(table):
table.create_fts_index("text")
table.create_fts_index("text", use_tantivy=False)
# Insert some unindexed data
table.add(

View File

@@ -28,7 +28,7 @@ def sync_table(tmpdir_factory) -> Table:
}
)
table = db.create_table("test", data)
table.create_fts_index("text", with_position=False)
table.create_fts_index("text", with_position=False, use_tantivy=False)
return table
@@ -192,7 +192,7 @@ def table_with_id(tmpdir_factory) -> Table:
}
)
table = db.create_table("test_with_id", data)
table.create_fts_index("text", with_position=False)
table.create_fts_index("text", with_position=False, use_tantivy=False)
return table

View File

@@ -681,7 +681,7 @@ class TestPushdownOperations:
{"root": self.temp_dir},
namespace_client_pushdown_operations=["QueryTable"],
)
assert "QueryTable" in db._namespace_client_pushdown_operations
assert "QueryTable" in db._pushdown_operations
def test_create_table_pushdown_stored(self):
"""Test that CreateTable pushdown is stored on sync connection."""
@@ -690,7 +690,7 @@ class TestPushdownOperations:
{"root": self.temp_dir},
namespace_client_pushdown_operations=["CreateTable"],
)
assert "CreateTable" in db._namespace_client_pushdown_operations
assert "CreateTable" in db._pushdown_operations
def test_both_pushdowns_stored(self):
"""Test that both pushdown operations can be set together."""
@@ -699,13 +699,13 @@ class TestPushdownOperations:
{"root": self.temp_dir},
namespace_client_pushdown_operations=["QueryTable", "CreateTable"],
)
assert "QueryTable" in db._namespace_client_pushdown_operations
assert "CreateTable" in db._namespace_client_pushdown_operations
assert "QueryTable" in db._pushdown_operations
assert "CreateTable" in db._pushdown_operations
def test_pushdown_defaults_to_empty(self):
"""Test that pushdown operations default to empty."""
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
assert len(db._namespace_client_pushdown_operations) == 0
assert len(db._pushdown_operations) == 0
@pytest.mark.asyncio
@@ -727,7 +727,7 @@ class TestAsyncPushdownOperations:
{"root": self.temp_dir},
namespace_client_pushdown_operations=["QueryTable"],
)
assert "QueryTable" in db._namespace_client_pushdown_operations
assert "QueryTable" in db._pushdown_operations
async def test_async_create_table_pushdown_stored(self):
"""Test that CreateTable pushdown is stored on async connection."""
@@ -736,9 +736,9 @@ class TestAsyncPushdownOperations:
{"root": self.temp_dir},
namespace_client_pushdown_operations=["CreateTable"],
)
assert "CreateTable" in db._namespace_client_pushdown_operations
assert "CreateTable" in db._pushdown_operations
async def test_async_pushdown_defaults_to_empty(self):
"""Test that pushdown operations default to empty on async connection."""
db = lancedb.connect_namespace_async("dir", {"root": self.temp_dir})
assert len(db._namespace_client_pushdown_operations) == 0
assert len(db._pushdown_operations) == 0

View File

@@ -18,9 +18,6 @@ Tests verify:
"""
import copy
import shutil
import sys
import tempfile
import time
import uuid
from typing import Dict, Optional
@@ -390,66 +387,6 @@ def test_namespace_open_table_with_provider(s3_bucket: str, use_custom: bool):
assert get_describe_call_count(inner_ns_client) == describe_count_after_open
@pytest.mark.skipif(
sys.platform == "win32",
reason="TODO: fix schema-only namespace metrics test on Windows",
)
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
def test_namespace_create_schema_only_with_provider(use_custom: bool):
"""
Test creating a schema-only table through namespace with storage options provider.
Verifies:
- declare_table is called once to reserve the location
- describe_table is not needed during create in create mode
- the table can be reopened successfully afterward
- opening the table triggers exactly one describe_table call
"""
temp_dir = tempfile.mkdtemp()
try:
ns_client, inner_ns_client = create_tracking_namespace(
bucket_name=temp_dir,
storage_options={},
credential_expires_in_seconds=3600,
use_custom=use_custom,
)
db = LanceNamespaceDBConnection(ns_client)
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
db.create_namespace([namespace_name])
table_name = f"test_table_{uuid.uuid4().hex}"
namespace_path = [namespace_name]
schema = pa.schema(
[
pa.field("id", pa.int64()),
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("text", pa.string()),
]
)
assert get_declare_call_count(inner_ns_client) == 0
assert get_describe_call_count(inner_ns_client) == 0
table = db.create_table(
table_name, schema=schema, namespace_path=namespace_path
)
assert table.name == table_name
assert table.namespace == namespace_path
assert get_declare_call_count(inner_ns_client) == 1
assert get_describe_call_count(inner_ns_client) == 0
reopened_table = db.open_table(table_name, namespace_path=namespace_path)
assert reopened_table.schema == schema
assert get_declare_call_count(inner_ns_client) == 1
assert get_describe_call_count(inner_ns_client) == 1
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
@pytest.mark.s3_test
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
def test_namespace_credential_refresh_on_read(s3_bucket: str, use_custom: bool):

View File

@@ -1095,60 +1095,3 @@ def test_getitems_invalid_offset(some_permutation: Permutation):
"""Test __getitems__ with an out-of-range offset raises an error."""
with pytest.raises(Exception):
some_permutation.__getitems__([999999])
def test_from_table_identity(mem_db):
"""Permutation.from_table without ops behaves like identity."""
tbl = mem_db.create_table("tbl", pa.table({"x": range(10)}))
perm = Permutation.from_table(tbl)
assert perm.num_rows == 10
assert perm.column_names == ["x"]
def test_from_table_shuffle_seeded(mem_db):
"""from_table().shuffle(seed=...) is reproducible and reorders rows."""
tbl = mem_db.create_table("tbl", pa.table({"x": range(100)}))
perm = Permutation.from_table(tbl).shuffle(seed=42)
rows = [r["x"] for r in perm.__getitems__(list(range(100)))]
assert sorted(rows) == list(range(100))
assert rows != list(range(100))
# Same seed → same order
rows2 = [
r["x"]
for r in Permutation.from_table(tbl)
.shuffle(seed=42)
.__getitems__(list(range(100)))
]
assert rows == rows2
def test_from_table_filter(mem_db):
"""from_table().filter(...) limits the rows."""
tbl = mem_db.create_table("tbl", pa.table({"x": range(100)}))
perm = Permutation.from_table(tbl).filter("x < 25")
assert perm.num_rows == 25
def test_from_table_chained_ops(mem_db):
"""Chained shuffle + filter materializes once."""
tbl = mem_db.create_table("tbl", pa.table({"x": range(100)}))
perm = Permutation.from_table(tbl).filter("x >= 50").shuffle(seed=7)
assert perm.num_rows == 50
rows = [r["x"] for r in perm.__getitems__(list(range(50)))]
assert sorted(rows) == list(range(50, 100))
def test_from_table_forwards_read_methods(mem_db):
"""from_table() result transparently forwards Permutation read methods."""
tbl = mem_db.create_table("tbl", pa.table({"x": range(10), "y": range(10)}))
perm = Permutation.from_table(tbl).select_columns(["x"])
assert perm.column_names == ["x"]
def test_from_table_split_random(mem_db):
"""from_table().split_random(...) returns rows from the first split."""
tbl = mem_db.create_table("tbl", pa.table({"x": range(100)}))
perm = Permutation.from_table(tbl).split_random(ratios=[0.3, 0.7], seed=1)
# Default split is 0 — ratio 0.3 → ~30 rows
assert 25 <= perm.num_rows <= 35

View File

@@ -1385,7 +1385,7 @@ def test_query_timeout(tmp_path):
}
)
table = db.create_table("test", data)
table.create_fts_index("text")
table.create_fts_index("text", use_tantivy=False)
with pytest.raises(Exception, match="Query timeout"):
table.search().where("text = 'a'").to_list(timeout=timedelta(0))

View File

@@ -26,8 +26,11 @@ from lancedb.rerankers import (
)
from lancedb.table import LanceTable
# Tests rely on FTS index
pytest.importorskip("lancedb.fts")
def get_test_table(tmp_path):
def get_test_table(tmp_path, use_tantivy):
db = lancedb.connect(tmp_path)
# Create a LanceDB table schema with a vector and a text column
emb = EmbeddingFunctionRegistry.get_instance().get("test").create()
@@ -95,7 +98,7 @@ def get_test_table(tmp_path):
)
# Create a fts index
table.create_fts_index("text", replace=True)
table.create_fts_index("text", use_tantivy=use_tantivy, replace=True)
return table, MyTable
@@ -205,8 +208,8 @@ def _run_test_reranker(reranker, table, query, query_vector, schema):
assert len(result) == 20 and result == result_arrow
def _run_test_hybrid_reranker(reranker, tmp_path):
table, schema = get_test_table(tmp_path)
def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
table, schema = get_test_table(tmp_path, use_tantivy)
# The default reranker
result1 = (
table.search(
@@ -282,7 +285,8 @@ def _run_test_hybrid_reranker(reranker, tmp_path):
)
def test_linear_combination(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_linear_combination(tmp_path, use_tantivy):
reranker = LinearCombinationReranker()
vector_results = pa.Table.from_pydict(
@@ -309,20 +313,22 @@ def test_linear_combination(tmp_path):
assert "_score" not in combined_results.column_names
assert "_relevance_score" in combined_results.column_names
_run_test_hybrid_reranker(reranker, tmp_path)
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
def test_rrf_reranker(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_rrf_reranker(tmp_path, use_tantivy):
reranker = RRFReranker()
_run_test_hybrid_reranker(reranker, tmp_path)
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
def test_mrr_reranker(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_mrr_reranker(tmp_path, use_tantivy):
reranker = MRRReranker()
_run_test_hybrid_reranker(reranker, tmp_path)
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
# Test multi-vector part
table, schema = get_test_table(tmp_path)
table, schema = get_test_table(tmp_path, use_tantivy)
query = "single player experience"
rs1 = table.search(query, vector_column_name="vector").limit(10).with_row_id(True)
rs2 = (
@@ -357,7 +363,7 @@ def test_rrf_reranker_distance():
table = db.create_table("test", data)
table.create_index(num_partitions=1, num_sub_vectors=2)
table.create_fts_index("text")
table.create_fts_index("text", use_tantivy=False)
reranker = RRFReranker(return_score="all")
@@ -416,31 +422,35 @@ def test_rrf_reranker_distance():
@pytest.mark.skipif(
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
)
def test_cohere_reranker(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_cohere_reranker(tmp_path, use_tantivy):
pytest.importorskip("cohere")
reranker = CohereReranker()
table, schema = get_test_table(tmp_path)
table, schema = get_test_table(tmp_path, use_tantivy)
_run_test_reranker(reranker, table, "single player experience", None, schema)
def test_cross_encoder_reranker(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_cross_encoder_reranker(tmp_path, use_tantivy):
pytest.importorskip("sentence_transformers")
reranker = CrossEncoderReranker()
table, schema = get_test_table(tmp_path)
table, schema = get_test_table(tmp_path, use_tantivy)
_run_test_reranker(reranker, table, "single player experience", None, schema)
def test_colbert_reranker(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_colbert_reranker(tmp_path, use_tantivy):
pytest.importorskip("rerankers")
reranker = ColbertReranker()
table, schema = get_test_table(tmp_path)
table, schema = get_test_table(tmp_path, use_tantivy)
_run_test_reranker(reranker, table, "single player experience", None, schema)
def test_answerdotai_reranker(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_answerdotai_reranker(tmp_path, use_tantivy):
pytest.importorskip("rerankers")
reranker = AnswerdotaiRerankers()
table, schema = get_test_table(tmp_path)
table, schema = get_test_table(tmp_path, use_tantivy)
_run_test_reranker(reranker, table, "single player experience", None, schema)
@@ -449,9 +459,10 @@ def test_answerdotai_reranker(tmp_path):
or os.environ.get("OPENAI_BASE_URL") is not None,
reason="OPENAI_API_KEY not set",
)
def test_openai_reranker(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_openai_reranker(tmp_path, use_tantivy):
pytest.importorskip("openai")
table, schema = get_test_table(tmp_path)
table, schema = get_test_table(tmp_path, use_tantivy)
reranker = OpenaiReranker()
_run_test_reranker(reranker, table, "single player experience", None, schema)
@@ -459,9 +470,10 @@ def test_openai_reranker(tmp_path):
@pytest.mark.skipif(
os.environ.get("JINA_API_KEY") is None, reason="JINA_API_KEY not set"
)
def test_jina_reranker(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_jina_reranker(tmp_path, use_tantivy):
pytest.importorskip("jina")
table, schema = get_test_table(tmp_path)
table, schema = get_test_table(tmp_path, use_tantivy)
reranker = JinaReranker()
_run_test_reranker(reranker, table, "single player experience", None, schema)
@@ -469,10 +481,11 @@ def test_jina_reranker(tmp_path):
@pytest.mark.skipif(
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
)
def test_voyageai_reranker(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_voyageai_reranker(tmp_path, use_tantivy):
pytest.importorskip("voyageai")
reranker = VoyageAIReranker(model_name="rerank-2.5")
table, schema = get_test_table(tmp_path)
table, schema = get_test_table(tmp_path, use_tantivy)
_run_test_reranker(reranker, table, "single player experience", None, schema)
@@ -491,7 +504,7 @@ def test_empty_result_reranker():
# Create empty table with schema
empty_table = db.create_table("empty_table", schema=schema, mode="overwrite")
empty_table.create_fts_index("text", replace=True)
empty_table.create_fts_index("text", use_tantivy=False, replace=True)
for reranker in [
CrossEncoderReranker(),
# ColbertReranker(),
@@ -590,10 +603,11 @@ def test_empty_hybrid_result_reranker():
assert "_rowid" in result.column_names
def test_cross_encoder_reranker_return_all(tmp_path):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy):
pytest.importorskip("sentence_transformers")
reranker = CrossEncoderReranker(return_score="all")
table, schema = get_test_table(tmp_path)
table, schema = get_test_table(tmp_path, use_tantivy)
query = "single player experience"
result = (
table.search(query, query_type="hybrid", vector_column_name="vector")

View File

@@ -242,8 +242,8 @@ def test_s3_dynamodb_sync(s3_bucket: str, commit_table: str, monkeypatch):
# FTS indices should error since they are not supported yet.
with pytest.raises(
ValueError,
match="Tantivy-based FTS has been removed",
NotImplementedError,
match="Full-text search is only supported on the local filesystem",
):
table.create_fts_index("x", use_tantivy=True)

View File

@@ -3,7 +3,6 @@
import os
import sys
from datetime import date, datetime, timedelta
from time import sleep
from typing import List
@@ -1948,6 +1947,7 @@ def setup_hybrid_search_table(db: DBConnection, embedding_func):
def test_hybrid_search(tmp_db: DBConnection):
# This test uses an FTS index
pytest.importorskip("lancedb.fts")
pytest.importorskip("lance")
table, MyTable, emb = setup_hybrid_search_table(tmp_db, "test")
@@ -2018,6 +2018,7 @@ def test_hybrid_search(tmp_db: DBConnection):
def test_hybrid_search_metric_type(tmp_db: DBConnection):
# This test uses an FTS index
pytest.importorskip("lancedb.fts")
pytest.importorskip("lance")
# Need to use nonnorm as the embedding function so l2 and dot results
@@ -2039,13 +2040,6 @@ def test_hybrid_search_metric_type(tmp_db: DBConnection):
@pytest.mark.parametrize(
"consistency_interval", [None, timedelta(seconds=0), timedelta(seconds=0.1)]
)
@pytest.mark.skipif(
sys.platform == "win32",
reason=(
"TODO: directory namespace is not supported on Windows yet; "
"re-enable after that is fixed."
),
)
def test_consistency(tmp_path, consistency_interval):
db = lancedb.connect(tmp_path)
table = db.create_table("my_table", data=[{"id": 0}])
@@ -2066,6 +2060,7 @@ def test_consistency(tmp_path, consistency_interval):
elif consistency_interval == timedelta(seconds=0):
assert table2.version == table.version
else:
# (consistency_interval == timedelta(seconds=0.1)
assert table2.version == table.version - 1
sleep(0.1)
assert table2.version == table.version

View File

@@ -1,17 +1,11 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
use std::{
collections::{HashMap, HashSet},
sync::Arc,
time::Duration,
};
use std::{collections::HashMap, sync::Arc, time::Duration};
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
use lancedb::{
connection::Connection as LanceConnection,
connection::NamespaceClientPushdownOperation,
database::namespace::LanceNamespaceDatabase,
database::{CreateTableMode, Database, ReadConsistency},
};
use pyo3::{
@@ -45,29 +39,6 @@ impl Connection {
}
}
fn parse_namespace_client_pushdown_operations(
operations: Option<Vec<String>>,
) -> PyResult<HashSet<NamespaceClientPushdownOperation>> {
let mut parsed = HashSet::new();
for operation in operations.unwrap_or_default() {
match operation.as_str() {
"QueryTable" => {
parsed.insert(NamespaceClientPushdownOperation::QueryTable);
}
"CreateTable" => {
parsed.insert(NamespaceClientPushdownOperation::CreateTable);
}
_ => {
return Err(PyValueError::new_err(format!(
"Invalid pushdown operation: {}",
operation
)));
}
}
}
Ok(parsed)
}
impl Connection {
fn parse_create_mode_str(mode: &str) -> PyResult<CreateTableMode> {
match mode {
@@ -525,7 +496,7 @@ impl Connection {
}
#[pyfunction]
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None, manifest_enabled=false, namespace_client_properties=None))]
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None))]
#[allow(clippy::too_many_arguments)]
pub fn connect(
py: Python<'_>,
@@ -537,8 +508,6 @@ pub fn connect(
client_config: Option<PyClientConfig>,
storage_options: Option<HashMap<String, String>>,
session: Option<crate::session::Session>,
manifest_enabled: bool,
namespace_client_properties: Option<HashMap<String, String>>,
) -> PyResult<Bound<'_, PyAny>> {
future_into_py(py, async move {
let mut builder = lancedb::connect(&uri);
@@ -558,12 +527,6 @@ pub fn connect(
if let Some(storage_options) = storage_options {
builder = builder.storage_options(storage_options);
}
if manifest_enabled {
builder = builder.manifest_enabled(true);
}
if let Some(namespace_client_properties) = namespace_client_properties {
builder = builder.namespace_client_properties(namespace_client_properties);
}
#[cfg(feature = "remote")]
if let Some(client_config) = client_config {
builder = builder.client_config(client_config.into());
@@ -575,52 +538,6 @@ pub fn connect(
})
}
#[pyfunction]
#[pyo3(signature = (
namespace_client,
read_consistency_interval=None,
storage_options=None,
session=None,
namespace_client_pushdown_operations=None,
namespace_client_impl=None,
namespace_client_properties=None,
))]
#[allow(clippy::too_many_arguments)]
pub fn connect_namespace_client(
py: Python<'_>,
namespace_client: Py<PyAny>,
read_consistency_interval: Option<f64>,
storage_options: Option<HashMap<String, String>>,
session: Option<crate::session::Session>,
namespace_client_pushdown_operations: Option<Vec<String>>,
namespace_client_impl: Option<String>,
namespace_client_properties: Option<HashMap<String, String>>,
) -> PyResult<Connection> {
let namespace_client = extract_namespace_arc(py, namespace_client)?;
let read_consistency_interval = read_consistency_interval.map(Duration::from_secs_f64);
let namespace_client_pushdown_operations =
parse_namespace_client_pushdown_operations(namespace_client_pushdown_operations)?;
let ns_impl = namespace_client_impl.unwrap_or_else(|| "python".to_string());
let ns_properties = namespace_client_properties.unwrap_or_default();
let storage_options = storage_options.unwrap_or_default();
let session = session.map(|s| s.inner.clone());
let database = LanceNamespaceDatabase::from_namespace_client(
namespace_client,
ns_impl,
ns_properties,
storage_options,
read_consistency_interval,
session,
namespace_client_pushdown_operations,
);
Ok(Connection::new(LanceConnection::new(
Arc::new(database),
Arc::new(lancedb::embeddings::MemoryRegistry::new()),
)))
}
#[derive(FromPyObject)]
pub struct PyClientConfig {
user_agent: String,

View File

@@ -2,7 +2,7 @@
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
use arrow::RecordBatchStream;
use connection::{Connection, connect, connect_namespace_client};
use connection::{Connection, connect};
use env_logger::Env;
use expr::{PyExpr, expr_col, expr_func, expr_lit};
use index::IndexConfig;
@@ -58,7 +58,6 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PyPermutationReader>()?;
m.add_class::<PyExpr>()?;
m.add_function(wrap_pyfunction!(connect, m)?)?;
m.add_function(wrap_pyfunction!(connect_namespace_client, m)?)?;
m.add_function(wrap_pyfunction!(permutation::async_permutation_builder, m)?)?;
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
m.add_function(wrap_pyfunction!(query::fts_query_to_json, m)?)?;

40
python/uv.lock generated
View File

@@ -1996,6 +1996,7 @@ tests = [
{ name = "pytest-mock" },
{ name = "pytz" },
{ name = "requests" },
{ name = "tantivy" },
]
[package.metadata]
@@ -2049,6 +2050,7 @@ requires-dist = [
{ name = "sentence-transformers", marker = "extra == 'embeddings'", specifier = ">=2.2.0" },
{ name = "sentencepiece", marker = "extra == 'embeddings'", specifier = ">=0.1.99" },
{ name = "sentencepiece", marker = "extra == 'siglip'" },
{ name = "tantivy", marker = "extra == 'tests'", specifier = ">=0.20.0" },
{ name = "torch", marker = "extra == 'clip'" },
{ name = "torch", marker = "extra == 'embeddings'", specifier = ">=2.0.0" },
{ name = "torch", marker = "extra == 'siglip'" },
@@ -4777,6 +4779,44 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
]
[[package]]
name = "tantivy"
version = "0.25.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/1b/f9/0cd3955d155d3e3ef74b864769514dd191e5dacba9f0beb7af2d914942ce/tantivy-0.25.1.tar.gz", hash = "sha256:68a3314699a7d18fcf338b52bae8ce46a97dde1128a3e47e33fa4db7f71f265e", size = 75120, upload-time = "2025-12-02T11:57:12.997Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/80/f7/2276bed3bed983ce2970dc70e3571f372587fe4f5f2bac1d6d617df08fa3/tantivy-0.25.1-cp310-cp310-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:7aa587a3dc9470584cacf5e3640fee93d12ec5f10109669c1f47c4e90820b958", size = 7638510, upload-time = "2025-12-02T11:56:08.754Z" },
{ url = "https://files.pythonhosted.org/packages/20/8c/078dc50570e243414356b05633f52fe544b85179281ffa9f1fe05d76bbd8/tantivy-0.25.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:56d77fe667595693d9fa5f0b4545776d84da9526bab0273b3fc6c7536dc0d8a2", size = 3932659, upload-time = "2025-12-02T11:56:10.621Z" },
{ url = "https://files.pythonhosted.org/packages/bd/dc/281c48436a1e3178b58fe463af314434fe0f3a4ec0c7588a362900e0c69e/tantivy-0.25.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ba8c347cd48595fcaeabb28a909ebce92cf9c5e5c84ab5ba1136a280a307b5c", size = 4197430, upload-time = "2025-12-02T11:56:12.65Z" },
{ url = "https://files.pythonhosted.org/packages/7b/6c/61e6e0b0a350007d10a9b66a35703361d3345e14e7a7cc83494776b2a054/tantivy-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa7c4932e8fde1f09f2d46226060e827e197c2749abdc6129d73a752773adc38", size = 4184055, upload-time = "2025-12-02T11:56:14.647Z" },
{ url = "https://files.pythonhosted.org/packages/5f/fd/0eb059b12f0b6f91623a54a46448a83b7f716d08f3bca68c095d697b85da/tantivy-0.25.1-cp310-cp310-win_amd64.whl", hash = "sha256:afcfc5dbb0bcd5d24531f4471737ae0896f33528426ab0b1dad3e427c19120f6", size = 3424134, upload-time = "2025-12-02T11:56:16.242Z" },
{ url = "https://files.pythonhosted.org/packages/4e/7a/8a277f377e8a151fc0e71d4ffc1114aefb6e5e1c7dd609fed0955cf34ed8/tantivy-0.25.1-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:d363d7b4207d3a5aa7f0d212420df35bed18bdb6bae26a2a8bd57428388b7c29", size = 7637033, upload-time = "2025-12-02T11:56:18.104Z" },
{ url = "https://files.pythonhosted.org/packages/71/31/8b4acdedfc9f9a2d04b1340d07eef5213d6f151d1e18da0cb423e5f090d2/tantivy-0.25.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8f4389cf1d889a1df7c5a3195806b4b56c37cee10d8a26faaa0dea35a867b5ff", size = 3932180, upload-time = "2025-12-02T11:56:19.833Z" },
{ url = "https://files.pythonhosted.org/packages/2f/dc/3e8499c21b4b9795e8f2fc54c68ce5b92905aaeadadaa56ecfa9180b11b1/tantivy-0.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99864c09fc54652c3c2486cdf13f86cdc8200f4b481569cb291e095ca5d496e5", size = 4197620, upload-time = "2025-12-02T11:56:21.496Z" },
{ url = "https://files.pythonhosted.org/packages/f8/8e/f2ce62fffc811eb62bead92c7b23c2e218f817cbd54c4f3b802e03ba1438/tantivy-0.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05abf37ddbc5063c575548be0d62931629c086bff7a5a1b67cf5a8f5ebf4cd8c", size = 4183794, upload-time = "2025-12-02T11:56:23.215Z" },
{ url = "https://files.pythonhosted.org/packages/de/64/24e2891b0ba3fd9853e10c296095a33b89bf3efd65e29da1ee5dae736040/tantivy-0.25.1-cp311-cp311-win_amd64.whl", hash = "sha256:f307ee8ad21597b0be23af83008fd66cfd5f958cdfa24ec0aaa08a38e86bbef4", size = 3424235, upload-time = "2025-12-02T11:56:25.172Z" },
{ url = "https://files.pythonhosted.org/packages/41/e7/6849c713ed0996c7628324c60512c4882006f0a62145e56c624a93407f90/tantivy-0.25.1-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:90fd919e5f611809f746560ecf36eb9be824dec62e21ae17a27243759edb9aa1", size = 7621494, upload-time = "2025-12-02T11:56:27.069Z" },
{ url = "https://files.pythonhosted.org/packages/c5/22/c3d8294600dc6e7fa350daef9ff337d3c06e132b81df727de9f7a50c692a/tantivy-0.25.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:4613c7cf6c23f3a97989819690a0f956d799354957de7a204abcc60083cebe02", size = 3925219, upload-time = "2025-12-02T11:56:29.403Z" },
{ url = "https://files.pythonhosted.org/packages/41/fc/cbb1df71dd44c9110eff4eaaeda9d44f2d06182fe0452193be20ddfba93f/tantivy-0.25.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c477bd20b4df804d57dfc5033431bef27cde605695ae141b03abbf6ebc069129", size = 4198699, upload-time = "2025-12-02T11:56:31.359Z" },
{ url = "https://files.pythonhosted.org/packages/47/4d/71abb78b774073c3ce12a4faa4351a9d910a71ffa3659526affba163873d/tantivy-0.25.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9b1a1ba1113c523c7ff7b10f282d6c4074006f7ef8d71e1d973d51bf7291ddb", size = 4183585, upload-time = "2025-12-02T11:56:33.317Z" },
{ url = "https://files.pythonhosted.org/packages/be/16/3f00cd7ec458b92a0e977960af9ddfbeb762127d9acc68da9094a1fda556/tantivy-0.25.1-cp312-cp312-win_amd64.whl", hash = "sha256:9de0bafd3bd7ac9f8f82d53e17562e9db11a5af308fe5185c4bd86feaddbe4a6", size = 3424622, upload-time = "2025-12-02T11:56:34.788Z" },
{ url = "https://files.pythonhosted.org/packages/3d/25/73cfbcf1a8ea49be6c42817431cac46b70a119fe64da903fcc2d92b5b511/tantivy-0.25.1-cp313-cp313-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f51ff7196c6f31719202080ed8372d5e3d51e92c749c032fb8234f012e99744c", size = 7622530, upload-time = "2025-12-02T11:56:36.839Z" },
{ url = "https://files.pythonhosted.org/packages/12/c8/c0d7591cdf4f7e7a9fc4da786d1ca8cd1aacffaa2be16ea6d401a8e4a566/tantivy-0.25.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:550e63321bfcacc003859f2fa29c1e8e56450807b3c9a501c1add27cfb9236d9", size = 3925637, upload-time = "2025-12-02T11:56:38.425Z" },
{ url = "https://files.pythonhosted.org/packages/3a/09/bedfc223bffec7641b417dd7ab071134b2ef8f8550e9b1fb6014657ef52e/tantivy-0.25.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fde31cc8d6e122faf7902aeea32bc008a429a6e8904e34d3468126a3ec01b016", size = 4197322, upload-time = "2025-12-02T11:56:40.411Z" },
{ url = "https://files.pythonhosted.org/packages/f5/f1/1fa5183500c8042200c9f2b840d34f5bbcfb434a1ee750e7132262d2a5c9/tantivy-0.25.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b11bd5a518b0be645320b47af8493f6a40c4f3234313e37adcf4534a564d27dd", size = 4183143, upload-time = "2025-12-02T11:56:42.048Z" },
{ url = "https://files.pythonhosted.org/packages/d5/74/a4c4f4eb95888ccb784da3b017aa0625ab1ac411bf5d022a9a797d9a2334/tantivy-0.25.1-cp313-cp313-win_amd64.whl", hash = "sha256:cc7fe88853e06b3251ee4fa42b7a2038727f850c8765bcc8167cfc73585dd24e", size = 3423491, upload-time = "2025-12-02T11:56:43.858Z" },
{ url = "https://files.pythonhosted.org/packages/8b/2f/581519492226f97d23bd0adc95dad991ebeaa73ea6abc8bff389a3096d9a/tantivy-0.25.1-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:dae99e75b7eaa9bf5bd16ab106b416370f08c135aed0e117d62a3201cd1ffe36", size = 7610316, upload-time = "2025-12-02T11:56:45.927Z" },
{ url = "https://files.pythonhosted.org/packages/91/40/5d7bc315ab9e6a22c5572656e8ada1c836cfa96dccf533377504fbc3c9d9/tantivy-0.25.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:506e9533c5ef4d3df43bad64ffecc0aa97c76e361ea610815dc3a20a9d6b30b3", size = 3919882, upload-time = "2025-12-02T11:56:48.469Z" },
{ url = "https://files.pythonhosted.org/packages/02/b9/e0ef2f57a6a72444cb66c2ffbc310ab33ffaace275f1c4b0319d84ea3f18/tantivy-0.25.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dbd4f8f264dacbcc9dee542832da2173fd53deaaea03f082d95214f8b5ed6bc", size = 4196031, upload-time = "2025-12-02T11:56:50.151Z" },
{ url = "https://files.pythonhosted.org/packages/1e/02/bf3f8cacfd08642e14a73f7956a3fb95d58119132c98c121b9065a1f8615/tantivy-0.25.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:824c643ccb640dd9e35e00c5d5054ddf3323f56fe4219d57d428a9eeea13d22c", size = 4183437, upload-time = "2025-12-02T11:56:51.818Z" },
{ url = "https://files.pythonhosted.org/packages/9c/83/afa90e570198e2d1139dd567bec3c9cf44d8c54f63a649f16d711ede02f5/tantivy-0.25.1-cp313-cp313t-win_amd64.whl", hash = "sha256:09c987b840afcebac817836ac08407eff17272d8aa60ce6e291f89c81830221d", size = 3419409, upload-time = "2025-12-02T11:56:53.451Z" },
{ url = "https://files.pythonhosted.org/packages/ff/44/9f1d67aa5030f7eebc966c863d1316a510a971dd8bb45651df4acdfae9ed/tantivy-0.25.1-cp314-cp314-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:7f5d29ae85dd0f23df8d15b3e7b341d4f9eb5a446bbb9640df48ac1f6d9e0c6c", size = 7623723, upload-time = "2025-12-02T11:56:55.066Z" },
{ url = "https://files.pythonhosted.org/packages/db/30/6e085bd3ed9d12da3c91c185854abd70f9dfd35fb36a75ea98428d42c30b/tantivy-0.25.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:f2d2938fb69a74fc1bb36edfaf7f0d1596fa1264db0f377bda2195c58bcb6245", size = 3926243, upload-time = "2025-12-02T11:56:57.058Z" },
{ url = "https://files.pythonhosted.org/packages/32/f5/a00d65433430f51718e5cc6938df571765d7c4e03aedec5aef4ab567aa9b/tantivy-0.25.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f5ff124c4802558e627091e780b362ca944169736caba5a372eef39a79d0ae0", size = 4207186, upload-time = "2025-12-02T11:56:58.803Z" },
{ url = "https://files.pythonhosted.org/packages/19/63/61bdb12fc95f2a7f77bd419a5149bfa9f28caa76cb569bf2b6b06e1d033e/tantivy-0.25.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43b80ef62a340416139c93d19264e5f808da48e04f9305f1092b8ed22be0a5be", size = 4187312, upload-time = "2025-12-02T11:57:00.595Z" },
{ url = "https://files.pythonhosted.org/packages/b7/de/e39c0b01d59019bf5c38face8b81defbc4a68cebf5e0c53bcb2cd715a449/tantivy-0.25.1-cp314-cp314-win_amd64.whl", hash = "sha256:286b654f40c70c1e6b64b9bc7031ed0bf5c440f5bffeaeeee21a0ee6cc39f0e2", size = 3436535, upload-time = "2025-12-02T11:57:02.267Z" },
]
[[package]]
name = "threadpoolctl"
version = "3.6.0"

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.28.0-beta.10"
version = "0.28.0-beta.4"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true
@@ -111,12 +111,7 @@ default = []
aws = ["lance/aws", "lance-io/aws", "lance-namespace-impls/dir-aws"]
oss = ["lance/oss", "lance-io/oss", "lance-namespace-impls/dir-oss"]
gcs = ["lance/gcp", "lance-io/gcp", "lance-namespace-impls/dir-gcp"]
azure = [
"lance/azure",
"lance-io/azure",
"lance-namespace-impls/dir-azure",
"lance-namespace-impls/credential-vendor-azure",
]
azure = ["lance/azure", "lance-io/azure", "lance-namespace-impls/dir-azure"]
huggingface = [
"lance/huggingface",
"lance-io/huggingface",

View File

@@ -171,7 +171,7 @@ impl OpenTableBuilder {
/// Options already set on the connection will be inherited by the table,
/// but can be overridden here.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
let store_params = self
.request
@@ -188,7 +188,7 @@ impl OpenTableBuilder {
/// Options already set on the connection will be inherited by the table,
/// but can be overridden here.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub fn storage_options(
mut self,
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
@@ -582,23 +582,6 @@ pub struct ConnectRequest {
/// Database specific options
pub options: HashMap<String, String>,
/// Extra properties for the equivalent namespace client.
///
/// For a local [`ListingDatabase`], these are merged into the backing
/// `DirectoryNamespace` properties. This is useful for namespace-specific
/// settings such as `table_version_tracking_enabled` that are distinct from
/// storage options.
pub namespace_client_properties: HashMap<String, String>,
/// Use directory namespace manifests as the source of truth for native
/// LanceDB table metadata.
///
/// When enabled for a local/native connection, LanceDB returns a
/// namespace-backed database directly. Directory listing fallback remains
/// enabled for migration, and directory-listing-to-manifest migration is
/// forced on.
pub manifest_enabled: bool,
/// The interval at which to check for updates from other processes.
///
/// If None, then consistency is not checked. For performance
@@ -638,8 +621,6 @@ impl ConnectBuilder {
client_config: Default::default(),
read_consistency_interval: None,
options: HashMap::new(),
namespace_client_properties: HashMap::new(),
manifest_enabled: false,
session: None,
},
embedding_registry: None,
@@ -757,7 +738,7 @@ impl ConnectBuilder {
/// Set an option for the storage layer.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.request.options.insert(key.into(), value.into());
self
@@ -765,7 +746,7 @@ impl ConnectBuilder {
/// Set multiple options for the storage layer.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub fn storage_options(
mut self,
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
@@ -776,42 +757,6 @@ impl ConnectBuilder {
self
}
/// Set an additional property for the equivalent namespace client.
pub fn namespace_client_property(
mut self,
key: impl Into<String>,
value: impl Into<String>,
) -> Self {
self.request
.namespace_client_properties
.insert(key.into(), value.into());
self
}
/// Set multiple additional properties for the equivalent namespace client.
pub fn namespace_client_properties(
mut self,
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
) -> Self {
for (key, value) in pairs {
self.request
.namespace_client_properties
.insert(key.into(), value.into());
}
self
}
/// Enable or disable manifest-backed directory namespace mode for local
/// native connections.
///
/// When enabled, the connection uses the directory namespace database
/// directly for all table operations and forces
/// `dir_listing_to_manifest_migration_enabled=true`.
pub fn manifest_enabled(mut self, enabled: bool) -> Self {
self.request.manifest_enabled = enabled;
self
}
/// The interval at which to check for updates from other processes. This
/// only affects LanceDB OSS.
///
@@ -907,16 +852,6 @@ impl ConnectBuilder {
pub async fn execute(self) -> Result<Connection> {
if self.request.uri.starts_with("db") {
self.execute_remote()
} else if self.request.manifest_enabled {
let internal = Arc::new(
ListingDatabase::connect_manifest_enabled_namespace_database(&self.request).await?,
);
Ok(Connection {
internal,
embedding_registry: self
.embedding_registry
.unwrap_or_else(|| Arc::new(MemoryRegistry::new())),
})
} else {
let internal = Arc::new(ListingDatabase::connect_with_options(&self.request).await?);
Ok(Connection {
@@ -946,7 +881,7 @@ use std::collections::HashSet;
/// These operations will be executed on the namespace server instead of locally
/// when enabled via [`ConnectNamespaceBuilder::pushdown_operations`].
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NamespaceClientPushdownOperation {
pub enum PushdownOperation {
/// Execute queries on the namespace server via `query_table()` instead of locally.
QueryTable,
/// Execute table creation on the namespace server via `create_table()`
@@ -958,11 +893,10 @@ pub struct ConnectNamespaceBuilder {
ns_impl: String,
properties: HashMap<String, String>,
storage_options: HashMap<String, String>,
namespace_client_properties: HashMap<String, String>,
read_consistency_interval: Option<std::time::Duration>,
embedding_registry: Option<Arc<dyn EmbeddingRegistry>>,
session: Option<Arc<lance::session::Session>>,
pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
pushdown_operations: HashSet<PushdownOperation>,
}
impl ConnectNamespaceBuilder {
@@ -971,7 +905,6 @@ impl ConnectNamespaceBuilder {
ns_impl: ns_impl.to_string(),
properties,
storage_options: HashMap::new(),
namespace_client_properties: HashMap::new(),
read_consistency_interval: None,
embedding_registry: None,
session: None,
@@ -981,7 +914,7 @@ impl ConnectNamespaceBuilder {
/// Set an option for the storage layer.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.storage_options.insert(key.into(), value.into());
self
@@ -989,7 +922,7 @@ impl ConnectNamespaceBuilder {
/// Set multiple options for the storage layer.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub fn storage_options(
mut self,
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
@@ -1000,29 +933,6 @@ impl ConnectNamespaceBuilder {
self
}
/// Set an additional namespace client property.
pub fn namespace_client_property(
mut self,
key: impl Into<String>,
value: impl Into<String>,
) -> Self {
self.namespace_client_properties
.insert(key.into(), value.into());
self
}
/// Set multiple additional namespace client properties.
pub fn namespace_client_properties(
mut self,
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
) -> Self {
for (key, value) in pairs {
self.namespace_client_properties
.insert(key.into(), value.into());
}
self
}
/// The interval at which to check for updates from other processes.
///
/// If left unset, consistency is not checked. For maximum read
@@ -1060,11 +970,11 @@ impl ConnectNamespaceBuilder {
/// and leveraging server-side compute resources.
///
/// Available operations:
/// - [`NamespaceClientPushdownOperation::QueryTable`]: Execute queries via `namespace.query_table()`
/// - [`NamespaceClientPushdownOperation::CreateTable`]: Execute table creation via `namespace.create_table()`
/// - [`PushdownOperation::QueryTable`]: Execute queries via `namespace.query_table()`
/// - [`PushdownOperation::CreateTable`]: Execute table creation via `namespace.create_table()`
///
/// By default, no operations are pushed down (all executed locally).
pub fn pushdown_operation(mut self, operation: NamespaceClientPushdownOperation) -> Self {
pub fn pushdown_operation(mut self, operation: PushdownOperation) -> Self {
self.pushdown_operations.insert(operation);
self
}
@@ -1074,7 +984,7 @@ impl ConnectNamespaceBuilder {
/// See [`Self::pushdown_operation`] for details.
pub fn pushdown_operations(
mut self,
operations: impl IntoIterator<Item = NamespaceClientPushdownOperation>,
operations: impl IntoIterator<Item = PushdownOperation>,
) -> Self {
self.pushdown_operations.extend(operations);
self
@@ -1084,13 +994,10 @@ impl ConnectNamespaceBuilder {
pub async fn execute(self) -> Result<Connection> {
use crate::database::namespace::LanceNamespaceDatabase;
let mut properties = self.properties;
properties.extend(self.namespace_client_properties);
let internal = Arc::new(
LanceNamespaceDatabase::connect(
&self.ns_impl,
properties,
self.properties,
self.storage_options,
self.read_consistency_interval,
self.session,
@@ -1163,9 +1070,6 @@ mod tests {
use lance_testing::datagen::{BatchGenerator, IncrementingInt32};
use tempfile::tempdir;
use crate::database::listing::{ListingDatabaseOptions, OPT_NEW_TABLE_V2_MANIFEST_PATHS};
use crate::database::namespace::LanceNamespaceDatabase;
use crate::table::NativeTable;
use crate::test_utils::connection::new_test_connection;
use super::*;
@@ -1213,172 +1117,6 @@ mod tests {
assert_eq!(db.uri(), relative_uri.to_str().unwrap().to_string());
}
#[tokio::test]
async fn test_connect_with_namespace_client_properties() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let db = connect(uri)
.namespace_client_property("table_version_tracking_enabled", "true")
.namespace_client_property("manifest_enabled", "true")
.execute()
.await
.unwrap();
let (ns_impl, properties) = db.namespace_client_config().await.unwrap();
assert_eq!(ns_impl, "dir");
assert_eq!(properties.get("root"), Some(&uri.to_string()));
assert_eq!(
properties.get("table_version_tracking_enabled"),
Some(&"true".to_string())
);
assert_eq!(
properties.get("manifest_enabled"),
Some(&"true".to_string())
);
}
#[tokio::test]
async fn test_connect_with_manifest_enabled_uses_directory_namespace() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let db = connect(uri)
.manifest_enabled(true)
.storage_option("timeout", "30s")
.namespace_client_property("manifest_enabled", "false")
.namespace_client_property("dir_listing_to_manifest_migration_enabled", "false")
.execute()
.await
.unwrap();
assert!(
db.database()
.as_any()
.downcast_ref::<LanceNamespaceDatabase>()
.is_some()
);
assert_eq!(db.uri(), uri);
let (ns_impl, properties) = db.namespace_client_config().await.unwrap();
assert_eq!(ns_impl, "dir");
assert_eq!(properties.get("root"), Some(&uri.to_string()));
assert_eq!(
properties.get("manifest_enabled"),
Some(&"true".to_string())
);
assert_eq!(
properties.get("dir_listing_to_manifest_migration_enabled"),
Some(&"true".to_string())
);
assert_eq!(properties.get("storage.timeout"), Some(&"30s".to_string()));
}
#[tokio::test]
async fn test_manifest_enabled_rejects_commit_engine_uri() {
let Err(err) = connect("s3+ddb://bucket/db?ddbTableName=manifest")
.manifest_enabled(true)
.execute()
.await
else {
panic!("expected manifest-enabled s3+ddb connection to fail");
};
assert!(
matches!(err, Error::NotSupported { message } if message.contains("commit engine URI schemes"))
);
let Err(err) = connect("s3://bucket/db?engine=ddb&ddbTableName=manifest")
.manifest_enabled(true)
.execute()
.await
else {
panic!("expected manifest-enabled engine query connection to fail");
};
assert!(
matches!(err, Error::NotSupported { message } if message.contains("commit engine"))
);
}
#[tokio::test]
async fn test_manifest_enabled_connection_migrates_root_listing_table() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)]));
connect(uri)
.execute()
.await
.unwrap()
.create_empty_table("legacy", schema)
.execute()
.await
.unwrap();
let db = connect(uri).manifest_enabled(true).execute().await.unwrap();
let tables = db.table_names().execute().await.unwrap();
assert_eq!(tables, vec!["legacy".to_string()]);
db.open_table("legacy").execute().await.unwrap();
}
#[tokio::test]
async fn test_manifest_enabled_preserves_new_table_options() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let options = ListingDatabaseOptions::builder()
.enable_v2_manifest_paths(true)
.build();
let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)]));
let table = connect(uri)
.manifest_enabled(true)
.database_options(&options)
.execute()
.await
.unwrap()
.create_empty_table("v1_manifest", schema)
.storage_option(OPT_NEW_TABLE_V2_MANIFEST_PATHS, "false")
.execute()
.await
.unwrap();
let native_table = table
.base_table()
.as_any()
.downcast_ref::<NativeTable>()
.unwrap();
assert!(!native_table.uses_v2_manifest_paths().await.unwrap());
}
#[tokio::test]
async fn test_manifest_enabled_vend_input_storage_options() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)]));
let table = connect(uri)
.manifest_enabled(true)
.storage_option("test_storage_option", "test_value")
.namespace_client_property("vend_input_storage_options", "true")
.namespace_client_property(
"vend_input_storage_options_refresh_interval_millis",
"60000",
)
.execute()
.await
.unwrap()
.create_empty_table("vended", schema)
.execute()
.await
.unwrap();
let storage_options = table.latest_storage_options().await.unwrap().unwrap();
assert_eq!(
storage_options.get("test_storage_option"),
Some(&"test_value".to_string())
);
assert!(storage_options.contains_key("expires_at_millis"));
}
#[tokio::test]
async fn test_table_names() {
let tc = new_test_connection().await.unwrap();

View File

@@ -55,7 +55,7 @@ impl CreateTableBuilder {
/// Options already set on the connection will be inherited by the table,
/// but can be overridden here.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
let store_params = self
.request
@@ -73,7 +73,7 @@ impl CreateTableBuilder {
/// Options already set on the connection will be inherited by the table,
/// but can be overridden here.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub fn storage_options(
mut self,
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,

View File

@@ -20,7 +20,6 @@ use snafu::ResultExt;
use crate::connection::ConnectRequest;
use crate::database::ReadConsistency;
use crate::database::namespace::LanceNamespaceDatabase;
use crate::error::{CreateDirSnafu, Error, Result};
use crate::io::object_store::MirroringObjectStoreWrapper;
use crate::table::NativeTable;
@@ -74,7 +73,7 @@ pub struct ListingDatabaseOptions {
/// These are used to create/list tables and they are inherited by all tables
/// opened by this database.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub storage_options: HashMap<String, String>,
}
@@ -186,7 +185,7 @@ impl ListingDatabaseOptionsBuilder {
/// Set an option for the storage layer.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.options
.storage_options
@@ -196,7 +195,7 @@ impl ListingDatabaseOptionsBuilder {
/// Set multiple options for the storage layer.
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
pub fn storage_options(
mut self,
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
@@ -256,9 +255,6 @@ pub struct ListingDatabase {
// Session for object stores and caching
session: Arc<lance::session::Session>,
// Namespace-backed database for child namespace operations
namespace_database: Arc<LanceNamespaceDatabase>,
}
impl std::fmt::Display for ListingDatabase {
@@ -285,175 +281,6 @@ const MIRRORED_STORE: &str = "mirroredStore";
/// A connection to LanceDB
impl ListingDatabase {
pub(crate) fn build_namespace_client_properties(
uri: &str,
storage_options: &HashMap<String, String>,
namespace_client_properties: HashMap<String, String>,
) -> HashMap<String, String> {
let mut properties = namespace_client_properties;
properties.insert("root".to_string(), uri.to_string());
for (key, value) in storage_options {
properties.insert(format!("storage.{}", key), value.clone());
}
properties
}
pub(crate) fn build_manifest_enabled_namespace_client_properties(
uri: &str,
storage_options: &HashMap<String, String>,
namespace_client_properties: HashMap<String, String>,
) -> HashMap<String, String> {
let mut properties = Self::build_namespace_client_properties(
uri,
storage_options,
namespace_client_properties,
);
properties.insert("manifest_enabled".to_string(), "true".to_string());
properties.insert(
"dir_listing_to_manifest_migration_enabled".to_string(),
"true".to_string(),
);
properties
}
async fn connect_namespace_database(
uri: &str,
storage_options: HashMap<String, String>,
namespace_client_properties: HashMap<String, String>,
read_consistency_interval: Option<std::time::Duration>,
session: Arc<lance::session::Session>,
) -> Result<Arc<LanceNamespaceDatabase>> {
let ns_properties = Self::build_namespace_client_properties(
uri,
&storage_options,
namespace_client_properties,
);
Ok(Arc::new(
LanceNamespaceDatabase::connect(
"dir",
ns_properties,
storage_options,
read_consistency_interval,
Some(session),
HashSet::new(),
)
.await?,
))
}
async fn prepare_namespace_root(
uri: &str,
storage_options: &HashMap<String, String>,
session: Arc<lance::session::Session>,
) -> Result<String> {
match url::Url::parse(uri) {
Ok(url) if url.scheme().len() == 1 && cfg!(windows) => {
let (object_store, _) = ObjectStore::from_uri_and_params(
session.store_registry(),
uri,
&ObjectStoreParams::default(),
)
.await?;
if object_store.is_local() {
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
}
Ok(uri.to_string())
}
Ok(mut url) => {
if url.scheme().contains('+') {
return Err(Error::NotSupported {
message: "commit engine URI schemes are not supported for manifest-enabled namespace connections".to_string(),
});
}
for (key, value) in url.query_pairs() {
if key == ENGINE {
return Err(Error::NotSupported {
message: format!(
"commit engine '{}' is not supported for manifest-enabled namespace connections",
value
),
});
} else if key == MIRRORED_STORE {
return Err(Error::NotSupported {
message: "mirrored store is not supported for manifest-enabled namespace connections"
.to_string(),
});
}
}
url.set_query(None);
let plain_uri = url.to_string();
let os_params = ObjectStoreParams {
storage_options_accessor: if storage_options.is_empty() {
None
} else {
Some(Arc::new(StorageOptionsAccessor::with_static_options(
storage_options.clone(),
)))
},
..Default::default()
};
let (object_store, _) = ObjectStore::from_uri_and_params(
session.store_registry(),
&plain_uri,
&os_params,
)
.await?;
if object_store.is_local() {
Self::try_create_dir(&plain_uri).context(CreateDirSnafu {
path: plain_uri.clone(),
})?;
}
Ok(plain_uri)
}
Err(_) => {
let (object_store, _) = ObjectStore::from_uri_and_params(
session.store_registry(),
uri,
&ObjectStoreParams::default(),
)
.await?;
if object_store.is_local() {
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
}
Ok(uri.to_string())
}
}
}
pub(crate) async fn connect_manifest_enabled_namespace_database(
request: &ConnectRequest,
) -> Result<LanceNamespaceDatabase> {
let options = ListingDatabaseOptions::parse_from_map(&request.options)?;
let session = request
.session
.clone()
.unwrap_or_else(|| Arc::new(lance::session::Session::default()));
let namespace_root =
Self::prepare_namespace_root(&request.uri, &options.storage_options, session.clone())
.await?;
let ns_properties = Self::build_manifest_enabled_namespace_client_properties(
&namespace_root,
&options.storage_options,
request.namespace_client_properties.clone(),
);
LanceNamespaceDatabase::connect_with_new_table_config(
"dir",
ns_properties,
options.storage_options,
request.read_consistency_interval,
Some(session),
HashSet::new(),
options.new_table_config,
)
.await
.map(|db| db.with_uri(request.uri.clone()))
}
/// Connect to a listing database
///
/// The URI should be a path to a directory where the tables are stored.
@@ -473,7 +300,6 @@ impl ListingDatabase {
uri,
request.read_consistency_interval,
options.new_table_config,
request.namespace_client_properties.clone(),
request.session.clone(),
)
.await
@@ -561,15 +387,6 @@ impl ListingDatabase {
None => None,
};
let namespace_database = Self::connect_namespace_database(
&table_base_uri,
options.storage_options.clone(),
request.namespace_client_properties.clone(),
request.read_consistency_interval,
session.clone(),
)
.await?;
Ok(Self {
uri: table_base_uri,
query_string,
@@ -581,7 +398,6 @@ impl ListingDatabase {
storage_options_provider: None,
new_table_config: options.new_table_config,
session,
namespace_database,
})
}
Err(_) => {
@@ -589,7 +405,6 @@ impl ListingDatabase {
uri,
request.read_consistency_interval,
options.new_table_config,
request.namespace_client_properties.clone(),
request.session.clone(),
)
.await
@@ -601,7 +416,6 @@ impl ListingDatabase {
path: &str,
read_consistency_interval: Option<std::time::Duration>,
new_table_config: NewTableConfig,
namespace_client_properties: HashMap<String, String>,
session: Option<Arc<lance::session::Session>>,
) -> Result<Self> {
let session = session.unwrap_or_else(|| Arc::new(lance::session::Session::default()));
@@ -615,15 +429,6 @@ impl ListingDatabase {
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
}
let namespace_database = Self::connect_namespace_database(
path,
HashMap::new(),
namespace_client_properties,
read_consistency_interval,
session.clone(),
)
.await?;
Ok(Self {
uri: path.to_string(),
query_string: None,
@@ -635,7 +440,6 @@ impl ListingDatabase {
storage_options_provider: None,
new_table_config,
session,
namespace_database,
})
}
@@ -693,10 +497,6 @@ impl ListingDatabase {
Ok(uri)
}
fn namespace_database(&self) -> Arc<LanceNamespaceDatabase> {
self.namespace_database.clone()
}
async fn drop_tables(&self, names: Vec<String>) -> Result<()> {
let object_store_params = ObjectStoreParams {
storage_options_accessor: if self.storage_options.is_empty() {
@@ -821,12 +621,15 @@ impl ListingDatabase {
store_params.storage_options_accessor = Some(Arc::new(accessor));
}
write_params.data_storage_version = storage_version_override
.or(write_params.data_storage_version)
.or(self.new_table_config.data_storage_version);
write_params.data_storage_version = self
.new_table_config
.data_storage_version
.or(storage_version_override);
if let Some(enable_v2_manifest_paths) =
v2_manifest_override.or(self.new_table_config.enable_v2_manifest_paths)
if let Some(enable_v2_manifest_paths) = self
.new_table_config
.enable_v2_manifest_paths
.or(v2_manifest_override)
{
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
}
@@ -893,7 +696,16 @@ impl Database for ListingDatabase {
&self,
request: ListNamespacesRequest,
) -> Result<ListNamespacesResponse> {
self.namespace_database().list_namespaces(request).await
if request.id.as_ref().map(|v| !v.is_empty()).unwrap_or(false) {
return Err(Error::NotSupported {
message: "Namespace operations are not supported for listing database".into(),
});
}
Ok(ListNamespacesResponse {
namespaces: Vec::new(),
page_token: None,
})
}
fn uri(&self) -> &str {
@@ -914,26 +726,36 @@ impl Database for ListingDatabase {
async fn create_namespace(
&self,
request: CreateNamespaceRequest,
_request: CreateNamespaceRequest,
) -> Result<CreateNamespaceResponse> {
self.namespace_database().create_namespace(request).await
Err(Error::NotSupported {
message: "Namespace operations are not supported for listing database".into(),
})
}
async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result<DropNamespaceResponse> {
self.namespace_database().drop_namespace(request).await
async fn drop_namespace(
&self,
_request: DropNamespaceRequest,
) -> Result<DropNamespaceResponse> {
Err(Error::NotSupported {
message: "Namespace operations are not supported for listing database".into(),
})
}
async fn describe_namespace(
&self,
request: DescribeNamespaceRequest,
_request: DescribeNamespaceRequest,
) -> Result<DescribeNamespaceResponse> {
self.namespace_database().describe_namespace(request).await
Err(Error::NotSupported {
message: "Namespace operations are not supported for listing database".into(),
})
}
#[allow(deprecated)]
async fn table_names(&self, request: TableNamesRequest) -> Result<Vec<String>> {
if !request.namespace_path.is_empty() {
return self.namespace_database().table_names(request).await;
return Err(Error::NotSupported {
message: "Namespace parameter is not supported for listing database. Only root namespace is supported.".into(),
});
}
let mut f = self
.object_store
@@ -966,7 +788,9 @@ impl Database for ListingDatabase {
async fn list_tables(&self, request: ListTablesRequest) -> Result<ListTablesResponse> {
if request.id.as_ref().map(|v| !v.is_empty()).unwrap_or(false) {
return self.namespace_database().list_tables(request).await;
return Err(Error::NotSupported {
message: "Namespace parameter is not supported for listing database. Only root namespace is supported.".into(),
});
}
let mut f = self
.object_store
@@ -1014,8 +838,11 @@ impl Database for ListingDatabase {
}
async fn create_table(&self, request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
if !request.namespace_path.is_empty() {
return self.namespace_database().create_table(request).await;
// When namespace is not empty, location must be provided
if !request.namespace_path.is_empty() && request.location.is_none() {
return Err(Error::InvalidInput {
message: "Location must be provided when namespace is not empty".into(),
});
}
// Use provided location if available, otherwise derive from table name
let table_uri = request
@@ -1132,8 +959,11 @@ impl Database for ListingDatabase {
}
async fn open_table(&self, mut request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
if !request.namespace_path.is_empty() {
return self.namespace_database().open_table(request).await;
// When namespace is not empty, location must be provided
if !request.namespace_path.is_empty() && request.location.is_none() {
return Err(Error::InvalidInput {
message: "Location must be provided when namespace is not empty".into(),
});
}
// Use provided location if available, otherwise derive from table name
let table_uri = request
@@ -1229,10 +1059,9 @@ impl Database for ListingDatabase {
async fn drop_table(&self, name: &str, namespace_path: &[String]) -> Result<()> {
if !namespace_path.is_empty() {
return self
.namespace_database()
.drop_table(name, namespace_path)
.await;
return Err(Error::NotSupported {
message: "Namespace parameter is not supported for listing database.".into(),
});
}
self.drop_tables(vec![name.to_string()]).await
}
@@ -1241,10 +1070,9 @@ impl Database for ListingDatabase {
async fn drop_all_tables(&self, namespace_path: &[String]) -> Result<()> {
// Check if namespace parameter is provided
if !namespace_path.is_empty() {
return self
.namespace_database()
.drop_all_tables(namespace_path)
.await;
return Err(Error::NotSupported {
message: "Namespace parameter is not supported for listing database.".into(),
});
}
let tables = self.table_names(TableNamesRequest::default()).await?;
self.drop_tables(tables).await
@@ -1255,11 +1083,30 @@ impl Database for ListingDatabase {
}
async fn namespace_client(&self) -> Result<Arc<dyn lance_namespace::LanceNamespace>> {
self.namespace_database.namespace_client().await
// Create a DirectoryNamespace pointing to the same root with the same storage options
let mut builder = lance_namespace_impls::DirectoryNamespaceBuilder::new(&self.uri);
// Add storage options
if !self.storage_options.is_empty() {
builder = builder.storage_options(self.storage_options.clone());
}
// Use the same session
builder = builder.session(self.session.clone());
let namespace = builder.build().await.map_err(|e| Error::Runtime {
message: format!("Failed to create namespace client: {}", e),
})?;
Ok(Arc::new(namespace) as Arc<dyn lance_namespace::LanceNamespace>)
}
async fn namespace_client_config(&self) -> Result<(String, HashMap<String, String>)> {
self.namespace_database.namespace_client_config().await
let mut properties = HashMap::new();
properties.insert("root".to_string(), self.uri.clone());
for (key, value) in &self.storage_options {
properties.insert(format!("storage.{}", key), value.clone());
}
Ok(("dir".to_string(), properties))
}
}
@@ -1285,8 +1132,6 @@ mod tests {
#[cfg(feature = "remote")]
client_config: Default::default(),
options: Default::default(),
namespace_client_properties: Default::default(),
manifest_enabled: false,
read_consistency_interval: None,
session: None,
};
@@ -1420,8 +1265,6 @@ mod tests {
#[cfg(feature = "remote")]
client_config: Default::default(),
options: options.clone(),
namespace_client_properties: Default::default(),
manifest_enabled: false,
read_consistency_interval: None,
session: None,
};
@@ -1956,8 +1799,6 @@ mod tests {
#[cfg(feature = "remote")]
client_config: Default::default(),
options,
namespace_client_properties: Default::default(),
manifest_enabled: false,
read_consistency_interval: None,
session: None,
};
@@ -2063,8 +1904,6 @@ mod tests {
#[cfg(feature = "remote")]
client_config: Default::default(),
options,
namespace_client_properties: Default::default(),
manifest_enabled: false,
read_consistency_interval: None,
session: None,
};
@@ -2136,8 +1975,6 @@ mod tests {
#[cfg(feature = "remote")]
client_config: Default::default(),
options,
namespace_client_properties: Default::default(),
manifest_enabled: false,
read_consistency_interval: None,
session: None,
};
@@ -2271,210 +2108,4 @@ mod tests {
assert!(tables.contains(&"table1".to_string()));
assert!(tables.contains(&"table2".to_string()));
}
#[tokio::test]
async fn test_listing_database_namespace_operations() {
let (_tempdir, db) = setup_database().await;
db.create_namespace(CreateNamespaceRequest {
id: Some(vec!["parent".to_string()]),
..Default::default()
})
.await
.unwrap();
db.create_namespace(CreateNamespaceRequest {
id: Some(vec!["parent".to_string(), "child".to_string()]),
..Default::default()
})
.await
.unwrap();
let root_namespaces = db
.list_namespaces(ListNamespacesRequest {
id: Some(vec![]),
..Default::default()
})
.await
.unwrap();
assert!(root_namespaces.namespaces.contains(&"parent".to_string()));
let child_namespaces = db
.list_namespaces(ListNamespacesRequest {
id: Some(vec!["parent".to_string()]),
..Default::default()
})
.await
.unwrap();
assert!(child_namespaces.namespaces.contains(&"child".to_string()));
db.describe_namespace(DescribeNamespaceRequest {
id: Some(vec!["parent".to_string(), "child".to_string()]),
..Default::default()
})
.await
.unwrap();
}
#[tokio::test]
#[cfg(not(windows))] // TODO: support Windows once directory namespace-backed listing DB tests are supported.
async fn test_listing_database_with_namespace_client_properties() {
let tempdir = tempdir().unwrap();
let uri = tempdir.path().to_str().unwrap();
let mut namespace_client_properties = HashMap::new();
namespace_client_properties.insert(
"table_version_tracking_enabled".to_string(),
"true".to_string(),
);
namespace_client_properties.insert("manifest_enabled".to_string(), "true".to_string());
let request = ConnectRequest {
uri: uri.to_string(),
#[cfg(feature = "remote")]
client_config: Default::default(),
options: Default::default(),
namespace_client_properties,
manifest_enabled: false,
read_consistency_interval: None,
session: None,
};
let db = ListingDatabase::connect_with_options(&request)
.await
.unwrap();
let namespace_path = vec!["test_ns".to_string()];
db.create_namespace(CreateNamespaceRequest {
id: Some(namespace_path.clone()),
..Default::default()
})
.await
.unwrap();
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("name", DataType::Utf8, false),
]));
db.create_table(CreateTableRequest {
name: "managed_table".to_string(),
namespace_path: namespace_path.clone(),
data: Box::new(RecordBatch::new_empty(schema)) as Box<dyn Scannable>,
mode: CreateTableMode::Create,
write_options: Default::default(),
location: None,
namespace_client: None,
})
.await
.unwrap();
let namespace_client = db.namespace_client().await.unwrap();
let describe = namespace_client
.describe_table(lance_namespace::models::DescribeTableRequest {
id: Some(vec!["test_ns".to_string(), "managed_table".to_string()]),
..Default::default()
})
.await
.unwrap();
assert_eq!(describe.managed_versioning, Some(true));
}
#[tokio::test]
async fn test_listing_database_nested_namespace_table_ops() {
let (_tempdir, db) = setup_database().await;
let namespace_path = vec!["parent".to_string(), "child".to_string()];
db.create_namespace(CreateNamespaceRequest {
id: Some(vec!["parent".to_string()]),
..Default::default()
})
.await
.unwrap();
db.create_namespace(CreateNamespaceRequest {
id: Some(namespace_path.clone()),
..Default::default()
})
.await
.unwrap();
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("name", DataType::Utf8, false),
]));
db.create_table(CreateTableRequest {
name: "nested_table".to_string(),
namespace_path: namespace_path.clone(),
data: Box::new(RecordBatch::new_empty(schema)) as Box<dyn Scannable>,
mode: CreateTableMode::Create,
write_options: Default::default(),
location: None,
namespace_client: None,
})
.await
.unwrap();
let namespace_client = db.namespace_client().await.unwrap();
let describe = namespace_client
.describe_table(lance_namespace::models::DescribeTableRequest {
id: Some(vec![
"parent".to_string(),
"child".to_string(),
"nested_table".to_string(),
]),
..Default::default()
})
.await
.unwrap();
assert!(describe.location.is_some());
let table = db
.open_table(OpenTableRequest {
name: "nested_table".to_string(),
namespace_path: namespace_path.clone(),
index_cache_size: None,
lance_read_params: None,
location: None,
namespace_client: None,
managed_versioning: None,
})
.await
.unwrap();
assert_eq!(table.name(), "nested_table");
#[allow(deprecated)]
let table_names = db
.table_names(TableNamesRequest {
namespace_path: namespace_path.clone(),
start_after: None,
limit: None,
})
.await
.unwrap();
assert_eq!(table_names, vec!["nested_table".to_string()]);
let list_tables = db
.list_tables(ListTablesRequest {
id: Some(namespace_path.clone()),
..Default::default()
})
.await
.unwrap();
assert_eq!(list_tables.tables, vec!["nested_table".to_string()]);
db.drop_table("nested_table", &namespace_path)
.await
.unwrap();
let post_drop = db
.list_tables(ListTablesRequest {
id: Some(namespace_path),
..Default::default()
})
.await
.unwrap();
assert!(post_drop.tables.is_empty());
}
}

View File

@@ -22,15 +22,10 @@ use lance_namespace_impls::ConnectBuilder;
use lance_table::io::commit::CommitHandler;
use lance_table::io::commit::external_manifest::ExternalManifestCommitHandler;
use crate::connection::NamespaceClientPushdownOperation;
use crate::connection::PushdownOperation;
use crate::database::ReadConsistency;
use crate::database::listing::{
NewTableConfig, OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, OPT_NEW_TABLE_STORAGE_VERSION,
OPT_NEW_TABLE_V2_MANIFEST_PATHS,
};
use crate::error::{Error, Result};
use crate::table::NativeTable;
use lance::dataset::WriteMode;
use super::{
BaseTable, CloneTableRequest, CreateTableMode, CreateTableRequest as DbCreateTableRequest,
@@ -49,71 +44,21 @@ pub struct LanceNamespaceDatabase {
// database URI
uri: String,
// Operations to push down to the namespace server
pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
pushdown_operations: HashSet<PushdownOperation>,
// Namespace implementation type (e.g., "dir", "rest")
ns_impl: String,
// Namespace properties used to construct the namespace client
ns_properties: HashMap<String, String>,
// Options for tables created by this connection
new_table_config: NewTableConfig,
}
impl LanceNamespaceDatabase {
pub fn from_namespace_client(
namespace_client: Arc<dyn LanceNamespace>,
namespace_client_impl: String,
namespace_client_properties: HashMap<String, String>,
storage_options: HashMap<String, String>,
read_consistency_interval: Option<std::time::Duration>,
session: Option<Arc<lance::session::Session>>,
namespace_client_pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
) -> Self {
Self {
namespace: namespace_client,
storage_options,
read_consistency_interval,
session,
uri: format!("namespace://{}", namespace_client_impl),
pushdown_operations: namespace_client_pushdown_operations,
ns_impl: namespace_client_impl,
ns_properties: namespace_client_properties,
new_table_config: NewTableConfig::default(),
}
}
pub(crate) fn with_uri(mut self, uri: impl Into<String>) -> Self {
self.uri = uri.into();
self
}
pub async fn connect(
ns_impl: &str,
ns_properties: HashMap<String, String>,
storage_options: HashMap<String, String>,
read_consistency_interval: Option<std::time::Duration>,
session: Option<Arc<lance::session::Session>>,
pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
) -> Result<Self> {
Self::connect_with_new_table_config(
ns_impl,
ns_properties,
storage_options,
read_consistency_interval,
session,
pushdown_operations,
NewTableConfig::default(),
)
.await
}
pub(crate) async fn connect_with_new_table_config(
ns_impl: &str,
ns_properties: HashMap<String, String>,
storage_options: HashMap<String, String>,
read_consistency_interval: Option<std::time::Duration>,
session: Option<Arc<lance::session::Session>>,
pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
new_table_config: NewTableConfig,
pushdown_operations: HashSet<PushdownOperation>,
) -> Result<Self> {
let mut builder = ConnectBuilder::new(ns_impl);
for (key, value) in ns_properties.clone() {
@@ -135,79 +80,8 @@ impl LanceNamespaceDatabase {
pushdown_operations,
ns_impl: ns_impl.to_string(),
ns_properties,
new_table_config,
})
}
fn extract_storage_overrides(
&self,
request: &DbCreateTableRequest,
) -> Result<(
Option<lance_encoding::version::LanceFileVersion>,
Option<bool>,
Option<bool>,
)> {
let storage_options = request
.write_options
.lance_write_params
.as_ref()
.and_then(|p| p.store_params.as_ref())
.and_then(|sp| sp.storage_options());
let storage_version_override = storage_options
.and_then(|opts| opts.get(OPT_NEW_TABLE_STORAGE_VERSION))
.map(|s| s.parse::<lance_encoding::version::LanceFileVersion>())
.transpose()?;
let v2_manifest_override = storage_options
.and_then(|opts| opts.get(OPT_NEW_TABLE_V2_MANIFEST_PATHS))
.map(|s| s.parse::<bool>())
.transpose()
.map_err(|_| Error::InvalidInput {
message: "enable_v2_manifest_paths must be a boolean".to_string(),
})?;
let stable_row_ids_override = storage_options
.and_then(|opts| opts.get(OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS))
.map(|s| s.parse::<bool>())
.transpose()
.map_err(|_| Error::InvalidInput {
message: "enable_stable_row_ids must be a boolean".to_string(),
})?;
Ok((
storage_version_override,
v2_manifest_override,
stable_row_ids_override,
))
}
fn apply_new_table_config(
&self,
params: &mut lance::dataset::WriteParams,
request: &DbCreateTableRequest,
) -> Result<()> {
let (storage_version_override, v2_manifest_override, stable_row_ids_override) =
self.extract_storage_overrides(request)?;
params.data_storage_version = storage_version_override
.or(params.data_storage_version)
.or(self.new_table_config.data_storage_version);
if let Some(enable_v2_manifest_paths) =
v2_manifest_override.or(self.new_table_config.enable_v2_manifest_paths)
{
params.enable_v2_manifest_paths = enable_v2_manifest_paths;
}
if let Some(enable_stable_row_ids) =
stable_row_ids_override.or(self.new_table_config.enable_stable_row_ids)
{
params.enable_stable_row_ids = enable_stable_row_ids;
}
Ok(())
}
}
impl std::fmt::Debug for LanceNamespaceDatabase {
@@ -289,23 +163,37 @@ impl Database for LanceNamespaceDatabase {
async fn create_table(&self, request: DbCreateTableRequest) -> Result<Arc<dyn BaseTable>> {
let mut table_id = request.namespace_path.clone();
table_id.push(request.name.clone());
let mut existing_table = None;
let describe_request = DescribeTableRequest {
id: Some(table_id.clone()),
..Default::default()
};
let describe_result = self.namespace.describe_table(describe_request).await;
match request.mode {
CreateTableMode::Create => {}
CreateTableMode::Create => {
if describe_result.is_ok() {
return Err(Error::TableAlreadyExists {
name: request.name.clone(),
});
}
}
CreateTableMode::Overwrite => {
let describe_request = DescribeTableRequest {
id: Some(table_id.clone()),
..Default::default()
};
existing_table = self.namespace.describe_table(describe_request).await.ok();
if describe_result.is_ok() {
// Drop the existing table - must succeed
let drop_request = DropTableRequest {
id: Some(table_id.clone()),
..Default::default()
};
self.namespace
.drop_table(drop_request)
.await
.map_err(|e| Error::Runtime {
message: format!("Failed to drop existing table for overwrite: {}", e),
})?;
}
}
CreateTableMode::ExistOk(_) => {
let describe_request = DescribeTableRequest {
id: Some(table_id.clone()),
..Default::default()
};
let describe_result = self.namespace.describe_table(describe_request).await;
if describe_result.is_ok() {
let native_table = NativeTable::open_from_namespace(
self.namespace.clone(),
@@ -333,86 +221,20 @@ impl Database for LanceNamespaceDatabase {
};
let (location, initial_storage_options, managed_versioning) = {
if let Some(response) = existing_table {
let loc = response.location.ok_or_else(|| Error::Runtime {
message: "Table location is missing from describe_table response".to_string(),
})?;
let opts = response
.storage_options
.or_else(|| Some(self.storage_options.clone()))
.filter(|o| !o.is_empty());
(loc, opts, response.managed_versioning)
} else {
match self.namespace.declare_table(declare_request).await {
Ok(response) => {
let loc = response.location.ok_or_else(|| Error::Runtime {
message: "Table location is missing from declare_table response"
.to_string(),
})?;
let opts = response
.storage_options
.or_else(|| Some(self.storage_options.clone()))
.filter(|o: &HashMap<String, String>| !o.is_empty());
(loc, opts, response.managed_versioning)
}
Err(e)
if matches!(request.mode, CreateTableMode::Create) && {
let err_str = e.to_string();
err_str.contains("already exists")
|| err_str.contains("TableAlreadyExists")
|| err_str.contains("table already exists")
} =>
{
let response = self
.namespace
.describe_table(DescribeTableRequest {
id: Some(table_id.clone()),
..Default::default()
})
.await
.map_err(|describe_err| Error::Runtime {
message: format!(
"Failed to describe existing declared table after declare conflict: {}",
describe_err
),
})?;
if response.version.is_some() && response.schema.is_some() {
return Err(Error::TableAlreadyExists {
name: request.name.clone(),
});
}
let loc = response.location.ok_or_else(|| Error::Runtime {
message: "Table location is missing from describe_table response"
.to_string(),
})?;
let opts = response
.storage_options
.or_else(|| Some(self.storage_options.clone()))
.filter(|o: &HashMap<String, String>| !o.is_empty());
(loc, opts, response.managed_versioning)
}
Err(e) => {
return Err(Error::Runtime {
message: format!("Failed to declare table: {}", e),
});
}
}
}
let response = self.namespace.declare_table(declare_request).await?;
let loc = response.location.ok_or_else(|| Error::Runtime {
message: "Table location is missing from declare_table response".to_string(),
})?;
// Use storage options from response, fall back to self.storage_options
let opts = response
.storage_options
.or_else(|| Some(self.storage_options.clone()))
.filter(|o| !o.is_empty());
(loc, opts, response.managed_versioning)
};
// Build write params with storage options and commit handler
let mut params = request
.write_options
.lance_write_params
.clone()
.unwrap_or_default();
self.apply_new_table_config(&mut params, &request)?;
if matches!(request.mode, CreateTableMode::Overwrite) {
params.mode = WriteMode::Overwrite;
}
let mut params = request.write_options.lance_write_params.unwrap_or_default();
// Set up storage options if provided
if let Some(storage_opts) = initial_storage_options {
@@ -628,47 +450,6 @@ mod tests {
));
}
#[tokio::test]
async fn test_namespace_connection_with_namespace_client_properties() {
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
let conn = connect_namespace("dir", properties)
.namespace_client_property("table_version_tracking_enabled", "true")
.namespace_client_property("manifest_enabled", "true")
.execute()
.await
.expect("Failed to connect to namespace");
conn.create_namespace(CreateNamespaceRequest {
id: Some(vec!["test_ns".into()]),
..Default::default()
})
.await
.expect("Failed to create namespace");
let test_data = create_test_data();
conn.create_table("test_table", test_data)
.namespace(vec!["test_ns".into()])
.execute()
.await
.expect("Failed to create table");
let namespace_client = conn.namespace_client().await.unwrap();
let describe = namespace_client
.describe_table(DescribeTableRequest {
id: Some(vec!["test_ns".into(), "test_table".into()]),
..Default::default()
})
.await
.expect("Failed to describe table");
assert_eq!(describe.managed_versioning, Some(true));
}
#[tokio::test]
async fn test_namespace_create_table_basic() {
// Setup: Create a temporary directory for the namespace
@@ -870,58 +651,6 @@ mod tests {
assert_eq!(id_col.value(2), 30);
}
#[tokio::test]
async fn test_namespace_create_table_after_declare_conflict() {
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
let conn = connect_namespace("dir", properties)
.execute()
.await
.expect("Failed to connect to namespace");
conn.create_namespace(CreateNamespaceRequest {
id: Some(vec!["test_ns".into()]),
..Default::default()
})
.await
.expect("Failed to create namespace");
let namespace_client = conn.namespace_client().await.unwrap();
namespace_client
.declare_table(DeclareTableRequest {
id: Some(vec!["test_ns".into(), "declared_test".into()]),
..Default::default()
})
.await
.expect("Failed to declare table");
let test_data = create_test_data();
let table = conn
.create_table("declared_test", test_data)
.namespace(vec!["test_ns".into()])
.execute()
.await
.expect("Failed to create table after declare conflict");
let results = table
.query()
.execute()
.await
.expect("Failed to query table")
.try_collect::<Vec<_>>()
.await
.expect("Failed to collect results");
assert_eq!(results.len(), 1);
assert_eq!(results[0].num_rows(), 5);
assert_eq!(table.namespace(), &["test_ns"]);
assert_eq!(table.id(), "test_ns$declared_test");
}
#[tokio::test]
async fn test_namespace_create_table_exist_ok_mode() {
// Setup: Create a temporary directory for the namespace

View File

@@ -69,7 +69,7 @@
//! It treats [`FixedSizeList<Float16/Float32>`](https://docs.rs/arrow/latest/arrow/array/struct.FixedSizeListArray.html)
//! columns as vector columns.
//!
//! For more details, please refer to the [LanceDB documentation](https://docs.lancedb.com).
//! For more details, please refer to the [LanceDB documentation](https://lancedb.com/docs).
//!
//! #### Create a table
//!

View File

@@ -16,7 +16,7 @@ use crate::remote::retry::{ResolvedRetryConfig, RetryCounter};
const REQUEST_ID_HEADER: HeaderName = HeaderName::from_static("x-request-id");
/// Configuration for TLS/mTLS settings.
#[derive(Clone, Debug)]
#[derive(Clone, Debug, Default)]
pub struct TlsConfig {
/// Path to the client certificate file (PEM format)
pub cert_file: Option<String>,
@@ -24,22 +24,10 @@ pub struct TlsConfig {
pub key_file: Option<String>,
/// Path to the CA certificate file for server verification (PEM format)
pub ssl_ca_cert: Option<String>,
/// Whether to verify the hostname in the server's certificate.
/// Defaults to `true`.
/// Whether to verify the hostname in the server's certificate
pub assert_hostname: bool,
}
impl Default for TlsConfig {
fn default() -> Self {
Self {
cert_file: None,
key_file: None,
ssl_ca_cert: None,
assert_hostname: true,
}
}
}
/// Trait for providing custom headers for each request
#[async_trait::async_trait]
pub trait HeaderProvider: Send + Sync + std::fmt::Debug {
@@ -938,7 +926,7 @@ mod tests {
assert!(config.cert_file.is_none());
assert!(config.key_file.is_none());
assert!(config.ssl_ca_cert.is_none());
assert!(config.assert_hostname);
assert!(!config.assert_hostname);
}
#[test]

View File

@@ -97,7 +97,7 @@ pub struct RemoteDatabaseOptions {
pub host_override: Option<String>,
/// Storage options configure the storage layer (e.g. S3, GCS, Azure, etc.)
///
/// See available options at <https://docs.lancedb.com/storage/>
/// See available options at <https://lancedb.com/docs/storage/>
///
/// These options are only used for LanceDB Enterprise and only a subset of options
/// are supported.

View File

@@ -47,7 +47,7 @@ use std::format;
use std::path::Path;
use std::sync::Arc;
use crate::connection::NamespaceClientPushdownOperation;
use crate::connection::PushdownOperation;
use crate::data::scannable::{PeekedScannable, Scannable, estimate_write_partitions};
use crate::database::Database;
@@ -1272,7 +1272,7 @@ pub struct NativeTable {
pub(crate) namespace_client: Option<Arc<dyn LanceNamespace>>,
// Operations to push down to the namespace server.
// pub(crate) so query.rs can access the field for server-side query execution.
pub(crate) pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
pub(crate) pushdown_operations: HashSet<PushdownOperation>,
}
impl std::fmt::Debug for NativeTable {
@@ -1359,7 +1359,7 @@ impl NativeTable {
params: Option<ReadParams>,
read_consistency_interval: Option<std::time::Duration>,
namespace_client: Option<Arc<dyn LanceNamespace>>,
pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
pushdown_operations: HashSet<PushdownOperation>,
managed_versioning: Option<bool>,
) -> Result<Self> {
let params = params.unwrap_or_default();
@@ -1470,7 +1470,7 @@ impl NativeTable {
write_store_wrapper: Option<Arc<dyn WrappingObjectStore>>,
params: Option<ReadParams>,
read_consistency_interval: Option<std::time::Duration>,
pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
pushdown_operations: HashSet<PushdownOperation>,
session: Option<Arc<lance::session::Session>>,
) -> Result<Self> {
let mut params = params.unwrap_or_default();
@@ -1518,7 +1518,7 @@ impl NativeTable {
let id = Self::build_id(&namespace, name);
let stored_namespace_client =
if pushdown_operations.contains(&NamespaceClientPushdownOperation::QueryTable) {
if pushdown_operations.contains(&PushdownOperation::QueryTable) {
Some(namespace_client)
} else {
None
@@ -1588,7 +1588,7 @@ impl NativeTable {
params: Option<WriteParams>,
read_consistency_interval: Option<std::time::Duration>,
namespace_client: Option<Arc<dyn LanceNamespace>>,
pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
pushdown_operations: HashSet<PushdownOperation>,
) -> Result<Self> {
// Default params uses format v1.
let params = params.unwrap_or(WriteParams {
@@ -1635,7 +1635,7 @@ impl NativeTable {
params: Option<WriteParams>,
read_consistency_interval: Option<std::time::Duration>,
namespace_client: Option<Arc<dyn LanceNamespace>>,
pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
pushdown_operations: HashSet<PushdownOperation>,
) -> Result<Self> {
let data: Box<dyn Scannable> = Box::new(RecordBatch::new_empty(schema));
Self::create(
@@ -1685,7 +1685,7 @@ impl NativeTable {
write_store_wrapper: Option<Arc<dyn WrappingObjectStore>>,
params: Option<WriteParams>,
read_consistency_interval: Option<std::time::Duration>,
pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
pushdown_operations: HashSet<PushdownOperation>,
session: Option<Arc<lance::session::Session>>,
) -> Result<Self> {
// Build table_id from namespace + name for the storage options provider
@@ -1738,7 +1738,7 @@ impl NativeTable {
let id = Self::build_id(&namespace, name);
let stored_namespace_client =
if pushdown_operations.contains(&NamespaceClientPushdownOperation::QueryTable) {
if pushdown_operations.contains(&PushdownOperation::QueryTable) {
Some(namespace_client)
} else {
None

View File

@@ -285,10 +285,7 @@ mod tests {
use super::*;
use crate::{
connect, io::object_store::io_tracking::IoStatsHolder, table::WriteOptions,
utils::background_cache::clock,
};
use crate::{connect, io::object_store::io_tracking::IoStatsHolder, table::WriteOptions};
async fn create_test_dataset(uri: &str) -> Dataset {
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
@@ -466,10 +463,6 @@ mod tests {
let uri = dir.path().to_str().unwrap();
let ds = create_test_dataset(uri).await;
// Other tests use a thread-local mock clock. Simulate leaked state from a
// previous test to ensure this wrapper starts from real time.
clock::advance_by(Duration::from_secs(60));
let wrapper = DatasetConsistencyWrapper::new_latest(ds, Some(Duration::from_millis(200)));
// Populate the cache

View File

@@ -4,7 +4,7 @@
use std::sync::Arc;
use super::NativeTable;
use crate::connection::NamespaceClientPushdownOperation;
use crate::connection::PushdownOperation;
use crate::error::{Error, Result};
use crate::expr::expr_to_sql_string;
use crate::query::{
@@ -44,7 +44,7 @@ pub async fn execute_query(
// If QueryTable pushdown is enabled and namespace client is configured, use server-side query execution
if table
.pushdown_operations
.contains(&NamespaceClientPushdownOperation::QueryTable)
.contains(&PushdownOperation::QueryTable)
&& let Some(ref namespace_client) = table.namespace_client
{
return execute_namespace_query(table, namespace_client.clone(), query, options).await;

View File

@@ -107,14 +107,6 @@ where
refresh_window < ttl,
"refresh_window ({refresh_window:?}) must be less than ttl ({ttl:?})"
);
#[cfg(test)]
{
// Tests may advance the thread-local mock clock and leave it behind for
// the next test that happens to run on the same worker thread. Each new
// cache should start from a clean clock state instead of inheriting
// unrelated mock time from a previous test.
clock::clear_mock();
}
Self {
inner: Arc::new(Mutex::new(CacheInner {
state: State::Empty,