mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-23 06:50:40 +00:00
Compare commits
69 Commits
toddfarmer
...
v0.29.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d45fd80c3f | ||
|
|
a1351a3c4e | ||
|
|
64978c8419 | ||
|
|
241420239b | ||
|
|
9d67ea2bb0 | ||
|
|
011fdd5c94 | ||
|
|
650f173236 | ||
|
|
9b21c136c6 | ||
|
|
694aa48e19 | ||
|
|
455ba5abbf | ||
|
|
5338aeb006 | ||
|
|
47a34f5cca | ||
|
|
a17c241e86 | ||
|
|
1fc23e5473 | ||
|
|
87b831bcae | ||
|
|
59db036118 | ||
|
|
c091243d5b | ||
|
|
a2aea7b4e5 | ||
|
|
4a5341edb1 | ||
|
|
25dfe2cfd4 | ||
|
|
4dcd7f4314 | ||
|
|
2e36cd9dad | ||
|
|
f31e27768a | ||
|
|
b84150a53e | ||
|
|
d135c18db6 | ||
|
|
ef399de092 | ||
|
|
0d767abd0e | ||
|
|
a92ae0ded5 | ||
|
|
c54888a83a | ||
|
|
ba6c44abc9 | ||
|
|
75b0a8e0a3 | ||
|
|
2a886141f7 | ||
|
|
2a1df8edcf | ||
|
|
fd98b845ea | ||
|
|
be48ada352 | ||
|
|
9ad2dfe601 | ||
|
|
f909df3e87 | ||
|
|
d715bbb588 | ||
|
|
5ce3d8d141 | ||
|
|
5eaac178b1 | ||
|
|
11af763fcd | ||
|
|
2ed5452e1c | ||
|
|
b7c0b5987c | ||
|
|
97a4b38f19 | ||
|
|
10879d99b8 | ||
|
|
4e6a1d5dce | ||
|
|
13d2759356 | ||
|
|
7f52ec8c36 | ||
|
|
c6ae0de3ee | ||
|
|
231f0655ce | ||
|
|
8c52977c59 | ||
|
|
359710a0bf | ||
|
|
1f1726369d | ||
|
|
df354abae4 | ||
|
|
11bc674548 | ||
|
|
5593460823 | ||
|
|
2807ad6854 | ||
|
|
4761fa9bcb | ||
|
|
4c2939d66e | ||
|
|
a813ce2f71 | ||
|
|
a898dc81c2 | ||
|
|
de3f8097e7 | ||
|
|
0ac59de5f1 | ||
|
|
d082c2d2ac | ||
|
|
9d8699f99e | ||
|
|
aa2c7b3591 | ||
|
|
590c0c1e77 | ||
|
|
382ecd65e3 | ||
|
|
e26b22bcca |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.27.2"
|
||||
current_version = "0.29.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/documentation.yml
vendored
2
.github/ISSUE_TEMPLATE/documentation.yml
vendored
@@ -18,6 +18,6 @@ body:
|
||||
label: Link
|
||||
description: >
|
||||
Provide a link to the existing documentation, if applicable.
|
||||
placeholder: ex. https://lancedb.com/docs/tables/...
|
||||
placeholder: ex. https://docs.lancedb.com/tables/...
|
||||
validations:
|
||||
required: false
|
||||
|
||||
18
.github/dependabot.yml
vendored
Normal file
18
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
version: 2
|
||||
|
||||
# Scope: the root Cargo workspace, which produces the Rust binaries we
|
||||
# ship to users (the Node.js and Python native extensions). The
|
||||
# `rust/lancedb` library crate shares the same lockfile; its consumers
|
||||
# pick their own dependency versions, but bumping transitive deps here
|
||||
# keeps the binaries we ship current.
|
||||
updates:
|
||||
- package-ecosystem: cargo
|
||||
directory: /
|
||||
schedule:
|
||||
interval: weekly
|
||||
open-pull-requests-limit: 10
|
||||
groups:
|
||||
rust-minor-patch:
|
||||
update-types:
|
||||
- minor
|
||||
- patch
|
||||
3
.github/workflows/dev.yml
vendored
3
.github/workflows/dev.yml
vendored
@@ -8,6 +8,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
labeler:
|
||||
permissions:
|
||||
|
||||
8
.github/workflows/java-publish.yml
vendored
8
.github/workflows/java-publish.yml
vendored
@@ -19,6 +19,9 @@ on:
|
||||
paths:
|
||||
- .github/workflows/java-publish.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
name: Build and Publish
|
||||
@@ -40,7 +43,7 @@ jobs:
|
||||
server-username: SONATYPE_USER
|
||||
server-password: SONATYPE_TOKEN
|
||||
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
|
||||
gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
|
||||
gpg-passphrase: MAVEN_GPG_PASSPHRASE
|
||||
- name: Set git config
|
||||
run: |
|
||||
git config --global user.email "dev+gha@lancedb.com"
|
||||
@@ -55,10 +58,11 @@ jobs:
|
||||
echo "use-agent" >> ~/.gnupg/gpg.conf
|
||||
echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf
|
||||
export GPG_TTY=$(tty)
|
||||
./mvnw --batch-mode -DskipTests -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -pl lancedb-core -am -P deploy-to-ossrh
|
||||
./mvnw --batch-mode -DskipTests -DpushChanges=false deploy -pl lancedb-core -am -P deploy-to-ossrh
|
||||
env:
|
||||
SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
|
||||
SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
|
||||
MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
|
||||
|
||||
report-failure:
|
||||
name: Report Workflow Failure
|
||||
|
||||
4
.github/workflows/java.yml
vendored
4
.github/workflows/java.yml
vendored
@@ -16,6 +16,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- release/**
|
||||
paths:
|
||||
- java/**
|
||||
- .github/workflows/java.yml
|
||||
@@ -24,6 +25,9 @@ on:
|
||||
- java/**
|
||||
- .github/workflows/java.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build-java:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
5
.github/workflows/license-header-check.yml
vendored
5
.github/workflows/license-header-check.yml
vendored
@@ -3,6 +3,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- release/**
|
||||
pull_request:
|
||||
paths:
|
||||
- rust/**
|
||||
@@ -10,6 +11,10 @@ on:
|
||||
- nodejs/**
|
||||
- java/**
|
||||
- .github/workflows/license-header-check.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
check-licenses:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
5
.github/workflows/nodejs.yml
vendored
5
.github/workflows/nodejs.yml
vendored
@@ -4,16 +4,21 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- release/**
|
||||
pull_request:
|
||||
paths:
|
||||
- Cargo.toml
|
||||
- Cargo.lock
|
||||
- rust-toolchain.toml
|
||||
- nodejs/**
|
||||
- rust/**
|
||||
- docs/src/js/**
|
||||
- .github/workflows/nodejs.yml
|
||||
- docker-compose.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
15
.github/workflows/pypi-publish.yml
vendored
15
.github/workflows/pypi-publish.yml
vendored
@@ -14,10 +14,16 @@ on:
|
||||
env:
|
||||
PIP_EXTRA_INDEX_URL: "https://pypi.fury.io/lance-format/ https://pypi.fury.io/lancedb/"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
name: Python ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
|
||||
timeout-minutes: 60
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
@@ -57,10 +63,12 @@ jobs:
|
||||
- uses: ./.github/workflows/upload_wheel
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
with:
|
||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||
mac:
|
||||
timeout-minutes: 90
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -85,10 +93,12 @@ jobs:
|
||||
- uses: ./.github/workflows/upload_wheel
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
with:
|
||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||
windows:
|
||||
timeout-minutes: 60
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -107,7 +117,6 @@ jobs:
|
||||
- uses: ./.github/workflows/upload_wheel
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
with:
|
||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||
gh-release:
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
|
||||
7
.github/workflows/python.yml
vendored
7
.github/workflows/python.yml
vendored
@@ -4,10 +4,12 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- release/**
|
||||
pull_request:
|
||||
paths:
|
||||
- Cargo.toml
|
||||
- Cargo.lock
|
||||
- rust-toolchain.toml
|
||||
- python/**
|
||||
- rust/**
|
||||
- .github/workflows/python.yml
|
||||
@@ -16,6 +18,9 @@ on:
|
||||
- .github/workflows/build_windows_wheel/**
|
||||
- .github/workflows/run_tests/**
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
@@ -107,7 +112,6 @@ jobs:
|
||||
- name: Install
|
||||
run: |
|
||||
pip install --extra-index-url https://pypi.fury.io/lance-format/ --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests,dev,embeddings]
|
||||
pip install tantivy
|
||||
pip install mlx
|
||||
- name: Doctest
|
||||
run: pytest --doctest-modules python/lancedb
|
||||
@@ -226,6 +230,5 @@ jobs:
|
||||
pip install "pydantic<2"
|
||||
pip install pyarrow==16
|
||||
pip install --extra-index-url https://pypi.fury.io/lance-format/ --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests]
|
||||
pip install tantivy
|
||||
- name: Run tests
|
||||
run: pytest -m "not slow and not s3_test" -x -v --durations=30 python/tests
|
||||
|
||||
19
.github/workflows/rust.yml
vendored
19
.github/workflows/rust.yml
vendored
@@ -4,13 +4,21 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- release/**
|
||||
pull_request:
|
||||
paths:
|
||||
- Cargo.toml
|
||||
- Cargo.lock
|
||||
- rust-toolchain.toml
|
||||
- deny.toml
|
||||
- rust/**
|
||||
- nodejs/Cargo.toml
|
||||
- python/Cargo.toml
|
||||
- .github/workflows/rust.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
@@ -52,6 +60,17 @@ jobs:
|
||||
- name: Run clippy (without remote feature)
|
||||
run: cargo clippy --profile ci --workspace --tests -- -D warnings
|
||||
|
||||
deny:
|
||||
# Supply-chain checks: advisories, licenses, banned crates, and source
|
||||
# restrictions. Configuration lives in `deny.toml` at the workspace root.
|
||||
timeout-minutes: 10
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: EmbarkStudios/cargo-deny-action@v2
|
||||
with:
|
||||
command: check advisories bans licenses sources
|
||||
|
||||
build-no-lock:
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 30
|
||||
|
||||
@@ -3,6 +3,9 @@ name: Update package-lock.json
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -3,6 +3,9 @@ name: Update NodeJs package-lock.json
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
31
.github/workflows/upload_wheel/action.yml
vendored
31
.github/workflows/upload_wheel/action.yml
vendored
@@ -2,9 +2,6 @@ name: upload-wheel
|
||||
|
||||
description: "Upload wheels to Pypi"
|
||||
inputs:
|
||||
pypi_token:
|
||||
required: true
|
||||
description: "release token for the repo"
|
||||
fury_token:
|
||||
required: true
|
||||
description: "release token for the fury repo"
|
||||
@@ -12,12 +9,6 @@ inputs:
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install twine
|
||||
python3 -m pip install --upgrade pkginfo
|
||||
- name: Choose repo
|
||||
shell: bash
|
||||
id: choose_repo
|
||||
@@ -27,19 +18,17 @@ runs:
|
||||
else
|
||||
echo "repo=pypi" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
- name: Publish to PyPI
|
||||
- name: Publish to Fury
|
||||
if: steps.choose_repo.outputs.repo == 'fury'
|
||||
shell: bash
|
||||
env:
|
||||
FURY_TOKEN: ${{ inputs.fury_token }}
|
||||
PYPI_TOKEN: ${{ inputs.pypi_token }}
|
||||
run: |
|
||||
if [[ ${{ steps.choose_repo.outputs.repo }} == fury ]]; then
|
||||
WHEEL=$(ls target/wheels/lancedb-*.whl 2> /dev/null | head -n 1)
|
||||
echo "Uploading $WHEEL to Fury"
|
||||
curl -f -F package=@$WHEEL https://$FURY_TOKEN@push.fury.io/lancedb/
|
||||
else
|
||||
twine upload --repository ${{ steps.choose_repo.outputs.repo }} \
|
||||
--username __token__ \
|
||||
--password $PYPI_TOKEN \
|
||||
target/wheels/lancedb-*.whl
|
||||
fi
|
||||
WHEEL=$(ls target/wheels/lancedb-*.whl 2> /dev/null | head -n 1)
|
||||
echo "Uploading $WHEEL to Fury"
|
||||
curl -f -F package=@$WHEEL https://$FURY_TOKEN@push.fury.io/lancedb/
|
||||
- name: Publish to PyPI
|
||||
if: steps.choose_repo.outputs.repo == 'pypi'
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
packages-dir: target/wheels/
|
||||
|
||||
1274
Cargo.lock
generated
1274
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
64
Cargo.toml
64
Cargo.toml
@@ -1,7 +1,5 @@
|
||||
[workspace]
|
||||
members = ["rust/lancedb", "nodejs", "python"]
|
||||
# Python package needs to be built by maturin.
|
||||
exclude = ["python"]
|
||||
resolver = "2"
|
||||
|
||||
[workspace.package]
|
||||
@@ -15,40 +13,40 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.91.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { version = "=4.0.0", default-features = false }
|
||||
lance-core = { version = "=4.0.0" }
|
||||
lance-datagen = { version = "=4.0.0" }
|
||||
lance-file = { version = "=4.0.0" }
|
||||
lance-io = { version = "=4.0.0", default-features = false }
|
||||
lance-index = { version = "=4.0.0" }
|
||||
lance-linalg = { version = "=4.0.0" }
|
||||
lance-namespace = { version = "=4.0.0" }
|
||||
lance-namespace-impls = { version = "=4.0.0", default-features = false }
|
||||
lance-table = { version = "=4.0.0" }
|
||||
lance-testing = { version = "=4.0.0" }
|
||||
lance-datafusion = { version = "=4.0.0" }
|
||||
lance-encoding = { version = "=4.0.0" }
|
||||
lance-arrow = { version = "=4.0.0" }
|
||||
lance = { "version" = "=6.0.0", default-features = false }
|
||||
lance-core = "=6.0.0"
|
||||
lance-datagen = "=6.0.0"
|
||||
lance-file = "=6.0.0"
|
||||
lance-io = { "version" = "=6.0.0", default-features = false }
|
||||
lance-index = "=6.0.0"
|
||||
lance-linalg = "=6.0.0"
|
||||
lance-namespace = "=6.0.0"
|
||||
lance-namespace-impls = { "version" = "=6.0.0", default-features = false }
|
||||
lance-table = "=6.0.0"
|
||||
lance-testing = "=6.0.0"
|
||||
lance-datafusion = "=6.0.0"
|
||||
lance-encoding = "=6.0.0"
|
||||
lance-arrow = "=6.0.0"
|
||||
ahash = "0.8"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "57.2", optional = false }
|
||||
arrow-array = "57.2"
|
||||
arrow-data = "57.2"
|
||||
arrow-ipc = "57.2"
|
||||
arrow-ord = "57.2"
|
||||
arrow-schema = "57.2"
|
||||
arrow-select = "57.2"
|
||||
arrow-cast = "57.2"
|
||||
arrow = { version = "58.0.0", optional = false }
|
||||
arrow-array = "58.0.0"
|
||||
arrow-data = "58.0.0"
|
||||
arrow-ipc = "58.0.0"
|
||||
arrow-ord = "58.0.0"
|
||||
arrow-schema = "58.0.0"
|
||||
arrow-select = "58.0.0"
|
||||
arrow-cast = "58.0.0"
|
||||
async-trait = "0"
|
||||
datafusion = { version = "52.1", default-features = false }
|
||||
datafusion-catalog = "52.1"
|
||||
datafusion-common = { version = "52.1", default-features = false }
|
||||
datafusion-execution = "52.1"
|
||||
datafusion-expr = "52.1"
|
||||
datafusion-functions = "52.1"
|
||||
datafusion-physical-plan = "52.1"
|
||||
datafusion-physical-expr = "52.1"
|
||||
datafusion-sql = "52.1"
|
||||
datafusion = { version = "53.0.0", default-features = false }
|
||||
datafusion-catalog = "53.0.0"
|
||||
datafusion-common = { version = "53.0.0", default-features = false }
|
||||
datafusion-execution = "53.0.0"
|
||||
datafusion-expr = "53.0.0"
|
||||
datafusion-functions = "53.0.0"
|
||||
datafusion-physical-plan = "53.0.0"
|
||||
datafusion-physical-expr = "53.0.0"
|
||||
datafusion-sql = "53.0.0"
|
||||
env_logger = "0.11"
|
||||
half = { "version" = "2.7.1", default-features = false, features = [
|
||||
"num-traits",
|
||||
|
||||
10
README.md
10
README.md
@@ -1,3 +1,9 @@
|
||||
<a href="https://cloud.lancedb.com" target="_blank">
|
||||
<img src="https://github.com/user-attachments/assets/92dad0a2-2a37-4ce1-b783-0d1b4f30a00c" alt="LanceDB Cloud Public Beta" width="100%" style="max-width: 100%;">
|
||||
</a>
|
||||
<div align="center">
|
||||
|
||||
[](https://lancedb.com)
|
||||
[](https://lancedb.com/)
|
||||
[](https://blog.lancedb.com/)
|
||||
[](https://discord.gg/zMM32dvNtd)
|
||||
@@ -9,7 +15,7 @@
|
||||
|
||||
# **The Multimodal AI Lakehouse**
|
||||
|
||||
[**How to Install** ](#how-to-install) ✦ [**Detailed Documentation**](https://lancedb.com/docs) ✦ [**Tutorials and Recipes**](https://github.com/lancedb/vectordb-recipes/tree/main) ✦ [**Contributors**](#contributors)
|
||||
[**How to Install** ](#how-to-install) ✦ [**Detailed Documentation**](https://docs.lancedb.com) ✦ [**Tutorials and Recipes**](https://github.com/lancedb/vectordb-recipes/tree/main) ✦ [**Contributors**](#contributors)
|
||||
|
||||
**The ultimate multimodal data platform for AI/ML applications.**
|
||||
|
||||
@@ -51,7 +57,7 @@ LanceDB is a central location where developers can build, train and analyze thei
|
||||
|
||||
## **How to Install**:
|
||||
|
||||
Follow the [Quickstart](https://lancedb.com/docs/quickstart/) doc to set up LanceDB locally.
|
||||
Follow the [Quickstart](https://docs.lancedb.com/quickstart) doc to set up LanceDB locally.
|
||||
|
||||
**API & SDK:** We also support Python, Typescript and Rust SDKs
|
||||
|
||||
|
||||
196
deny.toml
Normal file
196
deny.toml
Normal file
@@ -0,0 +1,196 @@
|
||||
# cargo-deny configuration for LanceDB.
|
||||
#
|
||||
# Run locally with `cargo deny check`. See
|
||||
# https://embarkstudios.github.io/cargo-deny/ for the full reference.
|
||||
|
||||
# The set of target triples we care about. cargo-deny will only consider
|
||||
# dependencies that are used on at least one of these targets. Keeping this
|
||||
# explicit avoids noise from platform-specific crates (e.g. wasm, android,
|
||||
# ios) that we never actually ship.
|
||||
[graph]
|
||||
targets = [
|
||||
"x86_64-unknown-linux-gnu",
|
||||
"aarch64-unknown-linux-gnu",
|
||||
"x86_64-apple-darwin",
|
||||
"aarch64-apple-darwin",
|
||||
"x86_64-pc-windows-msvc",
|
||||
"aarch64-pc-windows-msvc",
|
||||
]
|
||||
all-features = true
|
||||
|
||||
[output]
|
||||
feature-depth = 1
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Advisories: security vulnerabilities and yanked crates.
|
||||
# ---------------------------------------------------------------------------
|
||||
[advisories]
|
||||
version = 2
|
||||
# Fail the check if any crate in the lockfile has been yanked from crates.io.
|
||||
# Yanked crates are a signal the author retracted the release (often due to
|
||||
# bugs or security issues) and should not be depended on.
|
||||
yanked = "deny"
|
||||
# Advisory IDs we have explicitly reviewed and chosen to accept. Every
|
||||
# entry must include a rationale and, where possible, an upstream issue
|
||||
# pointing to a fix. Revisit this list whenever dependencies are updated.
|
||||
ignore = [
|
||||
# rsa: Marvin Attack timing side-channel in PKCS#1 v1.5 decryption.
|
||||
# Reached only through opendal → reqsign → rsa. We do not use RSA
|
||||
# decryption in LanceDB ourselves; this is dormant in the signing path.
|
||||
# No fixed release exists upstream as of this writing.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2023-0071
|
||||
{ id = "RUSTSEC-2023-0071", reason = "rsa crate via opendal/reqsign; no fixed upstream release" },
|
||||
|
||||
# instant: unmaintained. Pulled in via backoff → instant. Upstream
|
||||
# recommends switching to `web-time`; fix has to come from backoff.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2024-0384
|
||||
{ id = "RUSTSEC-2024-0384", reason = "transitive via backoff; waiting on backoff replacement" },
|
||||
|
||||
# paste: unmaintained (author archived the repo). Used transitively by
|
||||
# datafusion and the arrow ecosystem; widespread, no drop-in replacement.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2024-0436
|
||||
{ id = "RUSTSEC-2024-0436", reason = "transitive via datafusion; awaiting ecosystem migration" },
|
||||
|
||||
# encoding: unmaintained. Reached through lindera-dictionary, which is
|
||||
# required by the native Lindera tokenizer path. Lindera has not migrated
|
||||
# off this crate yet.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2021-0153
|
||||
{ id = "RUSTSEC-2021-0153", reason = "transitive via lindera-dictionary for native Lindera tokenizer" },
|
||||
|
||||
# fast-float: unsound and unmaintained. Reached only through polars-arrow
|
||||
# from the optional Polars integration; replacement requires a Polars
|
||||
# dependency upgrade.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2024-0379
|
||||
{ id = "RUSTSEC-2024-0379", reason = "transitive via polars-arrow; waiting on Polars migration" },
|
||||
|
||||
# tantivy: segfault on malformed input due to missing bounds check.
|
||||
# Pulled in via lance for full-text search. We only feed tantivy
|
||||
# documents we construct ourselves, not attacker-controlled bytes.
|
||||
# Tracked for a lance dependency bump.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0003
|
||||
{ id = "RUSTSEC-2025-0003", reason = "tantivy via lance; inputs are internally produced, not user-supplied bytes" },
|
||||
|
||||
# backoff: unmaintained. Reached only via async-openai. Replacement
|
||||
# requires async-openai to migrate (or us to drop async-openai).
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0012
|
||||
{ id = "RUSTSEC-2025-0012", reason = "transitive via async-openai; waiting on upstream migration" },
|
||||
|
||||
# number_prefix: unmaintained. Transitive via indicatif → hf-hub.
|
||||
# No security impact, just maintenance status.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0119
|
||||
{ id = "RUSTSEC-2025-0119", reason = "transitive via hf-hub/indicatif; cosmetic formatting crate" },
|
||||
|
||||
# bincode: unmaintained. Reached through lindera and lindera-dictionary,
|
||||
# which are required by the native Lindera tokenizer path. Lindera has not
|
||||
# migrated to another serialization format yet.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0141
|
||||
{ id = "RUSTSEC-2025-0141", reason = "transitive via lindera/lindera-dictionary for native Lindera tokenizer" },
|
||||
|
||||
# lru: soundness issue in IterMut. Reached only through aws-sdk-s3 in
|
||||
# LanceDB's dev-dependency graph; LanceDB does not use that iterator
|
||||
# directly. Clearing this requires the AWS SDK chain to update lru.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0002
|
||||
{ id = "RUSTSEC-2026-0002", reason = "transitive via aws-sdk-s3 dev-dependency; waiting on AWS SDK lru upgrade" },
|
||||
|
||||
# rustls-webpki 0.101.7 (old major line): name-constraint checks for
|
||||
# URI / wildcard names. Pulled in only via the legacy rustls 0.21 chain
|
||||
# from aws-smithy-http-client. The 0.103 line we actively use is patched.
|
||||
# Clearing the 0.101 copy requires the aws-sdk chain to migrate off
|
||||
# rustls 0.21.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0098
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0099
|
||||
{ id = "RUSTSEC-2026-0098", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
|
||||
{ id = "RUSTSEC-2026-0099", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
|
||||
|
||||
# rustls-webpki 0.101.7: reachable panic in CRL parsing. Same legacy
|
||||
# rustls 0.21 chain from aws-smithy-http-client as above. The 0.103 line
|
||||
# we actively use is upgraded to 0.103.13 which contains the fix.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0104
|
||||
{ id = "RUSTSEC-2026-0104", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
|
||||
|
||||
# rand 0.8.5: soundness issue only when ThreadRng reseeds inside a custom
|
||||
# logger. Reached through several transitive chains. LanceDB does not use
|
||||
# rand from a custom logger; upgrade once all pinned chains accept 0.8.6+.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0097
|
||||
{ id = "RUSTSEC-2026-0097", reason = "transitive rand 0.8.5; LanceDB does not call ThreadRng from custom logging" },
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Licenses: only allow licenses we've reviewed as compatible with Apache-2.0.
|
||||
# ---------------------------------------------------------------------------
|
||||
[licenses]
|
||||
version = 2
|
||||
# SPDX identifiers for licenses that are compatible with our Apache-2.0
|
||||
# distribution. Additions require legal review.
|
||||
allow = [
|
||||
"Apache-2.0",
|
||||
"Apache-2.0 WITH LLVM-exception",
|
||||
"MIT",
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
"ISC",
|
||||
"Unicode-3.0",
|
||||
"Unicode-DFS-2016",
|
||||
"Zlib",
|
||||
"CC0-1.0",
|
||||
"MPL-2.0",
|
||||
"BSL-1.0",
|
||||
"OpenSSL",
|
||||
# 0BSD ("BSD Zero Clause") is effectively public domain — no attribution
|
||||
# required. Pulled in by `mock_instant`.
|
||||
"0BSD",
|
||||
# bzip2-1.0.6 is the permissive upstream bzip2 license (BSD-like). Pulled
|
||||
# in by `libbz2-rs-sys`, the pure-Rust bzip2 implementation.
|
||||
"bzip2-1.0.6",
|
||||
# CDLA-Permissive-2.0 is a permissive data license used by `webpki-roots`
|
||||
# for the Mozilla CA root bundle. Data-only, distribution-compatible.
|
||||
"CDLA-Permissive-2.0",
|
||||
]
|
||||
confidence-threshold = 0.8
|
||||
# Crates whose license cannot be determined from Cargo metadata but whose
|
||||
# license we've manually confirmed from upstream. Keep this list minimal.
|
||||
[[licenses.clarify]]
|
||||
# polars-arrow-format omits the `license` field in its Cargo.toml, but the
|
||||
# upstream repo (pola-rs/polars-arrow-format) is dual-licensed Apache-2.0 OR
|
||||
# MIT. See https://github.com/pola-rs/polars-arrow-format/blob/main/LICENSE
|
||||
crate = "polars-arrow-format"
|
||||
expression = "Apache-2.0 OR MIT"
|
||||
license-files = []
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bans: disallow specific crates and flag dependency hygiene issues.
|
||||
# ---------------------------------------------------------------------------
|
||||
[bans]
|
||||
# Warn (not deny) on duplicate versions of the same crate. In a large
|
||||
# workspace like this one, duplicates are common and often unavoidable
|
||||
# transitively. We surface them to discourage growth, but don't fail CI.
|
||||
multiple-versions = "warn"
|
||||
# Wildcard version requirements (`foo = "*"`) are a footgun — they let any
|
||||
# future release in without review. Ban them outright.
|
||||
wildcards = "deny"
|
||||
# Internal workspace crates reference each other via `path = "..."`, which
|
||||
# cargo-deny sees as a wildcard version. That's fine for private workspace
|
||||
# members (not published to crates.io), so allow it specifically for paths.
|
||||
allow-wildcard-paths = true
|
||||
# Features that, if enabled, should cause the check to fail.
|
||||
deny = []
|
||||
# Crates to skip when checking for duplicate versions.
|
||||
skip = []
|
||||
# Similar to `skip`, but also skips the entire transitive subtree.
|
||||
skip-tree = []
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Sources: restrict where crates can come from.
|
||||
# ---------------------------------------------------------------------------
|
||||
[sources]
|
||||
# Deny any registry other than the ones explicitly listed below.
|
||||
unknown-registry = "deny"
|
||||
# Deny any git dependency whose host isn't in the allow-list below. This
|
||||
# prevents accidental pulls from arbitrary forks.
|
||||
unknown-git = "deny"
|
||||
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
|
||||
# Lance is developed in a sibling repo and pulled as a git dependency until
|
||||
# releases are cut to crates.io. Allow that specific host.
|
||||
allow-git = [
|
||||
"https://github.com/lance-format/lance",
|
||||
]
|
||||
@@ -24,4 +24,4 @@ RUN python --version && \
|
||||
rustc --version && \
|
||||
protoc --version
|
||||
|
||||
RUN pip install --no-cache-dir tantivy lancedb
|
||||
RUN pip install --no-cache-dir lancedb
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# LanceDB Documentation
|
||||
|
||||
LanceDB docs are available at [lancedb.com/docs](https://lancedb.com/docs).
|
||||
LanceDB docs are available at [docs.lancedb.com](https://docs.lancedb.com).
|
||||
|
||||
The SDK docs are built and deployed automatically by [Github Actions](../.github/workflows/docs.yml)
|
||||
whenever a commit is pushed to the `main` branch. So it is possible for the docs to show
|
||||
|
||||
@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
|
||||
<dependency>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-core</artifactId>
|
||||
<version>0.27.2</version>
|
||||
<version>0.29.0</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
@@ -57,32 +57,32 @@ LanceNamespace namespaceClient = LanceDbNamespaceClientBuilder.newBuilder()
|
||||
|
||||
## Metadata Operations
|
||||
|
||||
### Creating a Namespace
|
||||
### Creating a Namespace Path
|
||||
|
||||
Namespaces organize tables hierarchically. Create a namespace before creating tables within it:
|
||||
Namespace paths organize tables hierarchically. Create the desired namespace path before creating tables within it:
|
||||
|
||||
```java
|
||||
import org.lance.namespace.model.CreateNamespaceRequest;
|
||||
import org.lance.namespace.model.CreateNamespaceResponse;
|
||||
|
||||
// Create a child namespace
|
||||
// Create a child namespace path
|
||||
CreateNamespaceRequest request = new CreateNamespaceRequest();
|
||||
request.setId(Arrays.asList("my_namespace"));
|
||||
|
||||
CreateNamespaceResponse response = namespaceClient.createNamespace(request);
|
||||
```
|
||||
|
||||
You can also create nested namespaces:
|
||||
You can also create nested namespace paths:
|
||||
|
||||
```java
|
||||
// Create a nested namespace: parent/child
|
||||
// Create a nested namespace path: parent/child
|
||||
CreateNamespaceRequest request = new CreateNamespaceRequest();
|
||||
request.setId(Arrays.asList("parent_namespace", "child_namespace"));
|
||||
|
||||
CreateNamespaceResponse response = namespaceClient.createNamespace(request);
|
||||
```
|
||||
|
||||
### Describing a Namespace
|
||||
### Describing a Namespace Path
|
||||
|
||||
```java
|
||||
import org.lance.namespace.model.DescribeNamespaceRequest;
|
||||
@@ -95,22 +95,22 @@ DescribeNamespaceResponse response = namespaceClient.describeNamespace(request);
|
||||
System.out.println("Namespace properties: " + response.getProperties());
|
||||
```
|
||||
|
||||
### Listing Namespaces
|
||||
### Listing Namespace Paths
|
||||
|
||||
```java
|
||||
import org.lance.namespace.model.ListNamespacesRequest;
|
||||
import org.lance.namespace.model.ListNamespacesResponse;
|
||||
|
||||
// List all namespaces at root level
|
||||
// List all namespace paths at the root level
|
||||
ListNamespacesRequest request = new ListNamespacesRequest();
|
||||
request.setId(Arrays.asList()); // Empty for root
|
||||
|
||||
ListNamespacesResponse response = namespaceClient.listNamespaces(request);
|
||||
for (String ns : response.getNamespaces()) {
|
||||
System.out.println("Namespace: " + ns);
|
||||
System.out.println("Namespace path: " + ns);
|
||||
}
|
||||
|
||||
// List child namespaces under a parent
|
||||
// List child namespace paths under a parent path
|
||||
ListNamespacesRequest childRequest = new ListNamespacesRequest();
|
||||
childRequest.setId(Arrays.asList("parent_namespace"));
|
||||
|
||||
@@ -123,7 +123,7 @@ ListNamespacesResponse childResponse = namespaceClient.listNamespaces(childReque
|
||||
import org.lance.namespace.model.ListTablesRequest;
|
||||
import org.lance.namespace.model.ListTablesResponse;
|
||||
|
||||
// List tables in a namespace
|
||||
// List tables in a namespace path
|
||||
ListTablesRequest request = new ListTablesRequest();
|
||||
request.setId(Arrays.asList("my_namespace"));
|
||||
|
||||
@@ -133,7 +133,7 @@ for (String table : response.getTables()) {
|
||||
}
|
||||
```
|
||||
|
||||
### Dropping a Namespace
|
||||
### Dropping a Namespace Path
|
||||
|
||||
```java
|
||||
import org.lance.namespace.model.DropNamespaceRequest;
|
||||
@@ -175,7 +175,7 @@ DropTableResponse response = namespaceClient.dropTable(request);
|
||||
|
||||
### Creating a Table
|
||||
|
||||
Tables are created within a namespace by providing data in Apache Arrow IPC format:
|
||||
Tables are created within a namespace path by providing data in Apache Arrow IPC format:
|
||||
|
||||
```java
|
||||
import org.lance.namespace.LanceNamespace;
|
||||
@@ -242,7 +242,7 @@ try (BufferAllocator allocator = new RootAllocator();
|
||||
}
|
||||
byte[] tableData = out.toByteArray();
|
||||
|
||||
// Create table in a namespace
|
||||
// Create a table in a namespace path
|
||||
CreateTableRequest request = new CreateTableRequest();
|
||||
request.setId(Arrays.asList("my_namespace", "my_table"));
|
||||
CreateTableResponse response = namespaceClient.createTable(request, tableData);
|
||||
|
||||
@@ -34,7 +34,7 @@ const results = await table.vectorSearch([0.1, 0.3]).limit(20).toArray();
|
||||
console.log(results);
|
||||
```
|
||||
|
||||
The [quickstart](https://lancedb.com/docs/quickstart/basic-usage/) contains more complete examples.
|
||||
The [quickstart](https://docs.lancedb.com/quickstart/) contains more complete examples.
|
||||
|
||||
## Development
|
||||
|
||||
|
||||
@@ -61,8 +61,8 @@ sharing the same data, deletion, and index files.
|
||||
* **options.sourceVersion?**: `number`
|
||||
The version of the source table to clone.
|
||||
|
||||
* **options.targetNamespace?**: `string`[]
|
||||
The namespace for the target table (defaults to root namespace).
|
||||
* **options.targetNamespacePath?**: `string`[]
|
||||
The namespace path for the target table (defaults to root namespace).
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -116,13 +116,13 @@ Creates a new empty Table
|
||||
|
||||
`Promise`<[`Table`](Table.md)>
|
||||
|
||||
#### createEmptyTable(name, schema, namespace, options)
|
||||
#### createEmptyTable(name, schema, namespacePath, options)
|
||||
|
||||
```ts
|
||||
abstract createEmptyTable(
|
||||
name,
|
||||
schema,
|
||||
namespace?,
|
||||
namespacePath?,
|
||||
options?): Promise<Table>
|
||||
```
|
||||
|
||||
@@ -136,8 +136,8 @@ Creates a new empty Table
|
||||
* **schema**: [`SchemaLike`](../type-aliases/SchemaLike.md)
|
||||
The schema of the table
|
||||
|
||||
* **namespace?**: `string`[]
|
||||
The namespace to create the table in (defaults to root namespace)
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path to create the table in (defaults to root namespace)
|
||||
|
||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
Additional options
|
||||
@@ -150,10 +150,10 @@ Creates a new empty Table
|
||||
|
||||
### createTable()
|
||||
|
||||
#### createTable(options, namespace)
|
||||
#### createTable(options, namespacePath)
|
||||
|
||||
```ts
|
||||
abstract createTable(options, namespace?): Promise<Table>
|
||||
abstract createTable(options, namespacePath?): Promise<Table>
|
||||
```
|
||||
|
||||
Creates a new Table and initialize it with new data.
|
||||
@@ -163,8 +163,8 @@ Creates a new Table and initialize it with new data.
|
||||
* **options**: `object` & `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
The options object.
|
||||
|
||||
* **namespace?**: `string`[]
|
||||
The namespace to create the table in (defaults to root namespace)
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path to create the table in (defaults to root namespace)
|
||||
|
||||
##### Returns
|
||||
|
||||
@@ -197,13 +197,13 @@ Creates a new Table and initialize it with new data.
|
||||
|
||||
`Promise`<[`Table`](Table.md)>
|
||||
|
||||
#### createTable(name, data, namespace, options)
|
||||
#### createTable(name, data, namespacePath, options)
|
||||
|
||||
```ts
|
||||
abstract createTable(
|
||||
name,
|
||||
data,
|
||||
namespace?,
|
||||
namespacePath?,
|
||||
options?): Promise<Table>
|
||||
```
|
||||
|
||||
@@ -218,8 +218,8 @@ Creates a new Table and initialize it with new data.
|
||||
Non-empty Array of Records
|
||||
to be inserted into the table
|
||||
|
||||
* **namespace?**: `string`[]
|
||||
The namespace to create the table in (defaults to root namespace)
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path to create the table in (defaults to root namespace)
|
||||
|
||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
Additional options
|
||||
@@ -247,15 +247,15 @@ Return a brief description of the connection
|
||||
### dropAllTables()
|
||||
|
||||
```ts
|
||||
abstract dropAllTables(namespace?): Promise<void>
|
||||
abstract dropAllTables(namespacePath?): Promise<void>
|
||||
```
|
||||
|
||||
Drop all tables in the database.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **namespace?**: `string`[]
|
||||
The namespace to drop tables from (defaults to root namespace).
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path to drop tables from (defaults to root namespace).
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -266,7 +266,7 @@ Drop all tables in the database.
|
||||
### dropTable()
|
||||
|
||||
```ts
|
||||
abstract dropTable(name, namespace?): Promise<void>
|
||||
abstract dropTable(name, namespacePath?): Promise<void>
|
||||
```
|
||||
|
||||
Drop an existing table.
|
||||
@@ -276,8 +276,8 @@ Drop an existing table.
|
||||
* **name**: `string`
|
||||
The name of the table to drop.
|
||||
|
||||
* **namespace?**: `string`[]
|
||||
The namespace of the table (defaults to root namespace).
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path of the table (defaults to root namespace).
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -304,7 +304,7 @@ Return true if the connection has not been closed
|
||||
```ts
|
||||
abstract openTable(
|
||||
name,
|
||||
namespace?,
|
||||
namespacePath?,
|
||||
options?): Promise<Table>
|
||||
```
|
||||
|
||||
@@ -315,8 +315,8 @@ Open a table in the database.
|
||||
* **name**: `string`
|
||||
The name of the table
|
||||
|
||||
* **namespace?**: `string`[]
|
||||
The namespace of the table (defaults to root namespace)
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path of the table (defaults to root namespace)
|
||||
|
||||
* **options?**: `Partial`<[`OpenTableOptions`](../interfaces/OpenTableOptions.md)>
|
||||
Additional options
|
||||
@@ -349,10 +349,10 @@ Tables will be returned in lexicographical order.
|
||||
|
||||
`Promise`<`string`[]>
|
||||
|
||||
#### tableNames(namespace, options)
|
||||
#### tableNames(namespacePath, options)
|
||||
|
||||
```ts
|
||||
abstract tableNames(namespace?, options?): Promise<string[]>
|
||||
abstract tableNames(namespacePath?, options?): Promise<string[]>
|
||||
```
|
||||
|
||||
List all the table names in this database.
|
||||
@@ -361,8 +361,8 @@ Tables will be returned in lexicographical order.
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **namespace?**: `string`[]
|
||||
The namespace to list tables from (defaults to root namespace)
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path to list tables from (defaults to root namespace)
|
||||
|
||||
* **options?**: `Partial`<[`TableNamesOptions`](../interfaces/TableNamesOptions.md)>
|
||||
options to control the
|
||||
|
||||
@@ -501,6 +501,34 @@ Modeled after ``VACUUM`` in PostgreSQL.
|
||||
|
||||
***
|
||||
|
||||
### prewarmData()
|
||||
|
||||
```ts
|
||||
abstract prewarmData(columns?): Promise<void>
|
||||
```
|
||||
|
||||
Prewarm one or more columns of data in the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **columns?**: `string`[]
|
||||
The columns to prewarm. If undefined, all columns are prewarmed.
|
||||
This will load the column data into the page cache so that future queries that
|
||||
read those columns avoid the initial cold-start latency. This call initiates
|
||||
prewarming and returns once the request is accepted; the warming itself may
|
||||
continue in the background. Calling it on already-prewarmed columns is a
|
||||
no-op on the server.
|
||||
Prewarming is generally useful for columns used in filters or projections.
|
||||
Large columns (e.g. high-dimensional vectors or binary data) may not be
|
||||
practical to prewarm.
|
||||
This feature is currently only supported on remote tables.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`void`>
|
||||
|
||||
***
|
||||
|
||||
### prewarmIndex()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -53,3 +53,18 @@ optional tlsConfig: TlsConfig;
|
||||
```ts
|
||||
optional userAgent: string;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### userId?
|
||||
|
||||
```ts
|
||||
optional userId: string;
|
||||
```
|
||||
|
||||
User identifier for tracking purposes.
|
||||
|
||||
This is sent as the `x-lancedb-user-id` header in requests to LanceDB Cloud/Enterprise.
|
||||
It can be set directly, or via the `LANCEDB_USER_ID` environment variable.
|
||||
Alternatively, set `LANCEDB_USER_ID_ENV_KEY` to specify another environment
|
||||
variable that contains the user ID value.
|
||||
|
||||
@@ -41,6 +41,29 @@ for testing purposes.
|
||||
|
||||
***
|
||||
|
||||
### manifestEnabled?
|
||||
|
||||
```ts
|
||||
optional manifestEnabled: boolean;
|
||||
```
|
||||
|
||||
(For LanceDB OSS only): use directory namespace manifests as the source
|
||||
of truth for table metadata. Existing directory-listed root tables are
|
||||
migrated into the manifest on access.
|
||||
|
||||
***
|
||||
|
||||
### namespaceClientProperties?
|
||||
|
||||
```ts
|
||||
optional namespaceClientProperties: Record<string, string>;
|
||||
```
|
||||
|
||||
(For LanceDB OSS only): extra properties for the backing namespace
|
||||
client used by manifest-enabled native connections.
|
||||
|
||||
***
|
||||
|
||||
### readConsistencyInterval?
|
||||
|
||||
```ts
|
||||
@@ -89,4 +112,4 @@ optional storageOptions: Record<string, string>;
|
||||
|
||||
(For LanceDB OSS only): configuration for object storage.
|
||||
|
||||
The available options are described at https://lancedb.com/docs/storage/
|
||||
The available options are described at https://docs.lancedb.com/storage/
|
||||
|
||||
@@ -97,4 +97,4 @@ Configuration for object storage.
|
||||
Options already set on the connection will be inherited by the table,
|
||||
but can be overridden here.
|
||||
|
||||
The available options are described at https://lancedb.com/docs/storage/
|
||||
The available options are described at https://docs.lancedb.com/storage/
|
||||
|
||||
@@ -42,4 +42,4 @@ Configuration for object storage.
|
||||
Options already set on the connection will be inherited by the table,
|
||||
but can be overridden here.
|
||||
|
||||
The available options are described at https://lancedb.com/docs/storage/
|
||||
The available options are described at https://docs.lancedb.com/storage/
|
||||
|
||||
@@ -94,11 +94,11 @@ of raw SQL strings with [where][lancedb.query.LanceQueryBuilder.where] and
|
||||
|
||||
## Full text search
|
||||
|
||||
::: lancedb.fts.create_index
|
||||
Use [lancedb.table.Table.create_fts_index][] for the synchronous API or
|
||||
[lancedb.table.AsyncTable.create_index][] with [lancedb.index.FTS][] for the
|
||||
asynchronous API.
|
||||
|
||||
::: lancedb.fts.populate_index
|
||||
|
||||
::: lancedb.fts.search_index
|
||||
::: lancedb.index.FTS
|
||||
|
||||
## Utilities
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.27.2-final.0</version>
|
||||
<version>0.29.0-final.0</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.27.2-final.0</version>
|
||||
<version>0.29.0-final.0</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Java SDK Parent POM</description>
|
||||
@@ -28,7 +28,7 @@
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<arrow.version>15.0.0</arrow.version>
|
||||
<lance-core.version>3.0.1</lance-core.version>
|
||||
<lance-core.version>6.0.0</lance-core.version>
|
||||
<spotless.skip>false</spotless.skip>
|
||||
<spotless.version>2.30.0</spotless.version>
|
||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.27.2"
|
||||
version = "0.29.0"
|
||||
publish = false
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
@@ -15,7 +16,7 @@ crate-type = ["cdylib"]
|
||||
async-trait.workspace = true
|
||||
arrow-ipc.workspace = true
|
||||
arrow-array.workspace = true
|
||||
arrow-buffer = "57.2"
|
||||
arrow-buffer = "58.0.0"
|
||||
half.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
env_logger.workspace = true
|
||||
@@ -31,8 +32,8 @@ lzma-sys = { version = "0.1", features = ["static"] }
|
||||
log.workspace = true
|
||||
|
||||
# Pin to resolve build failures; update periodically for security patches.
|
||||
aws-lc-sys = "=0.38.0"
|
||||
aws-lc-rs = "=1.16.1"
|
||||
aws-lc-sys = "=0.40.0"
|
||||
aws-lc-rs = "=1.16.3"
|
||||
|
||||
[build-dependencies]
|
||||
napi-build = "2.3.1"
|
||||
|
||||
@@ -30,7 +30,7 @@ const results = await table.vectorSearch([0.1, 0.3]).limit(20).toArray();
|
||||
console.log(results);
|
||||
```
|
||||
|
||||
The [quickstart](https://lancedb.com/docs/quickstart/basic-usage/) contains more complete examples.
|
||||
The [quickstart](https://docs.lancedb.com/quickstart/) contains more complete examples.
|
||||
|
||||
## Development
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import { spawn } from "node:child_process";
|
||||
import * as path from "node:path";
|
||||
import { RecordBatch } from "apache-arrow";
|
||||
import * as tmp from "tmp";
|
||||
import { Connection, Index, Table, connect, makeArrowTable } from "../lancedb";
|
||||
@@ -76,4 +78,91 @@ describe("rerankers", function () {
|
||||
|
||||
expect(result).toHaveLength(2);
|
||||
});
|
||||
|
||||
it("does not keep process alive after rerank query", async function () {
|
||||
const script = `
|
||||
import * as lancedb from "./dist/index.js";
|
||||
import * as os from "node:os";
|
||||
import * as path from "node:path";
|
||||
import * as fs from "node:fs/promises";
|
||||
|
||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "lancedb-rerank-exit-"));
|
||||
const db = await lancedb.connect(dir);
|
||||
const table = await db.createTable("test", [{ text: "hello", vector: [1, 2, 3] }], {
|
||||
mode: "overwrite",
|
||||
});
|
||||
await table.createIndex("text", { config: lancedb.Index.fts() });
|
||||
await table.waitForIndex(["text_idx"], 30);
|
||||
|
||||
const reranker = await lancedb.rerankers.RRFReranker.create();
|
||||
await table
|
||||
.query()
|
||||
.nearestTo([1, 2, 3])
|
||||
.fullTextSearch("hello")
|
||||
.rerank(reranker)
|
||||
.toArray();
|
||||
|
||||
table.close();
|
||||
db.close();
|
||||
`;
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const child = spawn(
|
||||
process.execPath,
|
||||
["--input-type=module", "-e", script],
|
||||
{
|
||||
cwd: path.resolve(__dirname, ".."),
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
},
|
||||
);
|
||||
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
|
||||
child.stdout.on("data", (chunk) => {
|
||||
stdout += chunk.toString();
|
||||
});
|
||||
|
||||
child.stderr.on("data", (chunk) => {
|
||||
stderr += chunk.toString();
|
||||
});
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
child.kill();
|
||||
reject(
|
||||
new Error(
|
||||
`child process did not exit in time\nstdout:\n${stdout}\nstderr:\n${stderr}`,
|
||||
),
|
||||
);
|
||||
}, 20_000);
|
||||
|
||||
child.on("error", (err) => {
|
||||
clearTimeout(timeout);
|
||||
reject(err);
|
||||
});
|
||||
|
||||
child.on("exit", (code, signal) => {
|
||||
clearTimeout(timeout);
|
||||
if (signal !== null) {
|
||||
reject(
|
||||
new Error(
|
||||
`child process exited with signal ${signal}\nstdout:\n${stdout}\nstderr:\n${stderr}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code !== 0) {
|
||||
reject(
|
||||
new Error(
|
||||
`child process exited with code ${code}\nstdout:\n${stdout}\nstderr:\n${stderr}`,
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -103,7 +103,7 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
},
|
||||
numIndices: 0,
|
||||
numRows: 3,
|
||||
totalBytes: 24,
|
||||
totalBytes: 44,
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1870,6 +1870,25 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(results.length).toBe(3);
|
||||
});
|
||||
|
||||
test("prewarmData errors on local tables", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
{ text: "alpha", vector: [0.1, 0.2, 0.3] },
|
||||
{ text: "beta", vector: [0.4, 0.5, 0.6] },
|
||||
];
|
||||
const table = await db.createTable("prewarm_data_test", data);
|
||||
|
||||
// prewarmData is only supported on remote tables. We verify the call
|
||||
// is wired through napi and surfaces the expected error for both
|
||||
// arg shapes (undefined and string[]).
|
||||
await expect(table.prewarmData()).rejects.toThrow(
|
||||
"prewarm_data is currently only supported on remote tables",
|
||||
);
|
||||
await expect(table.prewarmData(["text"])).rejects.toThrow(
|
||||
"prewarm_data is currently only supported on remote tables",
|
||||
);
|
||||
});
|
||||
|
||||
test("full text index on list", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
|
||||
@@ -42,7 +42,7 @@ export interface CreateTableOptions {
|
||||
* Options already set on the connection will be inherited by the table,
|
||||
* but can be overridden here.
|
||||
*
|
||||
* The available options are described at https://lancedb.com/docs/storage/
|
||||
* The available options are described at https://docs.lancedb.com/storage/
|
||||
*/
|
||||
storageOptions?: Record<string, string>;
|
||||
|
||||
@@ -78,7 +78,7 @@ export interface OpenTableOptions {
|
||||
* Options already set on the connection will be inherited by the table,
|
||||
* but can be overridden here.
|
||||
*
|
||||
* The available options are described at https://lancedb.com/docs/storage/
|
||||
* The available options are described at https://docs.lancedb.com/storage/
|
||||
*/
|
||||
storageOptions?: Record<string, string>;
|
||||
/**
|
||||
@@ -166,25 +166,25 @@ export abstract class Connection {
|
||||
* List all the table names in this database.
|
||||
*
|
||||
* Tables will be returned in lexicographical order.
|
||||
* @param {string[]} namespace - The namespace to list tables from (defaults to root namespace)
|
||||
* @param {string[]} namespacePath - The namespace path to list tables from (defaults to root namespace)
|
||||
* @param {Partial<TableNamesOptions>} options - options to control the
|
||||
* paging / start point
|
||||
*
|
||||
*/
|
||||
abstract tableNames(
|
||||
namespace?: string[],
|
||||
namespacePath?: string[],
|
||||
options?: Partial<TableNamesOptions>,
|
||||
): Promise<string[]>;
|
||||
|
||||
/**
|
||||
* Open a table in the database.
|
||||
* @param {string} name - The name of the table
|
||||
* @param {string[]} namespace - The namespace of the table (defaults to root namespace)
|
||||
* @param {string[]} namespacePath - The namespace path of the table (defaults to root namespace)
|
||||
* @param {Partial<OpenTableOptions>} options - Additional options
|
||||
*/
|
||||
abstract openTable(
|
||||
name: string,
|
||||
namespace?: string[],
|
||||
namespacePath?: string[],
|
||||
options?: Partial<OpenTableOptions>,
|
||||
): Promise<Table>;
|
||||
|
||||
@@ -193,7 +193,7 @@ export abstract class Connection {
|
||||
* @param {object} options - The options object.
|
||||
* @param {string} options.name - The name of the table.
|
||||
* @param {Data} options.data - Non-empty Array of Records to be inserted into the table
|
||||
* @param {string[]} namespace - The namespace to create the table in (defaults to root namespace)
|
||||
* @param {string[]} namespacePath - The namespace path to create the table in (defaults to root namespace)
|
||||
*
|
||||
*/
|
||||
abstract createTable(
|
||||
@@ -201,7 +201,7 @@ export abstract class Connection {
|
||||
name: string;
|
||||
data: Data;
|
||||
} & Partial<CreateTableOptions>,
|
||||
namespace?: string[],
|
||||
namespacePath?: string[],
|
||||
): Promise<Table>;
|
||||
/**
|
||||
* Creates a new Table and initialize it with new data.
|
||||
@@ -220,13 +220,13 @@ export abstract class Connection {
|
||||
* @param {string} name - The name of the table.
|
||||
* @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
|
||||
* to be inserted into the table
|
||||
* @param {string[]} namespace - The namespace to create the table in (defaults to root namespace)
|
||||
* @param {string[]} namespacePath - The namespace path to create the table in (defaults to root namespace)
|
||||
* @param {Partial<CreateTableOptions>} options - Additional options
|
||||
*/
|
||||
abstract createTable(
|
||||
name: string,
|
||||
data: Record<string, unknown>[] | TableLike,
|
||||
namespace?: string[],
|
||||
namespacePath?: string[],
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table>;
|
||||
|
||||
@@ -245,28 +245,28 @@ export abstract class Connection {
|
||||
* Creates a new empty Table
|
||||
* @param {string} name - The name of the table.
|
||||
* @param {Schema} schema - The schema of the table
|
||||
* @param {string[]} namespace - The namespace to create the table in (defaults to root namespace)
|
||||
* @param {string[]} namespacePath - The namespace path to create the table in (defaults to root namespace)
|
||||
* @param {Partial<CreateTableOptions>} options - Additional options
|
||||
*/
|
||||
abstract createEmptyTable(
|
||||
name: string,
|
||||
schema: import("./arrow").SchemaLike,
|
||||
namespace?: string[],
|
||||
namespacePath?: string[],
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table>;
|
||||
|
||||
/**
|
||||
* Drop an existing table.
|
||||
* @param {string} name The name of the table to drop.
|
||||
* @param {string[]} namespace The namespace of the table (defaults to root namespace).
|
||||
* @param {string[]} namespacePath The namespace path of the table (defaults to root namespace).
|
||||
*/
|
||||
abstract dropTable(name: string, namespace?: string[]): Promise<void>;
|
||||
abstract dropTable(name: string, namespacePath?: string[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Drop all tables in the database.
|
||||
* @param {string[]} namespace The namespace to drop tables from (defaults to root namespace).
|
||||
* @param {string[]} namespacePath The namespace path to drop tables from (defaults to root namespace).
|
||||
*/
|
||||
abstract dropAllTables(namespace?: string[]): Promise<void>;
|
||||
abstract dropAllTables(namespacePath?: string[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Clone a table from a source table.
|
||||
@@ -279,7 +279,7 @@ export abstract class Connection {
|
||||
* @param {string} targetTableName - The name of the target table to create.
|
||||
* @param {string} sourceUri - The URI of the source table to clone from.
|
||||
* @param {object} options - Clone options.
|
||||
* @param {string[]} options.targetNamespace - The namespace for the target table (defaults to root namespace).
|
||||
* @param {string[]} options.targetNamespacePath - The namespace path for the target table (defaults to root namespace).
|
||||
* @param {number} options.sourceVersion - The version of the source table to clone.
|
||||
* @param {string} options.sourceTag - The tag of the source table to clone.
|
||||
* @param {boolean} options.isShallow - Whether to perform a shallow clone (defaults to true).
|
||||
@@ -288,7 +288,7 @@ export abstract class Connection {
|
||||
targetTableName: string,
|
||||
sourceUri: string,
|
||||
options?: {
|
||||
targetNamespace?: string[];
|
||||
targetNamespacePath?: string[];
|
||||
sourceVersion?: number;
|
||||
sourceTag?: string;
|
||||
isShallow?: boolean;
|
||||
@@ -319,25 +319,25 @@ export class LocalConnection extends Connection {
|
||||
}
|
||||
|
||||
async tableNames(
|
||||
namespaceOrOptions?: string[] | Partial<TableNamesOptions>,
|
||||
namespacePathOrOptions?: string[] | Partial<TableNamesOptions>,
|
||||
options?: Partial<TableNamesOptions>,
|
||||
): Promise<string[]> {
|
||||
// Detect if first argument is namespace array or options object
|
||||
let namespace: string[] | undefined;
|
||||
// Detect if first argument is namespacePath array or options object
|
||||
let namespacePath: string[] | undefined;
|
||||
let tableNamesOptions: Partial<TableNamesOptions> | undefined;
|
||||
|
||||
if (Array.isArray(namespaceOrOptions)) {
|
||||
// First argument is namespace array
|
||||
namespace = namespaceOrOptions;
|
||||
if (Array.isArray(namespacePathOrOptions)) {
|
||||
// First argument is namespacePath array
|
||||
namespacePath = namespacePathOrOptions;
|
||||
tableNamesOptions = options;
|
||||
} else {
|
||||
// First argument is options object (backwards compatibility)
|
||||
namespace = undefined;
|
||||
tableNamesOptions = namespaceOrOptions;
|
||||
namespacePath = undefined;
|
||||
tableNamesOptions = namespacePathOrOptions;
|
||||
}
|
||||
|
||||
return this.inner.tableNames(
|
||||
namespace ?? [],
|
||||
namespacePath ?? [],
|
||||
tableNamesOptions?.startAfter,
|
||||
tableNamesOptions?.limit,
|
||||
);
|
||||
@@ -345,12 +345,12 @@ export class LocalConnection extends Connection {
|
||||
|
||||
async openTable(
|
||||
name: string,
|
||||
namespace?: string[],
|
||||
namespacePath?: string[],
|
||||
options?: Partial<OpenTableOptions>,
|
||||
): Promise<Table> {
|
||||
const innerTable = await this.inner.openTable(
|
||||
name,
|
||||
namespace ?? [],
|
||||
namespacePath ?? [],
|
||||
cleanseStorageOptions(options?.storageOptions),
|
||||
options?.indexCacheSize,
|
||||
);
|
||||
@@ -362,7 +362,7 @@ export class LocalConnection extends Connection {
|
||||
targetTableName: string,
|
||||
sourceUri: string,
|
||||
options?: {
|
||||
targetNamespace?: string[];
|
||||
targetNamespacePath?: string[];
|
||||
sourceVersion?: number;
|
||||
sourceTag?: string;
|
||||
isShallow?: boolean;
|
||||
@@ -371,7 +371,7 @@ export class LocalConnection extends Connection {
|
||||
const innerTable = await this.inner.cloneTable(
|
||||
targetTableName,
|
||||
sourceUri,
|
||||
options?.targetNamespace ?? [],
|
||||
options?.targetNamespacePath ?? [],
|
||||
options?.sourceVersion ?? null,
|
||||
options?.sourceTag ?? null,
|
||||
options?.isShallow ?? true,
|
||||
@@ -406,42 +406,42 @@ export class LocalConnection extends Connection {
|
||||
nameOrOptions:
|
||||
| string
|
||||
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
|
||||
dataOrNamespace?: Record<string, unknown>[] | TableLike | string[],
|
||||
namespaceOrOptions?: string[] | Partial<CreateTableOptions>,
|
||||
dataOrNamespacePath?: Record<string, unknown>[] | TableLike | string[],
|
||||
namespacePathOrOptions?: string[] | Partial<CreateTableOptions>,
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table> {
|
||||
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
|
||||
// First overload: createTable(options, namespace?)
|
||||
// First overload: createTable(options, namespacePath?)
|
||||
const { name, data, ...createOptions } = nameOrOptions;
|
||||
const namespace = dataOrNamespace as string[] | undefined;
|
||||
return this._createTableImpl(name, data, namespace, createOptions);
|
||||
const namespacePath = dataOrNamespacePath as string[] | undefined;
|
||||
return this._createTableImpl(name, data, namespacePath, createOptions);
|
||||
}
|
||||
|
||||
// Second overload: createTable(name, data, namespace?, options?)
|
||||
// Second overload: createTable(name, data, namespacePath?, options?)
|
||||
const name = nameOrOptions;
|
||||
const data = dataOrNamespace as Record<string, unknown>[] | TableLike;
|
||||
const data = dataOrNamespacePath as Record<string, unknown>[] | TableLike;
|
||||
|
||||
// Detect if third argument is namespace array or options object
|
||||
let namespace: string[] | undefined;
|
||||
// Detect if third argument is namespacePath array or options object
|
||||
let namespacePath: string[] | undefined;
|
||||
let createOptions: Partial<CreateTableOptions> | undefined;
|
||||
|
||||
if (Array.isArray(namespaceOrOptions)) {
|
||||
// Third argument is namespace array
|
||||
namespace = namespaceOrOptions;
|
||||
if (Array.isArray(namespacePathOrOptions)) {
|
||||
// Third argument is namespacePath array
|
||||
namespacePath = namespacePathOrOptions;
|
||||
createOptions = options;
|
||||
} else {
|
||||
// Third argument is options object (backwards compatibility)
|
||||
namespace = undefined;
|
||||
createOptions = namespaceOrOptions;
|
||||
namespacePath = undefined;
|
||||
createOptions = namespacePathOrOptions;
|
||||
}
|
||||
|
||||
return this._createTableImpl(name, data, namespace, createOptions);
|
||||
return this._createTableImpl(name, data, namespacePath, createOptions);
|
||||
}
|
||||
|
||||
private async _createTableImpl(
|
||||
name: string,
|
||||
data: Data,
|
||||
namespace?: string[],
|
||||
namespacePath?: string[],
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table> {
|
||||
if (data === undefined) {
|
||||
@@ -455,7 +455,7 @@ export class LocalConnection extends Connection {
|
||||
name,
|
||||
buf,
|
||||
mode,
|
||||
namespace ?? [],
|
||||
namespacePath ?? [],
|
||||
storageOptions,
|
||||
);
|
||||
|
||||
@@ -465,21 +465,21 @@ export class LocalConnection extends Connection {
|
||||
async createEmptyTable(
|
||||
name: string,
|
||||
schema: import("./arrow").SchemaLike,
|
||||
namespaceOrOptions?: string[] | Partial<CreateTableOptions>,
|
||||
namespacePathOrOptions?: string[] | Partial<CreateTableOptions>,
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Promise<Table> {
|
||||
// Detect if third argument is namespace array or options object
|
||||
let namespace: string[] | undefined;
|
||||
// Detect if third argument is namespacePath array or options object
|
||||
let namespacePath: string[] | undefined;
|
||||
let createOptions: Partial<CreateTableOptions> | undefined;
|
||||
|
||||
if (Array.isArray(namespaceOrOptions)) {
|
||||
// Third argument is namespace array
|
||||
namespace = namespaceOrOptions;
|
||||
if (Array.isArray(namespacePathOrOptions)) {
|
||||
// Third argument is namespacePath array
|
||||
namespacePath = namespacePathOrOptions;
|
||||
createOptions = options;
|
||||
} else {
|
||||
// Third argument is options object (backwards compatibility)
|
||||
namespace = undefined;
|
||||
createOptions = namespaceOrOptions;
|
||||
namespacePath = undefined;
|
||||
createOptions = namespacePathOrOptions;
|
||||
}
|
||||
|
||||
let mode: string = createOptions?.mode ?? "create";
|
||||
@@ -502,18 +502,18 @@ export class LocalConnection extends Connection {
|
||||
name,
|
||||
buf,
|
||||
mode,
|
||||
namespace ?? [],
|
||||
namespacePath ?? [],
|
||||
storageOptions,
|
||||
);
|
||||
return new LocalTable(innerTable);
|
||||
}
|
||||
|
||||
async dropTable(name: string, namespace?: string[]): Promise<void> {
|
||||
return this.inner.dropTable(name, namespace ?? []);
|
||||
async dropTable(name: string, namespacePath?: string[]): Promise<void> {
|
||||
return this.inner.dropTable(name, namespacePath ?? []);
|
||||
}
|
||||
|
||||
async dropAllTables(namespace?: string[]): Promise<void> {
|
||||
return this.inner.dropAllTables(namespace ?? []);
|
||||
async dropAllTables(namespacePath?: string[]): Promise<void> {
|
||||
return this.inner.dropAllTables(namespacePath ?? []);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -285,6 +285,25 @@ export abstract class Table {
|
||||
*/
|
||||
abstract prewarmIndex(name: string): Promise<void>;
|
||||
|
||||
/**
|
||||
* Prewarm one or more columns of data in the table.
|
||||
*
|
||||
* @param columns The columns to prewarm. If undefined, all columns are prewarmed.
|
||||
*
|
||||
* This will load the column data into the page cache so that future queries that
|
||||
* read those columns avoid the initial cold-start latency. This call initiates
|
||||
* prewarming and returns once the request is accepted; the warming itself may
|
||||
* continue in the background. Calling it on already-prewarmed columns is a
|
||||
* no-op on the server.
|
||||
*
|
||||
* Prewarming is generally useful for columns used in filters or projections.
|
||||
* Large columns (e.g. high-dimensional vectors or binary data) may not be
|
||||
* practical to prewarm.
|
||||
*
|
||||
* This feature is currently only supported on remote tables.
|
||||
*/
|
||||
abstract prewarmData(columns?: string[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Waits for asynchronous indexing to complete on the table.
|
||||
*
|
||||
@@ -710,6 +729,10 @@ export class LocalTable extends Table {
|
||||
await this.inner.prewarmIndex(name);
|
||||
}
|
||||
|
||||
async prewarmData(columns?: string[]): Promise<void> {
|
||||
await this.inner.prewarmData(columns);
|
||||
}
|
||||
|
||||
async waitForIndex(
|
||||
indexNames: string[],
|
||||
timeoutSeconds: number,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.27.2",
|
||||
"version": "0.29.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.27.2",
|
||||
"version": "0.29.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.27.2",
|
||||
"version": "0.29.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.27.2",
|
||||
"version": "0.29.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.27.2",
|
||||
"version": "0.29.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.27.2",
|
||||
"version": "0.29.0",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.27.2",
|
||||
"version": "0.29.0",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.27.2",
|
||||
"version": "0.28.0-beta.11",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.27.2",
|
||||
"version": "0.28.0-beta.11",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.27.2",
|
||||
"version": "0.29.0",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
@@ -75,7 +75,6 @@
|
||||
"build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir lancedb",
|
||||
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
|
||||
"build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir dist",
|
||||
"postbuild:release": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
|
||||
"build": "npm run build:debug && npm run tsc",
|
||||
"build-release": "npm run build:release && npm run tsc",
|
||||
"tsc": "tsc -b",
|
||||
|
||||
@@ -67,6 +67,12 @@ impl Connection {
|
||||
builder = builder.storage_option(key, value);
|
||||
}
|
||||
}
|
||||
if let Some(manifest_enabled) = options.manifest_enabled {
|
||||
builder = builder.manifest_enabled(manifest_enabled);
|
||||
}
|
||||
if let Some(namespace_client_properties) = options.namespace_client_properties {
|
||||
builder = builder.namespace_client_properties(namespace_client_properties);
|
||||
}
|
||||
|
||||
// Create client config, optionally with header provider
|
||||
let client_config = options.client_config.unwrap_or_default();
|
||||
@@ -119,12 +125,12 @@ impl Connection {
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn table_names(
|
||||
&self,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
start_after: Option<String>,
|
||||
limit: Option<u32>,
|
||||
) -> napi::Result<Vec<String>> {
|
||||
let mut op = self.get_inner()?.table_names();
|
||||
op = op.namespace(namespace);
|
||||
op = op.namespace(namespace_path.unwrap_or_default());
|
||||
if let Some(start_after) = start_after {
|
||||
op = op.start_after(start_after);
|
||||
}
|
||||
@@ -146,7 +152,7 @@ impl Connection {
|
||||
name: String,
|
||||
buf: Buffer,
|
||||
mode: String,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
) -> napi::Result<Table> {
|
||||
let batches = ipc_file_to_batches(buf.to_vec())
|
||||
@@ -154,7 +160,7 @@ impl Connection {
|
||||
let mode = Self::parse_create_mode_str(&mode)?;
|
||||
let mut builder = self.get_inner()?.create_table(&name, batches).mode(mode);
|
||||
|
||||
builder = builder.namespace(namespace);
|
||||
builder = builder.namespace(namespace_path.unwrap_or_default());
|
||||
|
||||
if let Some(storage_options) = storage_options {
|
||||
for (key, value) in storage_options {
|
||||
@@ -171,7 +177,7 @@ impl Connection {
|
||||
name: String,
|
||||
schema_buf: Buffer,
|
||||
mode: String,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
) -> napi::Result<Table> {
|
||||
let schema = ipc_file_to_schema(schema_buf.to_vec()).map_err(|e| {
|
||||
@@ -183,7 +189,7 @@ impl Connection {
|
||||
.create_empty_table(&name, schema)
|
||||
.mode(mode);
|
||||
|
||||
builder = builder.namespace(namespace);
|
||||
builder = builder.namespace(namespace_path.unwrap_or_default());
|
||||
|
||||
if let Some(storage_options) = storage_options {
|
||||
for (key, value) in storage_options {
|
||||
@@ -198,13 +204,13 @@ impl Connection {
|
||||
pub async fn open_table(
|
||||
&self,
|
||||
name: String,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
index_cache_size: Option<u32>,
|
||||
) -> napi::Result<Table> {
|
||||
let mut builder = self.get_inner()?.open_table(&name);
|
||||
|
||||
builder = builder.namespace(namespace);
|
||||
builder = builder.namespace(namespace_path.unwrap_or_default());
|
||||
|
||||
if let Some(storage_options) = storage_options {
|
||||
for (key, value) in storage_options {
|
||||
@@ -223,7 +229,7 @@ impl Connection {
|
||||
&self,
|
||||
target_table_name: String,
|
||||
source_uri: String,
|
||||
target_namespace: Vec<String>,
|
||||
target_namespace_path: Option<Vec<String>>,
|
||||
source_version: Option<i64>,
|
||||
source_tag: Option<String>,
|
||||
is_shallow: bool,
|
||||
@@ -232,7 +238,7 @@ impl Connection {
|
||||
.get_inner()?
|
||||
.clone_table(&target_table_name, &source_uri);
|
||||
|
||||
builder = builder.target_namespace(target_namespace);
|
||||
builder = builder.target_namespace(target_namespace_path.unwrap_or_default());
|
||||
|
||||
if let Some(version) = source_version {
|
||||
builder = builder.source_version(version as u64);
|
||||
@@ -250,18 +256,21 @@ impl Connection {
|
||||
|
||||
/// Drop table with the name. Or raise an error if the table does not exist.
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn drop_table(&self, name: String, namespace: Vec<String>) -> napi::Result<()> {
|
||||
pub async fn drop_table(
|
||||
&self,
|
||||
name: String,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
) -> napi::Result<()> {
|
||||
let ns = namespace_path.unwrap_or_default();
|
||||
self.get_inner()?
|
||||
.drop_table(&name, &namespace)
|
||||
.drop_table(&name, &ns)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn drop_all_tables(&self, namespace: Vec<String>) -> napi::Result<()> {
|
||||
self.get_inner()?
|
||||
.drop_all_tables(&namespace)
|
||||
.await
|
||||
.default_error()
|
||||
pub async fn drop_all_tables(&self, namespace_path: Option<Vec<String>>) -> napi::Result<()> {
|
||||
let ns = namespace_path.unwrap_or_default();
|
||||
self.get_inner()?.drop_all_tables(&ns).await.default_error()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,8 +35,15 @@ pub struct ConnectionOptions {
|
||||
pub read_consistency_interval: Option<f64>,
|
||||
/// (For LanceDB OSS only): configuration for object storage.
|
||||
///
|
||||
/// The available options are described at https://lancedb.com/docs/storage/
|
||||
/// The available options are described at https://docs.lancedb.com/storage/
|
||||
pub storage_options: Option<HashMap<String, String>>,
|
||||
/// (For LanceDB OSS only): use directory namespace manifests as the source
|
||||
/// of truth for table metadata. Existing directory-listed root tables are
|
||||
/// migrated into the manifest on access.
|
||||
pub manifest_enabled: Option<bool>,
|
||||
/// (For LanceDB OSS only): extra properties for the backing namespace
|
||||
/// client used by manifest-enabled native connections.
|
||||
pub namespace_client_properties: Option<HashMap<String, String>>,
|
||||
/// (For LanceDB OSS only): the session to use for this connection. Holds
|
||||
/// shared caches and other session-specific state.
|
||||
pub session: Option<session::Session>,
|
||||
|
||||
@@ -92,6 +92,13 @@ pub struct ClientConfig {
|
||||
pub extra_headers: Option<HashMap<String, String>>,
|
||||
pub id_delimiter: Option<String>,
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
/// User identifier for tracking purposes.
|
||||
///
|
||||
/// This is sent as the `x-lancedb-user-id` header in requests to LanceDB Cloud/Enterprise.
|
||||
/// It can be set directly, or via the `LANCEDB_USER_ID` environment variable.
|
||||
/// Alternatively, set `LANCEDB_USER_ID_ENV_KEY` to specify another environment
|
||||
/// variable that contains the user ID value.
|
||||
pub user_id: Option<String>,
|
||||
}
|
||||
|
||||
impl From<TimeoutConfig> for lancedb::remote::TimeoutConfig {
|
||||
@@ -145,6 +152,7 @@ impl From<ClientConfig> for lancedb::remote::ClientConfig {
|
||||
id_delimiter: config.id_delimiter,
|
||||
tls_config: config.tls_config.map(Into::into),
|
||||
header_provider: None, // the header provider is set separately later
|
||||
user_id: config.user_id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ type RerankHybridFn = ThreadsafeFunction<
|
||||
RerankHybridCallbackArgs,
|
||||
Status,
|
||||
false,
|
||||
true,
|
||||
>;
|
||||
|
||||
/// Reranker implementation that "wraps" a NodeJS Reranker implementation.
|
||||
@@ -32,7 +33,10 @@ impl Reranker {
|
||||
pub fn new(
|
||||
rerank_hybrid: Function<RerankHybridCallbackArgs, Promise<Buffer>>,
|
||||
) -> napi::Result<Self> {
|
||||
let rerank_hybrid = rerank_hybrid.build_threadsafe_function().build()?;
|
||||
let rerank_hybrid = rerank_hybrid
|
||||
.build_threadsafe_function()
|
||||
.weak::<true>()
|
||||
.build()?;
|
||||
Ok(Self { rerank_hybrid })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,6 +159,14 @@ impl Table {
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn prewarm_data(&self, columns: Option<Vec<String>>) -> napi::Result<()> {
|
||||
self.inner_ref()?
|
||||
.prewarm_data(columns)
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn wait_for_index(&self, index_names: Vec<String>, timeout_s: i64) -> Result<()> {
|
||||
let timeout = std::time::Duration::from_secs(timeout_s.try_into().unwrap());
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.30.2"
|
||||
current_version = "0.32.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.30.2"
|
||||
version = "0.32.0"
|
||||
publish = false
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
@@ -14,7 +15,7 @@ name = "_lancedb"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
arrow = { version = "57.2", features = ["pyarrow"] }
|
||||
arrow = { version = "58.0.0", features = ["pyarrow"] }
|
||||
async-trait = "0.1"
|
||||
bytes = "1"
|
||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||
@@ -24,8 +25,8 @@ lance-namespace-impls.workspace = true
|
||||
lance-io.workspace = true
|
||||
env_logger.workspace = true
|
||||
log.workspace = true
|
||||
pyo3 = { version = "0.26", features = ["extension-module", "abi3-py39"] }
|
||||
pyo3-async-runtimes = { version = "0.26", features = [
|
||||
pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39"] }
|
||||
pyo3-async-runtimes = { version = "0.28", features = [
|
||||
"attributes",
|
||||
"tokio-runtime",
|
||||
] }
|
||||
@@ -34,10 +35,11 @@ futures.workspace = true
|
||||
serde = "1"
|
||||
serde_json = "1"
|
||||
snafu.workspace = true
|
||||
tokio = { version = "1.40", features = ["sync"] }
|
||||
tokio = { version = "1.40", features = ["sync", "rt-multi-thread"] }
|
||||
libc = "0.2"
|
||||
|
||||
[build-dependencies]
|
||||
pyo3-build-config = { version = "0.26", features = [
|
||||
pyo3-build-config = { version = "0.28", features = [
|
||||
"extension-module",
|
||||
"abi3-py39",
|
||||
] }
|
||||
|
||||
@@ -183,7 +183,6 @@
|
||||
| stack-data | 0.6.3 | MIT License | http://github.com/alexmojaki/stack_data |
|
||||
| sympy | 1.14.0 | BSD License | https://sympy.org |
|
||||
| tabulate | 0.9.0 | MIT License | https://github.com/astanin/python-tabulate |
|
||||
| tantivy | 0.25.1 | UNKNOWN | UNKNOWN |
|
||||
| threadpoolctl | 3.6.0 | BSD License | https://github.com/joblib/threadpoolctl |
|
||||
| timm | 1.0.24 | Apache Software License | https://github.com/huggingface/pytorch-image-models |
|
||||
| tinycss2 | 1.4.0 | BSD License | https://www.courtbouillon.org/tinycss2 |
|
||||
|
||||
@@ -45,7 +45,7 @@ repository = "https://github.com/lancedb/lancedb"
|
||||
|
||||
[project.optional-dependencies]
|
||||
pylance = [
|
||||
"pylance>=4.0.0b7",
|
||||
"pylance>=6.0.0",
|
||||
]
|
||||
tests = [
|
||||
"aiohttp>=3.9.0",
|
||||
@@ -57,9 +57,8 @@ tests = [
|
||||
"duckdb>=0.9.0",
|
||||
"pytz>=2023.3",
|
||||
"polars>=0.19, <=1.3.0",
|
||||
"tantivy>=0.20.0",
|
||||
"pyarrow-stubs>=16.0",
|
||||
"pylance>=4.0.0b7",
|
||||
"pylance>=6.0.0",
|
||||
"requests>=2.31.0",
|
||||
"datafusion>=52,<53",
|
||||
]
|
||||
@@ -83,7 +82,7 @@ embeddings = [
|
||||
"colpali-engine>=0.3.10",
|
||||
"huggingface_hub>=0.19.0",
|
||||
"InstructorEmbedding>=1.0.1",
|
||||
"google.generativeai>=0.3.0",
|
||||
"google-genai>=1.0.0",
|
||||
"boto3>=1.28.57",
|
||||
"awscli>=1.44.38",
|
||||
"botocore>=1.31.57",
|
||||
|
||||
@@ -6,8 +6,7 @@ import importlib.metadata
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import timedelta
|
||||
from typing import Dict, Optional, Union, Any
|
||||
import warnings
|
||||
from typing import Dict, Optional, Union, Any, List
|
||||
|
||||
__version__ = importlib.metadata.version("lancedb")
|
||||
|
||||
@@ -15,7 +14,6 @@ from ._lancedb import connect as lancedb_connect
|
||||
from .common import URI, sanitize_uri
|
||||
from urllib.parse import urlparse
|
||||
from .db import AsyncConnection, DBConnection, LanceDBConnection
|
||||
from .io import StorageOptionsProvider
|
||||
from .remote import ClientConfig
|
||||
from .remote.db import RemoteDBConnection
|
||||
from .expr import Expr, col, lit, func
|
||||
@@ -64,7 +62,7 @@ def _check_s3_bucket_with_dots(
|
||||
|
||||
|
||||
def connect(
|
||||
uri: URI,
|
||||
uri: Optional[URI] = None,
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
region: str = "us-east-1",
|
||||
@@ -74,14 +72,19 @@ def connect(
|
||||
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
session: Optional[Session] = None,
|
||||
manifest_enabled: bool = False,
|
||||
namespace_client_impl: Optional[str] = None,
|
||||
namespace_client_properties: Optional[Dict[str, str]] = None,
|
||||
namespace_client_pushdown_operations: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> DBConnection:
|
||||
"""Connect to a LanceDB database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
uri: str or Path
|
||||
The uri of the database.
|
||||
uri: str or Path, optional
|
||||
The uri of the database. When ``namespace_client_impl`` is provided you may
|
||||
omit ``uri`` and connect through a namespace client instead.
|
||||
api_key: str, optional
|
||||
If presented, connect to LanceDB cloud.
|
||||
Otherwise, connect to a database on file system or cloud storage.
|
||||
@@ -107,13 +110,29 @@ def connect(
|
||||
default configuration is used.
|
||||
storage_options: dict, optional
|
||||
Additional options for the storage backend. See available options at
|
||||
<https://lancedb.com/docs/storage/>
|
||||
<https://docs.lancedb.com/storage/>
|
||||
manifest_enabled : bool, default False
|
||||
When true for local/native connections, use directory namespace
|
||||
manifests as the source of truth for table metadata. Existing
|
||||
directory-listed root tables are migrated into the manifest on access.
|
||||
session: Session, optional
|
||||
(For LanceDB OSS only)
|
||||
A session to use for this connection. Sessions allow you to configure
|
||||
cache sizes for index and metadata caches, which can significantly
|
||||
impact memory use and performance. They can also be re-used across
|
||||
multiple connections to share the same cache state.
|
||||
namespace_client_impl : str, optional
|
||||
When provided along with ``namespace_client_properties``, ``connect``
|
||||
returns a namespace-backed connection by delegating to
|
||||
:func:`connect_namespace`. The value identifies which namespace
|
||||
implementation to load (e.g., ``"dir"`` or ``"rest"``).
|
||||
namespace_client_properties : dict, optional
|
||||
Configuration to pass to the namespace client implementation. Required
|
||||
when ``namespace_client_impl`` is set.
|
||||
namespace_client_pushdown_operations : list[str], optional
|
||||
Only used when ``namespace_client_properties`` is provided. Forwards to
|
||||
:func:`connect_namespace` to control which operations are executed on the
|
||||
namespace service (e.g., ``["QueryTable", "CreateTable"]``).
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -133,11 +152,48 @@ def connect(
|
||||
>>> db = lancedb.connect("db://my_database", api_key="ldb_...",
|
||||
... client_config={"retry_config": {"retries": 5}})
|
||||
|
||||
Connect to a namespace-backed database:
|
||||
|
||||
>>> db = lancedb.connect(namespace_client_impl="dir",
|
||||
... namespace_client_properties={"root": "/tmp/ns"})
|
||||
|
||||
Returns
|
||||
-------
|
||||
conn : DBConnection
|
||||
A connection to a LanceDB database.
|
||||
"""
|
||||
if namespace_client_impl is not None:
|
||||
if namespace_client_properties is None:
|
||||
raise ValueError(
|
||||
"namespace_client_properties must be provided when "
|
||||
"namespace_client_impl is set"
|
||||
)
|
||||
if kwargs:
|
||||
raise ValueError(f"Unknown keyword arguments: {kwargs}")
|
||||
return connect_namespace(
|
||||
namespace_client_impl,
|
||||
namespace_client_properties,
|
||||
read_consistency_interval=read_consistency_interval,
|
||||
storage_options=storage_options,
|
||||
session=session,
|
||||
namespace_client_pushdown_operations=namespace_client_pushdown_operations,
|
||||
)
|
||||
|
||||
if namespace_client_properties is not None and not manifest_enabled:
|
||||
raise ValueError(
|
||||
"namespace_client_impl must be provided when using "
|
||||
"namespace_client_properties unless manifest_enabled=True"
|
||||
)
|
||||
|
||||
if namespace_client_pushdown_operations is not None:
|
||||
raise ValueError(
|
||||
"namespace_client_pushdown_operations is only valid when "
|
||||
"connecting through a namespace"
|
||||
)
|
||||
if uri is None:
|
||||
raise ValueError(
|
||||
"uri is required when not connecting through a namespace client"
|
||||
)
|
||||
if isinstance(uri, str) and uri.startswith("db://"):
|
||||
if api_key is None:
|
||||
api_key = os.environ.get("LANCEDB_API_KEY")
|
||||
@@ -166,9 +222,92 @@ def connect(
|
||||
read_consistency_interval=read_consistency_interval,
|
||||
storage_options=storage_options,
|
||||
session=session,
|
||||
manifest_enabled=manifest_enabled,
|
||||
namespace_client_properties=namespace_client_properties,
|
||||
)
|
||||
|
||||
|
||||
WORKER_PROPERTY_PREFIX = "_lancedb_worker_"
|
||||
|
||||
|
||||
def _apply_worker_overrides(props: dict[str, str]) -> dict[str, str]:
|
||||
"""Apply worker property overrides.
|
||||
|
||||
Any key starting with ``_lancedb_worker_`` is extracted, the prefix
|
||||
is stripped, and the resulting key-value pair is put back into the
|
||||
map (overriding the existing value if present). The original
|
||||
prefixed key is removed.
|
||||
"""
|
||||
worker_keys = [k for k in props if k.startswith(WORKER_PROPERTY_PREFIX)]
|
||||
if not worker_keys:
|
||||
return props
|
||||
result = dict(props)
|
||||
for key in worker_keys:
|
||||
value = result.pop(key)
|
||||
real_key = key[len(WORKER_PROPERTY_PREFIX) :]
|
||||
result[real_key] = value
|
||||
return result
|
||||
|
||||
|
||||
def deserialize_conn(
|
||||
data: str,
|
||||
*,
|
||||
for_worker: bool = False,
|
||||
) -> DBConnection:
|
||||
"""Reconstruct a DBConnection from a serialized string.
|
||||
|
||||
The string must have been produced by
|
||||
:meth:`DBConnection.serialize`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : str
|
||||
String produced by ``serialize()``.
|
||||
for_worker : bool, default False
|
||||
When ``True``, any namespace client property whose key starts
|
||||
with ``_lancedb_worker_`` has that prefix stripped and the
|
||||
value overrides the corresponding property. For example,
|
||||
``_lancedb_worker_uri`` replaces ``uri``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DBConnection
|
||||
A new connection matching the serialized state.
|
||||
"""
|
||||
import json
|
||||
|
||||
parsed = json.loads(data)
|
||||
connection_type = parsed.get("connection_type")
|
||||
|
||||
rci_secs = parsed.get("read_consistency_interval_seconds")
|
||||
rci = timedelta(seconds=rci_secs) if rci_secs is not None else None
|
||||
storage_options = parsed.get("storage_options")
|
||||
|
||||
if connection_type == "namespace":
|
||||
props = dict(parsed.get("namespace_client_properties") or {})
|
||||
if for_worker:
|
||||
props = _apply_worker_overrides(props)
|
||||
return connect_namespace(
|
||||
namespace_client_impl=parsed["namespace_client_impl"],
|
||||
namespace_client_properties=props,
|
||||
read_consistency_interval=rci,
|
||||
storage_options=storage_options,
|
||||
namespace_client_pushdown_operations=parsed.get(
|
||||
"namespace_client_pushdown_operations"
|
||||
),
|
||||
)
|
||||
elif connection_type == "local":
|
||||
return LanceDBConnection(
|
||||
parsed["uri"],
|
||||
read_consistency_interval=rci,
|
||||
storage_options=storage_options,
|
||||
manifest_enabled=parsed.get("manifest_enabled", False),
|
||||
namespace_client_properties=parsed.get("namespace_client_properties"),
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown connection_type: {connection_type}")
|
||||
|
||||
|
||||
async def connect_async(
|
||||
uri: URI,
|
||||
*,
|
||||
@@ -179,6 +318,8 @@ async def connect_async(
|
||||
client_config: Optional[Union[ClientConfig, Dict[str, Any]]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
session: Optional[Session] = None,
|
||||
manifest_enabled: bool = False,
|
||||
namespace_client_properties: Optional[Dict[str, str]] = None,
|
||||
) -> AsyncConnection:
|
||||
"""Connect to a LanceDB database.
|
||||
|
||||
@@ -211,13 +352,20 @@ async def connect_async(
|
||||
default configuration is used.
|
||||
storage_options: dict, optional
|
||||
Additional options for the storage backend. See available options at
|
||||
<https://lancedb.com/docs/storage/>
|
||||
<https://docs.lancedb.com/storage/>
|
||||
session: Session, optional
|
||||
(For LanceDB OSS only)
|
||||
A session to use for this connection. Sessions allow you to configure
|
||||
cache sizes for index and metadata caches, which can significantly
|
||||
impact memory use and performance. They can also be re-used across
|
||||
multiple connections to share the same cache state.
|
||||
manifest_enabled : bool, default False
|
||||
When true for local/native connections, use directory namespace
|
||||
manifests as the source of truth for table metadata. Existing
|
||||
directory-listed root tables are migrated into the manifest on access.
|
||||
namespace_client_properties : dict, optional
|
||||
Additional directory namespace client properties to use with
|
||||
``manifest_enabled=True``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -260,6 +408,8 @@ async def connect_async(
|
||||
client_config,
|
||||
storage_options,
|
||||
session,
|
||||
manifest_enabled,
|
||||
namespace_client_properties,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -284,17 +434,6 @@ __all__ = [
|
||||
"LanceNamespaceDBConnection",
|
||||
"RemoteDBConnection",
|
||||
"Session",
|
||||
"StorageOptionsProvider",
|
||||
"Table",
|
||||
"__version__",
|
||||
]
|
||||
|
||||
|
||||
def __warn_on_fork():
|
||||
warnings.warn(
|
||||
"lance is not fork-safe. If you are using multiprocessing, use spawn instead.",
|
||||
)
|
||||
|
||||
|
||||
if hasattr(os, "register_at_fork"):
|
||||
os.register_at_fork(before=__warn_on_fork) # type: ignore[attr-defined]
|
||||
|
||||
@@ -12,9 +12,9 @@ from .index import (
|
||||
LabelList,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
FTS,
|
||||
)
|
||||
from .io import StorageOptionsProvider
|
||||
from lance_namespace import (
|
||||
ListNamespacesResponse,
|
||||
CreateNamespaceResponse,
|
||||
@@ -26,6 +26,7 @@ from .remote import ClientConfig
|
||||
|
||||
IvfHnswPq: type[HnswPq] = HnswPq
|
||||
IvfHnswSq: type[HnswSq] = HnswSq
|
||||
IvfHnswFlat: type[HnswFlat] = HnswFlat
|
||||
|
||||
class PyExpr:
|
||||
"""A type-safe DataFusion expression node (Rust-side handle)."""
|
||||
@@ -72,35 +73,35 @@ class Connection(object):
|
||||
async def close(self): ...
|
||||
async def list_namespaces(
|
||||
self,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
page_token: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> ListNamespacesResponse: ...
|
||||
async def create_namespace(
|
||||
self,
|
||||
namespace: List[str],
|
||||
namespace_path: List[str],
|
||||
mode: Optional[str] = None,
|
||||
properties: Optional[Dict[str, str]] = None,
|
||||
) -> CreateNamespaceResponse: ...
|
||||
async def drop_namespace(
|
||||
self,
|
||||
namespace: List[str],
|
||||
namespace_path: List[str],
|
||||
mode: Optional[str] = None,
|
||||
behavior: Optional[str] = None,
|
||||
) -> DropNamespaceResponse: ...
|
||||
async def describe_namespace(
|
||||
self,
|
||||
namespace: List[str],
|
||||
namespace_path: List[str],
|
||||
) -> DescribeNamespaceResponse: ...
|
||||
async def list_tables(
|
||||
self,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
page_token: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> ListTablesResponse: ...
|
||||
async def table_names(
|
||||
self,
|
||||
namespace: Optional[List[str]],
|
||||
namespace_path: Optional[List[str]],
|
||||
start_after: Optional[str],
|
||||
limit: Optional[int],
|
||||
) -> list[str]: ... # Deprecated: Use list_tables instead
|
||||
@@ -109,9 +110,8 @@ class Connection(object):
|
||||
name: str,
|
||||
mode: str,
|
||||
data: pa.RecordBatchReader,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
storage_options_provider: Optional[StorageOptionsProvider] = None,
|
||||
location: Optional[str] = None,
|
||||
) -> Table: ...
|
||||
async def create_empty_table(
|
||||
@@ -119,17 +119,15 @@ class Connection(object):
|
||||
name: str,
|
||||
mode: str,
|
||||
schema: pa.Schema,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
storage_options_provider: Optional[StorageOptionsProvider] = None,
|
||||
location: Optional[str] = None,
|
||||
) -> Table: ...
|
||||
async def open_table(
|
||||
self,
|
||||
name: str,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
storage_options_provider: Optional[StorageOptionsProvider] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
location: Optional[str] = None,
|
||||
) -> Table: ...
|
||||
@@ -137,7 +135,7 @@ class Connection(object):
|
||||
self,
|
||||
target_table_name: str,
|
||||
source_uri: str,
|
||||
target_namespace: Optional[List[str]] = None,
|
||||
target_namespace_path: Optional[List[str]] = None,
|
||||
source_version: Optional[int] = None,
|
||||
source_tag: Optional[str] = None,
|
||||
is_shallow: bool = True,
|
||||
@@ -146,13 +144,18 @@ class Connection(object):
|
||||
self,
|
||||
cur_name: str,
|
||||
new_name: str,
|
||||
cur_namespace: Optional[List[str]] = None,
|
||||
new_namespace: Optional[List[str]] = None,
|
||||
cur_namespace_path: Optional[List[str]] = None,
|
||||
new_namespace_path: Optional[List[str]] = None,
|
||||
) -> None: ...
|
||||
async def drop_table(
|
||||
self, name: str, namespace: Optional[List[str]] = None
|
||||
self, name: str, namespace_path: Optional[List[str]] = None
|
||||
) -> None: ...
|
||||
async def drop_all_tables(self, namespace: Optional[List[str]] = None) -> None: ...
|
||||
async def drop_all_tables(
|
||||
self, namespace_path: Optional[List[str]] = None
|
||||
) -> None: ...
|
||||
async def namespace_client_config(
|
||||
self,
|
||||
) -> Dict[str, Any]: ...
|
||||
|
||||
class Table:
|
||||
def name(self) -> str: ...
|
||||
@@ -179,6 +182,7 @@ class Table:
|
||||
IvfPq,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
@@ -241,6 +245,8 @@ async def connect(
|
||||
client_config: Optional[Union[ClientConfig, Dict[str, Any]]],
|
||||
storage_options: Optional[Dict[str, str]],
|
||||
session: Optional[Session],
|
||||
manifest_enabled: bool = False,
|
||||
namespace_client_properties: Optional[Dict[str, str]] = None,
|
||||
) -> Connection: ...
|
||||
|
||||
class RecordBatchStream:
|
||||
@@ -439,7 +445,7 @@ class AsyncPermutationBuilder:
|
||||
async def execute(self) -> Table: ...
|
||||
|
||||
def async_permutation_builder(
|
||||
table: Table, dest_table_name: str
|
||||
table: Table,
|
||||
) -> AsyncPermutationBuilder: ...
|
||||
def fts_query_to_json(query: Any) -> str: ...
|
||||
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import threading
|
||||
import warnings
|
||||
|
||||
|
||||
class BackgroundEventLoop:
|
||||
@@ -13,6 +15,9 @@ class BackgroundEventLoop:
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._start()
|
||||
|
||||
def _start(self):
|
||||
self.loop = asyncio.new_event_loop()
|
||||
self.thread = threading.Thread(
|
||||
target=self.loop.run_forever,
|
||||
@@ -31,3 +36,30 @@ class BackgroundEventLoop:
|
||||
|
||||
|
||||
LOOP = BackgroundEventLoop()
|
||||
|
||||
_FORK_WARNED = False
|
||||
|
||||
|
||||
def _reset_after_fork():
|
||||
# Threads do not survive fork(), so the asyncio loop in LOOP.thread is
|
||||
# dead in the child. Re-initialize the singleton in place so existing
|
||||
# `from .background_loop import LOOP` references in other modules see
|
||||
# the new state. The Rust-side tokio runtime is reset analogously by a
|
||||
# pthread_atfork hook installed in the _lancedb extension.
|
||||
LOOP._start()
|
||||
global _FORK_WARNED
|
||||
if not _FORK_WARNED:
|
||||
_FORK_WARNED = True
|
||||
warnings.warn(
|
||||
"lancedb fork support is experimental: the internal async "
|
||||
"runtime has been reset in the forked child, but a small chance "
|
||||
"of deadlock remains if other state was mid-operation at fork "
|
||||
"time. The 'forkserver' or 'spawn' multiprocessing start method "
|
||||
"is likely a safer alternative.",
|
||||
RuntimeWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
if hasattr(os, "register_at_fork"):
|
||||
os.register_at_fork(after_in_child=_reset_after_fork)
|
||||
|
||||
@@ -96,7 +96,7 @@ def data_to_reader(
|
||||
f"Unknown data type {type(data)}. "
|
||||
"Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
|
||||
"pyarrow Table/RecordBatch, or Pydantic models. "
|
||||
"See https://lancedb.com/docs/tables/ for examples."
|
||||
"See https://docs.lancedb.com/tables/ for examples."
|
||||
)
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -19,10 +19,10 @@ from .utils import TEXT, api_key_not_found_help
|
||||
@register("gemini-text")
|
||||
class GeminiText(TextEmbeddingFunction):
|
||||
"""
|
||||
An embedding function that uses the Google's Gemini API. Requires GOOGLE_API_KEY to
|
||||
An embedding function that uses Google's Gemini API. Requires GOOGLE_API_KEY to
|
||||
be set.
|
||||
|
||||
https://ai.google.dev/docs/embeddings_guide
|
||||
https://ai.google.dev/gemini-api/docs/embeddings
|
||||
|
||||
Supports various tasks types:
|
||||
| Task Type | Description |
|
||||
@@ -46,9 +46,12 @@ class GeminiText(TextEmbeddingFunction):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str, default "models/embedding-001"
|
||||
The name of the model to use. See the Gemini documentation for a list of
|
||||
available models.
|
||||
name: str, default "gemini-embedding-001"
|
||||
The name of the model to use. Supported models include:
|
||||
- "gemini-embedding-001" (768 dimensions)
|
||||
|
||||
Note: The legacy "models/embedding-001" format is also supported but
|
||||
"gemini-embedding-001" is recommended.
|
||||
|
||||
query_task_type: str, default "retrieval_query"
|
||||
Sets the task type for the queries.
|
||||
@@ -77,7 +80,7 @@ class GeminiText(TextEmbeddingFunction):
|
||||
|
||||
"""
|
||||
|
||||
name: str = "models/embedding-001"
|
||||
name: str = "gemini-embedding-001"
|
||||
query_task_type: str = "retrieval_query"
|
||||
source_task_type: str = "retrieval_document"
|
||||
|
||||
@@ -114,23 +117,48 @@ class GeminiText(TextEmbeddingFunction):
|
||||
texts: list[str] or np.ndarray (of str)
|
||||
The texts to embed
|
||||
"""
|
||||
if (
|
||||
kwargs.get("task_type") == "retrieval_document"
|
||||
): # Provide a title to use existing API design
|
||||
title = "Embedding of a document"
|
||||
kwargs["title"] = title
|
||||
from google.genai import types
|
||||
|
||||
return [
|
||||
self.client.embed_content(model=self.name, content=text, **kwargs)[
|
||||
"embedding"
|
||||
]
|
||||
for text in texts
|
||||
]
|
||||
task_type = kwargs.get("task_type")
|
||||
|
||||
# Build content objects for embed_content
|
||||
contents = []
|
||||
for text in texts:
|
||||
if task_type == "retrieval_document":
|
||||
# Provide a title for retrieval_document task
|
||||
contents.append(
|
||||
{"parts": [{"text": "Embedding of a document"}, {"text": text}]}
|
||||
)
|
||||
else:
|
||||
contents.append({"parts": [{"text": text}]})
|
||||
|
||||
# Build config
|
||||
config_kwargs = {}
|
||||
if task_type:
|
||||
config_kwargs["task_type"] = task_type.upper() # API expects uppercase
|
||||
|
||||
# Call embed_content for each content
|
||||
embeddings = []
|
||||
for content in contents:
|
||||
config = (
|
||||
types.EmbedContentConfig(**config_kwargs) if config_kwargs else None
|
||||
)
|
||||
response = self.client.models.embed_content(
|
||||
model=self.name,
|
||||
contents=content,
|
||||
config=config,
|
||||
)
|
||||
embeddings.append(response.embeddings[0].values)
|
||||
|
||||
return embeddings
|
||||
|
||||
@cached_property
|
||||
def client(self):
|
||||
genai = attempt_import_or_raise("google.generativeai", "google.generativeai")
|
||||
attempt_import_or_raise("google.genai", "google-genai")
|
||||
|
||||
if not os.environ.get("GOOGLE_API_KEY"):
|
||||
api_key_not_found_help("google")
|
||||
return genai
|
||||
|
||||
from google import genai as genai_module
|
||||
|
||||
return genai_module.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
|
||||
|
||||
@@ -1,201 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
"""Full text search index using tantivy-py"""
|
||||
|
||||
import os
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
try:
|
||||
import tantivy
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
|
||||
)
|
||||
|
||||
from .table import LanceTable
|
||||
|
||||
|
||||
def create_index(
|
||||
index_path: str,
|
||||
text_fields: List[str],
|
||||
ordering_fields: Optional[List[str]] = None,
|
||||
tokenizer_name: str = "default",
|
||||
) -> tantivy.Index:
|
||||
"""
|
||||
Create a new Index (not populated)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index_path : str
|
||||
Path to the index directory
|
||||
text_fields : List[str]
|
||||
List of text fields to index
|
||||
ordering_fields: List[str]
|
||||
List of unsigned type fields to order by at search time
|
||||
tokenizer_name : str, default "default"
|
||||
The tokenizer to use
|
||||
|
||||
Returns
|
||||
-------
|
||||
index : tantivy.Index
|
||||
The index object (not yet populated)
|
||||
"""
|
||||
if ordering_fields is None:
|
||||
ordering_fields = []
|
||||
# Declaring our schema.
|
||||
schema_builder = tantivy.SchemaBuilder()
|
||||
# special field that we'll populate with row_id
|
||||
schema_builder.add_integer_field("doc_id", stored=True)
|
||||
# data fields
|
||||
for name in text_fields:
|
||||
schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
|
||||
if ordering_fields:
|
||||
for name in ordering_fields:
|
||||
schema_builder.add_unsigned_field(name, fast=True)
|
||||
schema = schema_builder.build()
|
||||
os.makedirs(index_path, exist_ok=True)
|
||||
index = tantivy.Index(schema, path=index_path)
|
||||
return index
|
||||
|
||||
|
||||
def populate_index(
|
||||
index: tantivy.Index,
|
||||
table: LanceTable,
|
||||
fields: List[str],
|
||||
writer_heap_size: Optional[int] = None,
|
||||
ordering_fields: Optional[List[str]] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Populate an index with data from a LanceTable
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : tantivy.Index
|
||||
The index object
|
||||
table : LanceTable
|
||||
The table to index
|
||||
fields : List[str]
|
||||
List of fields to index
|
||||
writer_heap_size : int
|
||||
The writer heap size in bytes, defaults to 1GB
|
||||
|
||||
Returns
|
||||
-------
|
||||
int
|
||||
The number of rows indexed
|
||||
"""
|
||||
if ordering_fields is None:
|
||||
ordering_fields = []
|
||||
writer_heap_size = writer_heap_size or 1024 * 1024 * 1024
|
||||
# first check the fields exist and are string or large string type
|
||||
nested = []
|
||||
|
||||
for name in fields:
|
||||
try:
|
||||
f = table.schema.field(name) # raises KeyError if not found
|
||||
except KeyError:
|
||||
f = resolve_path(table.schema, name)
|
||||
nested.append(name)
|
||||
|
||||
if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
|
||||
raise TypeError(f"Field {name} is not a string type")
|
||||
|
||||
# create a tantivy writer
|
||||
writer = index.writer(heap_size=writer_heap_size)
|
||||
# write data into index
|
||||
dataset = table.to_lance()
|
||||
row_id = 0
|
||||
|
||||
max_nested_level = 0
|
||||
if len(nested) > 0:
|
||||
max_nested_level = max([len(name.split(".")) for name in nested])
|
||||
|
||||
for b in dataset.to_batches(columns=fields + ordering_fields):
|
||||
if max_nested_level > 0:
|
||||
b = pa.Table.from_batches([b])
|
||||
for _ in range(max_nested_level - 1):
|
||||
b = b.flatten()
|
||||
for i in range(b.num_rows):
|
||||
doc = tantivy.Document()
|
||||
for name in fields:
|
||||
value = b[name][i].as_py()
|
||||
if value is not None:
|
||||
doc.add_text(name, value)
|
||||
for name in ordering_fields:
|
||||
value = b[name][i].as_py()
|
||||
if value is not None:
|
||||
doc.add_unsigned(name, value)
|
||||
if not doc.is_empty:
|
||||
doc.add_integer("doc_id", row_id)
|
||||
writer.add_document(doc)
|
||||
row_id += 1
|
||||
# commit changes
|
||||
writer.commit()
|
||||
return row_id
|
||||
|
||||
|
||||
def resolve_path(schema, field_name: str) -> pa.Field:
|
||||
"""
|
||||
Resolve a nested field path to a list of field names
|
||||
|
||||
Parameters
|
||||
----------
|
||||
field_name : str
|
||||
The field name to resolve
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[str]
|
||||
The resolved path
|
||||
"""
|
||||
path = field_name.split(".")
|
||||
field = schema.field(path.pop(0))
|
||||
for segment in path:
|
||||
if pa.types.is_struct(field.type):
|
||||
field = field.type.field(segment)
|
||||
else:
|
||||
raise KeyError(f"field {field_name} not found in schema {schema}")
|
||||
return field
|
||||
|
||||
|
||||
def search_index(
|
||||
index: tantivy.Index, query: str, limit: int = 10, ordering_field=None
|
||||
) -> Tuple[Tuple[int], Tuple[float]]:
|
||||
"""
|
||||
Search an index for a query
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : tantivy.Index
|
||||
The index object
|
||||
query : str
|
||||
The query string
|
||||
limit : int
|
||||
The maximum number of results to return
|
||||
|
||||
Returns
|
||||
-------
|
||||
ids_and_score: list[tuple[int], tuple[float]]
|
||||
A tuple of two tuples, the first containing the document ids
|
||||
and the second containing the scores
|
||||
"""
|
||||
searcher = index.searcher()
|
||||
query = index.parse_query(query)
|
||||
# get top results
|
||||
if ordering_field:
|
||||
results = searcher.search(query, limit, order_by_field=ordering_field)
|
||||
else:
|
||||
results = searcher.search(query, limit)
|
||||
if results.count == 0:
|
||||
return tuple(), tuple()
|
||||
return tuple(
|
||||
zip(
|
||||
*[
|
||||
(searcher.doc(doc_address)["doc_id"][0], score)
|
||||
for score, doc_address in results.hits
|
||||
]
|
||||
)
|
||||
)
|
||||
@@ -7,6 +7,7 @@ from typing import Literal, Optional
|
||||
from ._lancedb import (
|
||||
IndexConfig,
|
||||
)
|
||||
from .types import BaseTokenizerType
|
||||
|
||||
lang_mapping = {
|
||||
"ar": "Arabic",
|
||||
@@ -111,8 +112,12 @@ class FTS:
|
||||
- "simple": Splits text by whitespace and punctuation.
|
||||
- "whitespace": Split text by whitespace, but not punctuation.
|
||||
- "raw": No tokenization. The entire text is treated as a single token.
|
||||
- "ngram": N-gram tokenizer for substring-style matching.
|
||||
- "jieba/*": Jieba tokenizer loaded from Lance's language model home.
|
||||
- "lindera/*": Lindera tokenizer loaded from Lance's language model home.
|
||||
language : str, default "English"
|
||||
The language to use for tokenization.
|
||||
The language to use for stemming and stop-word removal. This is not the
|
||||
primary way to enable CJK tokenization.
|
||||
max_token_length : int, default 40
|
||||
The maximum token length to index. Tokens longer than this length will be
|
||||
ignored.
|
||||
@@ -127,10 +132,17 @@ class FTS:
|
||||
ascii_folding : bool, default True
|
||||
Whether to fold ASCII characters. This converts accented characters to
|
||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||
|
||||
Notes
|
||||
-----
|
||||
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
|
||||
require tokenizer models in Lance's language model home. Set
|
||||
``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
|
||||
directory under ``lance/language_models``.
|
||||
"""
|
||||
|
||||
with_position: bool = False
|
||||
base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple"
|
||||
base_tokenizer: BaseTokenizerType = "simple"
|
||||
language: str = "English"
|
||||
max_token_length: Optional[int] = 40
|
||||
lower_case: bool = True
|
||||
@@ -376,9 +388,98 @@ class HnswSq:
|
||||
target_partition_size: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class HnswFlat:
|
||||
"""Describe a HNSW-FLAT index configuration.
|
||||
|
||||
HNSW-FLAT stands for Hierarchical Navigable Small World without quantization.
|
||||
It stores raw vectors in the HNSW graph, providing the highest recall among
|
||||
the IVF_HNSW family at the cost of more memory and disk space compared to
|
||||
:class:`HnswSq` or :class:`HnswPq`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
distance_type: str, default "l2"
|
||||
|
||||
The distance metric used to train the index.
|
||||
|
||||
The following distance types are available:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
num_partitions, default sqrt(num_rows)
|
||||
|
||||
The number of IVF partitions to create.
|
||||
|
||||
For HNSW, we recommend a small number of partitions. Setting this to 1
|
||||
works well for most tables. For very large tables, training just one HNSW
|
||||
graph will require too much memory. Each partition becomes its own HNSW
|
||||
graph, so setting this value higher reduces the peak memory use of
|
||||
training.
|
||||
|
||||
max_iterations, default 50
|
||||
|
||||
Max iterations to train kmeans.
|
||||
|
||||
When training an IVF index we use kmeans to calculate the partitions.
|
||||
This parameter controls how many iterations of kmeans to run.
|
||||
|
||||
sample_rate, default 256
|
||||
|
||||
The rate used to calculate the number of training vectors for kmeans.
|
||||
|
||||
m, default 20
|
||||
|
||||
The number of neighbors to select for each vector in the HNSW graph.
|
||||
|
||||
This value controls the tradeoff between search speed and accuracy.
|
||||
The higher the value the more accurate the search but the slower it
|
||||
will be.
|
||||
|
||||
ef_construction, default 300
|
||||
|
||||
The number of candidates to evaluate during the construction of the HNSW
|
||||
graph.
|
||||
|
||||
This value controls the tradeoff between build speed and accuracy.
|
||||
The higher the value the more accurate the build but the slower it will
|
||||
be. 150 to 300 is the typical range. 100 is a minimum for good quality
|
||||
search results. In most cases, there is no benefit to setting this higher
|
||||
than 500. This value should be set to a value that is not less than `ef`
|
||||
in the search phase.
|
||||
|
||||
target_partition_size, default is 1,048,576
|
||||
|
||||
The target size of each partition.
|
||||
"""
|
||||
|
||||
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
||||
num_partitions: Optional[int] = None
|
||||
max_iterations: int = 50
|
||||
sample_rate: int = 256
|
||||
m: int = 20
|
||||
ef_construction: int = 300
|
||||
target_partition_size: Optional[int] = None
|
||||
|
||||
|
||||
# Backwards-compatible aliases
|
||||
IvfHnswPq = HnswPq
|
||||
IvfHnswSq = HnswSq
|
||||
IvfHnswFlat = HnswFlat
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -698,11 +799,13 @@ __all__ = [
|
||||
"IvfPq",
|
||||
"IvfHnswPq",
|
||||
"IvfHnswSq",
|
||||
"IvfHnswFlat",
|
||||
"IvfSq",
|
||||
"IvfRq",
|
||||
"IvfFlat",
|
||||
"HnswPq",
|
||||
"HnswSq",
|
||||
"HnswFlat",
|
||||
"IndexConfig",
|
||||
"FTS",
|
||||
"Bitmap",
|
||||
|
||||
@@ -2,70 +2,3 @@
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
"""I/O utilities and interfaces for LanceDB."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict
|
||||
|
||||
|
||||
class StorageOptionsProvider(ABC):
|
||||
"""Abstract base class for providing storage options to LanceDB tables.
|
||||
|
||||
Storage options providers enable automatic credential refresh for cloud
|
||||
storage backends (e.g., AWS S3, Azure Blob Storage, GCS). When credentials
|
||||
have an expiration time, the provider's fetch_storage_options() method will
|
||||
be called periodically to get fresh credentials before they expire.
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> class MyProvider(StorageOptionsProvider):
|
||||
... def fetch_storage_options(self) -> Dict[str, str]:
|
||||
... # Fetch fresh credentials from your credential manager
|
||||
... return {
|
||||
... "aws_access_key_id": "...",
|
||||
... "aws_secret_access_key": "...",
|
||||
... "expires_at_millis": "1234567890000" # Optional
|
||||
... }
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def fetch_storage_options(self) -> Dict[str, str]:
|
||||
"""Fetch fresh storage credentials.
|
||||
|
||||
This method is called by LanceDB when credentials need to be refreshed.
|
||||
If the returned dictionary contains an "expires_at_millis" key with a
|
||||
Unix timestamp in milliseconds, LanceDB will automatically refresh the
|
||||
credentials before that time. If the key is not present, credentials
|
||||
are assumed to not expire.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict[str, str]
|
||||
Dictionary containing cloud storage credentials and optionally an
|
||||
expiration time:
|
||||
- "expires_at_millis" (optional): Unix timestamp in milliseconds when
|
||||
credentials expire
|
||||
- Provider-specific credential keys (e.g., aws_access_key_id,
|
||||
aws_secret_access_key, etc.)
|
||||
|
||||
Raises
|
||||
------
|
||||
RuntimeError
|
||||
If credentials cannot be fetched or are invalid
|
||||
"""
|
||||
pass
|
||||
|
||||
def provider_id(self) -> str:
|
||||
"""Return a human-readable unique identifier for this provider instance.
|
||||
|
||||
This identifier is used for caching and equality comparison. Two providers
|
||||
with the same ID will share the same cached object store connection.
|
||||
|
||||
The default implementation uses the class name and string representation.
|
||||
Override this method if you need custom identification logic.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
A unique identifier for this provider instance
|
||||
"""
|
||||
return f"{self.__class__.__name__} {{ repr: {str(self)!r} }}"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
from deprecation import deprecated
|
||||
from lancedb import AsyncConnection, DBConnection
|
||||
import pyarrow as pa
|
||||
import copy
|
||||
import json
|
||||
|
||||
from deprecation import deprecated
|
||||
import pyarrow as pa
|
||||
|
||||
from ._lancedb import async_permutation_builder, PermutationReader
|
||||
from .table import LanceTable
|
||||
from .background_loop import LOOP
|
||||
@@ -36,10 +37,7 @@ class PermutationBuilder:
|
||||
be referenced by name in the future. If names are not provided then they can only
|
||||
be referenced by their ordinal index. There is no requirement to name every split.
|
||||
|
||||
By default, the permutation will be stored in memory and will be lost when the
|
||||
program exits. To persist the permutation (for very large datasets or to share
|
||||
the permutation across multiple workers) use the [persist](#persist) method to
|
||||
create a permanent table.
|
||||
The permutation is stored in memory and will be lost when the program exits.
|
||||
"""
|
||||
|
||||
def __init__(self, table: LanceTable):
|
||||
@@ -51,15 +49,6 @@ class PermutationBuilder:
|
||||
"""
|
||||
self._async = async_permutation_builder(table)
|
||||
|
||||
def persist(
|
||||
self, database: Union[DBConnection, AsyncConnection], table_name: str
|
||||
) -> "PermutationBuilder":
|
||||
"""
|
||||
Persist the permutation to the given database.
|
||||
"""
|
||||
self._async.persist(database, table_name)
|
||||
return self
|
||||
|
||||
def split_random(
|
||||
self,
|
||||
*,
|
||||
@@ -284,9 +273,8 @@ class Permutations:
|
||||
self.permutation_table = permutation_table
|
||||
|
||||
if permutation_table.schema.metadata is not None:
|
||||
split_names = permutation_table.schema.metadata.get(
|
||||
b"split_names", None
|
||||
).decode("utf-8")
|
||||
raw = permutation_table.schema.metadata.get(b"split_names")
|
||||
split_names = raw.decode("utf-8") if raw is not None else None
|
||||
if split_names is not None:
|
||||
self.split_names = json.loads(split_names)
|
||||
self.split_dict = {
|
||||
@@ -381,20 +369,44 @@ class Permutation:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
reader: PermutationReader,
|
||||
base_table: LanceTable,
|
||||
permutation_table: Optional[LanceTable],
|
||||
split: int,
|
||||
selection: dict[str, str],
|
||||
batch_size: int,
|
||||
transform_fn: Callable[pa.RecordBatch, Any],
|
||||
offset: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
connection_factory: Optional[Callable[[str], LanceTable]] = None,
|
||||
_reader: Optional[PermutationReader] = None,
|
||||
):
|
||||
"""
|
||||
Internal constructor. Use [from_tables](#from_tables) instead.
|
||||
"""
|
||||
assert reader is not None, "reader is required"
|
||||
assert base_table is not None, "base_table is required"
|
||||
assert selection is not None, "selection is required"
|
||||
self.reader = reader
|
||||
self.base_table = base_table
|
||||
self.permutation_table = permutation_table
|
||||
self.split = split
|
||||
self.selection = selection
|
||||
self.transform_fn = transform_fn
|
||||
self.batch_size = batch_size
|
||||
self.offset = offset
|
||||
self.limit = limit
|
||||
self.connection_factory = connection_factory
|
||||
if _reader is None:
|
||||
_reader = LOOP.run(self._build_reader())
|
||||
self.reader: PermutationReader = _reader
|
||||
|
||||
async def _build_reader(self) -> PermutationReader:
|
||||
reader = await PermutationReader.from_tables(
|
||||
self.base_table, self.permutation_table, self.split
|
||||
)
|
||||
if self.offset is not None:
|
||||
reader = await reader.with_offset(self.offset)
|
||||
if self.limit is not None:
|
||||
reader = await reader.with_limit(self.limit)
|
||||
return reader
|
||||
|
||||
def _with_selection(self, selection: dict[str, str]) -> "Permutation":
|
||||
"""
|
||||
@@ -403,21 +415,97 @@ class Permutation:
|
||||
Does not validation of the selection and it replaces it entirely. This is not
|
||||
intended for public use.
|
||||
"""
|
||||
return Permutation(self.reader, selection, self.batch_size, self.transform_fn)
|
||||
|
||||
def _with_reader(self, reader: PermutationReader) -> "Permutation":
|
||||
"""
|
||||
Creates a new permutation with the given reader
|
||||
|
||||
This is an internal method and should not be used directly.
|
||||
"""
|
||||
return Permutation(reader, self.selection, self.batch_size, self.transform_fn)
|
||||
new = copy.copy(self)
|
||||
new.selection = selection
|
||||
return new
|
||||
|
||||
def with_batch_size(self, batch_size: int) -> "Permutation":
|
||||
"""
|
||||
Creates a new permutation with the given batch size
|
||||
"""
|
||||
return Permutation(self.reader, self.selection, batch_size, self.transform_fn)
|
||||
new = copy.copy(self)
|
||||
new.batch_size = batch_size
|
||||
return new
|
||||
|
||||
def with_connection_factory(
|
||||
self, connection_factory: Callable[[str], LanceTable]
|
||||
) -> "Permutation":
|
||||
"""
|
||||
Creates a new permutation that will use ``connection_factory`` to reopen
|
||||
the base table when this permutation is unpickled in a worker process.
|
||||
|
||||
The factory is a callable that takes a single argument — the base table
|
||||
name — and returns a [LanceTable]. It must be picklable; the worker
|
||||
will pickle it via standard ``pickle`` and call it to recover the base
|
||||
table. Picklable callables in practice means top-level (module-level)
|
||||
functions, ``functools.partial`` of such functions, or instances of
|
||||
picklable classes implementing ``__call__``. Lambdas and closures over
|
||||
local variables don't pickle with the default protocol.
|
||||
|
||||
Setting a factory is necessary when the URI alone is not enough to
|
||||
re-open the connection — most importantly for LanceDB Cloud (``db://``)
|
||||
connections, where ``api_key`` and ``region`` aren't recoverable from
|
||||
the connection object after construction.
|
||||
|
||||
For local file or cloud-storage paths the factory is optional: if not
|
||||
set, ``__getstate__`` falls back to capturing
|
||||
``(uri, storage_options, namespace_path)`` and re-opening via
|
||||
``lancedb.connect(uri, storage_options=...)``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Basic native (file-system path), parameterized via ``functools.partial``::
|
||||
|
||||
import functools, lancedb
|
||||
from lancedb.permutation import Permutation
|
||||
|
||||
def open_native_table(uri: str, table_name: str):
|
||||
return lancedb.connect(uri).open_table(table_name)
|
||||
|
||||
factory = functools.partial(open_native_table, "/data/lance_db")
|
||||
permutation = Permutation.identity(
|
||||
factory("training")
|
||||
).with_connection_factory(factory)
|
||||
|
||||
Native via :func:`lancedb.connect_namespace` (e.g. a directory- or
|
||||
REST-backed namespace client). The factory takes the
|
||||
implementation name and properties dict as partial-bound args so
|
||||
the worker can rebuild the same namespace connection::
|
||||
|
||||
def open_via_namespace(
|
||||
impl: str, properties: dict[str, str], table_name: str,
|
||||
):
|
||||
return lancedb.connect_namespace(impl, properties).open_table(
|
||||
table_name,
|
||||
)
|
||||
|
||||
factory = functools.partial(
|
||||
open_via_namespace,
|
||||
"dir",
|
||||
{"root": "/data/lance_db"},
|
||||
)
|
||||
|
||||
LanceDB Cloud, reading credentials from env vars at worker startup
|
||||
so secrets aren't pickled into the dataset::
|
||||
|
||||
import os, lancedb
|
||||
|
||||
def open_remote_table(table_name: str):
|
||||
db = lancedb.connect(
|
||||
"db://my-database",
|
||||
api_key=os.environ["LANCEDB_API_KEY"],
|
||||
region=os.environ.get("LANCEDB_REGION", "us-east-1"),
|
||||
)
|
||||
return db.open_table(table_name)
|
||||
|
||||
permutation = Permutation.identity(
|
||||
open_remote_table("training")
|
||||
).with_connection_factory(open_remote_table)
|
||||
"""
|
||||
assert connection_factory is not None, "connection_factory is required"
|
||||
new = copy.copy(self)
|
||||
new.connection_factory = connection_factory
|
||||
return new
|
||||
|
||||
@classmethod
|
||||
def identity(cls, table: LanceTable) -> "Permutation":
|
||||
@@ -460,9 +548,8 @@ class Permutation:
|
||||
f"Cannot create a permutation on split `{split}`"
|
||||
" because no split names are defined in the permutation table"
|
||||
)
|
||||
split_names = permutation_table.schema.metadata.get(
|
||||
b"split_names", None
|
||||
).decode("utf-8")
|
||||
raw = permutation_table.schema.metadata.get(b"split_names")
|
||||
split_names = raw.decode("utf-8") if raw is not None else None
|
||||
if split_names is None:
|
||||
raise ValueError(
|
||||
f"Cannot create a permutation on split `{split}`"
|
||||
@@ -491,11 +578,126 @@ class Permutation:
|
||||
schema = await reader.output_schema(None)
|
||||
initial_selection = {name: name for name in schema.names}
|
||||
return cls(
|
||||
reader, initial_selection, DEFAULT_BATCH_SIZE, Transforms.arrow2python
|
||||
base_table,
|
||||
permutation_table,
|
||||
split,
|
||||
initial_selection,
|
||||
DEFAULT_BATCH_SIZE,
|
||||
Transforms.arrow2python,
|
||||
_reader=reader,
|
||||
)
|
||||
|
||||
return LOOP.run(do_from_tables())
|
||||
|
||||
def __getstate__(self) -> dict[str, Any]:
|
||||
"""Build a picklable state dict for this permutation.
|
||||
|
||||
The base table is captured either via a user-supplied
|
||||
``connection_factory`` (see [with_connection_factory]) or, as a
|
||||
fallback, by introspecting ``(uri, storage_options, namespace_path)``
|
||||
on the connection. The permutation table — always an in-memory
|
||||
LanceDB table — is captured as a pyarrow Table (which pickles via
|
||||
Arrow IPC natively). The reader is dropped from the wire format;
|
||||
``__setstate__`` rebuilds it from the restored tables.
|
||||
"""
|
||||
permutation_data: Optional[pa.Table] = None
|
||||
if self.permutation_table is not None:
|
||||
permutation_data = self.permutation_table.to_arrow()
|
||||
|
||||
common = {
|
||||
"base_table_name": self.base_table.name,
|
||||
"permutation_data": permutation_data,
|
||||
"split": self.split,
|
||||
"selection": self.selection,
|
||||
"batch_size": self.batch_size,
|
||||
"transform_fn": self.transform_fn,
|
||||
"offset": self.offset,
|
||||
"limit": self.limit,
|
||||
"connection_factory": self.connection_factory,
|
||||
}
|
||||
|
||||
if self.connection_factory is not None:
|
||||
# The factory carries enough state to recover the base table on
|
||||
# its own; we don't need to capture the URI / storage options /
|
||||
# namespace from the existing connection.
|
||||
return common
|
||||
|
||||
# URI-introspection fallback: only viable for native (OSS) connections
|
||||
# where (uri, storage_options) is enough to reopen. Remote / cloud
|
||||
# connections don't expose recoverable api_key / region — those users
|
||||
# must call with_connection_factory().
|
||||
try:
|
||||
base_uri = self.base_table._conn.uri
|
||||
storage_options = self.base_table._conn.storage_options
|
||||
except AttributeError as e:
|
||||
raise ValueError(
|
||||
"Cannot pickle this Permutation: the base table's connection "
|
||||
"does not expose a uri/storage_options, which usually means it "
|
||||
"is a remote (LanceDB Cloud) connection. Call "
|
||||
"Permutation.with_connection_factory(...) first to provide a "
|
||||
"picklable callable that re-opens the base table from a worker "
|
||||
"process."
|
||||
) from e
|
||||
|
||||
if base_uri.startswith("memory://"):
|
||||
# In-memory base tables don't exist in any worker process by
|
||||
# default, so dump the entire base table into the pickle. This
|
||||
# can be expensive for large datasets — users with large
|
||||
# in-memory base tables should either persist them or set a
|
||||
# connection_factory.
|
||||
return {
|
||||
**common,
|
||||
"base_table_data": self.base_table.to_arrow(),
|
||||
}
|
||||
|
||||
return {
|
||||
**common,
|
||||
"base_table_uri": base_uri,
|
||||
"base_table_namespace": self.base_table._namespace_path,
|
||||
"base_table_storage_options": storage_options,
|
||||
}
|
||||
|
||||
def __setstate__(self, state: dict[str, Any]) -> None:
|
||||
from . import connect
|
||||
|
||||
connection_factory = state["connection_factory"]
|
||||
if connection_factory is not None:
|
||||
base_table = connection_factory(state["base_table_name"])
|
||||
elif "base_table_data" in state:
|
||||
# In-memory base table inlined into the pickle; rebuild the same
|
||||
# way we rebuild the in-memory permutation table.
|
||||
mem_db = connect("memory://")
|
||||
base_table = mem_db.create_table(
|
||||
state["base_table_name"], state["base_table_data"]
|
||||
)
|
||||
else:
|
||||
base_db = connect(
|
||||
state["base_table_uri"],
|
||||
storage_options=state["base_table_storage_options"],
|
||||
)
|
||||
base_table = base_db.open_table(
|
||||
state["base_table_name"],
|
||||
namespace_path=state["base_table_namespace"] or None,
|
||||
)
|
||||
|
||||
permutation_table: Optional[LanceTable] = None
|
||||
if state["permutation_data"] is not None:
|
||||
mem_db = connect("memory://")
|
||||
permutation_table = mem_db.create_table(
|
||||
"permutation", state["permutation_data"]
|
||||
)
|
||||
|
||||
self.base_table = base_table
|
||||
self.permutation_table = permutation_table
|
||||
self.split = state["split"]
|
||||
self.selection = state["selection"]
|
||||
self.batch_size = state["batch_size"]
|
||||
self.transform_fn = state["transform_fn"]
|
||||
self.offset = state["offset"]
|
||||
self.limit = state["limit"]
|
||||
self.connection_factory = connection_factory
|
||||
self.reader = LOOP.run(self._build_reader())
|
||||
|
||||
@property
|
||||
def schema(self) -> pa.Schema:
|
||||
async def do_output_schema():
|
||||
@@ -762,7 +964,9 @@ class Permutation:
|
||||
for expensive operations such as image decoding.
|
||||
"""
|
||||
assert transform is not None, "transform is required"
|
||||
return Permutation(self.reader, self.selection, self.batch_size, transform)
|
||||
new = copy.copy(self)
|
||||
new.transform_fn = transform
|
||||
return new
|
||||
|
||||
def __getitem__(self, index: int) -> Any:
|
||||
"""
|
||||
@@ -797,12 +1001,10 @@ class Permutation:
|
||||
"""
|
||||
Skip the first `skip` rows of the permutation
|
||||
"""
|
||||
|
||||
async def do_with_skip():
|
||||
reader = await self.reader.with_offset(skip)
|
||||
return self._with_reader(reader)
|
||||
|
||||
return LOOP.run(do_with_skip())
|
||||
new = copy.copy(self)
|
||||
new.offset = skip
|
||||
new.reader = LOOP.run(new._build_reader())
|
||||
return new
|
||||
|
||||
@deprecated(details="Use with_take instead")
|
||||
def take(self, limit: int) -> "Permutation":
|
||||
@@ -820,12 +1022,10 @@ class Permutation:
|
||||
"""
|
||||
Limit the permutation to `limit` rows (following any `skip`)
|
||||
"""
|
||||
|
||||
async def do_with_take():
|
||||
reader = await self.reader.with_limit(limit)
|
||||
return self._with_reader(reader)
|
||||
|
||||
return LOOP.run(do_with_take())
|
||||
new = copy.copy(self)
|
||||
new.limit = limit
|
||||
new.reader = LOOP.run(new._build_reader())
|
||||
return new
|
||||
|
||||
@deprecated(details="Use with_repeat instead")
|
||||
def repeat(self, times: int) -> "Permutation":
|
||||
|
||||
@@ -10,6 +10,7 @@ import sys
|
||||
import types
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import date, datetime
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
@@ -314,6 +315,19 @@ def _pydantic_type_to_arrow_type(tp: Any, field: FieldInfo) -> pa.DataType:
|
||||
return pa.list_(pa.list_(tp.value_arrow_type(), tp.dim()))
|
||||
# For regular Vector
|
||||
return pa.list_(tp.value_arrow_type(), tp.dim())
|
||||
if _safe_issubclass(tp, Enum):
|
||||
# Map Enum to the Arrow type of its value.
|
||||
# For string-valued enums, use dictionary encoding for efficiency.
|
||||
# For integer enums, use the native type.
|
||||
# Fall back to utf8 for mixed-type or empty enums.
|
||||
value_types = {type(m.value) for m in tp}
|
||||
if len(value_types) == 1:
|
||||
value_type = value_types.pop()
|
||||
if value_type is str:
|
||||
# Use dictionary encoding for string enums
|
||||
return pa.dictionary(pa.int32(), pa.utf8())
|
||||
return _py_type_to_arrow_type(value_type, field)
|
||||
return pa.utf8()
|
||||
return _py_type_to_arrow_type(tp, field)
|
||||
|
||||
|
||||
|
||||
@@ -25,7 +25,6 @@ import deprecation
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
import pyarrow.fs as pa_fs
|
||||
import pydantic
|
||||
|
||||
from lancedb.pydantic import PYDANTIC_VERSION
|
||||
@@ -1526,9 +1525,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
return self._table._output_schema(self.to_query_object())
|
||||
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
path, fs, exist = self._table._get_fts_index_path()
|
||||
if exist:
|
||||
return self.tantivy_to_arrow()
|
||||
self._table._ensure_no_legacy_fts_index()
|
||||
|
||||
query = self._query
|
||||
if self._phrase_query:
|
||||
@@ -1552,90 +1549,6 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
):
|
||||
raise NotImplementedError("to_batches on an FTS query")
|
||||
|
||||
def tantivy_to_arrow(self) -> pa.Table:
|
||||
try:
|
||||
import tantivy
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
|
||||
)
|
||||
|
||||
from .fts import search_index
|
||||
|
||||
# get the index path
|
||||
path, fs, exist = self._table._get_fts_index_path()
|
||||
|
||||
# check if the index exist
|
||||
if not exist:
|
||||
raise FileNotFoundError(
|
||||
"Fts index does not exist. "
|
||||
"Please first call table.create_fts_index(['<field_names>']) to "
|
||||
"create the fts index."
|
||||
)
|
||||
|
||||
# Check that we are on local filesystem
|
||||
if not isinstance(fs, pa_fs.LocalFileSystem):
|
||||
raise NotImplementedError(
|
||||
"Tantivy-based full text search "
|
||||
"is only supported on the local filesystem"
|
||||
)
|
||||
# open the index
|
||||
index = tantivy.Index.open(path)
|
||||
# get the scores and doc ids
|
||||
query = self._query
|
||||
if self._phrase_query:
|
||||
query = query.replace('"', "'")
|
||||
query = f'"{query}"'
|
||||
limit = self._limit if self._limit is not None else 10
|
||||
row_ids, scores = search_index(
|
||||
index, query, limit, ordering_field=self.ordering_field_name
|
||||
)
|
||||
if len(row_ids) == 0:
|
||||
empty_schema = pa.schema([pa.field("_score", pa.float32())])
|
||||
return pa.Table.from_batches([], schema=empty_schema)
|
||||
scores = pa.array(scores)
|
||||
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
|
||||
output_tbl = output_tbl.append_column("_score", scores)
|
||||
# this needs to match vector search results which are uint64
|
||||
row_ids = pa.array(row_ids, type=pa.uint64())
|
||||
|
||||
if self._where is not None:
|
||||
tmp_name = "__lancedb__duckdb__indexer__"
|
||||
output_tbl = output_tbl.append_column(
|
||||
tmp_name, pa.array(range(len(output_tbl)))
|
||||
)
|
||||
try:
|
||||
# TODO would be great to have Substrait generate pyarrow compute
|
||||
# expressions or conversely have pyarrow support SQL expressions
|
||||
# using Substrait
|
||||
import duckdb
|
||||
|
||||
indexer = duckdb.sql(
|
||||
f"SELECT {tmp_name} FROM output_tbl WHERE {self._where}"
|
||||
).to_arrow_table()[tmp_name]
|
||||
output_tbl = output_tbl.take(indexer).drop([tmp_name])
|
||||
row_ids = row_ids.take(indexer)
|
||||
|
||||
except ImportError:
|
||||
import tempfile
|
||||
|
||||
import lance
|
||||
|
||||
# TODO Use "memory://" instead once that's supported
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
ds = lance.write_dataset(output_tbl, tmp)
|
||||
output_tbl = ds.to_table(filter=self._where)
|
||||
indexer = output_tbl[tmp_name]
|
||||
row_ids = row_ids.take(indexer)
|
||||
output_tbl = output_tbl.drop([tmp_name])
|
||||
|
||||
if self._with_row_id:
|
||||
output_tbl = output_tbl.append_column("_rowid", row_ids)
|
||||
|
||||
if self._reranker is not None:
|
||||
output_tbl = self._reranker.rerank_fts(self._query, output_tbl)
|
||||
return output_tbl
|
||||
|
||||
def rerank(self, reranker: Reranker) -> LanceFtsQueryBuilder:
|
||||
"""Rerank the results using the specified reranker.
|
||||
|
||||
@@ -1730,7 +1643,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
def _validate_query(self, query, vector=None, text=None):
|
||||
if query is not None and (vector is not None or text is not None):
|
||||
raise ValueError(
|
||||
"You can either provide a string query in search() method"
|
||||
"You can either provide a string query in search() method "
|
||||
"or set `vector()` and `text()` explicitly for hybrid search."
|
||||
"But not both."
|
||||
)
|
||||
|
||||
@@ -145,6 +145,33 @@ class TlsConfig:
|
||||
|
||||
@dataclass
|
||||
class ClientConfig:
|
||||
"""Configuration for the LanceDB Cloud HTTP client.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
user_agent: str
|
||||
User agent string sent with requests.
|
||||
retry_config: RetryConfig
|
||||
Configuration for retrying failed requests.
|
||||
timeout_config: Optional[TimeoutConfig]
|
||||
Configuration for request timeouts.
|
||||
extra_headers: Optional[dict]
|
||||
Additional headers to include in requests.
|
||||
id_delimiter: Optional[str]
|
||||
The delimiter to use when constructing object identifiers.
|
||||
tls_config: Optional[TlsConfig]
|
||||
TLS/mTLS configuration for secure connections.
|
||||
header_provider: Optional[HeaderProvider]
|
||||
Provider for dynamic headers to be added to each request.
|
||||
user_id: Optional[str]
|
||||
User identifier for tracking purposes. This is sent as the
|
||||
`x-lancedb-user-id` header in requests to LanceDB Cloud/Enterprise.
|
||||
|
||||
This can also be set via the `LANCEDB_USER_ID` environment variable.
|
||||
Alternatively, set `LANCEDB_USER_ID_ENV_KEY` to specify another
|
||||
environment variable that contains the user ID value.
|
||||
"""
|
||||
|
||||
user_agent: str = f"LanceDB-Python-Client/{__version__}"
|
||||
retry_config: RetryConfig = field(default_factory=RetryConfig)
|
||||
timeout_config: Optional[TimeoutConfig] = field(default_factory=TimeoutConfig)
|
||||
@@ -152,6 +179,7 @@ class ClientConfig:
|
||||
id_delimiter: Optional[str] = None
|
||||
tls_config: Optional[TlsConfig] = None
|
||||
header_provider: Optional["HeaderProvider"] = None
|
||||
user_id: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if isinstance(self.retry_config, dict):
|
||||
|
||||
@@ -24,6 +24,7 @@ from ..common import DATA
|
||||
from ..db import DBConnection, LOOP
|
||||
from ..embeddings import EmbeddingFunctionConfig
|
||||
from lance_namespace import (
|
||||
LanceNamespace,
|
||||
CreateNamespaceResponse,
|
||||
DescribeNamespaceResponse,
|
||||
DropNamespaceResponse,
|
||||
@@ -111,7 +112,7 @@ class RemoteDBConnection(DBConnection):
|
||||
@override
|
||||
def list_namespaces(
|
||||
self,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
page_token: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> ListNamespacesResponse:
|
||||
@@ -119,7 +120,7 @@ class RemoteDBConnection(DBConnection):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], optional
|
||||
namespace_path: List[str], optional
|
||||
The parent namespace to list namespaces in.
|
||||
None or empty list represents root namespace.
|
||||
page_token: str, optional
|
||||
@@ -133,18 +134,18 @@ class RemoteDBConnection(DBConnection):
|
||||
ListNamespacesResponse
|
||||
Response containing namespace names and optional page_token for pagination.
|
||||
"""
|
||||
if namespace is None:
|
||||
namespace = []
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
return LOOP.run(
|
||||
self._conn.list_namespaces(
|
||||
namespace=namespace, page_token=page_token, limit=limit
|
||||
namespace_path=namespace_path, page_token=page_token, limit=limit
|
||||
)
|
||||
)
|
||||
|
||||
@override
|
||||
def create_namespace(
|
||||
self,
|
||||
namespace: List[str],
|
||||
namespace_path: List[str],
|
||||
mode: Optional[str] = None,
|
||||
properties: Optional[Dict[str, str]] = None,
|
||||
) -> CreateNamespaceResponse:
|
||||
@@ -152,7 +153,7 @@ class RemoteDBConnection(DBConnection):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
namespace_path: List[str]
|
||||
The namespace identifier to create.
|
||||
mode: str, optional
|
||||
Creation mode - "create" (fail if exists), "exist_ok" (skip if exists),
|
||||
@@ -167,14 +168,14 @@ class RemoteDBConnection(DBConnection):
|
||||
"""
|
||||
return LOOP.run(
|
||||
self._conn.create_namespace(
|
||||
namespace=namespace, mode=mode, properties=properties
|
||||
namespace_path=namespace_path, mode=mode, properties=properties
|
||||
)
|
||||
)
|
||||
|
||||
@override
|
||||
def drop_namespace(
|
||||
self,
|
||||
namespace: List[str],
|
||||
namespace_path: List[str],
|
||||
mode: Optional[str] = None,
|
||||
behavior: Optional[str] = None,
|
||||
) -> DropNamespaceResponse:
|
||||
@@ -182,7 +183,7 @@ class RemoteDBConnection(DBConnection):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
namespace_path: List[str]
|
||||
The namespace identifier to drop.
|
||||
mode: str, optional
|
||||
Whether to skip if not exists ("SKIP") or fail ("FAIL"). Case insensitive.
|
||||
@@ -196,16 +197,20 @@ class RemoteDBConnection(DBConnection):
|
||||
Response containing properties and transaction_id if applicable.
|
||||
"""
|
||||
return LOOP.run(
|
||||
self._conn.drop_namespace(namespace=namespace, mode=mode, behavior=behavior)
|
||||
self._conn.drop_namespace(
|
||||
namespace_path=namespace_path, mode=mode, behavior=behavior
|
||||
)
|
||||
)
|
||||
|
||||
@override
|
||||
def describe_namespace(self, namespace: List[str]) -> DescribeNamespaceResponse:
|
||||
def describe_namespace(
|
||||
self, namespace_path: List[str]
|
||||
) -> DescribeNamespaceResponse:
|
||||
"""Describe a namespace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str]
|
||||
namespace_path: List[str]
|
||||
The namespace identifier to describe.
|
||||
|
||||
Returns
|
||||
@@ -213,12 +218,12 @@ class RemoteDBConnection(DBConnection):
|
||||
DescribeNamespaceResponse
|
||||
Response containing the namespace properties.
|
||||
"""
|
||||
return LOOP.run(self._conn.describe_namespace(namespace=namespace))
|
||||
return LOOP.run(self._conn.describe_namespace(namespace_path=namespace_path))
|
||||
|
||||
@override
|
||||
def list_tables(
|
||||
self,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
page_token: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> ListTablesResponse:
|
||||
@@ -226,7 +231,7 @@ class RemoteDBConnection(DBConnection):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], optional
|
||||
namespace_path: List[str], optional
|
||||
The namespace to list tables in.
|
||||
None or empty list represents root namespace.
|
||||
page_token: str, optional
|
||||
@@ -240,11 +245,11 @@ class RemoteDBConnection(DBConnection):
|
||||
ListTablesResponse
|
||||
Response containing table names and optional page_token for pagination.
|
||||
"""
|
||||
if namespace is None:
|
||||
namespace = []
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
return LOOP.run(
|
||||
self._conn.list_tables(
|
||||
namespace=namespace, page_token=page_token, limit=limit
|
||||
namespace_path=namespace_path, page_token=page_token, limit=limit
|
||||
)
|
||||
)
|
||||
|
||||
@@ -254,7 +259,7 @@ class RemoteDBConnection(DBConnection):
|
||||
page_token: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
*,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
) -> Iterable[str]:
|
||||
"""List the names of all tables in the database.
|
||||
|
||||
@@ -263,7 +268,7 @@ class RemoteDBConnection(DBConnection):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
namespace: List[str], default []
|
||||
namespace_path: List[str], default []
|
||||
The namespace to list tables in.
|
||||
Empty list represents root namespace.
|
||||
page_token: str
|
||||
@@ -282,11 +287,11 @@ class RemoteDBConnection(DBConnection):
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
if namespace is None:
|
||||
namespace = []
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
return LOOP.run(
|
||||
self._conn.table_names(
|
||||
namespace=namespace, start_after=page_token, limit=limit
|
||||
namespace_path=namespace_path, start_after=page_token, limit=limit
|
||||
)
|
||||
)
|
||||
|
||||
@@ -295,7 +300,7 @@ class RemoteDBConnection(DBConnection):
|
||||
self,
|
||||
name: str,
|
||||
*,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
) -> Table:
|
||||
@@ -305,7 +310,7 @@ class RemoteDBConnection(DBConnection):
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], optional
|
||||
namespace_path: List[str], optional
|
||||
The namespace to open the table from.
|
||||
None or empty list represents root namespace.
|
||||
|
||||
@@ -315,15 +320,15 @@ class RemoteDBConnection(DBConnection):
|
||||
"""
|
||||
from .table import RemoteTable
|
||||
|
||||
if namespace is None:
|
||||
namespace = []
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
if index_cache_size is not None:
|
||||
logging.info(
|
||||
"index_cache_size is ignored in LanceDb Cloud"
|
||||
" (there is no local cache to configure)"
|
||||
)
|
||||
|
||||
table = LOOP.run(self._conn.open_table(name, namespace=namespace))
|
||||
table = LOOP.run(self._conn.open_table(name, namespace_path=namespace_path))
|
||||
return RemoteTable(table, self.db_name)
|
||||
|
||||
def clone_table(
|
||||
@@ -331,7 +336,7 @@ class RemoteDBConnection(DBConnection):
|
||||
target_table_name: str,
|
||||
source_uri: str,
|
||||
*,
|
||||
target_namespace: Optional[List[str]] = None,
|
||||
target_namespace_path: Optional[List[str]] = None,
|
||||
source_version: Optional[int] = None,
|
||||
source_tag: Optional[str] = None,
|
||||
is_shallow: bool = True,
|
||||
@@ -344,7 +349,7 @@ class RemoteDBConnection(DBConnection):
|
||||
The name of the target table to create.
|
||||
source_uri: str
|
||||
The URI of the source table to clone from.
|
||||
target_namespace: List[str], optional
|
||||
target_namespace_path: List[str], optional
|
||||
The namespace for the target table.
|
||||
None or empty list represents root namespace.
|
||||
source_version: int, optional
|
||||
@@ -361,13 +366,13 @@ class RemoteDBConnection(DBConnection):
|
||||
"""
|
||||
from .table import RemoteTable
|
||||
|
||||
if target_namespace is None:
|
||||
target_namespace = []
|
||||
if target_namespace_path is None:
|
||||
target_namespace_path = []
|
||||
table = LOOP.run(
|
||||
self._conn.clone_table(
|
||||
target_table_name,
|
||||
source_uri,
|
||||
target_namespace=target_namespace,
|
||||
target_namespace_path=target_namespace_path,
|
||||
source_version=source_version,
|
||||
source_tag=source_tag,
|
||||
is_shallow=is_shallow,
|
||||
@@ -387,7 +392,7 @@ class RemoteDBConnection(DBConnection):
|
||||
exist_ok: bool = False,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
*,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
) -> Table:
|
||||
"""Create a [Table][lancedb.table.Table] in the database.
|
||||
|
||||
@@ -395,7 +400,7 @@ class RemoteDBConnection(DBConnection):
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], optional
|
||||
namespace_path: List[str], optional
|
||||
The namespace to create the table in.
|
||||
None or empty list represents root namespace.
|
||||
data: The data to initialize the table, *optional*
|
||||
@@ -495,8 +500,8 @@ class RemoteDBConnection(DBConnection):
|
||||
mode = "exist_ok"
|
||||
elif not mode:
|
||||
mode = "exist_ok"
|
||||
if namespace is None:
|
||||
namespace = []
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
validate_table_name(name)
|
||||
if embedding_functions is not None:
|
||||
logging.warning(
|
||||
@@ -511,7 +516,7 @@ class RemoteDBConnection(DBConnection):
|
||||
self._conn.create_table(
|
||||
name,
|
||||
data,
|
||||
namespace=namespace,
|
||||
namespace_path=namespace_path,
|
||||
mode=mode,
|
||||
schema=schema,
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
@@ -521,28 +526,28 @@ class RemoteDBConnection(DBConnection):
|
||||
return RemoteTable(table, self.db_name)
|
||||
|
||||
@override
|
||||
def drop_table(self, name: str, namespace: Optional[List[str]] = None):
|
||||
def drop_table(self, name: str, namespace_path: Optional[List[str]] = None):
|
||||
"""Drop a table from the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
namespace: List[str], optional
|
||||
namespace_path: List[str], optional
|
||||
The namespace to drop the table from.
|
||||
None or empty list represents root namespace.
|
||||
"""
|
||||
if namespace is None:
|
||||
namespace = []
|
||||
LOOP.run(self._conn.drop_table(name, namespace=namespace))
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
LOOP.run(self._conn.drop_table(name, namespace_path=namespace_path))
|
||||
|
||||
@override
|
||||
def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
new_name: str,
|
||||
cur_namespace: Optional[List[str]] = None,
|
||||
new_namespace: Optional[List[str]] = None,
|
||||
cur_namespace_path: Optional[List[str]] = None,
|
||||
new_namespace_path: Optional[List[str]] = None,
|
||||
):
|
||||
"""Rename a table in the database.
|
||||
|
||||
@@ -553,19 +558,32 @@ class RemoteDBConnection(DBConnection):
|
||||
new_name: str
|
||||
The new name of the table.
|
||||
"""
|
||||
if cur_namespace is None:
|
||||
cur_namespace = []
|
||||
if new_namespace is None:
|
||||
new_namespace = []
|
||||
if cur_namespace_path is None:
|
||||
cur_namespace_path = []
|
||||
if new_namespace_path is None:
|
||||
new_namespace_path = []
|
||||
LOOP.run(
|
||||
self._conn.rename_table(
|
||||
cur_name,
|
||||
new_name,
|
||||
cur_namespace=cur_namespace,
|
||||
new_namespace=new_namespace,
|
||||
cur_namespace_path=cur_namespace_path,
|
||||
new_namespace_path=new_namespace_path,
|
||||
)
|
||||
)
|
||||
|
||||
@override
|
||||
def namespace_client(self) -> LanceNamespace:
|
||||
"""Get the equivalent namespace client for this connection.
|
||||
|
||||
Returns a RestNamespace with the same URI and authentication headers.
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceNamespace
|
||||
The namespace client for this connection.
|
||||
"""
|
||||
return LOOP.run(self._conn.namespace_client())
|
||||
|
||||
async def close(self):
|
||||
"""Close the connection to the database."""
|
||||
self._conn.close()
|
||||
|
||||
@@ -22,6 +22,7 @@ from lancedb.index import (
|
||||
FTS,
|
||||
BTree,
|
||||
Bitmap,
|
||||
HnswFlat,
|
||||
HnswSq,
|
||||
IvfFlat,
|
||||
IvfPq,
|
||||
@@ -39,6 +40,7 @@ from lancedb.table import _normalize_progress
|
||||
|
||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
|
||||
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
||||
from ..types import BaseTokenizerType
|
||||
|
||||
|
||||
class RemoteTable(Table):
|
||||
@@ -167,7 +169,7 @@ class RemoteTable(Table):
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
with_position: bool = False,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: str = "simple",
|
||||
base_tokenizer: BaseTokenizerType = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
@@ -284,13 +286,15 @@ class RemoteTable(Table):
|
||||
)
|
||||
elif index_type == "IVF_HNSW_SQ":
|
||||
config = HnswSq(distance_type=metric, num_partitions=num_partitions)
|
||||
elif index_type == "IVF_HNSW_FLAT":
|
||||
config = HnswFlat(distance_type=metric, num_partitions=num_partitions)
|
||||
elif index_type == "IVF_FLAT":
|
||||
config = IvfFlat(distance_type=metric, num_partitions=num_partitions)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown vector index type: {index_type}. Valid options are"
|
||||
" 'IVF_FLAT', 'IVF_PQ', 'IVF_RQ', 'IVF_SQ',"
|
||||
" 'IVF_HNSW_PQ', 'IVF_HNSW_SQ'"
|
||||
" 'IVF_HNSW_PQ', 'IVF_HNSW_SQ', 'IVF_HNSW_FLAT'"
|
||||
)
|
||||
|
||||
LOOP.run(
|
||||
|
||||
@@ -57,6 +57,7 @@ from .index import (
|
||||
LabelList,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
FTS,
|
||||
)
|
||||
from .merge import LanceMergeInsertBuilder
|
||||
@@ -86,10 +87,62 @@ from .util import (
|
||||
)
|
||||
from .index import lang_mapping
|
||||
|
||||
_MODEL_BACKED_TOKENIZER_PREFIXES = ("jieba", "lindera")
|
||||
_MODEL_BACKED_TOKENIZER_ERRORS = (
|
||||
"unknown base tokenizer",
|
||||
"Invalid directory path:",
|
||||
"Failed to load Jieba",
|
||||
"Failed to load tokenizer config",
|
||||
"Failed to initialize default tokenizer",
|
||||
)
|
||||
|
||||
|
||||
def _add_unique_note(exception: BaseException, note: str) -> None:
|
||||
existing_notes = getattr(exception, "__notes__", ()) or ()
|
||||
message = (
|
||||
exception.args[0]
|
||||
if exception.args and isinstance(exception.args[0], str)
|
||||
else ""
|
||||
)
|
||||
if note not in existing_notes and note not in message:
|
||||
add_note(exception, note)
|
||||
|
||||
|
||||
def _is_model_backed_tokenizer(base_tokenizer: str) -> bool:
|
||||
return any(
|
||||
base_tokenizer == prefix or base_tokenizer.startswith(f"{prefix}/")
|
||||
for prefix in _MODEL_BACKED_TOKENIZER_PREFIXES
|
||||
)
|
||||
|
||||
|
||||
def _maybe_add_fts_error_note(
|
||||
exception: BaseException, *, base_tokenizer: str, language: Optional[str] = None
|
||||
) -> None:
|
||||
message = str(exception)
|
||||
if language is not None and "not support the requested language" in message:
|
||||
supported_langs = ", ".join(lang_mapping.values())
|
||||
_add_unique_note(exception, f"Supported languages: {supported_langs}")
|
||||
return
|
||||
|
||||
if not _is_model_backed_tokenizer(base_tokenizer):
|
||||
return
|
||||
|
||||
if not any(marker in message for marker in _MODEL_BACKED_TOKENIZER_ERRORS):
|
||||
return
|
||||
|
||||
_add_unique_note(
|
||||
exception,
|
||||
"Model-backed tokenizers such as 'jieba/default' and 'lindera/ipadic' "
|
||||
"require tokenizer models in Lance's language model home. Set "
|
||||
"LANCE_LANGUAGE_MODEL_HOME to override the default platform data "
|
||||
"directory under 'lance/language_models'. Expected layouts include "
|
||||
"'<model-home>/jieba/default/...' and "
|
||||
"'<model-home>/lindera/ipadic/...'.",
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .db import LanceDBConnection
|
||||
from .io import StorageOptionsProvider
|
||||
from ._lancedb import (
|
||||
Table as LanceDBTable,
|
||||
OptimizeStats,
|
||||
@@ -192,7 +245,7 @@ def _into_pyarrow_reader(
|
||||
f"Unknown data type {type(data)}. "
|
||||
"Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
|
||||
"pyarrow Table/RecordBatch, or Pydantic models. "
|
||||
"See https://lancedb.com/docs/tables/ for examples."
|
||||
"See https://docs.lancedb.com/tables/ for examples."
|
||||
)
|
||||
|
||||
|
||||
@@ -271,15 +324,17 @@ def _sanitize_data(
|
||||
reader,
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
target_schema=target_schema,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
if target_schema is None:
|
||||
target_schema, reader = _infer_target_schema(reader)
|
||||
|
||||
if metadata:
|
||||
new_metadata = target_schema.metadata or {}
|
||||
new_metadata.update(metadata)
|
||||
target_schema = target_schema.with_metadata(new_metadata)
|
||||
target_schema = target_schema.with_metadata(
|
||||
_merge_metadata(target_schema.metadata, metadata)
|
||||
)
|
||||
|
||||
_validate_schema(target_schema)
|
||||
reader = _cast_to_target_schema(reader, target_schema, allow_subschema)
|
||||
@@ -295,7 +350,7 @@ def _cast_to_target_schema(
|
||||
# pa.Table.cast expects field order not to be changed.
|
||||
# Lance doesn't care about field order, so we don't need to rearrange fields
|
||||
# to match the target schema. We just need to correctly cast the fields.
|
||||
if reader.schema == target_schema:
|
||||
if reader.schema.equals(target_schema, check_metadata=True):
|
||||
# Fast path when the schemas are already the same
|
||||
return reader
|
||||
|
||||
@@ -315,7 +370,13 @@ def _cast_to_target_schema(
|
||||
def gen():
|
||||
for batch in reader:
|
||||
# Table but not RecordBatch has cast.
|
||||
yield pa.Table.from_batches([batch]).cast(reordered_schema).to_batches()[0]
|
||||
cast_batches = (
|
||||
pa.Table.from_batches([batch]).cast(reordered_schema).to_batches()
|
||||
)
|
||||
if cast_batches:
|
||||
yield pa.RecordBatch.from_arrays(
|
||||
cast_batches[0].columns, schema=reordered_schema
|
||||
)
|
||||
|
||||
return pa.RecordBatchReader.from_batches(reordered_schema, gen())
|
||||
|
||||
@@ -333,37 +394,51 @@ def _align_field_types(
|
||||
if target_field is None:
|
||||
raise ValueError(f"Field '{field.name}' not found in target schema")
|
||||
if pa.types.is_struct(target_field.type):
|
||||
new_type = pa.struct(
|
||||
_align_field_types(
|
||||
field.type.fields,
|
||||
target_field.type.fields,
|
||||
if pa.types.is_struct(field.type):
|
||||
new_type = pa.struct(
|
||||
_align_field_types(
|
||||
field.type.fields,
|
||||
target_field.type.fields,
|
||||
)
|
||||
)
|
||||
)
|
||||
else:
|
||||
new_type = target_field.type
|
||||
elif pa.types.is_list(target_field.type):
|
||||
new_type = pa.list_(
|
||||
_align_field_types(
|
||||
[field.type.value_field],
|
||||
[target_field.type.value_field],
|
||||
)[0]
|
||||
)
|
||||
if _is_list_like(field.type):
|
||||
new_type = pa.list_(
|
||||
_align_field_types(
|
||||
[field.type.value_field],
|
||||
[target_field.type.value_field],
|
||||
)[0]
|
||||
)
|
||||
else:
|
||||
new_type = target_field.type
|
||||
elif pa.types.is_large_list(target_field.type):
|
||||
new_type = pa.large_list(
|
||||
_align_field_types(
|
||||
[field.type.value_field],
|
||||
[target_field.type.value_field],
|
||||
)[0]
|
||||
)
|
||||
if _is_list_like(field.type):
|
||||
new_type = pa.large_list(
|
||||
_align_field_types(
|
||||
[field.type.value_field],
|
||||
[target_field.type.value_field],
|
||||
)[0]
|
||||
)
|
||||
else:
|
||||
new_type = target_field.type
|
||||
elif pa.types.is_fixed_size_list(target_field.type):
|
||||
new_type = pa.list_(
|
||||
_align_field_types(
|
||||
[field.type.value_field],
|
||||
[target_field.type.value_field],
|
||||
)[0],
|
||||
target_field.type.list_size,
|
||||
)
|
||||
if _is_list_like(field.type):
|
||||
new_type = pa.list_(
|
||||
_align_field_types(
|
||||
[field.type.value_field],
|
||||
[target_field.type.value_field],
|
||||
)[0],
|
||||
target_field.type.list_size,
|
||||
)
|
||||
else:
|
||||
new_type = target_field.type
|
||||
else:
|
||||
new_type = target_field.type
|
||||
new_fields.append(pa.field(field.name, new_type, field.nullable))
|
||||
new_fields.append(
|
||||
pa.field(field.name, new_type, field.nullable, target_field.metadata)
|
||||
)
|
||||
return new_fields
|
||||
|
||||
|
||||
@@ -441,6 +516,7 @@ def sanitize_create_table(
|
||||
schema = data.schema
|
||||
|
||||
if metadata:
|
||||
metadata = _merge_metadata(schema.metadata, metadata)
|
||||
schema = schema.with_metadata(metadata)
|
||||
# Need to apply metadata to the data as well
|
||||
if isinstance(data, pa.Table):
|
||||
@@ -493,9 +569,9 @@ def _append_vector_columns(
|
||||
vector columns to the table.
|
||||
"""
|
||||
if schema is None:
|
||||
metadata = metadata or {}
|
||||
metadata = _merge_metadata(metadata)
|
||||
else:
|
||||
metadata = schema.metadata or metadata or {}
|
||||
metadata = _merge_metadata(schema.metadata, metadata)
|
||||
functions = EmbeddingFunctionRegistry.get_instance().parse_functions(metadata)
|
||||
|
||||
if not functions:
|
||||
@@ -921,29 +997,29 @@ class Table(ABC):
|
||||
Parameters
|
||||
----------
|
||||
field_names: str or list of str
|
||||
The name(s) of the field to index.
|
||||
If ``use_tantivy`` is False (default), only a single field name
|
||||
(str) is supported. To index multiple fields, create a separate
|
||||
FTS index for each field.
|
||||
The name of the field to index. Native FTS indexes can only be
|
||||
created on a single field at a time. To search over multiple text
|
||||
fields, create a separate FTS index for each field.
|
||||
replace: bool, default False
|
||||
If True, replace the existing index if it exists. Note that this is
|
||||
not yet an atomic operation; the index will be temporarily
|
||||
unavailable while the new index is being created.
|
||||
writer_heap_size: int, default 1GB
|
||||
Only available with use_tantivy=True
|
||||
Deprecated legacy Tantivy parameter. Any value other than the
|
||||
default raises an error.
|
||||
ordering_field_names:
|
||||
A list of unsigned type fields to index to optionally order
|
||||
results on at search time.
|
||||
only available with use_tantivy=True
|
||||
Deprecated legacy Tantivy parameter. Setting this raises an error.
|
||||
tokenizer_name: str, default "default"
|
||||
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
||||
language code followed by "_stem". So for english it would be "en_stem".
|
||||
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
||||
A compatibility alias for native tokenizer configs. Can be "raw",
|
||||
"default" or the 2 letter language code followed by "_stem". So
|
||||
for english it would be "en_stem". For new native FTS indexes, use
|
||||
``base_tokenizer`` directly; ``tokenizer_name`` is a legacy
|
||||
compatibility alias and does not expose model-backed tokenizer names
|
||||
such as ``jieba/default`` or ``lindera/ipadic``.
|
||||
use_tantivy: bool, default False
|
||||
If True, use the legacy full-text search implementation based on tantivy.
|
||||
If False, use the new full-text search implementation based on lance-index.
|
||||
Deprecated legacy Tantivy parameter. Setting this to True raises an
|
||||
error.
|
||||
with_position: bool, default False
|
||||
Only available with use_tantivy=False
|
||||
If False, do not store the positions of the terms in the text.
|
||||
This can reduce the size of the index and improve indexing speed.
|
||||
But it will raise an exception for phrase queries.
|
||||
@@ -953,8 +1029,11 @@ class Table(ABC):
|
||||
- "whitespace": Split text by whitespace, but not punctuation.
|
||||
- "raw": No tokenization. The entire text is treated as a single token.
|
||||
- "ngram": N-Gram tokenizer.
|
||||
- "jieba/*": Jieba tokenizer loaded from Lance's language model home.
|
||||
- "lindera/*": Lindera tokenizer loaded from Lance's language model home.
|
||||
language : str, default "English"
|
||||
The language to use for tokenization.
|
||||
The language to use for stemming and stop-word removal. This is not
|
||||
the primary way to enable CJK tokenization.
|
||||
max_token_length : int, default 40
|
||||
The maximum token length to index. Tokens longer than this length will be
|
||||
ignored.
|
||||
@@ -980,6 +1059,13 @@ class Table(ABC):
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
The name of the index. If not provided, a default name will be generated.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
|
||||
require tokenizer models in Lance's language model home. Set
|
||||
``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
|
||||
directory under ``lance/language_models``.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -1724,6 +1810,16 @@ class Table(ABC):
|
||||
index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound
|
||||
return (path, fs, index_exists)
|
||||
|
||||
def _ensure_no_legacy_fts_index(self):
|
||||
path, _, exists = self._get_fts_index_path()
|
||||
if exists:
|
||||
raise ValueError(
|
||||
"Legacy Tantivy FTS index detected at "
|
||||
f"{path}. Tantivy-based FTS has been removed. "
|
||||
"Delete the legacy index and recreate it with "
|
||||
"table.create_fts_index(...)."
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def uses_v2_manifest_paths(self) -> bool:
|
||||
"""
|
||||
@@ -1776,30 +1872,30 @@ class LanceTable(Table):
|
||||
connection: "LanceDBConnection",
|
||||
name: str,
|
||||
*,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
storage_options_provider: Optional["StorageOptionsProvider"] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
location: Optional[str] = None,
|
||||
namespace_client: Optional[Any] = None,
|
||||
managed_versioning: Optional[bool] = None,
|
||||
pushdown_operations: Optional[set] = None,
|
||||
_async: AsyncTable = None,
|
||||
):
|
||||
if namespace is None:
|
||||
namespace = []
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
self._conn = connection
|
||||
self._namespace = namespace
|
||||
self._namespace_path = namespace_path
|
||||
self._location = location # Store location for use in _dataset_path
|
||||
self._namespace_client = namespace_client
|
||||
self._pushdown_operations = pushdown_operations or set()
|
||||
if _async is not None:
|
||||
self._table = _async
|
||||
else:
|
||||
self._table = LOOP.run(
|
||||
connection._conn.open_table(
|
||||
name,
|
||||
namespace=namespace,
|
||||
namespace_path=namespace_path,
|
||||
storage_options=storage_options,
|
||||
storage_options_provider=storage_options_provider,
|
||||
index_cache_size=index_cache_size,
|
||||
location=location,
|
||||
namespace_client=namespace_client,
|
||||
@@ -1814,13 +1910,13 @@ class LanceTable(Table):
|
||||
@property
|
||||
def namespace(self) -> List[str]:
|
||||
"""Return the namespace path of the table."""
|
||||
return self._namespace
|
||||
return self._namespace_path
|
||||
|
||||
@property
|
||||
def id(self) -> str:
|
||||
"""Return the full identifier of the table (namespace$name)."""
|
||||
if self._namespace:
|
||||
return "$".join(self._namespace + [self.name])
|
||||
if self._namespace_path:
|
||||
return "$".join(self._namespace_path + [self.name])
|
||||
return self.name
|
||||
|
||||
@classmethod
|
||||
@@ -1841,26 +1937,26 @@ class LanceTable(Table):
|
||||
db,
|
||||
name,
|
||||
*,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
storage_options_provider: Optional["StorageOptionsProvider"] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
location: Optional[str] = None,
|
||||
namespace_client: Optional[Any] = None,
|
||||
managed_versioning: Optional[bool] = None,
|
||||
pushdown_operations: Optional[set] = None,
|
||||
):
|
||||
if namespace is None:
|
||||
namespace = []
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
tbl = cls(
|
||||
db,
|
||||
name,
|
||||
namespace=namespace,
|
||||
namespace_path=namespace_path,
|
||||
storage_options=storage_options,
|
||||
storage_options_provider=storage_options_provider,
|
||||
index_cache_size=index_cache_size,
|
||||
location=location,
|
||||
namespace_client=namespace_client,
|
||||
managed_versioning=managed_versioning,
|
||||
pushdown_operations=pushdown_operations,
|
||||
)
|
||||
|
||||
# check the dataset exists
|
||||
@@ -1893,11 +1989,11 @@ class LanceTable(Table):
|
||||
)
|
||||
|
||||
if self._namespace_client is not None:
|
||||
table_id = self._namespace + [self.name]
|
||||
table_id = self._namespace_path + [self.name]
|
||||
return lance.dataset(
|
||||
version=self.version,
|
||||
storage_options=self._conn.storage_options,
|
||||
namespace=self._namespace_client,
|
||||
namespace_client=self._namespace_client,
|
||||
table_id=table_id,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -2141,7 +2237,13 @@ class LanceTable(Table):
|
||||
index_cache_size: Optional[int] = None,
|
||||
num_bits: int = 8,
|
||||
index_type: Literal[
|
||||
"IVF_FLAT", "IVF_SQ", "IVF_PQ", "IVF_RQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
|
||||
"IVF_FLAT",
|
||||
"IVF_SQ",
|
||||
"IVF_PQ",
|
||||
"IVF_RQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
] = "IVF_PQ",
|
||||
max_iterations: int = 50,
|
||||
sample_rate: int = 256,
|
||||
@@ -2228,6 +2330,16 @@ class LanceTable(Table):
|
||||
ef_construction=ef_construction,
|
||||
target_partition_size=target_partition_size,
|
||||
)
|
||||
elif index_type == "IVF_HNSW_FLAT":
|
||||
config = HnswFlat(
|
||||
distance_type=metric,
|
||||
num_partitions=num_partitions,
|
||||
max_iterations=max_iterations,
|
||||
sample_rate=sample_rate,
|
||||
m=m,
|
||||
ef_construction=ef_construction,
|
||||
target_partition_size=target_partition_size,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown index type {index_type}")
|
||||
|
||||
@@ -2383,41 +2495,57 @@ class LanceTable(Table):
|
||||
prefix_only: bool = False,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
if not use_tantivy:
|
||||
if not isinstance(field_names, str):
|
||||
raise ValueError(
|
||||
"Native FTS indexes can only be created on a single field "
|
||||
"at a time. To search over multiple text fields, create a "
|
||||
"separate FTS index for each field."
|
||||
)
|
||||
self._ensure_no_legacy_fts_index()
|
||||
|
||||
if tokenizer_name is None:
|
||||
tokenizer_configs = {
|
||||
"base_tokenizer": base_tokenizer,
|
||||
"language": language,
|
||||
"with_position": with_position,
|
||||
"max_token_length": max_token_length,
|
||||
"lower_case": lower_case,
|
||||
"stem": stem,
|
||||
"remove_stop_words": remove_stop_words,
|
||||
"ascii_folding": ascii_folding,
|
||||
"ngram_min_length": ngram_min_length,
|
||||
"ngram_max_length": ngram_max_length,
|
||||
"prefix_only": prefix_only,
|
||||
}
|
||||
else:
|
||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||
|
||||
config = FTS(
|
||||
**tokenizer_configs,
|
||||
if use_tantivy:
|
||||
raise ValueError(
|
||||
"Tantivy-based FTS has been removed. "
|
||||
"Remove use_tantivy and recreate the index with native FTS."
|
||||
)
|
||||
if ordering_field_names is not None:
|
||||
raise ValueError(
|
||||
"ordering_field_names was only supported by the removed "
|
||||
"Tantivy-based FTS implementation."
|
||||
)
|
||||
if writer_heap_size != 1024 * 1024 * 1024:
|
||||
raise ValueError(
|
||||
"writer_heap_size was only supported by the removed "
|
||||
"Tantivy-based FTS implementation."
|
||||
)
|
||||
if not isinstance(field_names, str):
|
||||
raise ValueError(
|
||||
"Native FTS indexes can only be created on a single field "
|
||||
"at a time. To search over multiple text fields, create a "
|
||||
"separate FTS index for each field."
|
||||
)
|
||||
if "." in field_names:
|
||||
raise ValueError(
|
||||
"Native FTS indexes can only be created on top-level fields. "
|
||||
f"Received nested field path: {field_names!r}."
|
||||
)
|
||||
|
||||
# delete the existing legacy index if it exists
|
||||
if replace:
|
||||
path, fs, exist = self._get_fts_index_path()
|
||||
if exist:
|
||||
fs.delete_dir(path)
|
||||
if tokenizer_name is None:
|
||||
tokenizer_configs = {
|
||||
"base_tokenizer": base_tokenizer,
|
||||
"language": language,
|
||||
"with_position": with_position,
|
||||
"max_token_length": max_token_length,
|
||||
"lower_case": lower_case,
|
||||
"stem": stem,
|
||||
"remove_stop_words": remove_stop_words,
|
||||
"ascii_folding": ascii_folding,
|
||||
"ngram_min_length": ngram_min_length,
|
||||
"ngram_max_length": ngram_max_length,
|
||||
"prefix_only": prefix_only,
|
||||
}
|
||||
else:
|
||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||
|
||||
config = FTS(
|
||||
**tokenizer_configs,
|
||||
)
|
||||
|
||||
try:
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
field_names,
|
||||
@@ -2426,42 +2554,13 @@ class LanceTable(Table):
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
from .fts import create_index, populate_index
|
||||
|
||||
if isinstance(field_names, str):
|
||||
field_names = [field_names]
|
||||
|
||||
if isinstance(ordering_field_names, str):
|
||||
ordering_field_names = [ordering_field_names]
|
||||
|
||||
path, fs, exist = self._get_fts_index_path()
|
||||
if exist:
|
||||
if not replace:
|
||||
raise ValueError("Index already exists. Use replace=True to overwrite.")
|
||||
fs.delete_dir(path)
|
||||
|
||||
if not isinstance(fs, pa_fs.LocalFileSystem):
|
||||
raise NotImplementedError(
|
||||
"Full-text search is only supported on the local filesystem"
|
||||
except (ValueError, RuntimeError) as e:
|
||||
_maybe_add_fts_error_note(
|
||||
e,
|
||||
base_tokenizer=config.base_tokenizer,
|
||||
language=config.language,
|
||||
)
|
||||
|
||||
if tokenizer_name is None:
|
||||
tokenizer_name = "default"
|
||||
index = create_index(
|
||||
path,
|
||||
field_names,
|
||||
ordering_fields=ordering_field_names,
|
||||
tokenizer_name=tokenizer_name,
|
||||
)
|
||||
populate_index(
|
||||
index,
|
||||
self,
|
||||
field_names,
|
||||
ordering_fields=ordering_field_names,
|
||||
writer_heap_size=writer_heap_size,
|
||||
)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
|
||||
@@ -2803,13 +2902,13 @@ class LanceTable(Table):
|
||||
fill_value: float = 0.0,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
*,
|
||||
namespace: Optional[List[str]] = None,
|
||||
namespace_path: Optional[List[str]] = None,
|
||||
storage_options: Optional[Dict[str, str | bool]] = None,
|
||||
storage_options_provider: Optional["StorageOptionsProvider"] = None,
|
||||
data_storage_version: Optional[str] = None,
|
||||
enable_v2_manifest_paths: Optional[bool] = None,
|
||||
location: Optional[str] = None,
|
||||
namespace_client: Optional[Any] = None,
|
||||
pushdown_operations: Optional[set] = None,
|
||||
):
|
||||
"""
|
||||
Create a new table.
|
||||
@@ -2864,13 +2963,14 @@ class LanceTable(Table):
|
||||
Deprecated. Set `storage_options` when connecting to the database and set
|
||||
`new_table_enable_v2_manifest_paths` in the options.
|
||||
"""
|
||||
if namespace is None:
|
||||
namespace = []
|
||||
if namespace_path is None:
|
||||
namespace_path = []
|
||||
self = cls.__new__(cls)
|
||||
self._conn = db
|
||||
self._namespace = namespace
|
||||
self._namespace_path = namespace_path
|
||||
self._location = location
|
||||
self._namespace_client = namespace_client
|
||||
self._pushdown_operations = pushdown_operations or set()
|
||||
|
||||
if data_storage_version is not None:
|
||||
warnings.warn(
|
||||
@@ -2903,10 +3003,10 @@ class LanceTable(Table):
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
embedding_functions=embedding_functions,
|
||||
namespace=namespace,
|
||||
namespace_path=namespace_path,
|
||||
storage_options=storage_options,
|
||||
storage_options_provider=storage_options_provider,
|
||||
location=location,
|
||||
namespace_client=namespace_client,
|
||||
)
|
||||
)
|
||||
return self
|
||||
@@ -2974,6 +3074,15 @@ class LanceTable(Table):
|
||||
batch_size: Optional[int] = None,
|
||||
timeout: Optional[timedelta] = None,
|
||||
) -> pa.RecordBatchReader:
|
||||
if (
|
||||
"QueryTable" in self._pushdown_operations
|
||||
and self._namespace_client is not None
|
||||
):
|
||||
from lancedb.namespace import _execute_server_side_query
|
||||
|
||||
table_id = self._namespace_path + [self.name]
|
||||
return _execute_server_side_query(self._namespace_client, table_id, query)
|
||||
|
||||
async_iter = LOOP.run(
|
||||
self._table._execute_query(query, batch_size=batch_size, timeout=timeout)
|
||||
)
|
||||
@@ -3203,43 +3312,157 @@ def _handle_bad_vectors(
|
||||
reader: pa.RecordBatchReader,
|
||||
on_bad_vectors: Literal["error", "drop", "fill", "null"] = "error",
|
||||
fill_value: float = 0.0,
|
||||
target_schema: Optional[pa.Schema] = None,
|
||||
metadata: Optional[dict] = None,
|
||||
) -> pa.RecordBatchReader:
|
||||
vector_columns = []
|
||||
vector_columns = _find_vector_columns(reader.schema, target_schema, metadata)
|
||||
if not vector_columns:
|
||||
return reader
|
||||
|
||||
for field in reader.schema:
|
||||
# They can provide a 'vector' column that isn't yet a FSL
|
||||
named_vector_col = (
|
||||
(
|
||||
pa.types.is_list(field.type)
|
||||
or pa.types.is_large_list(field.type)
|
||||
or pa.types.is_fixed_size_list(field.type)
|
||||
)
|
||||
and pa.types.is_floating(field.type.value_type)
|
||||
and field.name == VECTOR_COLUMN_NAME
|
||||
)
|
||||
# TODO: we're making an assumption that fixed size list of 10 or more
|
||||
# is a vector column. This is definitely a bit hacky.
|
||||
likely_vector_col = (
|
||||
pa.types.is_fixed_size_list(field.type)
|
||||
and pa.types.is_floating(field.type.value_type)
|
||||
and (field.type.list_size >= 10)
|
||||
)
|
||||
|
||||
if named_vector_col or likely_vector_col:
|
||||
vector_columns.append(field.name)
|
||||
output_schema = _vector_output_schema(reader.schema, vector_columns)
|
||||
|
||||
def gen():
|
||||
for batch in reader:
|
||||
for name in vector_columns:
|
||||
pending_dims = []
|
||||
for vector_column in vector_columns:
|
||||
dim = vector_column["expected_dim"]
|
||||
if target_schema is not None and dim is None:
|
||||
dim = _infer_vector_dim(batch[vector_column["name"]])
|
||||
pending_dims.append(vector_column)
|
||||
batch = _handle_bad_vector_column(
|
||||
batch,
|
||||
vector_column_name=name,
|
||||
vector_column_name=vector_column["name"],
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
expected_dim=dim,
|
||||
expected_value_type=vector_column["expected_value_type"],
|
||||
)
|
||||
yield batch
|
||||
for vector_column in pending_dims:
|
||||
if vector_column["expected_dim"] is None:
|
||||
vector_column["expected_dim"] = _infer_vector_dim(
|
||||
batch[vector_column["name"]]
|
||||
)
|
||||
if batch.schema.equals(output_schema, check_metadata=True):
|
||||
yield batch
|
||||
continue
|
||||
|
||||
return pa.RecordBatchReader.from_batches(reader.schema, gen())
|
||||
cast_batches = (
|
||||
pa.Table.from_batches([batch]).cast(output_schema).to_batches()
|
||||
)
|
||||
if cast_batches:
|
||||
yield pa.RecordBatch.from_arrays(
|
||||
cast_batches[0].columns,
|
||||
schema=output_schema,
|
||||
)
|
||||
|
||||
return pa.RecordBatchReader.from_batches(output_schema, gen())
|
||||
|
||||
|
||||
def _find_vector_columns(
|
||||
reader_schema: pa.Schema,
|
||||
target_schema: Optional[pa.Schema],
|
||||
metadata: Optional[dict],
|
||||
) -> List[dict]:
|
||||
if target_schema is None:
|
||||
vector_columns = []
|
||||
for field in reader_schema:
|
||||
named_vector_col = (
|
||||
_is_list_like(field.type)
|
||||
and pa.types.is_floating(field.type.value_type)
|
||||
and field.name == VECTOR_COLUMN_NAME
|
||||
)
|
||||
likely_vector_col = (
|
||||
pa.types.is_fixed_size_list(field.type)
|
||||
and pa.types.is_floating(field.type.value_type)
|
||||
and (field.type.list_size >= 10)
|
||||
)
|
||||
if named_vector_col or likely_vector_col:
|
||||
vector_columns.append(
|
||||
{
|
||||
"name": field.name,
|
||||
"expected_dim": None,
|
||||
"expected_value_type": None,
|
||||
}
|
||||
)
|
||||
return vector_columns
|
||||
|
||||
reader_column_names = set(reader_schema.names)
|
||||
active_metadata = _merge_metadata(target_schema.metadata, metadata)
|
||||
embedding_function_columns = set(
|
||||
EmbeddingFunctionRegistry.get_instance().parse_functions(active_metadata).keys()
|
||||
)
|
||||
vector_columns = []
|
||||
for field in target_schema:
|
||||
if field.name not in reader_column_names:
|
||||
continue
|
||||
if not _is_list_like(field.type) or not pa.types.is_floating(
|
||||
field.type.value_type
|
||||
):
|
||||
continue
|
||||
|
||||
reader_field = reader_schema.field(field.name)
|
||||
named_vector_col = (
|
||||
field.name in embedding_function_columns
|
||||
or field.name == VECTOR_COLUMN_NAME
|
||||
or (field.name == "embedding" and pa.types.is_fixed_size_list(field.type))
|
||||
)
|
||||
typed_fixed_vector_col = (
|
||||
pa.types.is_fixed_size_list(reader_field.type)
|
||||
and pa.types.is_floating(reader_field.type.value_type)
|
||||
and reader_field.type.list_size >= 10
|
||||
)
|
||||
|
||||
if named_vector_col or typed_fixed_vector_col:
|
||||
vector_columns.append(
|
||||
{
|
||||
"name": field.name,
|
||||
"expected_dim": (
|
||||
field.type.list_size
|
||||
if pa.types.is_fixed_size_list(field.type)
|
||||
else None
|
||||
),
|
||||
"expected_value_type": field.type.value_type,
|
||||
}
|
||||
)
|
||||
|
||||
return vector_columns
|
||||
|
||||
|
||||
def _vector_output_schema(
|
||||
reader_schema: pa.Schema,
|
||||
vector_columns: List[dict],
|
||||
) -> pa.Schema:
|
||||
columns_by_name = {column["name"]: column for column in vector_columns}
|
||||
fields = []
|
||||
for field in reader_schema:
|
||||
column = columns_by_name.get(field.name)
|
||||
if column is None:
|
||||
output_type = field.type
|
||||
else:
|
||||
output_type = _vector_output_type(field, column)
|
||||
fields.append(pa.field(field.name, output_type, field.nullable, field.metadata))
|
||||
return pa.schema(fields, metadata=reader_schema.metadata)
|
||||
|
||||
|
||||
def _vector_output_type(field: pa.Field, vector_column: dict) -> pa.DataType:
|
||||
if not _is_list_like(field.type):
|
||||
return field.type
|
||||
|
||||
if vector_column["expected_value_type"] is not None and (
|
||||
pa.types.is_null(field.type.value_type)
|
||||
or pa.types.is_integer(field.type.value_type)
|
||||
or pa.types.is_unsigned_integer(field.type.value_type)
|
||||
):
|
||||
return pa.list_(vector_column["expected_value_type"])
|
||||
|
||||
if (
|
||||
vector_column["expected_dim"] is not None
|
||||
and pa.types.is_fixed_size_list(field.type)
|
||||
and field.type.list_size != vector_column["expected_dim"]
|
||||
):
|
||||
return pa.list_(field.type.value_type)
|
||||
|
||||
return field.type
|
||||
|
||||
|
||||
def _handle_bad_vector_column(
|
||||
@@ -3247,6 +3470,8 @@ def _handle_bad_vector_column(
|
||||
vector_column_name: str,
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
expected_dim: Optional[int] = None,
|
||||
expected_value_type: Optional[pa.DataType] = None,
|
||||
) -> pa.RecordBatch:
|
||||
"""
|
||||
Ensure that the vector column exists and has type fixed_size_list(float)
|
||||
@@ -3263,14 +3488,39 @@ def _handle_bad_vector_column(
|
||||
fill_value: float, default 0.0
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
"""
|
||||
position = data.column_names.index(vector_column_name)
|
||||
vec_arr = data[vector_column_name]
|
||||
if not _is_list_like(vec_arr.type):
|
||||
return data
|
||||
|
||||
has_nan = has_nan_values(vec_arr)
|
||||
if (
|
||||
expected_dim is not None
|
||||
and pa.types.is_fixed_size_list(vec_arr.type)
|
||||
and vec_arr.type.list_size != expected_dim
|
||||
):
|
||||
vec_arr = pa.array(vec_arr.to_pylist(), type=pa.list_(vec_arr.type.value_type))
|
||||
data = data.set_column(position, vector_column_name, vec_arr)
|
||||
|
||||
if pa.types.is_fixed_size_list(vec_arr.type):
|
||||
if expected_value_type is not None and (
|
||||
pa.types.is_integer(vec_arr.type.value_type)
|
||||
or pa.types.is_unsigned_integer(vec_arr.type.value_type)
|
||||
):
|
||||
vec_arr = pa.array(vec_arr.to_pylist(), type=pa.list_(expected_value_type))
|
||||
data = data.set_column(position, vector_column_name, vec_arr)
|
||||
|
||||
if pa.types.is_floating(vec_arr.type.value_type):
|
||||
has_nan = has_nan_values(vec_arr)
|
||||
else:
|
||||
has_nan = pa.array([False] * len(vec_arr))
|
||||
|
||||
if expected_dim is not None:
|
||||
dim = expected_dim
|
||||
elif pa.types.is_fixed_size_list(vec_arr.type):
|
||||
dim = vec_arr.type.list_size
|
||||
else:
|
||||
dim = _modal_list_size(vec_arr)
|
||||
dim = _infer_vector_dim(vec_arr)
|
||||
if dim is None:
|
||||
return data
|
||||
has_wrong_dim = pc.not_equal(pc.list_value_length(vec_arr), dim)
|
||||
|
||||
has_bad_vectors = pc.any(has_nan).as_py() or pc.any(has_wrong_dim).as_py()
|
||||
@@ -3308,13 +3558,12 @@ def _handle_bad_vector_column(
|
||||
)
|
||||
vec_arr = pc.if_else(
|
||||
is_bad,
|
||||
pa.scalar([fill_value] * dim),
|
||||
pa.scalar([fill_value] * dim, type=vec_arr.type),
|
||||
vec_arr,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid value for on_bad_vectors: {on_bad_vectors}")
|
||||
|
||||
position = data.column_names.index(vector_column_name)
|
||||
return data.set_column(position, vector_column_name, vec_arr)
|
||||
|
||||
|
||||
@@ -3335,6 +3584,28 @@ def has_nan_values(arr: Union[pa.ListArray, pa.ChunkedArray]) -> pa.BooleanArray
|
||||
return pc.is_in(indices, has_nan_indices)
|
||||
|
||||
|
||||
def _is_list_like(data_type: pa.DataType) -> bool:
|
||||
return (
|
||||
pa.types.is_list(data_type)
|
||||
or pa.types.is_large_list(data_type)
|
||||
or pa.types.is_fixed_size_list(data_type)
|
||||
)
|
||||
|
||||
|
||||
def _merge_metadata(*metadata_dicts: Optional[dict]) -> dict:
|
||||
merged = {}
|
||||
for metadata in metadata_dicts:
|
||||
if metadata is None:
|
||||
continue
|
||||
for key, value in metadata.items():
|
||||
if isinstance(key, str):
|
||||
key = key.encode("utf-8")
|
||||
if isinstance(value, str):
|
||||
value = value.encode("utf-8")
|
||||
merged[key] = value
|
||||
return merged
|
||||
|
||||
|
||||
def _name_suggests_vector_column(field_name: str) -> bool:
|
||||
"""Check if a field name indicates a vector column."""
|
||||
name_lower = field_name.lower()
|
||||
@@ -3402,6 +3673,16 @@ def _modal_list_size(arr: Union[pa.ListArray, pa.ChunkedArray]) -> int:
|
||||
return pc.mode(pc.list_value_length(arr))[0].as_py()["mode"]
|
||||
|
||||
|
||||
def _infer_vector_dim(arr: Union[pa.Array, pa.ChunkedArray]) -> Optional[int]:
|
||||
if not _is_list_like(arr.type):
|
||||
return None
|
||||
lengths = pc.list_value_length(arr)
|
||||
lengths = pc.filter(lengths, pc.greater(lengths, 0))
|
||||
if len(lengths) == 0:
|
||||
return None
|
||||
return pc.mode(lengths)[0].as_py()["mode"]
|
||||
|
||||
|
||||
def _validate_schema(schema: pa.Schema):
|
||||
"""
|
||||
Make sure the metadata is valid utf8
|
||||
@@ -3609,7 +3890,18 @@ class AsyncTable:
|
||||
*,
|
||||
replace: Optional[bool] = None,
|
||||
config: Optional[
|
||||
Union[IvfFlat, IvfPq, IvfRq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
||||
Union[
|
||||
IvfFlat,
|
||||
IvfPq,
|
||||
IvfRq,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
FTS,
|
||||
]
|
||||
] = None,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
name: Optional[str] = None,
|
||||
@@ -3656,6 +3948,7 @@ class AsyncTable:
|
||||
IvfRq,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
@@ -3675,11 +3968,13 @@ class AsyncTable:
|
||||
name=name,
|
||||
train=train,
|
||||
)
|
||||
except ValueError as e:
|
||||
if "not support the requested language" in str(e):
|
||||
supported_langs = ", ".join(lang_mapping.values())
|
||||
help_msg = f"Supported languages: {supported_langs}"
|
||||
add_note(e, help_msg)
|
||||
except (ValueError, RuntimeError) as e:
|
||||
if isinstance(config, FTS):
|
||||
_maybe_add_fts_error_note(
|
||||
e,
|
||||
base_tokenizer=config.base_tokenizer,
|
||||
language=config.language,
|
||||
)
|
||||
raise e
|
||||
|
||||
async def drop_index(self, name: str) -> None:
|
||||
@@ -4824,6 +5119,7 @@ class IndexStatistics:
|
||||
"IVF_RQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"FTS",
|
||||
"BTREE",
|
||||
"BITMAP",
|
||||
|
||||
@@ -24,6 +24,7 @@ VectorIndexType = Literal[
|
||||
"IVF_PQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"IVF_RQ",
|
||||
]
|
||||
ScalarIndexType = Literal["BTREE", "BITMAP", "LABEL_LIST"]
|
||||
@@ -31,6 +32,7 @@ IndexType = Literal[
|
||||
"IVF_PQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"IVF_SQ",
|
||||
"FTS",
|
||||
"BTREE",
|
||||
@@ -40,4 +42,5 @@ IndexType = Literal[
|
||||
]
|
||||
|
||||
# Tokenizer literals
|
||||
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||
BuiltinTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||
BaseTokenizerType = BuiltinTokenizerType | str
|
||||
|
||||
@@ -180,7 +180,7 @@ def test_fts_fuzzy_query():
|
||||
),
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
table.create_fts_index("text", replace=True)
|
||||
|
||||
results = table.search(MatchQuery("foo", "text", fuzziness=1)).to_pandas()
|
||||
assert len(results) == 4
|
||||
@@ -230,7 +230,7 @@ def test_fts_boost_query():
|
||||
),
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("desc", use_tantivy=False, replace=True)
|
||||
table.create_fts_index("desc", replace=True)
|
||||
|
||||
results = table.search(
|
||||
BoostQuery(
|
||||
@@ -265,7 +265,7 @@ def test_fts_boolean_query(tmp_path):
|
||||
],
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
table.create_fts_index("text", replace=True)
|
||||
|
||||
# SHOULD
|
||||
results = table.search(
|
||||
@@ -319,9 +319,7 @@ def test_fts_native():
|
||||
],
|
||||
)
|
||||
|
||||
# passing `use_tantivy=False` to use lance FTS index
|
||||
# `use_tantivy=True` by default
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
table.search("puppy").limit(10).select(["text"]).to_list()
|
||||
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
|
||||
# ...
|
||||
@@ -332,7 +330,6 @@ def test_fts_native():
|
||||
# --8<-- [start:fts_config_folding]
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
language="French",
|
||||
stem=True,
|
||||
ascii_folding=True,
|
||||
@@ -346,7 +343,7 @@ def test_fts_native():
|
||||
table.search("puppy").limit(10).where("text='foo'", prefilter=False).to_list()
|
||||
# --8<-- [end:fts_postfiltering]
|
||||
# --8<-- [start:fts_with_position]
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
|
||||
table.create_fts_index("text", with_position=True, replace=True)
|
||||
# --8<-- [end:fts_with_position]
|
||||
# --8<-- [start:fts_incremental_index]
|
||||
table.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
|
||||
|
||||
8
python/python/tests/models/jieba/default/dict.txt
Normal file
8
python/python/tests/models/jieba/default/dict.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
我们 98740 r
|
||||
都 202780 d
|
||||
有 423765 v
|
||||
光明 1219 n
|
||||
的 318825 uj
|
||||
前途 1263 n
|
||||
前 62779 f
|
||||
途 857 n
|
||||
4
python/python/tests/models/lindera/ipadic/config.yml
Normal file
4
python/python/tests/models/lindera/ipadic/config.yml
Normal file
@@ -0,0 +1,4 @@
|
||||
segmenter:
|
||||
mode: "normal"
|
||||
dictionary:
|
||||
path: "./python/tests/models/lindera/ipadic/main"
|
||||
BIN
python/python/tests/models/lindera/ipadic/main.zip
Normal file
BIN
python/python/tests/models/lindera/ipadic/main.zip
Normal file
Binary file not shown.
@@ -3,6 +3,7 @@
|
||||
|
||||
|
||||
import re
|
||||
import sys
|
||||
from datetime import timedelta
|
||||
import os
|
||||
|
||||
@@ -14,8 +15,7 @@ import pytest
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_basic(tmp_path, use_tantivy):
|
||||
def test_basic(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
assert db.uri == str(tmp_path)
|
||||
@@ -48,7 +48,7 @@ def test_basic(tmp_path, use_tantivy):
|
||||
assert len(rs) == 1
|
||||
assert rs["item"].iloc[0] == "foo"
|
||||
|
||||
table.create_fts_index("item", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("item")
|
||||
rs = table.search("bar", query_type="fts").to_pandas()
|
||||
assert len(rs) == 1
|
||||
assert rs["item"].iloc[0] == "bar"
|
||||
@@ -183,8 +183,8 @@ def test_table_names(tmp_db: lancedb.DBConnection):
|
||||
result = list(tmp_db.table_names("test2", limit=2))
|
||||
assert result == ["test3"], f"Expected ['test3'], got {result}"
|
||||
|
||||
# Test that namespace parameter can be passed as keyword
|
||||
result = list(tmp_db.table_names(namespace=[]))
|
||||
# Test that namespace_path parameter can be passed as keyword
|
||||
result = list(tmp_db.table_names(namespace_path=[]))
|
||||
assert len(result) == 3
|
||||
|
||||
|
||||
@@ -896,42 +896,22 @@ def test_bypass_vector_index_sync(tmp_db: lancedb.DBConnection):
|
||||
|
||||
|
||||
def test_local_namespace_operations(tmp_path):
|
||||
"""Test that local mode namespace operations behave as expected."""
|
||||
# Create a local database connection
|
||||
"""Test that local mode namespace operations work via directory namespace."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Test list_namespaces returns empty list for root namespace
|
||||
namespaces = db.list_namespaces().namespaces
|
||||
assert namespaces == []
|
||||
# Root namespace starts empty
|
||||
assert db.list_namespaces().namespaces == []
|
||||
|
||||
# Test list_namespaces with non-empty namespace raises NotImplementedError
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Namespace operations are not supported for listing database",
|
||||
):
|
||||
db.list_namespaces(namespace=["test"])
|
||||
# Create and list child namespace
|
||||
db.create_namespace(["child"])
|
||||
assert "child" in db.list_namespaces().namespaces
|
||||
|
||||
# List namespaces under child
|
||||
assert db.list_namespaces(namespace_path=["child"]).namespaces == []
|
||||
|
||||
def test_local_create_namespace_not_supported(tmp_path):
|
||||
"""Test that create_namespace is not supported in local mode."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Namespace operations are not supported for listing database",
|
||||
):
|
||||
db.create_namespace(["test_namespace"])
|
||||
|
||||
|
||||
def test_local_drop_namespace_not_supported(tmp_path):
|
||||
"""Test that drop_namespace is not supported in local mode."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Namespace operations are not supported for listing database",
|
||||
):
|
||||
db.drop_namespace(["test_namespace"])
|
||||
# Drop namespace
|
||||
db.drop_namespace(["child"])
|
||||
assert db.list_namespaces().namespaces == []
|
||||
|
||||
|
||||
def test_clone_table_latest_version(tmp_path):
|
||||
@@ -1048,3 +1028,59 @@ def test_clone_table_deep_clone_fails(tmp_path):
|
||||
source_uri = os.path.join(tmp_path, "source.lance")
|
||||
with pytest.raises(Exception, match="Deep clone is not yet implemented"):
|
||||
db.clone_table("cloned", source_uri, is_shallow=False)
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Namespace client issues")
|
||||
def test_namespace_client_native_storage(tmp_path):
|
||||
"""Test namespace_client() returns DirectoryNamespace for native storage."""
|
||||
from lance.namespace import DirectoryNamespace
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
ns_client = db.namespace_client()
|
||||
|
||||
assert isinstance(ns_client, DirectoryNamespace)
|
||||
assert str(tmp_path) in ns_client.namespace_id()
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Namespace client issues")
|
||||
def test_namespace_client_with_storage_options(tmp_path):
|
||||
"""Test namespace_client() preserves storage options."""
|
||||
from lance.namespace import DirectoryNamespace
|
||||
|
||||
storage_options = {"timeout": "10s"}
|
||||
db = lancedb.connect(tmp_path, storage_options=storage_options)
|
||||
ns_client = db.namespace_client()
|
||||
|
||||
assert isinstance(ns_client, DirectoryNamespace)
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Namespace client issues")
|
||||
def test_namespace_client_operations(tmp_path):
|
||||
"""Test that namespace_client() returns a functional namespace client."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
ns_client = db.namespace_client()
|
||||
|
||||
# Create a table through the main db connection
|
||||
data = [{"id": 1, "text": "hello", "vector": [1.0, 2.0]}]
|
||||
db.create_table("test_table", data=data)
|
||||
|
||||
# Verify the namespace client can see the table
|
||||
from lance_namespace import ListTablesRequest
|
||||
|
||||
# id=[] means root namespace
|
||||
response = ns_client.list_tables(ListTablesRequest(id=[]))
|
||||
# Tables can be strings or objects with name attribute
|
||||
table_names = [t.name if hasattr(t, "name") else t for t in response.tables]
|
||||
assert "test_table" in table_names
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Namespace client issues")
|
||||
def test_namespace_client_namespace_connection(tmp_path):
|
||||
"""Test namespace_client() returns the backing client for namespace connections."""
|
||||
from lance.namespace import DirectoryNamespace
|
||||
|
||||
db = lancedb.connect_namespace("dir", {"root": str(tmp_path)})
|
||||
ns_client = db.namespace_client()
|
||||
|
||||
assert isinstance(ns_client, DirectoryNamespace)
|
||||
assert str(tmp_path) in ns_client.namespace_id()
|
||||
|
||||
@@ -15,7 +15,10 @@
|
||||
# limitations under the License.
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from pathlib import Path
|
||||
import zipfile
|
||||
|
||||
import lancedb as ldb
|
||||
from lancedb.db import DBConnection
|
||||
@@ -36,8 +39,7 @@ import pytest
|
||||
import pytest_asyncio
|
||||
from utils import exception_output
|
||||
|
||||
pytest.importorskip("lancedb.fts")
|
||||
tantivy = pytest.importorskip("tantivy")
|
||||
TEST_LANGUAGE_MODEL_HOME = Path(__file__).parent / "models"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -92,6 +94,40 @@ def table(tmp_path) -> ldb.table.LanceTable:
|
||||
return table
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def language_model_home(monkeypatch, tmp_path):
|
||||
model_home = tmp_path / "language-models"
|
||||
shutil.copytree(TEST_LANGUAGE_MODEL_HOME, model_home)
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(model_home))
|
||||
return model_home
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lindera_ipadic(language_model_home):
|
||||
model_path = language_model_home / "lindera" / "ipadic"
|
||||
extracted_model = model_path / "main"
|
||||
config_path = model_path / "config.yml"
|
||||
|
||||
if extracted_model.exists():
|
||||
shutil.rmtree(extracted_model)
|
||||
|
||||
with zipfile.ZipFile(model_path / "main.zip", "r") as zip_ref:
|
||||
zip_ref.extractall(model_path)
|
||||
config_path.write_text(
|
||||
"segmenter:\n"
|
||||
' mode: "normal"\n'
|
||||
" dictionary:\n"
|
||||
f' path: "{extracted_model.resolve().as_posix()}"\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if extracted_model.exists():
|
||||
shutil.rmtree(extracted_model)
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
||||
# Use local random state to avoid affecting other tests
|
||||
@@ -144,58 +180,53 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
||||
return table
|
||||
|
||||
|
||||
def test_create_index(tmp_path):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
assert isinstance(index, tantivy.Index)
|
||||
assert os.path.exists(str(tmp_path / "index"))
|
||||
@pytest.mark.parametrize(
|
||||
("kwargs", "match"),
|
||||
[
|
||||
(
|
||||
{"use_tantivy": True},
|
||||
"Tantivy-based FTS has been removed",
|
||||
),
|
||||
(
|
||||
{"ordering_field_names": ["count"]},
|
||||
"ordering_field_names was only supported",
|
||||
),
|
||||
(
|
||||
{"writer_heap_size": 128},
|
||||
"writer_heap_size was only supported",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_reject_removed_tantivy_parameters(table, kwargs, match):
|
||||
with pytest.raises(ValueError, match=match):
|
||||
table.create_fts_index("text", **kwargs)
|
||||
|
||||
|
||||
def test_create_index_with_stemming(tmp_path, table):
|
||||
index = ldb.fts.create_index(
|
||||
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
|
||||
)
|
||||
assert isinstance(index, tantivy.Index)
|
||||
assert os.path.exists(str(tmp_path / "index"))
|
||||
def test_reject_legacy_tantivy_index(table):
|
||||
path, _, _ = table._get_fts_index_path()
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
# Check stemming by running tokenizer on non empty table
|
||||
table.create_fts_index("text", tokenizer_name="en_stem", use_tantivy=True)
|
||||
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
|
||||
table.search("puppy").limit(5).to_list()
|
||||
|
||||
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
|
||||
table.create_fts_index("text")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
@pytest.mark.parametrize("with_position", [True, False])
|
||||
def test_create_inverted_index(table, use_tantivy, with_position):
|
||||
if use_tantivy and not with_position:
|
||||
pytest.skip("we don't support building a tantivy index without position")
|
||||
def test_create_inverted_index(table, with_position):
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=use_tantivy,
|
||||
with_position=with_position,
|
||||
name="custom_fts_index",
|
||||
)
|
||||
if not use_tantivy:
|
||||
indices = table.list_indices()
|
||||
fts_indices = [i for i in indices if i.index_type == "FTS"]
|
||||
assert any(i.name == "custom_fts_index" for i in fts_indices)
|
||||
indices = table.list_indices()
|
||||
fts_indices = [i for i in indices if i.index_type == "FTS"]
|
||||
assert any(i.name == "custom_fts_index" for i in fts_indices)
|
||||
|
||||
|
||||
def test_populate_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
||||
|
||||
|
||||
def test_search_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
ldb.fts.populate_index(index, table, ["text"])
|
||||
index.reload()
|
||||
results = ldb.fts.search_index(index, query="puppy", limit=5)
|
||||
assert len(results) == 2
|
||||
assert len(results[0]) == 5 # row_ids
|
||||
assert len(results[1]) == 5 # _score
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_search_fts(table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
def test_search_fts(table):
|
||||
table.create_fts_index("text")
|
||||
results = table.search("puppy").select(["id", "text"]).limit(5).to_list()
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
@@ -204,53 +235,52 @@ def test_search_fts(table, use_tantivy):
|
||||
results = table.search("puppy").select(["id", "text"]).to_list()
|
||||
assert len(results) == 10
|
||||
|
||||
if not use_tantivy:
|
||||
# Test with a query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
# Test with a query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
|
||||
# Test boost query
|
||||
results = (
|
||||
table.search(
|
||||
BoostQuery(
|
||||
MatchQuery("puppy", "text"),
|
||||
MatchQuery("runs", "text"),
|
||||
)
|
||||
# Test boost query
|
||||
results = (
|
||||
table.search(
|
||||
BoostQuery(
|
||||
MatchQuery("puppy", "text"),
|
||||
MatchQuery("runs", "text"),
|
||||
)
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
|
||||
# Test multi match query
|
||||
table.create_fts_index("text2", use_tantivy=use_tantivy)
|
||||
results = (
|
||||
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
# Test multi match query
|
||||
table.create_fts_index("text2")
|
||||
results = (
|
||||
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
# Test boolean query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
for r in results:
|
||||
assert "puppy" in r["text"]
|
||||
assert "runs" in r["text"]
|
||||
# Test boolean query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
for r in results:
|
||||
assert "puppy" in r["text"]
|
||||
assert "runs" in r["text"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -318,13 +348,13 @@ async def test_fts_select_async(async_table):
|
||||
|
||||
|
||||
def test_search_fts_phrase_query(table):
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=False)
|
||||
table.create_fts_index("text", with_position=False)
|
||||
try:
|
||||
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||
assert False
|
||||
except Exception:
|
||||
pass
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
|
||||
table.create_fts_index("text", with_position=True, replace=True)
|
||||
results = table.search("puppy").limit(100).to_list()
|
||||
|
||||
# Test with quotation marks
|
||||
@@ -375,8 +405,8 @@ async def test_search_fts_phrase_query_async(async_table):
|
||||
|
||||
|
||||
def test_search_fts_specify_column(table):
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text2", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
table.create_fts_index("text2")
|
||||
|
||||
results = table.search("puppy", fts_columns="text").limit(5).to_list()
|
||||
assert len(results) == 5
|
||||
@@ -470,42 +500,8 @@ async def test_search_fts_specify_column_async(async_table):
|
||||
pass
|
||||
|
||||
|
||||
def test_search_ordering_field_index_table(tmp_path, table):
|
||||
table.create_fts_index("text", ordering_field_names=["count"], use_tantivy=True)
|
||||
rows = (
|
||||
table.search("puppy", ordering_field_name="count")
|
||||
.limit(20)
|
||||
.select(["text", "count"])
|
||||
.to_list()
|
||||
)
|
||||
for r in rows:
|
||||
assert "puppy" in r["text"]
|
||||
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
||||
|
||||
|
||||
def test_search_ordering_field_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(
|
||||
str(tmp_path / "index"), ["text"], ordering_fields=["count"]
|
||||
)
|
||||
|
||||
ldb.fts.populate_index(index, table, ["text"], ordering_fields=["count"])
|
||||
index.reload()
|
||||
results = ldb.fts.search_index(
|
||||
index, query="puppy", limit=5, ordering_field="count"
|
||||
)
|
||||
assert len(results) == 2
|
||||
assert len(results[0]) == 5 # row_ids
|
||||
assert len(results[1]) == 5 # _distance
|
||||
rows = table.to_lance().take(results[0]).to_pylist()
|
||||
|
||||
for r in rows:
|
||||
assert "puppy" in r["text"]
|
||||
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_create_index_from_table(tmp_path, table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
def test_create_index_from_table(tmp_path, table):
|
||||
table.create_fts_index("text")
|
||||
df = table.search("puppy").limit(5).select(["text"]).to_pandas()
|
||||
assert len(df) <= 5
|
||||
assert "text" in df.columns
|
||||
@@ -525,36 +521,24 @@ def test_create_index_from_table(tmp_path, table, use_tantivy):
|
||||
)
|
||||
|
||||
with pytest.raises(Exception, match="already exists"):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text")
|
||||
|
||||
table.create_fts_index("text", replace=True, use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text", replace=True)
|
||||
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
||||
|
||||
|
||||
def test_create_index_multiple_columns(tmp_path, table):
|
||||
table.create_fts_index(["text", "text2"], use_tantivy=True)
|
||||
df = table.search("puppy").limit(5).to_pandas()
|
||||
assert len(df) == 5
|
||||
assert "text" in df.columns
|
||||
assert "text2" in df.columns
|
||||
|
||||
|
||||
def test_empty_rs(tmp_path, table, mocker):
|
||||
table.create_fts_index(["text", "text2"], use_tantivy=True)
|
||||
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
||||
df = table.search("puppy").limit(5).to_pandas()
|
||||
assert len(df) == 0
|
||||
with pytest.raises(ValueError, match="Native FTS indexes can only be created"):
|
||||
table.create_fts_index(["text", "text2"])
|
||||
|
||||
|
||||
def test_nested_schema(tmp_path, table):
|
||||
table.create_fts_index("nested.text", use_tantivy=True)
|
||||
rs = table.search("puppy").limit(5).to_list()
|
||||
assert len(rs) == 5
|
||||
with pytest.raises(ValueError, match="top-level fields"):
|
||||
table.create_fts_index("nested.text")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_search_index_with_filter(table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
def test_search_index_with_filter(table):
|
||||
table.create_fts_index("text")
|
||||
orig_import = __import__
|
||||
|
||||
def import_mock(name, *args):
|
||||
@@ -584,8 +568,7 @@ def test_search_index_with_filter(table, use_tantivy):
|
||||
assert r["_rowid"] is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_null_input(table, use_tantivy):
|
||||
def test_null_input(table):
|
||||
table.add(
|
||||
[
|
||||
{
|
||||
@@ -598,14 +581,13 @@ def test_null_input(table, use_tantivy):
|
||||
}
|
||||
]
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text")
|
||||
|
||||
|
||||
def test_syntax(table):
|
||||
# https://github.com/lancedb/lancedb/issues/769
|
||||
table.create_fts_index("text", use_tantivy=True)
|
||||
with pytest.raises(ValueError, match="Syntax Error"):
|
||||
table.search("they could have been dogs OR").limit(10).to_list()
|
||||
table.create_fts_index("text")
|
||||
table.search("they could have been dogs OR").limit(10).to_list()
|
||||
|
||||
# these should work
|
||||
|
||||
@@ -616,6 +598,7 @@ def test_syntax(table):
|
||||
).to_list()
|
||||
|
||||
# phrase queries
|
||||
table.create_fts_index("text", with_position=True, replace=True)
|
||||
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
|
||||
table.search('"they could have been dogs OR cats"').limit(10).to_list()
|
||||
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
|
||||
@@ -639,7 +622,7 @@ def test_language(mem_db: DBConnection):
|
||||
table = mem_db.create_table("test", data=data)
|
||||
|
||||
with pytest.raises(ValueError) as e:
|
||||
table.create_fts_index("text", use_tantivy=False, language="klingon")
|
||||
table.create_fts_index("text", language="klingon")
|
||||
|
||||
assert exception_output(e) == (
|
||||
"ValueError: LanceDB does not support the requested language: 'klingon'\n"
|
||||
@@ -650,7 +633,6 @@ def test_language(mem_db: DBConnection):
|
||||
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
language="French",
|
||||
stem=True,
|
||||
ascii_folding=True,
|
||||
@@ -690,7 +672,7 @@ def test_fts_on_list(mem_db: DBConnection):
|
||||
}
|
||||
)
|
||||
table = mem_db.create_table("test", data=data)
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True)
|
||||
table.create_fts_index("text", with_position=True)
|
||||
|
||||
res = table.search("lance").limit(5).to_list()
|
||||
assert len(res) == 3
|
||||
@@ -702,7 +684,7 @@ def test_fts_on_list(mem_db: DBConnection):
|
||||
def test_fts_ngram(mem_db: DBConnection):
|
||||
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
|
||||
table = mem_db.create_table("test", data=data)
|
||||
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
|
||||
table.create_fts_index("text", base_tokenizer="ngram")
|
||||
|
||||
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 2
|
||||
@@ -721,7 +703,6 @@ def test_fts_ngram(mem_db: DBConnection):
|
||||
# test setting min_ngram_length and prefix_only
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
base_tokenizer="ngram",
|
||||
replace=True,
|
||||
ngram_min_length=2,
|
||||
@@ -742,6 +723,90 @@ def test_fts_ngram(mem_db: DBConnection):
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
|
||||
def test_fts_jieba_tokenizer(mem_db: DBConnection, language_model_home):
|
||||
data = pa.table({"text": ["我们都有光明的前途", "光明的前途"]})
|
||||
table = mem_db.create_table("test_jieba", data=data)
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
results = table.search("我们", query_type="fts").limit(10).to_list()
|
||||
assert [row["text"] for row in results] == ["我们都有光明的前途"]
|
||||
|
||||
|
||||
def test_fts_jieba_missing_language_model_note(
|
||||
mem_db: DBConnection, monkeypatch, tmp_path
|
||||
):
|
||||
missing_root = tmp_path / "missing-language-models"
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
|
||||
table = mem_db.create_table(
|
||||
"test_missing_jieba_model",
|
||||
data=pa.table({"text": ["我们都有光明的前途"]}),
|
||||
)
|
||||
|
||||
with pytest.raises((ValueError, RuntimeError)) as e:
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
output = exception_output(e)
|
||||
assert "Invalid directory path:" in output
|
||||
assert "LANCE_LANGUAGE_MODEL_HOME" in output
|
||||
assert "jieba/default" in output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fts_jieba_missing_language_model_note_async(monkeypatch, tmp_path):
|
||||
missing_root = tmp_path / "missing-language-models"
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
|
||||
db = await ldb.connect_async(tmp_path / "async-db")
|
||||
table = await db.create_table(
|
||||
"test_missing_jieba_model_async",
|
||||
data=pa.table({"text": ["我们都有光明的前途"]}),
|
||||
)
|
||||
|
||||
with pytest.raises((ValueError, RuntimeError)) as e:
|
||||
await table.create_index(
|
||||
"text",
|
||||
config=FTS(
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
),
|
||||
)
|
||||
|
||||
output = exception_output(e)
|
||||
assert "Invalid directory path:" in output
|
||||
assert "LANCE_LANGUAGE_MODEL_HOME" in output
|
||||
assert "jieba/default" in output
|
||||
|
||||
|
||||
def test_fts_lindera_tokenizer(
|
||||
mem_db: DBConnection, language_model_home, lindera_ipadic
|
||||
):
|
||||
data = pa.table({"text": ["成田国際空港", "東京国際空港", "羽田空港"]})
|
||||
table = mem_db.create_table("test_lindera", data=data)
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="lindera/ipadic",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
results = table.search("成田", query_type="fts").limit(10).to_list()
|
||||
assert [row["text"] for row in results] == ["成田国際空港"]
|
||||
|
||||
|
||||
def test_fts_query_to_json():
|
||||
"""Test that FTS query to_json() produces valid JSON strings with exact format."""
|
||||
|
||||
@@ -886,7 +951,7 @@ def test_fts_query_to_json():
|
||||
|
||||
|
||||
def test_fts_fast_search(table):
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
|
||||
# Insert some unindexed data
|
||||
table.add(
|
||||
|
||||
@@ -28,7 +28,7 @@ def sync_table(tmpdir_factory) -> Table:
|
||||
}
|
||||
)
|
||||
table = db.create_table("test", data)
|
||||
table.create_fts_index("text", with_position=False, use_tantivy=False)
|
||||
table.create_fts_index("text", with_position=False)
|
||||
return table
|
||||
|
||||
|
||||
@@ -192,7 +192,7 @@ def table_with_id(tmpdir_factory) -> Table:
|
||||
}
|
||||
)
|
||||
table = db.create_table("test_with_id", data)
|
||||
table.create_fts_index("text", with_position=False, use_tantivy=False)
|
||||
table.create_fts_index("text", with_position=False)
|
||||
return table
|
||||
|
||||
|
||||
|
||||
@@ -16,11 +16,13 @@ from lancedb.index import (
|
||||
IvfSq,
|
||||
IvfHnswPq,
|
||||
IvfHnswSq,
|
||||
IvfHnswFlat,
|
||||
IvfRq,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
FTS,
|
||||
)
|
||||
from lancedb.table import IndexStatistics
|
||||
@@ -250,6 +252,21 @@ async def test_create_hnswpq_alias_index(some_table: AsyncTable):
|
||||
assert indices[0].index_type in {"HnswPq", "IvfHnswPq"}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_hnswflat_index(some_table: AsyncTable):
|
||||
await some_table.create_index("vector", config=HnswFlat(num_partitions=10))
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_hnswflat_alias_index(some_table: AsyncTable):
|
||||
await some_table.create_index("vector", config=IvfHnswFlat(num_partitions=5))
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type in {"HnswFlat", "IvfHnswFlat"}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_ivfsq_index(some_table: AsyncTable):
|
||||
await some_table.create_index("vector", config=IvfSq(num_partitions=10))
|
||||
@@ -295,6 +312,7 @@ def test_index_statistics_index_type_lists_all_supported_values():
|
||||
"IVF_RQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"FTS",
|
||||
"BTREE",
|
||||
"BITMAP",
|
||||
|
||||
@@ -33,6 +33,16 @@ class TestNamespaceConnection:
|
||||
# Initially no tables in root
|
||||
assert len(list(db.table_names())) == 0
|
||||
|
||||
def test_connect_via_connect_helper(self):
|
||||
"""Connecting via lancedb.connect should delegate to namespace connection."""
|
||||
db = lancedb.connect(
|
||||
namespace_client_impl="dir",
|
||||
namespace_client_properties={"root": self.temp_dir},
|
||||
)
|
||||
|
||||
assert isinstance(db, lancedb.LanceNamespaceDBConnection)
|
||||
assert len(list(db.table_names())) == 0
|
||||
|
||||
def test_create_table_through_namespace(self):
|
||||
"""Test creating a table through namespace."""
|
||||
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
|
||||
@@ -50,14 +60,14 @@ class TestNamespaceConnection:
|
||||
)
|
||||
|
||||
# Create empty table in child namespace
|
||||
table = db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
||||
table = db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||
assert table is not None
|
||||
assert table.name == "test_table"
|
||||
assert table.namespace == ["test_ns"]
|
||||
assert table.id == "test_ns$test_table"
|
||||
|
||||
# Table should appear in child namespace
|
||||
table_names = list(db.table_names(namespace=["test_ns"]))
|
||||
table_names = list(db.table_names(namespace_path=["test_ns"]))
|
||||
assert "test_table" in table_names
|
||||
assert len(table_names) == 1
|
||||
|
||||
@@ -80,10 +90,10 @@ class TestNamespaceConnection:
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
||||
db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||
|
||||
# Open the table
|
||||
table = db.open_table("test_table", namespace=["test_ns"])
|
||||
table = db.open_table("test_table", namespace_path=["test_ns"])
|
||||
assert table is not None
|
||||
assert table.name == "test_table"
|
||||
assert table.namespace == ["test_ns"]
|
||||
@@ -108,31 +118,31 @@ class TestNamespaceConnection:
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
db.create_table("table1", schema=schema, namespace=["test_ns"])
|
||||
db.create_table("table2", schema=schema, namespace=["test_ns"])
|
||||
db.create_table("table1", schema=schema, namespace_path=["test_ns"])
|
||||
db.create_table("table2", schema=schema, namespace_path=["test_ns"])
|
||||
|
||||
# Verify both tables exist in child namespace
|
||||
table_names = list(db.table_names(namespace=["test_ns"]))
|
||||
table_names = list(db.table_names(namespace_path=["test_ns"]))
|
||||
assert "table1" in table_names
|
||||
assert "table2" in table_names
|
||||
assert len(table_names) == 2
|
||||
|
||||
# Drop one table
|
||||
db.drop_table("table1", namespace=["test_ns"])
|
||||
db.drop_table("table1", namespace_path=["test_ns"])
|
||||
|
||||
# Verify only table2 remains
|
||||
table_names = list(db.table_names(namespace=["test_ns"]))
|
||||
table_names = list(db.table_names(namespace_path=["test_ns"]))
|
||||
assert "table1" not in table_names
|
||||
assert "table2" in table_names
|
||||
assert len(table_names) == 1
|
||||
|
||||
# Drop the second table
|
||||
db.drop_table("table2", namespace=["test_ns"])
|
||||
assert len(list(db.table_names(namespace=["test_ns"]))) == 0
|
||||
db.drop_table("table2", namespace_path=["test_ns"])
|
||||
assert len(list(db.table_names(namespace_path=["test_ns"]))) == 0
|
||||
|
||||
# Should not be able to open dropped table
|
||||
with pytest.raises(TableNotFoundError):
|
||||
db.open_table("table1", namespace=["test_ns"])
|
||||
db.open_table("table1", namespace_path=["test_ns"])
|
||||
|
||||
def test_create_table_with_schema(self):
|
||||
"""Test creating a table with explicit schema through namespace."""
|
||||
@@ -151,7 +161,7 @@ class TestNamespaceConnection:
|
||||
)
|
||||
|
||||
# Create table with schema in child namespace
|
||||
table = db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
||||
table = db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||
assert table is not None
|
||||
assert table.namespace == ["test_ns"]
|
||||
|
||||
@@ -175,7 +185,7 @@ class TestNamespaceConnection:
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
db.create_table("old_name", schema=schema, namespace=["test_ns"])
|
||||
db.create_table("old_name", schema=schema, namespace_path=["test_ns"])
|
||||
|
||||
# Rename should raise NotImplementedError
|
||||
with pytest.raises(NotImplementedError, match="rename_table is not supported"):
|
||||
@@ -196,20 +206,20 @@ class TestNamespaceConnection:
|
||||
]
|
||||
)
|
||||
for i in range(3):
|
||||
db.create_table(f"table{i}", schema=schema, namespace=["test_ns"])
|
||||
db.create_table(f"table{i}", schema=schema, namespace_path=["test_ns"])
|
||||
|
||||
# Verify tables exist in child namespace
|
||||
assert len(list(db.table_names(namespace=["test_ns"]))) == 3
|
||||
assert len(list(db.table_names(namespace_path=["test_ns"]))) == 3
|
||||
|
||||
# Drop all tables in child namespace
|
||||
db.drop_all_tables(namespace=["test_ns"])
|
||||
db.drop_all_tables(namespace_path=["test_ns"])
|
||||
|
||||
# Verify all tables are gone from child namespace
|
||||
assert len(list(db.table_names(namespace=["test_ns"]))) == 0
|
||||
assert len(list(db.table_names(namespace_path=["test_ns"]))) == 0
|
||||
|
||||
# Test that table_names works with keyword-only namespace parameter
|
||||
db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
||||
result = list(db.table_names(namespace=["test_ns"]))
|
||||
db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||
result = list(db.table_names(namespace_path=["test_ns"]))
|
||||
assert "test_table" in result
|
||||
|
||||
def test_table_operations(self):
|
||||
@@ -227,7 +237,7 @@ class TestNamespaceConnection:
|
||||
pa.field("text", pa.string()),
|
||||
]
|
||||
)
|
||||
table = db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
||||
table = db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||
|
||||
# Verify empty table was created
|
||||
result = table.to_pandas()
|
||||
@@ -298,25 +308,25 @@ class TestNamespaceConnection:
|
||||
]
|
||||
)
|
||||
table = db.create_table(
|
||||
"test_table", schema=schema, namespace=["test_namespace"]
|
||||
"test_table", schema=schema, namespace_path=["test_namespace"]
|
||||
)
|
||||
assert table is not None
|
||||
|
||||
# Verify table exists in namespace
|
||||
tables_in_namespace = list(db.table_names(namespace=["test_namespace"]))
|
||||
tables_in_namespace = list(db.table_names(namespace_path=["test_namespace"]))
|
||||
assert "test_table" in tables_in_namespace
|
||||
assert len(tables_in_namespace) == 1
|
||||
|
||||
# Open table from namespace
|
||||
table = db.open_table("test_table", namespace=["test_namespace"])
|
||||
table = db.open_table("test_table", namespace_path=["test_namespace"])
|
||||
assert table is not None
|
||||
assert table.name == "test_table"
|
||||
|
||||
# Drop table from namespace
|
||||
db.drop_table("test_table", namespace=["test_namespace"])
|
||||
db.drop_table("test_table", namespace_path=["test_namespace"])
|
||||
|
||||
# Verify table no longer exists in namespace
|
||||
tables_in_namespace = list(db.table_names(namespace=["test_namespace"]))
|
||||
tables_in_namespace = list(db.table_names(namespace_path=["test_namespace"]))
|
||||
assert len(tables_in_namespace) == 0
|
||||
|
||||
# Drop namespace
|
||||
@@ -338,14 +348,14 @@ class TestNamespaceConnection:
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
db.create_table("test_table", schema=schema, namespace=["test_namespace"])
|
||||
db.create_table("test_table", schema=schema, namespace_path=["test_namespace"])
|
||||
|
||||
# Try to drop namespace with tables - should fail
|
||||
with pytest.raises(NamespaceNotEmptyError):
|
||||
db.drop_namespace(["test_namespace"])
|
||||
|
||||
# Drop table first
|
||||
db.drop_table("test_table", namespace=["test_namespace"])
|
||||
db.drop_table("test_table", namespace_path=["test_namespace"])
|
||||
|
||||
# Now dropping namespace should work
|
||||
db.drop_namespace(["test_namespace"])
|
||||
@@ -368,10 +378,10 @@ class TestNamespaceConnection:
|
||||
|
||||
# Create table with same name in both namespaces
|
||||
table_a = db.create_table(
|
||||
"same_name_table", schema=schema, namespace=["namespace_a"]
|
||||
"same_name_table", schema=schema, namespace_path=["namespace_a"]
|
||||
)
|
||||
table_b = db.create_table(
|
||||
"same_name_table", schema=schema, namespace=["namespace_b"]
|
||||
"same_name_table", schema=schema, namespace_path=["namespace_b"]
|
||||
)
|
||||
|
||||
# Add different data to each table
|
||||
@@ -389,7 +399,9 @@ class TestNamespaceConnection:
|
||||
table_b.add(data_b)
|
||||
|
||||
# Verify data in namespace_a table
|
||||
opened_table_a = db.open_table("same_name_table", namespace=["namespace_a"])
|
||||
opened_table_a = db.open_table(
|
||||
"same_name_table", namespace_path=["namespace_a"]
|
||||
)
|
||||
result_a = opened_table_a.to_pandas().sort_values("id").reset_index(drop=True)
|
||||
assert len(result_a) == 2
|
||||
assert result_a["id"].tolist() == [1, 2]
|
||||
@@ -400,7 +412,9 @@ class TestNamespaceConnection:
|
||||
assert [v.tolist() for v in result_a["vector"]] == [[1.0, 2.0], [3.0, 4.0]]
|
||||
|
||||
# Verify data in namespace_b table
|
||||
opened_table_b = db.open_table("same_name_table", namespace=["namespace_b"])
|
||||
opened_table_b = db.open_table(
|
||||
"same_name_table", namespace_path=["namespace_b"]
|
||||
)
|
||||
result_b = opened_table_b.to_pandas().sort_values("id").reset_index(drop=True)
|
||||
assert len(result_b) == 3
|
||||
assert result_b["id"].tolist() == [10, 20, 30]
|
||||
@@ -420,8 +434,8 @@ class TestNamespaceConnection:
|
||||
assert "same_name_table" not in root_tables
|
||||
|
||||
# Clean up
|
||||
db.drop_table("same_name_table", namespace=["namespace_a"])
|
||||
db.drop_table("same_name_table", namespace=["namespace_b"])
|
||||
db.drop_table("same_name_table", namespace_path=["namespace_a"])
|
||||
db.drop_table("same_name_table", namespace_path=["namespace_b"])
|
||||
db.drop_namespace(["namespace_a"])
|
||||
db.drop_namespace(["namespace_b"])
|
||||
|
||||
@@ -449,6 +463,8 @@ class TestAsyncNamespaceConnection:
|
||||
table_names = await db.table_names()
|
||||
assert len(list(table_names)) == 0
|
||||
|
||||
# Async connect via namespace helper is not enabled yet.
|
||||
|
||||
async def test_create_table_async(self):
|
||||
"""Test creating a table asynchronously through namespace."""
|
||||
db = lancedb.connect_namespace_async("dir", {"root": self.temp_dir})
|
||||
@@ -467,13 +483,13 @@ class TestAsyncNamespaceConnection:
|
||||
|
||||
# Create empty table in child namespace
|
||||
table = await db.create_table(
|
||||
"test_table", schema=schema, namespace=["test_ns"]
|
||||
"test_table", schema=schema, namespace_path=["test_ns"]
|
||||
)
|
||||
assert table is not None
|
||||
assert isinstance(table, lancedb.AsyncTable)
|
||||
|
||||
# Table should appear in child namespace
|
||||
table_names = await db.table_names(namespace=["test_ns"])
|
||||
table_names = await db.table_names(namespace_path=["test_ns"])
|
||||
assert "test_table" in list(table_names)
|
||||
|
||||
async def test_open_table_async(self):
|
||||
@@ -490,10 +506,10 @@ class TestAsyncNamespaceConnection:
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
await db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
||||
await db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||
|
||||
# Open the table
|
||||
table = await db.open_table("test_table", namespace=["test_ns"])
|
||||
table = await db.open_table("test_table", namespace_path=["test_ns"])
|
||||
assert table is not None
|
||||
assert isinstance(table, lancedb.AsyncTable)
|
||||
|
||||
@@ -547,20 +563,20 @@ class TestAsyncNamespaceConnection:
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
]
|
||||
)
|
||||
await db.create_table("table1", schema=schema, namespace=["test_ns"])
|
||||
await db.create_table("table2", schema=schema, namespace=["test_ns"])
|
||||
await db.create_table("table1", schema=schema, namespace_path=["test_ns"])
|
||||
await db.create_table("table2", schema=schema, namespace_path=["test_ns"])
|
||||
|
||||
# Verify both tables exist in child namespace
|
||||
table_names = list(await db.table_names(namespace=["test_ns"]))
|
||||
table_names = list(await db.table_names(namespace_path=["test_ns"]))
|
||||
assert "table1" in table_names
|
||||
assert "table2" in table_names
|
||||
assert len(table_names) == 2
|
||||
|
||||
# Drop one table
|
||||
await db.drop_table("table1", namespace=["test_ns"])
|
||||
await db.drop_table("table1", namespace_path=["test_ns"])
|
||||
|
||||
# Verify only table2 remains
|
||||
table_names = list(await db.table_names(namespace=["test_ns"]))
|
||||
table_names = list(await db.table_names(namespace_path=["test_ns"]))
|
||||
assert "table1" not in table_names
|
||||
assert "table2" in table_names
|
||||
assert len(table_names) == 1
|
||||
@@ -589,20 +605,24 @@ class TestAsyncNamespaceConnection:
|
||||
]
|
||||
)
|
||||
table = await db.create_table(
|
||||
"test_table", schema=schema, namespace=["test_namespace"]
|
||||
"test_table", schema=schema, namespace_path=["test_namespace"]
|
||||
)
|
||||
assert table is not None
|
||||
|
||||
# Verify table exists in namespace
|
||||
tables_in_namespace = list(await db.table_names(namespace=["test_namespace"]))
|
||||
tables_in_namespace = list(
|
||||
await db.table_names(namespace_path=["test_namespace"])
|
||||
)
|
||||
assert "test_table" in tables_in_namespace
|
||||
assert len(tables_in_namespace) == 1
|
||||
|
||||
# Drop table from namespace
|
||||
await db.drop_table("test_table", namespace=["test_namespace"])
|
||||
await db.drop_table("test_table", namespace_path=["test_namespace"])
|
||||
|
||||
# Verify table no longer exists in namespace
|
||||
tables_in_namespace = list(await db.table_names(namespace=["test_namespace"]))
|
||||
tables_in_namespace = list(
|
||||
await db.table_names(namespace_path=["test_namespace"])
|
||||
)
|
||||
assert len(tables_in_namespace) == 0
|
||||
|
||||
# Drop namespace
|
||||
@@ -627,15 +647,98 @@ class TestAsyncNamespaceConnection:
|
||||
]
|
||||
)
|
||||
for i in range(3):
|
||||
await db.create_table(f"table{i}", schema=schema, namespace=["test_ns"])
|
||||
await db.create_table(
|
||||
f"table{i}", schema=schema, namespace_path=["test_ns"]
|
||||
)
|
||||
|
||||
# Verify tables exist in child namespace
|
||||
table_names = await db.table_names(namespace=["test_ns"])
|
||||
table_names = await db.table_names(namespace_path=["test_ns"])
|
||||
assert len(list(table_names)) == 3
|
||||
|
||||
# Drop all tables in child namespace
|
||||
await db.drop_all_tables(namespace=["test_ns"])
|
||||
await db.drop_all_tables(namespace_path=["test_ns"])
|
||||
|
||||
# Verify all tables are gone from child namespace
|
||||
table_names = await db.table_names(namespace=["test_ns"])
|
||||
table_names = await db.table_names(namespace_path=["test_ns"])
|
||||
assert len(list(table_names)) == 0
|
||||
|
||||
|
||||
class TestPushdownOperations:
|
||||
"""Test pushdown operations on namespace connections."""
|
||||
|
||||
def setup_method(self):
|
||||
"""Set up test fixtures."""
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
def teardown_method(self):
|
||||
"""Clean up test fixtures."""
|
||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||
|
||||
def test_query_table_pushdown_stored(self):
|
||||
"""Test that QueryTable pushdown is stored on sync connection."""
|
||||
db = lancedb.connect_namespace(
|
||||
"dir",
|
||||
{"root": self.temp_dir},
|
||||
namespace_client_pushdown_operations=["QueryTable"],
|
||||
)
|
||||
assert "QueryTable" in db._namespace_client_pushdown_operations
|
||||
|
||||
def test_create_table_pushdown_stored(self):
|
||||
"""Test that CreateTable pushdown is stored on sync connection."""
|
||||
db = lancedb.connect_namespace(
|
||||
"dir",
|
||||
{"root": self.temp_dir},
|
||||
namespace_client_pushdown_operations=["CreateTable"],
|
||||
)
|
||||
assert "CreateTable" in db._namespace_client_pushdown_operations
|
||||
|
||||
def test_both_pushdowns_stored(self):
|
||||
"""Test that both pushdown operations can be set together."""
|
||||
db = lancedb.connect_namespace(
|
||||
"dir",
|
||||
{"root": self.temp_dir},
|
||||
namespace_client_pushdown_operations=["QueryTable", "CreateTable"],
|
||||
)
|
||||
assert "QueryTable" in db._namespace_client_pushdown_operations
|
||||
assert "CreateTable" in db._namespace_client_pushdown_operations
|
||||
|
||||
def test_pushdown_defaults_to_empty(self):
|
||||
"""Test that pushdown operations default to empty."""
|
||||
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
|
||||
assert len(db._namespace_client_pushdown_operations) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestAsyncPushdownOperations:
|
||||
"""Test pushdown operations on async namespace connections."""
|
||||
|
||||
def setup_method(self):
|
||||
"""Set up test fixtures."""
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
def teardown_method(self):
|
||||
"""Clean up test fixtures."""
|
||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||
|
||||
async def test_async_query_table_pushdown_stored(self):
|
||||
"""Test that QueryTable pushdown is stored on async connection."""
|
||||
db = lancedb.connect_namespace_async(
|
||||
"dir",
|
||||
{"root": self.temp_dir},
|
||||
namespace_client_pushdown_operations=["QueryTable"],
|
||||
)
|
||||
assert "QueryTable" in db._namespace_client_pushdown_operations
|
||||
|
||||
async def test_async_create_table_pushdown_stored(self):
|
||||
"""Test that CreateTable pushdown is stored on async connection."""
|
||||
db = lancedb.connect_namespace_async(
|
||||
"dir",
|
||||
{"root": self.temp_dir},
|
||||
namespace_client_pushdown_operations=["CreateTable"],
|
||||
)
|
||||
assert "CreateTable" in db._namespace_client_pushdown_operations
|
||||
|
||||
async def test_async_pushdown_defaults_to_empty(self):
|
||||
"""Test that pushdown operations default to empty on async connection."""
|
||||
db = lancedb.connect_namespace_async("dir", {"root": self.temp_dir})
|
||||
assert len(db._namespace_client_pushdown_operations) == 0
|
||||
|
||||
@@ -4,9 +4,11 @@
|
||||
"""
|
||||
Integration tests for LanceDB Namespace with S3 and credential refresh.
|
||||
|
||||
This test simulates a namespace server that returns incrementing credentials
|
||||
and verifies that the credential refresh mechanism works correctly for both
|
||||
create_table and open_table operations.
|
||||
This test uses DirectoryNamespace with native ops_metrics and vend_input_storage_options
|
||||
features to track API calls and test credential refresh mechanisms.
|
||||
|
||||
Tests are parameterized to run with both DirectoryNamespace and a CustomNamespace
|
||||
wrapper to verify Python-Rust binding works correctly for custom implementations.
|
||||
|
||||
Tests verify:
|
||||
- Storage options provider is auto-created and used
|
||||
@@ -16,24 +18,141 @@ Tests verify:
|
||||
"""
|
||||
|
||||
import copy
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from threading import Lock
|
||||
from typing import Dict
|
||||
from typing import Dict, Optional
|
||||
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
from lance_namespace import (
|
||||
CreateEmptyTableRequest,
|
||||
CreateEmptyTableResponse,
|
||||
from lance.namespace import (
|
||||
DeclareTableRequest,
|
||||
DeclareTableResponse,
|
||||
DescribeTableRequest,
|
||||
DescribeTableResponse,
|
||||
DirectoryNamespace,
|
||||
LanceNamespace,
|
||||
)
|
||||
from lance_namespace import (
|
||||
CreateNamespaceRequest,
|
||||
CreateNamespaceResponse,
|
||||
CreateTableRequest,
|
||||
CreateTableResponse,
|
||||
CreateTableVersionRequest,
|
||||
CreateTableVersionResponse,
|
||||
DeregisterTableRequest,
|
||||
DeregisterTableResponse,
|
||||
DescribeNamespaceRequest,
|
||||
DescribeNamespaceResponse,
|
||||
DescribeTableVersionRequest,
|
||||
DescribeTableVersionResponse,
|
||||
DropNamespaceRequest,
|
||||
DropNamespaceResponse,
|
||||
DropTableRequest,
|
||||
DropTableResponse,
|
||||
ListNamespacesRequest,
|
||||
ListNamespacesResponse,
|
||||
ListTablesRequest,
|
||||
ListTablesResponse,
|
||||
ListTableVersionsRequest,
|
||||
ListTableVersionsResponse,
|
||||
NamespaceExistsRequest,
|
||||
RegisterTableRequest,
|
||||
RegisterTableResponse,
|
||||
TableExistsRequest,
|
||||
)
|
||||
from lancedb.namespace import LanceNamespaceDBConnection
|
||||
|
||||
|
||||
class CustomNamespace(LanceNamespace):
|
||||
"""A custom namespace wrapper that delegates to DirectoryNamespace.
|
||||
|
||||
This class verifies that the Python-Rust binding works correctly for
|
||||
custom namespace implementations that wrap the native DirectoryNamespace.
|
||||
All methods simply delegate to the underlying DirectoryNamespace instance.
|
||||
"""
|
||||
|
||||
def __init__(self, inner: DirectoryNamespace):
|
||||
self._inner = inner
|
||||
|
||||
def namespace_id(self) -> str:
|
||||
return f"CustomNamespace[{self._inner.namespace_id()}]"
|
||||
|
||||
def create_namespace(
|
||||
self, request: CreateNamespaceRequest
|
||||
) -> CreateNamespaceResponse:
|
||||
return self._inner.create_namespace(request)
|
||||
|
||||
def describe_namespace(
|
||||
self, request: DescribeNamespaceRequest
|
||||
) -> DescribeNamespaceResponse:
|
||||
return self._inner.describe_namespace(request)
|
||||
|
||||
def namespace_exists(self, request: NamespaceExistsRequest) -> None:
|
||||
return self._inner.namespace_exists(request)
|
||||
|
||||
def drop_namespace(self, request: DropNamespaceRequest) -> DropNamespaceResponse:
|
||||
return self._inner.drop_namespace(request)
|
||||
|
||||
def list_namespaces(self, request: ListNamespacesRequest) -> ListNamespacesResponse:
|
||||
return self._inner.list_namespaces(request)
|
||||
|
||||
def create_table(
|
||||
self, request: CreateTableRequest, data: bytes
|
||||
) -> CreateTableResponse:
|
||||
return self._inner.create_table(request, data)
|
||||
|
||||
def declare_table(self, request: DeclareTableRequest) -> DeclareTableResponse:
|
||||
return self._inner.declare_table(request)
|
||||
|
||||
def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse:
|
||||
return self._inner.describe_table(request)
|
||||
|
||||
def table_exists(self, request: TableExistsRequest) -> None:
|
||||
return self._inner.table_exists(request)
|
||||
|
||||
def drop_table(self, request: DropTableRequest) -> DropTableResponse:
|
||||
return self._inner.drop_table(request)
|
||||
|
||||
def list_tables(self, request: ListTablesRequest) -> ListTablesResponse:
|
||||
return self._inner.list_tables(request)
|
||||
|
||||
def register_table(self, request: RegisterTableRequest) -> RegisterTableResponse:
|
||||
return self._inner.register_table(request)
|
||||
|
||||
def deregister_table(
|
||||
self, request: DeregisterTableRequest
|
||||
) -> DeregisterTableResponse:
|
||||
return self._inner.deregister_table(request)
|
||||
|
||||
def list_table_versions(
|
||||
self, request: ListTableVersionsRequest
|
||||
) -> ListTableVersionsResponse:
|
||||
return self._inner.list_table_versions(request)
|
||||
|
||||
def describe_table_version(
|
||||
self, request: DescribeTableVersionRequest
|
||||
) -> DescribeTableVersionResponse:
|
||||
return self._inner.describe_table_version(request)
|
||||
|
||||
def create_table_version(
|
||||
self, request: CreateTableVersionRequest
|
||||
) -> CreateTableVersionResponse:
|
||||
return self._inner.create_table_version(request)
|
||||
|
||||
def retrieve_ops_metrics(self) -> Optional[Dict[str, int]]:
|
||||
return self._inner.retrieve_ops_metrics()
|
||||
|
||||
|
||||
def _wrap_if_custom(ns_client: DirectoryNamespace, use_custom: bool):
|
||||
"""Wrap namespace client in CustomNamespace if use_custom is True."""
|
||||
if use_custom:
|
||||
return CustomNamespace(ns_client)
|
||||
return ns_client
|
||||
|
||||
|
||||
# LocalStack S3 configuration
|
||||
CONFIG = {
|
||||
"allow_http": "true",
|
||||
@@ -89,162 +208,88 @@ def delete_bucket(s3, bucket_name):
|
||||
pass
|
||||
|
||||
|
||||
class TrackingNamespace(LanceNamespace):
|
||||
def create_tracking_namespace(
|
||||
bucket_name: str,
|
||||
storage_options: dict,
|
||||
credential_expires_in_seconds: int = 60,
|
||||
use_custom: bool = False,
|
||||
):
|
||||
"""Create a DirectoryNamespace with ops metrics and credential vending enabled.
|
||||
|
||||
Uses native DirectoryNamespace features:
|
||||
- ops_metrics_enabled=true: Tracks API call counts via retrieve_ops_metrics()
|
||||
- vend_input_storage_options=true: Returns input storage options in responses
|
||||
- vend_input_storage_options_refresh_interval_millis: Adds expires_at_millis
|
||||
|
||||
Args:
|
||||
bucket_name: S3 bucket name or local path
|
||||
storage_options: Storage options to pass through (credentials, endpoint, etc.)
|
||||
credential_expires_in_seconds: Interval in seconds for credential expiration
|
||||
use_custom: If True, wrap in CustomNamespace for testing custom implementations
|
||||
|
||||
Returns:
|
||||
Tuple of (namespace_client, inner_namespace_client) where inner is always
|
||||
the DirectoryNamespace (used for metrics retrieval)
|
||||
"""
|
||||
Mock namespace that wraps DirectoryNamespace and tracks API calls.
|
||||
# Add refresh_offset_millis to storage options so that credentials are not
|
||||
# considered expired immediately. Set to 1 second (1000ms) so that refresh
|
||||
# checks work correctly with short-lived credentials in tests.
|
||||
storage_options_with_refresh = dict(storage_options)
|
||||
storage_options_with_refresh["refresh_offset_millis"] = "1000"
|
||||
|
||||
This namespace returns incrementing credentials with each API call to simulate
|
||||
credential rotation. It also tracks the number of times each API is called
|
||||
to verify caching behavior.
|
||||
"""
|
||||
dir_props = {f"storage.{k}": v for k, v in storage_options_with_refresh.items()}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bucket_name: str,
|
||||
storage_options: Dict[str, str],
|
||||
credential_expires_in_seconds: int = 60,
|
||||
):
|
||||
from lance.namespace import DirectoryNamespace
|
||||
if bucket_name.startswith("/") or bucket_name.startswith("file://"):
|
||||
dir_props["root"] = f"{bucket_name}/namespace_root"
|
||||
else:
|
||||
dir_props["root"] = f"s3://{bucket_name}/namespace_root"
|
||||
|
||||
self.bucket_name = bucket_name
|
||||
self.base_storage_options = storage_options
|
||||
self.credential_expires_in_seconds = credential_expires_in_seconds
|
||||
self.describe_call_count = 0
|
||||
self.create_call_count = 0
|
||||
self.lock = Lock()
|
||||
# Enable ops metrics tracking
|
||||
dir_props["ops_metrics_enabled"] = "true"
|
||||
# Enable storage options vending
|
||||
dir_props["vend_input_storage_options"] = "true"
|
||||
# Set refresh interval in milliseconds
|
||||
dir_props["vend_input_storage_options_refresh_interval_millis"] = str(
|
||||
credential_expires_in_seconds * 1000
|
||||
)
|
||||
|
||||
# Create underlying DirectoryNamespace with storage options
|
||||
dir_props = {f"storage.{k}": v for k, v in storage_options.items()}
|
||||
inner_ns_client = DirectoryNamespace(**dir_props)
|
||||
ns_client = _wrap_if_custom(inner_ns_client, use_custom)
|
||||
return ns_client, inner_ns_client
|
||||
|
||||
# Use S3 path for bucket name, local path for file paths
|
||||
if bucket_name.startswith("/") or bucket_name.startswith("file://"):
|
||||
dir_props["root"] = f"{bucket_name}/namespace_root"
|
||||
else:
|
||||
dir_props["root"] = f"s3://{bucket_name}/namespace_root"
|
||||
|
||||
self.inner = DirectoryNamespace(**dir_props)
|
||||
def get_describe_call_count(namespace_client) -> int:
|
||||
"""Get the number of describe_table calls made to the namespace client."""
|
||||
return namespace_client.retrieve_ops_metrics().get("describe_table", 0)
|
||||
|
||||
def get_describe_call_count(self) -> int:
|
||||
"""Thread-safe getter for describe call count."""
|
||||
with self.lock:
|
||||
return self.describe_call_count
|
||||
|
||||
def get_create_call_count(self) -> int:
|
||||
"""Thread-safe getter for create call count."""
|
||||
with self.lock:
|
||||
return self.create_call_count
|
||||
|
||||
def namespace_id(self) -> str:
|
||||
"""Return namespace identifier."""
|
||||
return f"TrackingNamespace {{ inner: {self.inner.namespace_id()} }}"
|
||||
|
||||
def _modify_storage_options(
|
||||
self, storage_options: Dict[str, str], count: int
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
Add incrementing credentials with expiration timestamp.
|
||||
|
||||
This simulates a credential rotation system where each call returns
|
||||
new credentials that expire after credential_expires_in_seconds.
|
||||
"""
|
||||
# Start from base storage options (endpoint, region, allow_http, etc.)
|
||||
# because DirectoryNamespace returns None for storage_options from
|
||||
# describe_table/declare_table when no credential vendor is configured.
|
||||
modified = copy.deepcopy(self.base_storage_options)
|
||||
if storage_options:
|
||||
modified.update(storage_options)
|
||||
|
||||
# Increment credentials to simulate rotation
|
||||
modified["aws_access_key_id"] = f"AKID_{count}"
|
||||
modified["aws_secret_access_key"] = f"SECRET_{count}"
|
||||
modified["aws_session_token"] = f"TOKEN_{count}"
|
||||
|
||||
# Set expiration time
|
||||
expires_at_millis = int(
|
||||
(time.time() + self.credential_expires_in_seconds) * 1000
|
||||
)
|
||||
modified["expires_at_millis"] = str(expires_at_millis)
|
||||
|
||||
return modified
|
||||
|
||||
def declare_table(self, request: DeclareTableRequest) -> DeclareTableResponse:
|
||||
"""Track declare_table calls and inject rotating credentials."""
|
||||
with self.lock:
|
||||
self.create_call_count += 1
|
||||
count = self.create_call_count
|
||||
|
||||
response = self.inner.declare_table(request)
|
||||
response.storage_options = self._modify_storage_options(
|
||||
response.storage_options, count
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
def create_empty_table(
|
||||
self, request: CreateEmptyTableRequest
|
||||
) -> CreateEmptyTableResponse:
|
||||
"""Track create_empty_table calls and inject rotating credentials."""
|
||||
with self.lock:
|
||||
self.create_call_count += 1
|
||||
count = self.create_call_count
|
||||
|
||||
response = self.inner.create_empty_table(request)
|
||||
response.storage_options = self._modify_storage_options(
|
||||
response.storage_options, count
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse:
|
||||
"""Track describe_table calls and inject rotating credentials."""
|
||||
with self.lock:
|
||||
self.describe_call_count += 1
|
||||
count = self.describe_call_count
|
||||
|
||||
response = self.inner.describe_table(request)
|
||||
response.storage_options = self._modify_storage_options(
|
||||
response.storage_options, count
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# Pass through other methods to inner namespace
|
||||
def list_tables(self, request):
|
||||
return self.inner.list_tables(request)
|
||||
|
||||
def drop_table(self, request):
|
||||
return self.inner.drop_table(request)
|
||||
|
||||
def list_namespaces(self, request):
|
||||
return self.inner.list_namespaces(request)
|
||||
|
||||
def create_namespace(self, request):
|
||||
return self.inner.create_namespace(request)
|
||||
|
||||
def drop_namespace(self, request):
|
||||
return self.inner.drop_namespace(request)
|
||||
def get_declare_call_count(namespace_client) -> int:
|
||||
"""Get the number of declare_table calls made to the namespace client."""
|
||||
return namespace_client.retrieve_ops_metrics().get("declare_table", 0)
|
||||
|
||||
|
||||
@pytest.mark.s3_test
|
||||
def test_namespace_create_table_with_provider(s3_bucket: str):
|
||||
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||
def test_namespace_create_table_with_provider(s3_bucket: str, use_custom: bool):
|
||||
"""
|
||||
Test creating a table through namespace with storage options provider.
|
||||
|
||||
Verifies:
|
||||
- create_empty_table is called once to reserve location
|
||||
- declare_table is called once to reserve location
|
||||
- Storage options provider is auto-created
|
||||
- Table can be written successfully
|
||||
- Credentials are cached during write operations
|
||||
"""
|
||||
storage_options = copy.deepcopy(CONFIG)
|
||||
|
||||
namespace = TrackingNamespace(
|
||||
ns_client, inner_ns_client = create_tracking_namespace(
|
||||
bucket_name=s3_bucket,
|
||||
storage_options=storage_options,
|
||||
credential_expires_in_seconds=3600, # 1 hour
|
||||
use_custom=use_custom,
|
||||
)
|
||||
|
||||
db = LanceNamespaceDBConnection(namespace)
|
||||
db = LanceNamespaceDBConnection(ns_client)
|
||||
|
||||
# Create unique namespace for this test
|
||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||
@@ -254,8 +299,8 @@ def test_namespace_create_table_with_provider(s3_bucket: str):
|
||||
namespace_path = [namespace_name]
|
||||
|
||||
# Verify initial state
|
||||
assert namespace.get_create_call_count() == 0
|
||||
assert namespace.get_describe_call_count() == 0
|
||||
assert get_declare_call_count(inner_ns_client) == 0
|
||||
assert get_describe_call_count(inner_ns_client) == 0
|
||||
|
||||
# Create table with data
|
||||
data = pa.table(
|
||||
@@ -266,12 +311,12 @@ def test_namespace_create_table_with_provider(s3_bucket: str):
|
||||
}
|
||||
)
|
||||
|
||||
table = db.create_table(table_name, data, namespace=namespace_path)
|
||||
table = db.create_table(table_name, data, namespace_path=namespace_path)
|
||||
|
||||
# Verify create_empty_table was called exactly once
|
||||
assert namespace.get_create_call_count() == 1
|
||||
# Verify declare_table was called exactly once
|
||||
assert get_declare_call_count(inner_ns_client) == 1
|
||||
# describe_table should NOT be called during create in create mode
|
||||
assert namespace.get_describe_call_count() == 0
|
||||
assert get_describe_call_count(inner_ns_client) == 0
|
||||
|
||||
# Verify table was created successfully
|
||||
assert table.name == table_name
|
||||
@@ -281,7 +326,8 @@ def test_namespace_create_table_with_provider(s3_bucket: str):
|
||||
|
||||
|
||||
@pytest.mark.s3_test
|
||||
def test_namespace_open_table_with_provider(s3_bucket: str):
|
||||
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||
def test_namespace_open_table_with_provider(s3_bucket: str, use_custom: bool):
|
||||
"""
|
||||
Test opening a table through namespace with storage options provider.
|
||||
|
||||
@@ -293,13 +339,14 @@ def test_namespace_open_table_with_provider(s3_bucket: str):
|
||||
"""
|
||||
storage_options = copy.deepcopy(CONFIG)
|
||||
|
||||
namespace = TrackingNamespace(
|
||||
ns_client, inner_ns_client = create_tracking_namespace(
|
||||
bucket_name=s3_bucket,
|
||||
storage_options=storage_options,
|
||||
credential_expires_in_seconds=3600,
|
||||
use_custom=use_custom,
|
||||
)
|
||||
|
||||
db = LanceNamespaceDBConnection(namespace)
|
||||
db = LanceNamespaceDBConnection(ns_client)
|
||||
|
||||
# Create unique namespace for this test
|
||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||
@@ -317,21 +364,21 @@ def test_namespace_open_table_with_provider(s3_bucket: str):
|
||||
}
|
||||
)
|
||||
|
||||
db.create_table(table_name, data, namespace=namespace_path)
|
||||
db.create_table(table_name, data, namespace_path=namespace_path)
|
||||
|
||||
initial_create_count = namespace.get_create_call_count()
|
||||
assert initial_create_count == 1
|
||||
initial_declare_count = get_declare_call_count(inner_ns_client)
|
||||
assert initial_declare_count == 1
|
||||
|
||||
# Open the table
|
||||
opened_table = db.open_table(table_name, namespace=namespace_path)
|
||||
opened_table = db.open_table(table_name, namespace_path=namespace_path)
|
||||
|
||||
# Verify describe_table was called exactly once
|
||||
assert namespace.get_describe_call_count() == 1
|
||||
# create_empty_table should not be called again
|
||||
assert namespace.get_create_call_count() == initial_create_count
|
||||
assert get_describe_call_count(inner_ns_client) == 1
|
||||
# declare_table should not be called again
|
||||
assert get_declare_call_count(inner_ns_client) == initial_declare_count
|
||||
|
||||
# Perform multiple read operations
|
||||
describe_count_after_open = namespace.get_describe_call_count()
|
||||
describe_count_after_open = get_describe_call_count(inner_ns_client)
|
||||
|
||||
for _ in range(3):
|
||||
result = opened_table.to_pandas()
|
||||
@@ -340,11 +387,72 @@ def test_namespace_open_table_with_provider(s3_bucket: str):
|
||||
assert count == 5
|
||||
|
||||
# Verify credentials were cached (no additional describe_table calls)
|
||||
assert namespace.get_describe_call_count() == describe_count_after_open
|
||||
assert get_describe_call_count(inner_ns_client) == describe_count_after_open
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform == "win32",
|
||||
reason="TODO: fix schema-only namespace metrics test on Windows",
|
||||
)
|
||||
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||
def test_namespace_create_schema_only_with_provider(use_custom: bool):
|
||||
"""
|
||||
Test creating a schema-only table through namespace with storage options provider.
|
||||
|
||||
Verifies:
|
||||
- declare_table is called once to reserve the location
|
||||
- describe_table is not needed during create in create mode
|
||||
- the table can be reopened successfully afterward
|
||||
- opening the table triggers exactly one describe_table call
|
||||
"""
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
try:
|
||||
ns_client, inner_ns_client = create_tracking_namespace(
|
||||
bucket_name=temp_dir,
|
||||
storage_options={},
|
||||
credential_expires_in_seconds=3600,
|
||||
use_custom=use_custom,
|
||||
)
|
||||
|
||||
db = LanceNamespaceDBConnection(ns_client)
|
||||
|
||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||
db.create_namespace([namespace_name])
|
||||
|
||||
table_name = f"test_table_{uuid.uuid4().hex}"
|
||||
namespace_path = [namespace_name]
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("text", pa.string()),
|
||||
]
|
||||
)
|
||||
|
||||
assert get_declare_call_count(inner_ns_client) == 0
|
||||
assert get_describe_call_count(inner_ns_client) == 0
|
||||
|
||||
table = db.create_table(
|
||||
table_name, schema=schema, namespace_path=namespace_path
|
||||
)
|
||||
|
||||
assert table.name == table_name
|
||||
assert table.namespace == namespace_path
|
||||
assert get_declare_call_count(inner_ns_client) == 1
|
||||
assert get_describe_call_count(inner_ns_client) == 0
|
||||
|
||||
reopened_table = db.open_table(table_name, namespace_path=namespace_path)
|
||||
|
||||
assert reopened_table.schema == schema
|
||||
assert get_declare_call_count(inner_ns_client) == 1
|
||||
assert get_describe_call_count(inner_ns_client) == 1
|
||||
finally:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
@pytest.mark.s3_test
|
||||
def test_namespace_credential_refresh_on_read(s3_bucket: str):
|
||||
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||
def test_namespace_credential_refresh_on_read(s3_bucket: str, use_custom: bool):
|
||||
"""
|
||||
Test credential refresh when credentials expire during read operations.
|
||||
|
||||
@@ -355,13 +463,14 @@ def test_namespace_credential_refresh_on_read(s3_bucket: str):
|
||||
"""
|
||||
storage_options = copy.deepcopy(CONFIG)
|
||||
|
||||
namespace = TrackingNamespace(
|
||||
ns_client, inner_ns_client = create_tracking_namespace(
|
||||
bucket_name=s3_bucket,
|
||||
storage_options=storage_options,
|
||||
credential_expires_in_seconds=3, # Short expiration for testing
|
||||
use_custom=use_custom,
|
||||
)
|
||||
|
||||
db = LanceNamespaceDBConnection(namespace)
|
||||
db = LanceNamespaceDBConnection(ns_client)
|
||||
|
||||
# Create unique namespace for this test
|
||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||
@@ -378,16 +487,16 @@ def test_namespace_credential_refresh_on_read(s3_bucket: str):
|
||||
}
|
||||
)
|
||||
|
||||
db.create_table(table_name, data, namespace=namespace_path)
|
||||
db.create_table(table_name, data, namespace_path=namespace_path)
|
||||
|
||||
# Open table (triggers describe_table)
|
||||
opened_table = db.open_table(table_name, namespace=namespace_path)
|
||||
opened_table = db.open_table(table_name, namespace_path=namespace_path)
|
||||
|
||||
# Perform an immediate read (should use credentials from open)
|
||||
result = opened_table.to_pandas()
|
||||
assert len(result) == 3
|
||||
|
||||
describe_count_after_first_read = namespace.get_describe_call_count()
|
||||
describe_count_after_first_read = get_describe_call_count(inner_ns_client)
|
||||
|
||||
# Wait for credentials to expire (3 seconds + buffer)
|
||||
time.sleep(5)
|
||||
@@ -396,7 +505,7 @@ def test_namespace_credential_refresh_on_read(s3_bucket: str):
|
||||
result = opened_table.to_pandas()
|
||||
assert len(result) == 3
|
||||
|
||||
describe_count_after_refresh = namespace.get_describe_call_count()
|
||||
describe_count_after_refresh = get_describe_call_count(inner_ns_client)
|
||||
# Verify describe_table was called again (credential refresh)
|
||||
refresh_delta = describe_count_after_refresh - describe_count_after_first_read
|
||||
|
||||
@@ -409,7 +518,8 @@ def test_namespace_credential_refresh_on_read(s3_bucket: str):
|
||||
|
||||
|
||||
@pytest.mark.s3_test
|
||||
def test_namespace_credential_refresh_on_write(s3_bucket: str):
|
||||
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||
def test_namespace_credential_refresh_on_write(s3_bucket: str, use_custom: bool):
|
||||
"""
|
||||
Test credential refresh when credentials expire during write operations.
|
||||
|
||||
@@ -420,13 +530,14 @@ def test_namespace_credential_refresh_on_write(s3_bucket: str):
|
||||
"""
|
||||
storage_options = copy.deepcopy(CONFIG)
|
||||
|
||||
namespace = TrackingNamespace(
|
||||
ns_client, inner_ns_client = create_tracking_namespace(
|
||||
bucket_name=s3_bucket,
|
||||
storage_options=storage_options,
|
||||
credential_expires_in_seconds=3, # Short expiration
|
||||
use_custom=use_custom,
|
||||
)
|
||||
|
||||
db = LanceNamespaceDBConnection(namespace)
|
||||
db = LanceNamespaceDBConnection(ns_client)
|
||||
|
||||
# Create unique namespace for this test
|
||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||
@@ -443,7 +554,7 @@ def test_namespace_credential_refresh_on_write(s3_bucket: str):
|
||||
}
|
||||
)
|
||||
|
||||
table = db.create_table(table_name, initial_data, namespace=namespace_path)
|
||||
table = db.create_table(table_name, initial_data, namespace_path=namespace_path)
|
||||
|
||||
# Add more data (should use cached credentials)
|
||||
new_data = pa.table(
|
||||
@@ -471,24 +582,26 @@ def test_namespace_credential_refresh_on_write(s3_bucket: str):
|
||||
|
||||
|
||||
@pytest.mark.s3_test
|
||||
def test_namespace_overwrite_mode(s3_bucket: str):
|
||||
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||
def test_namespace_overwrite_mode(s3_bucket: str, use_custom: bool):
|
||||
"""
|
||||
Test creating table in overwrite mode with credential tracking.
|
||||
|
||||
Verifies:
|
||||
- First create calls create_empty_table exactly once
|
||||
- First create calls declare_table exactly once
|
||||
- Overwrite mode calls describe_table exactly once to check existence
|
||||
- Storage options provider works in overwrite mode
|
||||
"""
|
||||
storage_options = copy.deepcopy(CONFIG)
|
||||
|
||||
namespace = TrackingNamespace(
|
||||
ns_client, inner_ns_client = create_tracking_namespace(
|
||||
bucket_name=s3_bucket,
|
||||
storage_options=storage_options,
|
||||
credential_expires_in_seconds=3600,
|
||||
use_custom=use_custom,
|
||||
)
|
||||
|
||||
db = LanceNamespaceDBConnection(namespace)
|
||||
db = LanceNamespaceDBConnection(ns_client)
|
||||
|
||||
# Create unique namespace for this test
|
||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||
@@ -505,11 +618,11 @@ def test_namespace_overwrite_mode(s3_bucket: str):
|
||||
}
|
||||
)
|
||||
|
||||
table = db.create_table(table_name, data1, namespace=namespace_path)
|
||||
# Exactly one create_empty_table call for initial create
|
||||
assert namespace.get_create_call_count() == 1
|
||||
table = db.create_table(table_name, data1, namespace_path=namespace_path)
|
||||
# Exactly one declare_table call for initial create
|
||||
assert get_declare_call_count(inner_ns_client) == 1
|
||||
# No describe_table calls in create mode
|
||||
assert namespace.get_describe_call_count() == 0
|
||||
assert get_describe_call_count(inner_ns_client) == 0
|
||||
assert table.count_rows() == 2
|
||||
|
||||
# Overwrite the table
|
||||
@@ -521,14 +634,14 @@ def test_namespace_overwrite_mode(s3_bucket: str):
|
||||
)
|
||||
|
||||
table2 = db.create_table(
|
||||
table_name, data2, namespace=namespace_path, mode="overwrite"
|
||||
table_name, data2, namespace_path=namespace_path, mode="overwrite"
|
||||
)
|
||||
|
||||
# Should still have only 1 create_empty_table call
|
||||
# Should still have only 1 declare_table call
|
||||
# (overwrite reuses location from describe_table)
|
||||
assert namespace.get_create_call_count() == 1
|
||||
assert get_declare_call_count(inner_ns_client) == 1
|
||||
# Should have called describe_table exactly once to get existing table location
|
||||
assert namespace.get_describe_call_count() == 1
|
||||
assert get_describe_call_count(inner_ns_client) == 1
|
||||
|
||||
# Verify new data
|
||||
assert table2.count_rows() == 3
|
||||
@@ -537,7 +650,8 @@ def test_namespace_overwrite_mode(s3_bucket: str):
|
||||
|
||||
|
||||
@pytest.mark.s3_test
|
||||
def test_namespace_multiple_tables(s3_bucket: str):
|
||||
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||
def test_namespace_multiple_tables(s3_bucket: str, use_custom: bool):
|
||||
"""
|
||||
Test creating and opening multiple tables in the same namespace.
|
||||
|
||||
@@ -548,13 +662,14 @@ def test_namespace_multiple_tables(s3_bucket: str):
|
||||
"""
|
||||
storage_options = copy.deepcopy(CONFIG)
|
||||
|
||||
namespace = TrackingNamespace(
|
||||
ns_client, inner_ns_client = create_tracking_namespace(
|
||||
bucket_name=s3_bucket,
|
||||
storage_options=storage_options,
|
||||
credential_expires_in_seconds=3600,
|
||||
use_custom=use_custom,
|
||||
)
|
||||
|
||||
db = LanceNamespaceDBConnection(namespace)
|
||||
db = LanceNamespaceDBConnection(ns_client)
|
||||
|
||||
# Create unique namespace for this test
|
||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||
@@ -564,22 +679,22 @@ def test_namespace_multiple_tables(s3_bucket: str):
|
||||
# Create first table
|
||||
table1_name = f"table1_{uuid.uuid4().hex}"
|
||||
data1 = pa.table({"id": [1, 2], "value": [10, 20]})
|
||||
db.create_table(table1_name, data1, namespace=namespace_path)
|
||||
db.create_table(table1_name, data1, namespace_path=namespace_path)
|
||||
|
||||
# Create second table
|
||||
table2_name = f"table2_{uuid.uuid4().hex}"
|
||||
data2 = pa.table({"id": [3, 4], "value": [30, 40]})
|
||||
db.create_table(table2_name, data2, namespace=namespace_path)
|
||||
db.create_table(table2_name, data2, namespace_path=namespace_path)
|
||||
|
||||
# Should have 2 create calls (one per table)
|
||||
assert namespace.get_create_call_count() == 2
|
||||
# Should have 2 declare calls (one per table)
|
||||
assert get_declare_call_count(inner_ns_client) == 2
|
||||
|
||||
# Open both tables
|
||||
opened1 = db.open_table(table1_name, namespace=namespace_path)
|
||||
opened2 = db.open_table(table2_name, namespace=namespace_path)
|
||||
opened1 = db.open_table(table1_name, namespace_path=namespace_path)
|
||||
opened2 = db.open_table(table2_name, namespace_path=namespace_path)
|
||||
|
||||
# Should have 2 describe calls (one per open)
|
||||
assert namespace.get_describe_call_count() == 2
|
||||
assert get_describe_call_count(inner_ns_client) == 2
|
||||
|
||||
# Verify both tables work independently
|
||||
assert opened1.count_rows() == 2
|
||||
@@ -593,7 +708,8 @@ def test_namespace_multiple_tables(s3_bucket: str):
|
||||
|
||||
|
||||
@pytest.mark.s3_test
|
||||
def test_namespace_with_schema_only(s3_bucket: str):
|
||||
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||
def test_namespace_with_schema_only(s3_bucket: str, use_custom: bool):
|
||||
"""
|
||||
Test creating empty table with schema only (no data).
|
||||
|
||||
@@ -604,13 +720,14 @@ def test_namespace_with_schema_only(s3_bucket: str):
|
||||
"""
|
||||
storage_options = copy.deepcopy(CONFIG)
|
||||
|
||||
namespace = TrackingNamespace(
|
||||
ns_client, inner_ns_client = create_tracking_namespace(
|
||||
bucket_name=s3_bucket,
|
||||
storage_options=storage_options,
|
||||
credential_expires_in_seconds=3600,
|
||||
use_custom=use_custom,
|
||||
)
|
||||
|
||||
db = LanceNamespaceDBConnection(namespace)
|
||||
db = LanceNamespaceDBConnection(ns_client)
|
||||
|
||||
# Create unique namespace for this test
|
||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||
@@ -628,12 +745,12 @@ def test_namespace_with_schema_only(s3_bucket: str):
|
||||
]
|
||||
)
|
||||
|
||||
table = db.create_table(table_name, schema=schema, namespace=namespace_path)
|
||||
table = db.create_table(table_name, schema=schema, namespace_path=namespace_path)
|
||||
|
||||
# Should have called create_empty_table once
|
||||
assert namespace.get_create_call_count() == 1
|
||||
# Should have called declare_table once
|
||||
assert get_declare_call_count(inner_ns_client) == 1
|
||||
# Should NOT have called describe_table in create mode
|
||||
assert namespace.get_describe_call_count() == 0
|
||||
assert get_describe_call_count(inner_ns_client) == 0
|
||||
|
||||
# Verify empty table
|
||||
assert table.count_rows() == 0
|
||||
|
||||
@@ -9,21 +9,6 @@ from lancedb import DBConnection, Table, connect
|
||||
from lancedb.permutation import Permutation, Permutations, permutation_builder
|
||||
|
||||
|
||||
def test_permutation_persistence(tmp_path):
|
||||
db = connect(tmp_path)
|
||||
tbl = db.create_table("test_table", pa.table({"x": range(100), "y": range(100)}))
|
||||
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl).shuffle().persist(db, "test_permutation").execute()
|
||||
)
|
||||
assert permutation_tbl.count_rows() == 100
|
||||
|
||||
re_open = db.open_table("test_permutation")
|
||||
assert re_open.count_rows() == 100
|
||||
|
||||
assert permutation_tbl.to_arrow() == re_open.to_arrow()
|
||||
|
||||
|
||||
def test_split_random_ratios(mem_db):
|
||||
"""Test random splitting with ratios."""
|
||||
tbl = mem_db.create_table(
|
||||
@@ -522,6 +507,50 @@ def test_no_split_names(some_table: Table):
|
||||
assert permutations[1].num_rows == 500
|
||||
|
||||
|
||||
def test_permutations_metadata_without_split_names_key(mem_db: DBConnection):
|
||||
"""Regression: schema metadata present but missing split_names key must not crash.
|
||||
|
||||
Previously, `.get(b"split_names", None).decode()` was called unconditionally,
|
||||
so any permutation table whose metadata dict had other keys but no split_names
|
||||
raised AttributeError: 'NoneType' has no attribute 'decode'.
|
||||
"""
|
||||
base = mem_db.create_table("base_nosplit", pa.table({"x": range(10)}))
|
||||
|
||||
# Build a permutation-like table that carries some metadata but NOT split_names.
|
||||
raw = pa.table(
|
||||
{
|
||||
"row_id": pa.array(range(10), type=pa.uint64()),
|
||||
"split_id": pa.array([0] * 10, type=pa.uint32()),
|
||||
}
|
||||
).replace_schema_metadata({b"other_key": b"other_value"})
|
||||
perm_tbl = mem_db.create_table("perm_nosplit", raw)
|
||||
|
||||
permutations = Permutations(base, perm_tbl)
|
||||
assert permutations.split_names == []
|
||||
assert permutations.split_dict == {}
|
||||
|
||||
|
||||
def test_from_tables_string_split_missing_names_key(mem_db: DBConnection):
|
||||
"""Regression: from_tables() with a string split must raise ValueError, not
|
||||
AttributeError.
|
||||
|
||||
Previously, `.get(b"split_names", None).decode()` crashed with AttributeError
|
||||
when the metadata dict existed but had no split_names key.
|
||||
"""
|
||||
base = mem_db.create_table("base_strsplit", pa.table({"x": range(10)}))
|
||||
|
||||
raw = pa.table(
|
||||
{
|
||||
"row_id": pa.array(range(10), type=pa.uint64()),
|
||||
"split_id": pa.array([0] * 10, type=pa.uint32()),
|
||||
}
|
||||
).replace_schema_metadata({b"other_key": b"other_value"})
|
||||
perm_tbl = mem_db.create_table("perm_strsplit", raw)
|
||||
|
||||
with pytest.raises(ValueError, match="no split names are defined"):
|
||||
Permutation.from_tables(base, perm_tbl, split="train")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def some_perm_table(some_table: Table) -> Table:
|
||||
return (
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
import json
|
||||
from datetime import date, datetime
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import pyarrow as pa
|
||||
@@ -673,3 +674,29 @@ async def test_aliases_in_lance_model_async(mem_db_async):
|
||||
assert hasattr(model, "name")
|
||||
assert hasattr(model, "distance")
|
||||
assert model.distance < 0.01
|
||||
|
||||
|
||||
def test_enum_types():
|
||||
"""Enum fields should map to the Arrow type of their value (issue #1846)."""
|
||||
|
||||
class StrStatus(str, Enum):
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
DONE = "done"
|
||||
|
||||
class IntPriority(int, Enum):
|
||||
LOW = 1
|
||||
MEDIUM = 2
|
||||
HIGH = 3
|
||||
|
||||
class TestModel(pydantic.BaseModel):
|
||||
status: StrStatus
|
||||
priority: IntPriority
|
||||
opt_status: Optional[StrStatus] = None
|
||||
|
||||
schema = pydantic_to_schema(TestModel)
|
||||
|
||||
assert schema.field("status").type == pa.dictionary(pa.int32(), pa.utf8())
|
||||
assert schema.field("priority").type == pa.int64()
|
||||
assert schema.field("opt_status").type == pa.dictionary(pa.int32(), pa.utf8())
|
||||
assert schema.field("opt_status").nullable
|
||||
|
||||
@@ -1385,7 +1385,7 @@ def test_query_timeout(tmp_path):
|
||||
}
|
||||
)
|
||||
table = db.create_table("test", data)
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
|
||||
with pytest.raises(Exception, match="Query timeout"):
|
||||
table.search().where("text = 'a'").to_list(timeout=timedelta(0))
|
||||
|
||||
@@ -6,6 +6,8 @@ import contextlib
|
||||
from datetime import timedelta
|
||||
import http.server
|
||||
import json
|
||||
import multiprocessing as mp
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from unittest.mock import MagicMock, patch
|
||||
@@ -1230,3 +1232,82 @@ def test_background_loop_cancellation(exception):
|
||||
with pytest.raises(exception):
|
||||
loop.run(None)
|
||||
mock_future.cancel.assert_called_once()
|
||||
|
||||
|
||||
def _remote_fork_child(port: int, queue) -> None:
|
||||
# Build a fresh Connection in the child so we exercise the at-fork-child
|
||||
# tokio runtime reset rather than relying on an inherited reqwest client.
|
||||
db = lancedb.connect(
|
||||
"db://dev",
|
||||
api_key="fake",
|
||||
host_override=f"http://localhost:{port}",
|
||||
client_config={
|
||||
"retry_config": {"retries": 0},
|
||||
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
|
||||
},
|
||||
)
|
||||
queue.put(db.table_names())
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform != "linux",
|
||||
reason=(
|
||||
"fork() is unavailable on Windows and unsafe on macOS "
|
||||
"(Apple frameworks/TLS are not fork-safe)"
|
||||
),
|
||||
)
|
||||
def test_remote_connection_after_fork():
|
||||
"""A freshly-built remote Connection in a forked child should not hang.
|
||||
|
||||
The pyo3-async-runtimes tokio runtime would otherwise be inherited from
|
||||
the parent with dead worker threads; the at-fork-child handler in our
|
||||
runtime module rebuilds it on first use in the child.
|
||||
"""
|
||||
|
||||
def handler(request):
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(b'{"tables": []}')
|
||||
|
||||
server = http.server.HTTPServer(("localhost", 0), make_mock_http_handler(handler))
|
||||
port = server.server_address[1]
|
||||
server_thread = threading.Thread(target=server.serve_forever)
|
||||
server_thread.start()
|
||||
try:
|
||||
# Hit the server in the parent first so the runtime + LOOP are warm
|
||||
# before fork; a fresh child must still succeed.
|
||||
parent_db = lancedb.connect(
|
||||
"db://dev",
|
||||
api_key="fake",
|
||||
host_override=f"http://localhost:{port}",
|
||||
client_config={
|
||||
"retry_config": {"retries": 0},
|
||||
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
|
||||
},
|
||||
)
|
||||
assert parent_db.table_names() == []
|
||||
|
||||
ctx = mp.get_context("fork")
|
||||
queue = ctx.Queue()
|
||||
proc = ctx.Process(target=_remote_fork_child, args=(port, queue))
|
||||
proc.start()
|
||||
proc.join(timeout=15)
|
||||
|
||||
if proc.is_alive():
|
||||
proc.terminate()
|
||||
proc.join(timeout=5)
|
||||
if proc.is_alive():
|
||||
proc.kill()
|
||||
proc.join()
|
||||
pytest.fail("Remote connection hung after fork")
|
||||
|
||||
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
|
||||
assert not queue.empty(), "child produced no result"
|
||||
assert queue.get() == []
|
||||
|
||||
# Parent connection must still be usable after the child returned.
|
||||
assert parent_db.table_names() == []
|
||||
finally:
|
||||
server.shutdown()
|
||||
server_thread.join()
|
||||
|
||||
@@ -26,11 +26,8 @@ from lancedb.rerankers import (
|
||||
)
|
||||
from lancedb.table import LanceTable
|
||||
|
||||
# Tests rely on FTS index
|
||||
pytest.importorskip("lancedb.fts")
|
||||
|
||||
|
||||
def get_test_table(tmp_path, use_tantivy):
|
||||
def get_test_table(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
# Create a LanceDB table schema with a vector and a text column
|
||||
emb = EmbeddingFunctionRegistry.get_instance().get("test").create()
|
||||
@@ -98,7 +95,7 @@ def get_test_table(tmp_path, use_tantivy):
|
||||
)
|
||||
|
||||
# Create a fts index
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy, replace=True)
|
||||
table.create_fts_index("text", replace=True)
|
||||
|
||||
return table, MyTable
|
||||
|
||||
@@ -208,8 +205,8 @@ def _run_test_reranker(reranker, table, query, query_vector, schema):
|
||||
assert len(result) == 20 and result == result_arrow
|
||||
|
||||
|
||||
def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
def _run_test_hybrid_reranker(reranker, tmp_path):
|
||||
table, schema = get_test_table(tmp_path)
|
||||
# The default reranker
|
||||
result1 = (
|
||||
table.search(
|
||||
@@ -285,8 +282,7 @@ def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_linear_combination(tmp_path, use_tantivy):
|
||||
def test_linear_combination(tmp_path):
|
||||
reranker = LinearCombinationReranker()
|
||||
|
||||
vector_results = pa.Table.from_pydict(
|
||||
@@ -313,22 +309,20 @@ def test_linear_combination(tmp_path, use_tantivy):
|
||||
assert "_score" not in combined_results.column_names
|
||||
assert "_relevance_score" in combined_results.column_names
|
||||
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
_run_test_hybrid_reranker(reranker, tmp_path)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_rrf_reranker(tmp_path, use_tantivy):
|
||||
def test_rrf_reranker(tmp_path):
|
||||
reranker = RRFReranker()
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
_run_test_hybrid_reranker(reranker, tmp_path)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_mrr_reranker(tmp_path, use_tantivy):
|
||||
def test_mrr_reranker(tmp_path):
|
||||
reranker = MRRReranker()
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
_run_test_hybrid_reranker(reranker, tmp_path)
|
||||
|
||||
# Test multi-vector part
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
query = "single player experience"
|
||||
rs1 = table.search(query, vector_column_name="vector").limit(10).with_row_id(True)
|
||||
rs2 = (
|
||||
@@ -363,7 +357,7 @@ def test_rrf_reranker_distance():
|
||||
table = db.create_table("test", data)
|
||||
|
||||
table.create_index(num_partitions=1, num_sub_vectors=2)
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
|
||||
reranker = RRFReranker(return_score="all")
|
||||
|
||||
@@ -422,35 +416,31 @@ def test_rrf_reranker_distance():
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
|
||||
)
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_cohere_reranker(tmp_path, use_tantivy):
|
||||
def test_cohere_reranker(tmp_path):
|
||||
pytest.importorskip("cohere")
|
||||
reranker = CohereReranker()
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_cross_encoder_reranker(tmp_path, use_tantivy):
|
||||
def test_cross_encoder_reranker(tmp_path):
|
||||
pytest.importorskip("sentence_transformers")
|
||||
reranker = CrossEncoderReranker()
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_colbert_reranker(tmp_path, use_tantivy):
|
||||
def test_colbert_reranker(tmp_path):
|
||||
pytest.importorskip("rerankers")
|
||||
reranker = ColbertReranker()
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_answerdotai_reranker(tmp_path, use_tantivy):
|
||||
def test_answerdotai_reranker(tmp_path):
|
||||
pytest.importorskip("rerankers")
|
||||
reranker = AnswerdotaiRerankers()
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@@ -459,10 +449,9 @@ def test_answerdotai_reranker(tmp_path, use_tantivy):
|
||||
or os.environ.get("OPENAI_BASE_URL") is not None,
|
||||
reason="OPENAI_API_KEY not set",
|
||||
)
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_openai_reranker(tmp_path, use_tantivy):
|
||||
def test_openai_reranker(tmp_path):
|
||||
pytest.importorskip("openai")
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
reranker = OpenaiReranker()
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
@@ -470,10 +459,9 @@ def test_openai_reranker(tmp_path, use_tantivy):
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("JINA_API_KEY") is None, reason="JINA_API_KEY not set"
|
||||
)
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_jina_reranker(tmp_path, use_tantivy):
|
||||
def test_jina_reranker(tmp_path):
|
||||
pytest.importorskip("jina")
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
reranker = JinaReranker()
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
@@ -481,11 +469,10 @@ def test_jina_reranker(tmp_path, use_tantivy):
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||
)
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_voyageai_reranker(tmp_path, use_tantivy):
|
||||
def test_voyageai_reranker(tmp_path):
|
||||
pytest.importorskip("voyageai")
|
||||
reranker = VoyageAIReranker(model_name="rerank-2.5")
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||
|
||||
|
||||
@@ -504,7 +491,7 @@ def test_empty_result_reranker():
|
||||
|
||||
# Create empty table with schema
|
||||
empty_table = db.create_table("empty_table", schema=schema, mode="overwrite")
|
||||
empty_table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
empty_table.create_fts_index("text", replace=True)
|
||||
for reranker in [
|
||||
CrossEncoderReranker(),
|
||||
# ColbertReranker(),
|
||||
@@ -603,11 +590,10 @@ def test_empty_hybrid_result_reranker():
|
||||
assert "_rowid" in result.column_names
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy):
|
||||
def test_cross_encoder_reranker_return_all(tmp_path):
|
||||
pytest.importorskip("sentence_transformers")
|
||||
reranker = CrossEncoderReranker(return_score="all")
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
table, schema = get_test_table(tmp_path)
|
||||
query = "single player experience"
|
||||
result = (
|
||||
table.search(query, query_type="hybrid", vector_column_name="vector")
|
||||
|
||||
@@ -242,8 +242,8 @@ def test_s3_dynamodb_sync(s3_bucket: str, commit_table: str, monkeypatch):
|
||||
|
||||
# FTS indices should error since they are not supported yet.
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Full-text search is only supported on the local filesystem",
|
||||
ValueError,
|
||||
match="Tantivy-based FTS has been removed",
|
||||
):
|
||||
table.create_fts_index("x", use_tantivy=True)
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
from datetime import date, datetime, timedelta
|
||||
from time import sleep
|
||||
from typing import List
|
||||
@@ -10,7 +11,7 @@ from unittest.mock import patch
|
||||
|
||||
import lancedb
|
||||
from lancedb.dependencies import _PANDAS_AVAILABLE
|
||||
from lancedb.index import HnswPq, HnswSq, IvfPq
|
||||
from lancedb.index import HnswFlat, HnswPq, HnswSq, IvfPq
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pyarrow as pa
|
||||
@@ -916,6 +917,21 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
"my_vector", replace=True, config=expected_config, name=None, train=True
|
||||
)
|
||||
|
||||
table.create_index(
|
||||
vector_column_name="my_vector",
|
||||
metric="cosine",
|
||||
index_type="IVF_HNSW_FLAT",
|
||||
sample_rate=0.1,
|
||||
m=29,
|
||||
ef_construction=10,
|
||||
)
|
||||
expected_config = HnswFlat(
|
||||
distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10
|
||||
)
|
||||
mock_create_index.assert_called_with(
|
||||
"my_vector", replace=True, config=expected_config, name=None, train=True
|
||||
)
|
||||
|
||||
|
||||
@patch("lancedb.table.AsyncTable.create_index")
|
||||
def test_create_index_name_and_train_parameters(
|
||||
@@ -1049,6 +1065,231 @@ def test_add_with_nans(mem_db: DBConnection):
|
||||
assert np.allclose(v, np.array([0.0, 0.0]))
|
||||
|
||||
|
||||
def test_add_with_empty_fixed_size_list_drops_bad_rows(mem_db: DBConnection):
|
||||
class Schema(LanceModel):
|
||||
text: str
|
||||
embedding: Vector(16)
|
||||
|
||||
table = mem_db.create_table("test_empty_embeddings", schema=Schema)
|
||||
table.add(
|
||||
[
|
||||
{"text": "hello", "embedding": []},
|
||||
{"text": "bar", "embedding": [0.1] * 16},
|
||||
],
|
||||
on_bad_vectors="drop",
|
||||
)
|
||||
|
||||
data = table.to_arrow()
|
||||
assert data["text"].to_pylist() == ["bar"]
|
||||
assert np.allclose(data["embedding"].to_pylist()[0], np.array([0.1] * 16))
|
||||
|
||||
|
||||
def test_add_with_integer_embeddings_preserves_casting(mem_db: DBConnection):
|
||||
class Schema(LanceModel):
|
||||
text: str
|
||||
embedding: Vector(4)
|
||||
|
||||
table = mem_db.create_table("test_integer_embeddings", schema=Schema)
|
||||
table.add(
|
||||
[{"text": "foo", "embedding": [1, 2, 3, 4]}],
|
||||
on_bad_vectors="drop",
|
||||
)
|
||||
|
||||
assert table.to_arrow()["embedding"].to_pylist() == [[1.0, 2.0, 3.0, 4.0]]
|
||||
|
||||
|
||||
def test_on_bad_vectors_does_not_handle_non_vector_fixed_size_lists(
|
||||
mem_db: DBConnection,
|
||||
):
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 4)),
|
||||
pa.field("bbox", pa.list_(pa.float32(), 4)),
|
||||
]
|
||||
)
|
||||
table = mem_db.create_table("test_bbox_schema", schema=schema)
|
||||
|
||||
with pytest.raises(RuntimeError, match="FixedSizeListType"):
|
||||
table.add(
|
||||
[{"vector": [1.0, 2.0, 3.0, 4.0], "bbox": [0.0, 1.0]}],
|
||||
on_bad_vectors="drop",
|
||||
)
|
||||
|
||||
|
||||
def test_on_bad_vectors_does_not_handle_custom_named_fixed_size_lists(
|
||||
mem_db: DBConnection,
|
||||
):
|
||||
schema = pa.schema([pa.field("features", pa.list_(pa.float32(), 16))])
|
||||
table = mem_db.create_table("test_custom_named_fixed_size_vector", schema=schema)
|
||||
|
||||
with pytest.raises(RuntimeError, match="FixedSizeListType"):
|
||||
table.add(
|
||||
[
|
||||
{"features": []},
|
||||
{"features": [0.1] * 16},
|
||||
],
|
||||
on_bad_vectors="drop",
|
||||
)
|
||||
|
||||
|
||||
def test_on_bad_vectors_with_schema_list_vector_still_sanitizes(mem_db: DBConnection):
|
||||
schema = pa.schema([pa.field("vector", pa.list_(pa.float32()))])
|
||||
table = mem_db.create_table("test_schema_list_vector", schema=schema)
|
||||
table.add(
|
||||
[
|
||||
{"vector": [1.0, 2.0]},
|
||||
{"vector": [3.0]},
|
||||
{"vector": [4.0, 5.0]},
|
||||
],
|
||||
on_bad_vectors="drop",
|
||||
)
|
||||
|
||||
assert table.to_arrow()["vector"].to_pylist() == [[1.0, 2.0], [4.0, 5.0]]
|
||||
|
||||
|
||||
def test_on_bad_vectors_handles_typed_custom_fixed_vectors_for_list_schema(
|
||||
mem_db: DBConnection,
|
||||
):
|
||||
schema = pa.schema([pa.field("vec", pa.list_(pa.float32()))])
|
||||
table = mem_db.create_table("test_typed_custom_fixed_vector", schema=schema)
|
||||
data = pa.table(
|
||||
{
|
||||
"vec": pa.array(
|
||||
[[float("nan")] * 16, [1.0] * 16],
|
||||
type=pa.list_(pa.float32(), 16),
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
table.add(data, on_bad_vectors="drop")
|
||||
|
||||
assert table.to_arrow()["vec"].to_pylist() == [[1.0] * 16]
|
||||
|
||||
|
||||
def test_on_bad_vectors_fill_preserves_arrow_nested_vector_type(mem_db: DBConnection):
|
||||
schema = pa.schema([pa.field("vector", pa.list_(pa.float32()))])
|
||||
table = mem_db.create_table("test_fill_arrow_nested_type", schema=schema)
|
||||
data = pa.table(
|
||||
{
|
||||
"vector": pa.array(
|
||||
[[1.0, 2.0], [float("nan"), 3.0]],
|
||||
type=pa.list_(pa.float32(), 2),
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
table.add(
|
||||
data,
|
||||
on_bad_vectors="fill",
|
||||
fill_value=0.0,
|
||||
)
|
||||
|
||||
assert table.to_arrow()["vector"].to_pylist() == [[1.0, 2.0], [0.0, 0.0]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("table_name", "batch1", "expected"),
|
||||
[
|
||||
(
|
||||
"test_schema_list_vector_empty_prefix",
|
||||
pa.record_batch({"vector": [[], []]}),
|
||||
[[], [], [1.0, 2.0], [3.0, 4.0]],
|
||||
),
|
||||
(
|
||||
"test_schema_list_vector_all_bad_prefix",
|
||||
pa.record_batch({"vector": [[float("nan")] * 3, [float("nan")] * 3]}),
|
||||
[[1.0, 2.0], [3.0, 4.0]],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_on_bad_vectors_with_schema_list_vector_ignores_invalid_prefix_batches(
|
||||
mem_db: DBConnection,
|
||||
table_name: str,
|
||||
batch1: pa.RecordBatch,
|
||||
expected: list,
|
||||
):
|
||||
schema = pa.schema([pa.field("vector", pa.list_(pa.float32()))])
|
||||
table = mem_db.create_table(table_name, schema=schema)
|
||||
batch2 = pa.record_batch({"vector": [[1.0, 2.0], [3.0, 4.0]]})
|
||||
reader = pa.RecordBatchReader.from_batches(batch1.schema, [batch1, batch2])
|
||||
|
||||
table.add(reader, on_bad_vectors="drop")
|
||||
|
||||
assert table.to_arrow()["vector"].to_pylist() == expected
|
||||
|
||||
|
||||
def test_on_bad_vectors_with_multiple_vectors_locks_dim_after_final_drop(
|
||||
mem_db: DBConnection,
|
||||
):
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
func = MockTextEmbeddingFunction.create()
|
||||
metadata = registry.get_table_metadata(
|
||||
[
|
||||
EmbeddingFunctionConfig(
|
||||
source_column="text1", vector_column="vec1", function=func
|
||||
),
|
||||
EmbeddingFunctionConfig(
|
||||
source_column="text2", vector_column="vec2", function=func
|
||||
),
|
||||
]
|
||||
)
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vec1", pa.list_(pa.float32())),
|
||||
pa.field("vec2", pa.list_(pa.float32())),
|
||||
],
|
||||
metadata=metadata,
|
||||
)
|
||||
table = mem_db.create_table("test_multi_vector_dim_lock", schema=schema)
|
||||
batch1 = pa.record_batch(
|
||||
{
|
||||
"vec1": [[1.0, 2.0, 3.0], [10.0, 11.0]],
|
||||
"vec2": [[float("nan"), 0.0], [5.0, 6.0]],
|
||||
}
|
||||
)
|
||||
batch2 = pa.record_batch(
|
||||
{
|
||||
"vec1": [[20.0, 21.0], [30.0, 31.0]],
|
||||
"vec2": [[7.0, 8.0], [9.0, 10.0]],
|
||||
}
|
||||
)
|
||||
reader = pa.RecordBatchReader.from_batches(batch1.schema, [batch1, batch2])
|
||||
|
||||
table.add(reader, on_bad_vectors="drop")
|
||||
|
||||
data = table.to_arrow()
|
||||
assert data["vec1"].to_pylist() == [[10.0, 11.0], [20.0, 21.0], [30.0, 31.0]]
|
||||
assert data["vec2"].to_pylist() == [[5.0, 6.0], [7.0, 8.0], [9.0, 10.0]]
|
||||
|
||||
|
||||
def test_on_bad_vectors_does_not_handle_non_vector_list_columns(mem_db: DBConnection):
|
||||
schema = pa.schema([pa.field("embedding_history", pa.list_(pa.float32()))])
|
||||
table = mem_db.create_table("test_non_vector_list_schema", schema=schema)
|
||||
table.add(
|
||||
[
|
||||
{"embedding_history": [1.0, 2.0]},
|
||||
{"embedding_history": [3.0]},
|
||||
],
|
||||
on_bad_vectors="drop",
|
||||
)
|
||||
|
||||
assert table.to_arrow()["embedding_history"].to_pylist() == [
|
||||
[1.0, 2.0],
|
||||
[3.0],
|
||||
]
|
||||
|
||||
|
||||
def test_on_bad_vectors_all_null_schema_vector_batches_do_not_crash(
|
||||
mem_db: DBConnection,
|
||||
):
|
||||
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2), nullable=True)])
|
||||
table = mem_db.create_table("test_all_null_vector_batch", schema=schema)
|
||||
|
||||
table.add([{"vector": None}], on_bad_vectors="drop")
|
||||
|
||||
assert table.to_arrow()["vector"].to_pylist() == [None]
|
||||
|
||||
|
||||
def test_restore(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"my_table",
|
||||
@@ -1722,7 +1963,6 @@ def setup_hybrid_search_table(db: DBConnection, embedding_func):
|
||||
|
||||
def test_hybrid_search(tmp_db: DBConnection):
|
||||
# This test uses an FTS index
|
||||
pytest.importorskip("lancedb.fts")
|
||||
pytest.importorskip("lance")
|
||||
|
||||
table, MyTable, emb = setup_hybrid_search_table(tmp_db, "test")
|
||||
@@ -1793,7 +2033,6 @@ def test_hybrid_search(tmp_db: DBConnection):
|
||||
|
||||
def test_hybrid_search_metric_type(tmp_db: DBConnection):
|
||||
# This test uses an FTS index
|
||||
pytest.importorskip("lancedb.fts")
|
||||
pytest.importorskip("lance")
|
||||
|
||||
# Need to use nonnorm as the embedding function so l2 and dot results
|
||||
@@ -1815,6 +2054,13 @@ def test_hybrid_search_metric_type(tmp_db: DBConnection):
|
||||
@pytest.mark.parametrize(
|
||||
"consistency_interval", [None, timedelta(seconds=0), timedelta(seconds=0.1)]
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
sys.platform == "win32",
|
||||
reason=(
|
||||
"TODO: directory namespace is not supported on Windows yet; "
|
||||
"re-enable after that is fixed."
|
||||
),
|
||||
)
|
||||
def test_consistency(tmp_path, consistency_interval):
|
||||
db = lancedb.connect(tmp_path)
|
||||
table = db.create_table("my_table", data=[{"id": 0}])
|
||||
@@ -1835,7 +2081,6 @@ def test_consistency(tmp_path, consistency_interval):
|
||||
elif consistency_interval == timedelta(seconds=0):
|
||||
assert table2.version == table.version
|
||||
else:
|
||||
# (consistency_interval == timedelta(seconds=0.1)
|
||||
assert table2.version == table.version - 1
|
||||
sleep(0.1)
|
||||
assert table2.version == table.version
|
||||
@@ -2108,7 +2353,7 @@ def test_stats(mem_db: DBConnection):
|
||||
stats = table.stats()
|
||||
print(f"{stats=}")
|
||||
assert stats == {
|
||||
"total_bytes": 38,
|
||||
"total_bytes": 60,
|
||||
"num_rows": 2,
|
||||
"num_indices": 0,
|
||||
"fragment_stats": {
|
||||
|
||||
@@ -1,14 +1,29 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import functools
|
||||
import multiprocessing as mp
|
||||
import pickle
|
||||
import sys
|
||||
|
||||
import lancedb
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
from lancedb.permutation import Permutation, Permutations, permutation_builder
|
||||
from lancedb.util import tbl_to_tensor
|
||||
from lancedb.permutation import Permutation
|
||||
|
||||
torch = pytest.importorskip("torch")
|
||||
|
||||
|
||||
def _open_native_table(uri: str, table_name: str):
|
||||
"""Top-level connection factory used by the explicit-factory pickle test.
|
||||
|
||||
Defined at module scope so that pickle can resolve it by name in the
|
||||
worker / unpickling process.
|
||||
"""
|
||||
return lancedb.connect(uri).open_table(table_name)
|
||||
|
||||
|
||||
def test_table_dataloader(mem_db):
|
||||
table = mem_db.create_table("test_table", pa.table({"a": range(1000)}))
|
||||
dataloader = torch.utils.data.DataLoader(
|
||||
@@ -40,3 +55,156 @@ def test_permutation_dataloader(mem_db):
|
||||
for batch in dataloader:
|
||||
assert batch.size(0) == 1
|
||||
assert batch.size(1) == 10
|
||||
|
||||
|
||||
def test_permutation_is_picklable(tmp_db):
|
||||
"""A Permutation must be picklable so it can be used with PyTorch's
|
||||
DataLoader when num_workers > 0 (which uses multiprocessing and pickles
|
||||
the dataset to pass it to worker processes)."""
|
||||
table = tmp_db.create_table("test_table", pa.table({"a": range(1000)}))
|
||||
permutation = Permutation.identity(table)
|
||||
|
||||
pickled = pickle.dumps(permutation)
|
||||
restored = pickle.loads(pickled)
|
||||
|
||||
assert len(restored) == 1000
|
||||
rows = restored.__getitems__([0, 1, 2])
|
||||
assert rows == [{"a": 0}, {"a": 1}, {"a": 2}]
|
||||
|
||||
|
||||
def test_permutation_with_memory_base_is_picklable(mem_db):
|
||||
"""An in-memory base table is inlined into the pickle as Arrow IPC bytes
|
||||
and rebuilt on the other side as an in-memory LanceTable, so the
|
||||
Permutation round-trips even though the original database can't be
|
||||
reopened across processes."""
|
||||
table = mem_db.create_table("test_table", pa.table({"a": range(50)}))
|
||||
permutation = Permutation.identity(table)
|
||||
|
||||
restored = pickle.loads(pickle.dumps(permutation))
|
||||
|
||||
assert len(restored) == 50
|
||||
assert restored.__getitems__([0, 10, 49]) == [{"a": 0}, {"a": 10}, {"a": 49}]
|
||||
|
||||
|
||||
def test_permutation_dataloader_multiprocessing(tmp_db):
|
||||
"""Using a Permutation with a PyTorch DataLoader that has num_workers > 0
|
||||
must work end-to-end. Each worker process gets a pickled copy of the
|
||||
dataset and reads batches from it."""
|
||||
table = tmp_db.create_table("test_table", pa.table({"a": range(1000)}))
|
||||
permutation = Permutation.identity(table)
|
||||
|
||||
dataloader = torch.utils.data.DataLoader(
|
||||
permutation,
|
||||
batch_size=10,
|
||||
shuffle=True,
|
||||
num_workers=2,
|
||||
multiprocessing_context="spawn",
|
||||
)
|
||||
seen = 0
|
||||
for batch in dataloader:
|
||||
assert batch["a"].size(0) == 10
|
||||
seen += batch["a"].size(0)
|
||||
assert seen == 1000
|
||||
|
||||
|
||||
def test_permutation_pickle_with_connection_factory(tmp_path):
|
||||
"""When the user provides a connection_factory, pickling should round-trip
|
||||
through that factory rather than introspecting the connection URI. Useful
|
||||
for remote / cloud connections where the URI alone isn't reopenable."""
|
||||
db = lancedb.connect(tmp_path)
|
||||
db.create_table("test_table", pa.table({"a": range(50)}))
|
||||
|
||||
factory = functools.partial(_open_native_table, str(tmp_path))
|
||||
permutation = Permutation.identity(factory("test_table")).with_connection_factory(
|
||||
factory
|
||||
)
|
||||
|
||||
restored = pickle.loads(pickle.dumps(permutation))
|
||||
|
||||
assert len(restored) == 50
|
||||
# The factory survives pickling and is what powered base-table reopen.
|
||||
assert restored.connection_factory is not None
|
||||
assert restored.connection_factory.func is _open_native_table
|
||||
assert restored.__getitems__([0, 1, 2]) == [{"a": 0}, {"a": 1}, {"a": 2}]
|
||||
|
||||
|
||||
def test_permutation_with_builder_is_picklable(tmp_db):
|
||||
"""A Permutation built from a non-identity permutation table must round-trip
|
||||
through pickle while preserving the row order defined by the permutation."""
|
||||
table = tmp_db.create_table("test_table", pa.table({"a": range(100)}))
|
||||
perm_tbl = (
|
||||
permutation_builder(table)
|
||||
.split_random(ratios=[0.8, 0.2], seed=42, split_names=["train", "test"])
|
||||
.shuffle(seed=42)
|
||||
.execute()
|
||||
)
|
||||
permutations = Permutations(table, perm_tbl)
|
||||
permutation = permutations["train"]
|
||||
|
||||
indices = list(range(len(permutation)))
|
||||
expected = permutation.__getitems__(indices)
|
||||
|
||||
restored = pickle.loads(pickle.dumps(permutation))
|
||||
|
||||
assert len(restored) == len(permutation)
|
||||
assert restored.__getitems__(indices) == expected
|
||||
|
||||
|
||||
def _multiworker_dataloader_target(db_uri: str, result_queue):
|
||||
import lancedb
|
||||
from lancedb.permutation import Permutation
|
||||
|
||||
db = lancedb.connect(db_uri)
|
||||
table = db.open_table("test_table")
|
||||
permutation = Permutation.identity(table)
|
||||
|
||||
dataloader = torch.utils.data.DataLoader(
|
||||
permutation,
|
||||
batch_size=10,
|
||||
num_workers=2,
|
||||
multiprocessing_context="fork",
|
||||
)
|
||||
count = 0
|
||||
for batch in dataloader:
|
||||
assert batch["a"].size(0) == 10
|
||||
count += 1
|
||||
result_queue.put(count)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform != "linux",
|
||||
reason=(
|
||||
"fork() is unavailable on Windows and unsafe on macOS "
|
||||
"(Apple frameworks/TLS are not fork-safe)"
|
||||
),
|
||||
)
|
||||
def test_permutation_dataloader_fork_workers(tmp_path):
|
||||
"""A Permutation used by a fork-based DataLoader should not hang.
|
||||
|
||||
PyTorch's DataLoader uses fork-based multiprocessing by default on Linux.
|
||||
LanceDB drives async work through a background asyncio thread that does
|
||||
not survive a fork, so any LOOP.run() in a worker blocks forever.
|
||||
"""
|
||||
import lancedb
|
||||
|
||||
db_uri = str(tmp_path / "db")
|
||||
db = lancedb.connect(db_uri)
|
||||
db.create_table("test_table", pa.table({"a": list(range(1000))}))
|
||||
|
||||
ctx = mp.get_context("spawn")
|
||||
queue = ctx.Queue()
|
||||
proc = ctx.Process(target=_multiworker_dataloader_target, args=(db_uri, queue))
|
||||
proc.start()
|
||||
proc.join(timeout=30)
|
||||
|
||||
if proc.is_alive():
|
||||
proc.terminate()
|
||||
proc.join(timeout=5)
|
||||
if proc.is_alive():
|
||||
proc.kill()
|
||||
proc.join()
|
||||
pytest.fail("Permutation hung when iterated in a fork-based DataLoader worker")
|
||||
|
||||
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
|
||||
assert not queue.empty(), "child produced no batches"
|
||||
assert queue.get() == 100
|
||||
|
||||
@@ -15,8 +15,10 @@ from lancedb.table import (
|
||||
_cast_to_target_schema,
|
||||
_handle_bad_vectors,
|
||||
_into_pyarrow_reader,
|
||||
_sanitize_data,
|
||||
_infer_target_schema,
|
||||
_merge_metadata,
|
||||
_sanitize_data,
|
||||
sanitize_create_table,
|
||||
)
|
||||
import pyarrow as pa
|
||||
import pandas as pd
|
||||
@@ -304,6 +306,117 @@ def test_handle_bad_vectors_noop():
|
||||
assert output["vector"] == vector
|
||||
|
||||
|
||||
def test_handle_bad_vectors_updates_reader_schema_for_target_schema():
|
||||
data = pa.table({"vector": [[1, 2, 3, 4]]})
|
||||
target_schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 4))])
|
||||
|
||||
output = _handle_bad_vectors(
|
||||
data.to_reader(),
|
||||
on_bad_vectors="drop",
|
||||
target_schema=target_schema,
|
||||
)
|
||||
|
||||
assert output.schema == pa.schema([pa.field("vector", pa.list_(pa.float32()))])
|
||||
assert output.read_all()["vector"].to_pylist() == [[1.0, 2.0, 3.0, 4.0]]
|
||||
|
||||
|
||||
def test_sanitize_data_keeps_target_field_metadata():
|
||||
source_field = pa.field(
|
||||
"vector",
|
||||
pa.list_(pa.float32(), 2),
|
||||
metadata={b"source": b"drop-me"},
|
||||
)
|
||||
target_field = pa.field(
|
||||
"vector",
|
||||
pa.list_(pa.float32(), 2),
|
||||
metadata={b"target": b"keep-me"},
|
||||
)
|
||||
data = pa.table(
|
||||
{"vector": pa.array([[1.0, 2.0]], type=pa.list_(pa.float32(), 2))},
|
||||
schema=pa.schema([source_field]),
|
||||
)
|
||||
|
||||
output = _sanitize_data(
|
||||
data,
|
||||
target_schema=pa.schema([target_field]),
|
||||
on_bad_vectors="drop",
|
||||
).read_all()
|
||||
|
||||
assert output.schema.field("vector").metadata == {b"target": b"keep-me"}
|
||||
|
||||
|
||||
def test_sanitize_data_uses_separate_embedding_metadata_for_bad_vectors():
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
conf = EmbeddingFunctionConfig(
|
||||
source_column="text",
|
||||
vector_column="custom_vector",
|
||||
function=MockTextEmbeddingFunction.create(),
|
||||
)
|
||||
metadata = registry.get_table_metadata([conf])
|
||||
schema = pa.schema(
|
||||
{
|
||||
"text": pa.string(),
|
||||
"custom_vector": pa.list_(pa.float32(), 10),
|
||||
},
|
||||
metadata={b"note": b"keep-me"},
|
||||
)
|
||||
data = pa.table(
|
||||
{
|
||||
"text": ["bad", "good"],
|
||||
"custom_vector": [[1.0] * 9, [2.0] * 10],
|
||||
}
|
||||
)
|
||||
|
||||
output = _sanitize_data(
|
||||
data,
|
||||
target_schema=schema,
|
||||
metadata=metadata,
|
||||
on_bad_vectors="drop",
|
||||
).read_all()
|
||||
|
||||
assert output["text"].to_pylist() == ["good"]
|
||||
assert output.schema.metadata[b"note"] == b"keep-me"
|
||||
assert b"embedding_functions" in output.schema.metadata
|
||||
|
||||
|
||||
def test_sanitize_create_table_merges_and_overrides_embedding_metadata():
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
old_conf = EmbeddingFunctionConfig(
|
||||
source_column="text",
|
||||
vector_column="old_vector",
|
||||
function=MockTextEmbeddingFunction.create(),
|
||||
)
|
||||
new_conf = EmbeddingFunctionConfig(
|
||||
source_column="text",
|
||||
vector_column="custom_vector",
|
||||
function=MockTextEmbeddingFunction.create(),
|
||||
)
|
||||
metadata = registry.get_table_metadata([new_conf])
|
||||
schema = pa.schema(
|
||||
{
|
||||
"text": pa.string(),
|
||||
"custom_vector": pa.list_(pa.float32(), 10),
|
||||
},
|
||||
metadata=_merge_metadata(
|
||||
{b"note": b"keep-me"},
|
||||
registry.get_table_metadata([old_conf]),
|
||||
),
|
||||
)
|
||||
|
||||
data, schema = sanitize_create_table(
|
||||
pa.table({"text": ["good"]}),
|
||||
schema,
|
||||
metadata=metadata,
|
||||
on_bad_vectors="drop",
|
||||
)
|
||||
|
||||
assert schema.metadata[b"note"] == b"keep-me"
|
||||
assert b"embedding_functions" in schema.metadata
|
||||
assert data.schema.metadata[b"note"] == b"keep-me"
|
||||
funcs = EmbeddingFunctionRegistry.get_instance().parse_functions(schema.metadata)
|
||||
assert set(funcs.keys()) == {"custom_vector"}
|
||||
|
||||
|
||||
class TestModel(lancedb.pydantic.LanceModel):
|
||||
a: Optional[int]
|
||||
b: Optional[int]
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::error::PythonErrorExt;
|
||||
use crate::runtime::future_into_py;
|
||||
use arrow::{
|
||||
datatypes::SchemaRef,
|
||||
pyarrow::{IntoPyArrow, ToPyArrow},
|
||||
@@ -12,9 +14,6 @@ use lancedb::arrow::SendableRecordBatchStream;
|
||||
use pyo3::{
|
||||
Bound, Py, PyAny, PyRef, PyResult, Python, exceptions::PyStopAsyncIteration, pyclass, pymethods,
|
||||
};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::error::PythonErrorExt;
|
||||
|
||||
#[pyclass]
|
||||
pub struct RecordBatchStream {
|
||||
|
||||
@@ -1,11 +1,23 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
error::PythonErrorExt,
|
||||
namespace::{create_namespace_storage_options_provider, extract_namespace_arc},
|
||||
runtime::future_into_py,
|
||||
table::Table,
|
||||
};
|
||||
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
|
||||
use lancedb::{
|
||||
connection::Connection as LanceConnection,
|
||||
connection::NamespaceClientPushdownOperation,
|
||||
database::namespace::LanceNamespaceDatabase,
|
||||
database::{CreateTableMode, Database, ReadConsistency},
|
||||
};
|
||||
use pyo3::{
|
||||
@@ -14,12 +26,6 @@ use pyo3::{
|
||||
pyclass, pyfunction, pymethods,
|
||||
types::{PyDict, PyDictMethods},
|
||||
};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::{
|
||||
error::PythonErrorExt, namespace::extract_namespace_arc,
|
||||
storage_options::py_object_to_storage_options_provider, table::Table,
|
||||
};
|
||||
|
||||
#[pyclass]
|
||||
pub struct Connection {
|
||||
@@ -38,6 +44,29 @@ impl Connection {
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_namespace_client_pushdown_operations(
|
||||
operations: Option<Vec<String>>,
|
||||
) -> PyResult<HashSet<NamespaceClientPushdownOperation>> {
|
||||
let mut parsed = HashSet::new();
|
||||
for operation in operations.unwrap_or_default() {
|
||||
match operation.as_str() {
|
||||
"QueryTable" => {
|
||||
parsed.insert(NamespaceClientPushdownOperation::QueryTable);
|
||||
}
|
||||
"CreateTable" => {
|
||||
parsed.insert(NamespaceClientPushdownOperation::CreateTable);
|
||||
}
|
||||
_ => {
|
||||
return Err(PyValueError::new_err(format!(
|
||||
"Invalid pushdown operation: {}",
|
||||
operation
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(parsed)
|
||||
}
|
||||
|
||||
impl Connection {
|
||||
fn parse_create_mode_str(mode: &str) -> PyResult<CreateTableMode> {
|
||||
match mode {
|
||||
@@ -87,16 +116,16 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (namespace=vec![], start_after=None, limit=None))]
|
||||
#[pyo3(signature = (namespace_path=None, start_after=None, limit=None))]
|
||||
pub fn table_names(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
start_after: Option<String>,
|
||||
limit: Option<u32>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
let mut op = inner.table_names();
|
||||
op = op.namespace(namespace);
|
||||
op = op.namespace(namespace_path.unwrap_or_default());
|
||||
if let Some(start_after) = start_after {
|
||||
op = op.start_after(start_after);
|
||||
}
|
||||
@@ -107,34 +136,43 @@ impl Connection {
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[pyo3(signature = (name, mode, data, namespace=vec![], storage_options=None, storage_options_provider=None, location=None))]
|
||||
#[pyo3(signature = (name, mode, data, namespace_path=None, storage_options=None, location=None, namespace_client=None))]
|
||||
pub fn create_table<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
name: String,
|
||||
mode: &str,
|
||||
data: Bound<'_, PyAny>,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
storage_options_provider: Option<Py<PyAny>>,
|
||||
location: Option<String>,
|
||||
namespace_client: Option<Py<PyAny>>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
let py = self_.py();
|
||||
|
||||
let mode = Self::parse_create_mode_str(mode)?;
|
||||
|
||||
let batches: Box<dyn arrow::array::RecordBatchReader + Send> =
|
||||
Box::new(ArrowArrayStreamReader::from_pyarrow_bound(&data)?);
|
||||
|
||||
let mut builder = inner.create_table(name, batches).mode(mode);
|
||||
let ns_path = namespace_path.clone().unwrap_or_default();
|
||||
let mut builder = inner.create_table(name.clone(), batches).mode(mode);
|
||||
|
||||
builder = builder.namespace(namespace);
|
||||
builder = builder.namespace(ns_path.clone());
|
||||
if let Some(storage_options) = storage_options {
|
||||
builder = builder.storage_options(storage_options);
|
||||
}
|
||||
if let Some(provider_obj) = storage_options_provider {
|
||||
let provider = py_object_to_storage_options_provider(provider_obj)?;
|
||||
|
||||
// Auto-create storage options provider from namespace_client
|
||||
if let Some(ns_obj) = namespace_client {
|
||||
let ns_client = extract_namespace_arc(py, ns_obj)?;
|
||||
// Create table_id by combining namespace_path with table name
|
||||
let mut table_id = ns_path;
|
||||
table_id.push(name);
|
||||
let provider = create_namespace_storage_options_provider(ns_client, table_id);
|
||||
builder = builder.storage_options_provider(provider);
|
||||
}
|
||||
|
||||
if let Some(location) = location {
|
||||
builder = builder.location(location);
|
||||
}
|
||||
@@ -146,33 +184,44 @@ impl Connection {
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[pyo3(signature = (name, mode, schema, namespace=vec![], storage_options=None, storage_options_provider=None, location=None))]
|
||||
#[pyo3(signature = (name, mode, schema, namespace_path=None, storage_options=None, location=None, namespace_client=None))]
|
||||
pub fn create_empty_table<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
name: String,
|
||||
mode: &str,
|
||||
schema: Bound<'_, PyAny>,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
storage_options_provider: Option<Py<PyAny>>,
|
||||
location: Option<String>,
|
||||
namespace_client: Option<Py<PyAny>>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
let py = self_.py();
|
||||
|
||||
let mode = Self::parse_create_mode_str(mode)?;
|
||||
|
||||
let schema = Schema::from_pyarrow_bound(&schema)?;
|
||||
|
||||
let mut builder = inner.create_empty_table(name, Arc::new(schema)).mode(mode);
|
||||
let ns_path = namespace_path.clone().unwrap_or_default();
|
||||
let mut builder = inner
|
||||
.create_empty_table(name.clone(), Arc::new(schema))
|
||||
.mode(mode);
|
||||
|
||||
builder = builder.namespace(namespace);
|
||||
builder = builder.namespace(ns_path.clone());
|
||||
if let Some(storage_options) = storage_options {
|
||||
builder = builder.storage_options(storage_options);
|
||||
}
|
||||
if let Some(provider_obj) = storage_options_provider {
|
||||
let provider = py_object_to_storage_options_provider(provider_obj)?;
|
||||
|
||||
// Auto-create storage options provider from namespace_client
|
||||
if let Some(ns_obj) = namespace_client {
|
||||
let ns_client = extract_namespace_arc(py, ns_obj)?;
|
||||
// Create table_id by combining namespace_path with table name
|
||||
let mut table_id = ns_path;
|
||||
table_id.push(name);
|
||||
let provider = create_namespace_storage_options_provider(ns_client, table_id);
|
||||
builder = builder.storage_options_provider(provider);
|
||||
}
|
||||
|
||||
if let Some(location) = location {
|
||||
builder = builder.location(location);
|
||||
}
|
||||
@@ -184,45 +233,44 @@ impl Connection {
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[pyo3(signature = (name, namespace=vec![], storage_options = None, storage_options_provider=None, index_cache_size = None, location=None, namespace_client=None, managed_versioning=None))]
|
||||
#[pyo3(signature = (name, namespace_path=None, storage_options=None, index_cache_size=None, location=None, namespace_client=None, managed_versioning=None))]
|
||||
pub fn open_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
name: String,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
storage_options_provider: Option<Py<PyAny>>,
|
||||
index_cache_size: Option<u32>,
|
||||
location: Option<String>,
|
||||
namespace_client: Option<Py<PyAny>>,
|
||||
managed_versioning: Option<bool>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
let py = self_.py();
|
||||
|
||||
let mut builder = inner.open_table(name);
|
||||
builder = builder.namespace(namespace.clone());
|
||||
let ns_path = namespace_path.clone().unwrap_or_default();
|
||||
let mut builder = inner.open_table(name.clone());
|
||||
builder = builder.namespace(ns_path.clone());
|
||||
if let Some(storage_options) = storage_options {
|
||||
builder = builder.storage_options(storage_options);
|
||||
}
|
||||
if let Some(provider_obj) = storage_options_provider {
|
||||
let provider = py_object_to_storage_options_provider(provider_obj)?;
|
||||
|
||||
// Auto-create storage options provider from namespace_client
|
||||
if let Some(ns_obj) = namespace_client {
|
||||
let ns_client = extract_namespace_arc(py, ns_obj)?;
|
||||
// Create table_id by combining namespace_path with table name
|
||||
let mut table_id = ns_path;
|
||||
table_id.push(name);
|
||||
let provider = create_namespace_storage_options_provider(ns_client.clone(), table_id);
|
||||
builder = builder.storage_options_provider(provider);
|
||||
builder = builder.namespace_client(ns_client);
|
||||
}
|
||||
|
||||
if let Some(index_cache_size) = index_cache_size {
|
||||
builder = builder.index_cache_size(index_cache_size);
|
||||
}
|
||||
if let Some(location) = location {
|
||||
builder = builder.location(location);
|
||||
}
|
||||
// Extract namespace client from Python object if provided
|
||||
let ns_client = if let Some(ns_obj) = namespace_client {
|
||||
let py = self_.py();
|
||||
Some(extract_namespace_arc(py, ns_obj)?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
if let Some(ns_client) = ns_client {
|
||||
builder = builder.namespace_client(ns_client);
|
||||
}
|
||||
// Pass managed_versioning if provided to avoid redundant describe_table call
|
||||
if let Some(enabled) = managed_versioning {
|
||||
builder = builder.managed_versioning(enabled);
|
||||
@@ -234,12 +282,12 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (target_table_name, source_uri, target_namespace=vec![], source_version=None, source_tag=None, is_shallow=true))]
|
||||
#[pyo3(signature = (target_table_name, source_uri, target_namespace_path=None, source_version=None, source_tag=None, is_shallow=true))]
|
||||
pub fn clone_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
target_table_name: String,
|
||||
source_uri: String,
|
||||
target_namespace: Vec<String>,
|
||||
target_namespace_path: Option<Vec<String>>,
|
||||
source_version: Option<u64>,
|
||||
source_tag: Option<String>,
|
||||
is_shallow: bool,
|
||||
@@ -247,7 +295,7 @@ impl Connection {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
|
||||
let mut builder = inner.clone_table(target_table_name, source_uri);
|
||||
builder = builder.target_namespace(target_namespace);
|
||||
builder = builder.target_namespace(target_namespace_path.unwrap_or_default());
|
||||
if let Some(version) = source_version {
|
||||
builder = builder.source_version(version);
|
||||
}
|
||||
@@ -262,52 +310,56 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (cur_name, new_name, cur_namespace=vec![], new_namespace=vec![]))]
|
||||
#[pyo3(signature = (cur_name, new_name, cur_namespace_path=None, new_namespace_path=None))]
|
||||
pub fn rename_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
cur_name: String,
|
||||
new_name: String,
|
||||
cur_namespace: Vec<String>,
|
||||
new_namespace: Vec<String>,
|
||||
cur_namespace_path: Option<Vec<String>>,
|
||||
new_namespace_path: Option<Vec<String>>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
let cur_ns_path = cur_namespace_path.unwrap_or_default();
|
||||
let new_ns_path = new_namespace_path.unwrap_or_default();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner
|
||||
.rename_table(cur_name, new_name, &cur_namespace, &new_namespace)
|
||||
.rename_table(cur_name, new_name, &cur_ns_path, &new_ns_path)
|
||||
.await
|
||||
.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (name, namespace=vec![]))]
|
||||
#[pyo3(signature = (name, namespace_path=None))]
|
||||
pub fn drop_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
name: String,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
let ns_path = namespace_path.unwrap_or_default();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.drop_table(name, &namespace).await.infer_error()
|
||||
inner.drop_table(name, &ns_path).await.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (namespace=vec![],))]
|
||||
#[pyo3(signature = (namespace_path=None,))]
|
||||
pub fn drop_all_tables(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
let ns_path = namespace_path.unwrap_or_default();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.drop_all_tables(&namespace).await.infer_error()
|
||||
inner.drop_all_tables(&ns_path).await.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
// Namespace management methods
|
||||
|
||||
#[pyo3(signature = (namespace=vec![], page_token=None, limit=None))]
|
||||
#[pyo3(signature = (namespace_path=None, page_token=None, limit=None))]
|
||||
pub fn list_namespaces(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
page_token: Option<String>,
|
||||
limit: Option<u32>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
@@ -316,11 +368,7 @@ impl Connection {
|
||||
future_into_py(py, async move {
|
||||
use lance_namespace::models::ListNamespacesRequest;
|
||||
let request = ListNamespacesRequest {
|
||||
id: if namespace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(namespace)
|
||||
},
|
||||
id: namespace_path,
|
||||
page_token,
|
||||
limit: limit.map(|l| l as i32),
|
||||
..Default::default()
|
||||
@@ -335,10 +383,10 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (namespace, mode=None, properties=None))]
|
||||
#[pyo3(signature = (namespace_path, mode=None, properties=None))]
|
||||
pub fn create_namespace(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Vec<String>,
|
||||
mode: Option<String>,
|
||||
properties: Option<std::collections::HashMap<String, String>>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
@@ -354,11 +402,7 @@ impl Connection {
|
||||
_ => None,
|
||||
});
|
||||
let request = CreateNamespaceRequest {
|
||||
id: if namespace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(namespace)
|
||||
},
|
||||
id: Some(namespace_path),
|
||||
mode: mode_str,
|
||||
properties,
|
||||
..Default::default()
|
||||
@@ -372,10 +416,10 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (namespace, mode=None, behavior=None))]
|
||||
#[pyo3(signature = (namespace_path, mode=None, behavior=None))]
|
||||
pub fn drop_namespace(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Vec<String>,
|
||||
mode: Option<String>,
|
||||
behavior: Option<String>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
@@ -395,11 +439,7 @@ impl Connection {
|
||||
_ => None,
|
||||
});
|
||||
let request = DropNamespaceRequest {
|
||||
id: if namespace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(namespace)
|
||||
},
|
||||
id: Some(namespace_path),
|
||||
mode: mode_str,
|
||||
behavior: behavior_str,
|
||||
..Default::default()
|
||||
@@ -414,21 +454,17 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (namespace,))]
|
||||
#[pyo3(signature = (namespace_path,))]
|
||||
pub fn describe_namespace(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Vec<String>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
let py = self_.py();
|
||||
future_into_py(py, async move {
|
||||
use lance_namespace::models::DescribeNamespaceRequest;
|
||||
let request = DescribeNamespaceRequest {
|
||||
id: if namespace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(namespace)
|
||||
},
|
||||
id: Some(namespace_path),
|
||||
..Default::default()
|
||||
};
|
||||
let response = inner.describe_namespace(request).await.infer_error()?;
|
||||
@@ -440,10 +476,10 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (namespace=vec![], page_token=None, limit=None))]
|
||||
#[pyo3(signature = (namespace_path=None, page_token=None, limit=None))]
|
||||
pub fn list_tables(
|
||||
self_: PyRef<'_, Self>,
|
||||
namespace: Vec<String>,
|
||||
namespace_path: Option<Vec<String>>,
|
||||
page_token: Option<String>,
|
||||
limit: Option<u32>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
@@ -452,11 +488,7 @@ impl Connection {
|
||||
future_into_py(py, async move {
|
||||
use lance_namespace::models::ListTablesRequest;
|
||||
let request = ListTablesRequest {
|
||||
id: if namespace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(namespace)
|
||||
},
|
||||
id: namespace_path,
|
||||
page_token,
|
||||
limit: limit.map(|l| l as i32),
|
||||
..Default::default()
|
||||
@@ -470,10 +502,29 @@ impl Connection {
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the configuration for constructing an equivalent namespace client.
|
||||
/// Returns a dict with:
|
||||
/// - "impl": "dir" for DirectoryNamespace, "rest" for RestNamespace
|
||||
/// - "properties": configuration properties for the namespace
|
||||
#[pyo3(signature = ())]
|
||||
pub fn namespace_client_config(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
let py = self_.py();
|
||||
future_into_py(py, async move {
|
||||
let (impl_type, properties) = inner.namespace_client_config().await.infer_error()?;
|
||||
Python::attach(|py| -> PyResult<Py<PyDict>> {
|
||||
let dict = PyDict::new(py);
|
||||
dict.set_item("impl", impl_type)?;
|
||||
dict.set_item("properties", properties)?;
|
||||
Ok(dict.unbind())
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None))]
|
||||
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None, manifest_enabled=false, namespace_client_properties=None))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn connect(
|
||||
py: Python<'_>,
|
||||
@@ -485,6 +536,8 @@ pub fn connect(
|
||||
client_config: Option<PyClientConfig>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
session: Option<crate::session::Session>,
|
||||
manifest_enabled: bool,
|
||||
namespace_client_properties: Option<HashMap<String, String>>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
future_into_py(py, async move {
|
||||
let mut builder = lancedb::connect(&uri);
|
||||
@@ -504,6 +557,12 @@ pub fn connect(
|
||||
if let Some(storage_options) = storage_options {
|
||||
builder = builder.storage_options(storage_options);
|
||||
}
|
||||
if manifest_enabled {
|
||||
builder = builder.manifest_enabled(true);
|
||||
}
|
||||
if let Some(namespace_client_properties) = namespace_client_properties {
|
||||
builder = builder.namespace_client_properties(namespace_client_properties);
|
||||
}
|
||||
#[cfg(feature = "remote")]
|
||||
if let Some(client_config) = client_config {
|
||||
builder = builder.client_config(client_config.into());
|
||||
@@ -515,6 +574,52 @@ pub fn connect(
|
||||
})
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
#[pyo3(signature = (
|
||||
namespace_client,
|
||||
read_consistency_interval=None,
|
||||
storage_options=None,
|
||||
session=None,
|
||||
namespace_client_pushdown_operations=None,
|
||||
namespace_client_impl=None,
|
||||
namespace_client_properties=None,
|
||||
))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn connect_namespace_client(
|
||||
py: Python<'_>,
|
||||
namespace_client: Py<PyAny>,
|
||||
read_consistency_interval: Option<f64>,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
session: Option<crate::session::Session>,
|
||||
namespace_client_pushdown_operations: Option<Vec<String>>,
|
||||
namespace_client_impl: Option<String>,
|
||||
namespace_client_properties: Option<HashMap<String, String>>,
|
||||
) -> PyResult<Connection> {
|
||||
let namespace_client = extract_namespace_arc(py, namespace_client)?;
|
||||
let read_consistency_interval = read_consistency_interval.map(Duration::from_secs_f64);
|
||||
let namespace_client_pushdown_operations =
|
||||
parse_namespace_client_pushdown_operations(namespace_client_pushdown_operations)?;
|
||||
let ns_impl = namespace_client_impl.unwrap_or_else(|| "python".to_string());
|
||||
let ns_properties = namespace_client_properties.unwrap_or_default();
|
||||
let storage_options = storage_options.unwrap_or_default();
|
||||
let session = session.map(|s| s.inner.clone());
|
||||
|
||||
let database = LanceNamespaceDatabase::from_namespace_client(
|
||||
namespace_client,
|
||||
ns_impl,
|
||||
ns_properties,
|
||||
storage_options,
|
||||
read_consistency_interval,
|
||||
session,
|
||||
namespace_client_pushdown_operations,
|
||||
);
|
||||
|
||||
Ok(Connection::new(LanceConnection::new(
|
||||
Arc::new(database),
|
||||
Arc::new(lancedb::embeddings::MemoryRegistry::new()),
|
||||
)))
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
pub struct PyClientConfig {
|
||||
user_agent: String,
|
||||
@@ -524,6 +629,7 @@ pub struct PyClientConfig {
|
||||
id_delimiter: Option<String>,
|
||||
tls_config: Option<PyClientTlsConfig>,
|
||||
header_provider: Option<Py<PyAny>>,
|
||||
user_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
@@ -608,6 +714,7 @@ impl From<PyClientConfig> for lancedb::remote::ClientConfig {
|
||||
id_delimiter: value.id_delimiter,
|
||||
tls_config: value.tls_config.map(Into::into),
|
||||
header_provider,
|
||||
user_id: value.user_id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use pyo3::{Bound, PyAny, PyResult, exceptions::PyValueError, prelude::*, pyfunct
|
||||
/// [`expr_lit`] and combined with the methods on this struct. On the Python
|
||||
/// side a thin wrapper class (`lancedb.expr.Expr`) delegates to these methods
|
||||
/// and adds Python operator overloads.
|
||||
#[pyclass(name = "PyExpr")]
|
||||
#[pyclass(name = "PyExpr", from_py_object)]
|
||||
#[derive(Clone)]
|
||||
pub struct PyExpr(pub DfExpr);
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ impl PyHeaderProvider {
|
||||
Ok(headers_py) => {
|
||||
// Convert Python dict to Rust HashMap
|
||||
let bound_headers = headers_py.bind(py);
|
||||
let dict: &Bound<PyDict> = bound_headers.downcast().map_err(|e| {
|
||||
let dict: &Bound<PyDict> = bound_headers.cast().map_err(|e| {
|
||||
format!("HeaderProvider.get_headers must return a dict: {}", e)
|
||||
})?;
|
||||
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use lancedb::index::vector::{IvfFlatIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder};
|
||||
use lancedb::index::vector::{
|
||||
IvfFlatIndexBuilder, IvfHnswFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder,
|
||||
IvfPqIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder,
|
||||
};
|
||||
use lancedb::index::{
|
||||
Index as LanceDbIndex,
|
||||
scalar::{BTreeIndexBuilder, FtsIndexBuilder},
|
||||
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
|
||||
};
|
||||
use pyo3::IntoPyObject;
|
||||
use pyo3::types::PyStringMethods;
|
||||
@@ -13,7 +15,7 @@ use pyo3::{
|
||||
Bound, FromPyObject, PyAny, PyResult, Python,
|
||||
exceptions::{PyKeyError, PyValueError},
|
||||
intern, pyclass, pymethods,
|
||||
types::PyAnyMethods,
|
||||
types::{PyAnyMethods, PyString},
|
||||
};
|
||||
|
||||
use crate::util::parse_distance_type;
|
||||
@@ -22,7 +24,7 @@ pub fn class_name(ob: &'_ Bound<'_, PyAny>) -> PyResult<String> {
|
||||
let full_name = ob
|
||||
.getattr(intern!(ob.py(), "__class__"))?
|
||||
.getattr(intern!(ob.py(), "__name__"))?;
|
||||
let full_name = full_name.downcast()?.to_string_lossy();
|
||||
let full_name = full_name.cast::<PyString>()?.to_string_lossy();
|
||||
|
||||
match full_name.rsplit_once('.') {
|
||||
Some((_, name)) => Ok(name.to_string()),
|
||||
@@ -162,8 +164,26 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
||||
}
|
||||
Ok(LanceDbIndex::IvfHnswSq(hnsw_sq_builder))
|
||||
}
|
||||
"HnswFlat" => {
|
||||
let params = source.extract::<IvfHnswFlatParams>()?;
|
||||
let distance_type = parse_distance_type(params.distance_type)?;
|
||||
let mut hnsw_flat_builder = IvfHnswFlatIndexBuilder::default()
|
||||
.distance_type(distance_type)
|
||||
.max_iterations(params.max_iterations)
|
||||
.sample_rate(params.sample_rate)
|
||||
.num_edges(params.m)
|
||||
.ef_construction(params.ef_construction);
|
||||
if let Some(num_partitions) = params.num_partitions {
|
||||
hnsw_flat_builder = hnsw_flat_builder.num_partitions(num_partitions);
|
||||
}
|
||||
if let Some(target_partition_size) = params.target_partition_size {
|
||||
hnsw_flat_builder =
|
||||
hnsw_flat_builder.target_partition_size(target_partition_size);
|
||||
}
|
||||
Ok(LanceDbIndex::IvfHnswFlat(hnsw_flat_builder))
|
||||
}
|
||||
not_supported => Err(PyValueError::new_err(format!(
|
||||
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, or IvfHnswSq",
|
||||
"Invalid index type '{}'. Must be one of BTree, Bitmap, LabelList, FTS, IvfPq, IvfSq, IvfHnswPq, IvfHnswSq, or IvfHnswFlat",
|
||||
not_supported
|
||||
))),
|
||||
}
|
||||
@@ -250,6 +270,17 @@ struct IvfHnswSqParams {
|
||||
target_partition_size: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
struct IvfHnswFlatParams {
|
||||
distance_type: String,
|
||||
num_partitions: Option<u32>,
|
||||
max_iterations: u32,
|
||||
sample_rate: u32,
|
||||
m: u32,
|
||||
ef_construction: u32,
|
||||
target_partition_size: Option<u32>,
|
||||
}
|
||||
|
||||
#[pyclass(get_all)]
|
||||
/// A description of an index currently configured on a column
|
||||
pub struct IndexConfig {
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use arrow::RecordBatchStream;
|
||||
use connection::{Connection, connect};
|
||||
use connection::{Connection, connect, connect_namespace_client};
|
||||
use env_logger::Env;
|
||||
use expr::{PyExpr, expr_col, expr_func, expr_lit};
|
||||
use index::IndexConfig;
|
||||
@@ -28,8 +28,8 @@ pub mod index;
|
||||
pub mod namespace;
|
||||
pub mod permutation;
|
||||
pub mod query;
|
||||
pub mod runtime;
|
||||
pub mod session;
|
||||
pub mod storage_options;
|
||||
pub mod table;
|
||||
pub mod util;
|
||||
|
||||
@@ -59,6 +59,7 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_class::<PyPermutationReader>()?;
|
||||
m.add_class::<PyExpr>()?;
|
||||
m.add_function(wrap_pyfunction!(connect, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(connect_namespace_client, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(permutation::async_permutation_builder, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(query::fts_query_to_json, m)?)?;
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use lance_io::object_store::{LanceNamespaceStorageOptionsProvider, StorageOptionsProvider};
|
||||
use lance_namespace::LanceNamespace as LanceNamespaceTrait;
|
||||
use lance_namespace::models::*;
|
||||
use pyo3::prelude::*;
|
||||
@@ -182,7 +183,7 @@ async fn call_py_method_primitive<Req, Resp>(
|
||||
) -> lance_core::Result<Resp>
|
||||
where
|
||||
Req: serde::Serialize + Send + 'static,
|
||||
Resp: for<'py> pyo3::FromPyObject<'py> + Send + 'static,
|
||||
Resp: for<'a, 'py> pyo3::FromPyObject<'a, 'py> + Send + 'static,
|
||||
{
|
||||
let request_json = serde_json::to_string(&request).map_err(|e| {
|
||||
lance_core::Error::io(format!(
|
||||
@@ -202,7 +203,7 @@ where
|
||||
|
||||
// Call the Python method
|
||||
let result = py_namespace.call_method1(py, method_name, (request_arg,))?;
|
||||
let value: Resp = result.extract(py)?;
|
||||
let value: Resp = result.extract(py).map_err(Into::into)?;
|
||||
Ok::<_, PyErr>(value)
|
||||
})
|
||||
})
|
||||
@@ -694,3 +695,21 @@ pub fn extract_namespace_arc(
|
||||
let ns_ref = ns.bind(py);
|
||||
PyLanceNamespace::create_arc(py, ns_ref)
|
||||
}
|
||||
|
||||
/// Create a LanceNamespaceStorageOptionsProvider from a namespace client and table ID.
|
||||
///
|
||||
/// This creates a Rust storage options provider that fetches credentials from the
|
||||
/// namespace's describe_table() method, enabling automatic credential refresh.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `namespace_client` - The namespace client (wrapped PyLanceNamespace)
|
||||
/// * `table_id` - Full table identifier (namespace_path + table_name)
|
||||
pub fn create_namespace_storage_options_provider(
|
||||
namespace_client: Arc<dyn LanceNamespaceTrait>,
|
||||
table_id: Vec<String>,
|
||||
) -> Arc<dyn StorageOptionsProvider> {
|
||||
Arc::new(LanceNamespaceStorageOptionsProvider::new(
|
||||
namespace_client,
|
||||
table_id,
|
||||
))
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use crate::{
|
||||
arrow::RecordBatchStream, connection::Connection, error::PythonErrorExt, table::Table,
|
||||
arrow::RecordBatchStream, error::PythonErrorExt, runtime::future_into_py, table::Table,
|
||||
};
|
||||
use arrow::pyarrow::{PyArrowType, ToPyArrow};
|
||||
use lancedb::{
|
||||
@@ -21,16 +21,15 @@ use pyo3::{
|
||||
pyclass, pymethods,
|
||||
types::{PyAnyMethods, PyDict, PyDictMethods, PyType},
|
||||
};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
fn table_from_py<'a>(table: Bound<'a, PyAny>) -> PyResult<Bound<'a, Table>> {
|
||||
if table.hasattr("_inner")? {
|
||||
Ok(table.getattr("_inner")?.downcast_into::<Table>()?)
|
||||
Ok(table.getattr("_inner")?.cast_into::<Table>()?)
|
||||
} else if table.hasattr("_table")? {
|
||||
Ok(table
|
||||
.getattr("_table")?
|
||||
.getattr("_inner")?
|
||||
.downcast_into::<Table>()?)
|
||||
.cast_into::<Table>()?)
|
||||
} else {
|
||||
Err(PyRuntimeError::new_err(
|
||||
"Provided table does not appear to be a Table or RemoteTable instance",
|
||||
@@ -80,24 +79,6 @@ impl PyAsyncPermutationBuilder {
|
||||
|
||||
#[pymethods]
|
||||
impl PyAsyncPermutationBuilder {
|
||||
#[pyo3(signature = (database, table_name))]
|
||||
pub fn persist(
|
||||
slf: PyRefMut<'_, Self>,
|
||||
database: Bound<'_, PyAny>,
|
||||
table_name: String,
|
||||
) -> PyResult<Self> {
|
||||
let conn = if database.hasattr("_conn")? {
|
||||
database
|
||||
.getattr("_conn")?
|
||||
.getattr("_inner")?
|
||||
.downcast_into::<Connection>()?
|
||||
} else {
|
||||
database.getattr("_inner")?.downcast_into::<Connection>()?
|
||||
};
|
||||
let database = conn.borrow().database()?;
|
||||
slf.modify(|builder| builder.persist(database, table_name))
|
||||
}
|
||||
|
||||
#[pyo3(signature = (*, ratios=None, counts=None, fixed=None, seed=None, split_names=None))]
|
||||
pub fn split_random(
|
||||
slf: PyRefMut<'_, Self>,
|
||||
@@ -243,7 +224,7 @@ impl PyPermutationReader {
|
||||
let Some(selection) = selection else {
|
||||
return Ok(Select::All);
|
||||
};
|
||||
let selection = selection.downcast_into::<PyDict>()?;
|
||||
let selection = selection.cast_into::<PyDict>()?;
|
||||
let selection = selection
|
||||
.iter()
|
||||
.map(|(key, value)| {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user