mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-04 21:50:40 +00:00
Compare commits
86 Commits
python-v0.
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
87b831bcae | ||
|
|
59db036118 | ||
|
|
c091243d5b | ||
|
|
a2aea7b4e5 | ||
|
|
4a5341edb1 | ||
|
|
25dfe2cfd4 | ||
|
|
4dcd7f4314 | ||
|
|
2e36cd9dad | ||
|
|
f31e27768a | ||
|
|
b84150a53e | ||
|
|
d135c18db6 | ||
|
|
ef399de092 | ||
|
|
0d767abd0e | ||
|
|
a92ae0ded5 | ||
|
|
c54888a83a | ||
|
|
ba6c44abc9 | ||
|
|
75b0a8e0a3 | ||
|
|
2a886141f7 | ||
|
|
2a1df8edcf | ||
|
|
fd98b845ea | ||
|
|
be48ada352 | ||
|
|
9ad2dfe601 | ||
|
|
f909df3e87 | ||
|
|
d715bbb588 | ||
|
|
5ce3d8d141 | ||
|
|
5eaac178b1 | ||
|
|
11af763fcd | ||
|
|
2ed5452e1c | ||
|
|
b7c0b5987c | ||
|
|
97a4b38f19 | ||
|
|
10879d99b8 | ||
|
|
4e6a1d5dce | ||
|
|
13d2759356 | ||
|
|
7f52ec8c36 | ||
|
|
c6ae0de3ee | ||
|
|
231f0655ce | ||
|
|
8c52977c59 | ||
|
|
359710a0bf | ||
|
|
1f1726369d | ||
|
|
df354abae4 | ||
|
|
11bc674548 | ||
|
|
5593460823 | ||
|
|
2807ad6854 | ||
|
|
4761fa9bcb | ||
|
|
4c2939d66e | ||
|
|
a813ce2f71 | ||
|
|
a898dc81c2 | ||
|
|
de3f8097e7 | ||
|
|
0ac59de5f1 | ||
|
|
d082c2d2ac | ||
|
|
9d8699f99e | ||
|
|
aa2c7b3591 | ||
|
|
590c0c1e77 | ||
|
|
382ecd65e3 | ||
|
|
e26b22bcca | ||
|
|
3ba46135a5 | ||
|
|
f903d07887 | ||
|
|
5d550124bd | ||
|
|
c57cb310a2 | ||
|
|
97754f5123 | ||
|
|
7b1c063848 | ||
|
|
c7f189f27b | ||
|
|
a0a2942ad5 | ||
|
|
e3d53dd185 | ||
|
|
66804e99fc | ||
|
|
9f85d4c639 | ||
|
|
1ba19d728e | ||
|
|
4c44587af0 | ||
|
|
1d1cafb59c | ||
|
|
4714598155 | ||
|
|
74f457a0f2 | ||
|
|
cca6a7c989 | ||
|
|
ad96489114 | ||
|
|
76429730c0 | ||
|
|
874b74dd3c | ||
|
|
61de47f3a5 | ||
|
|
f4d613565e | ||
|
|
410ab9b6fe | ||
|
|
1d6e00b902 | ||
|
|
a0228036ae | ||
|
|
d8fc071a7d | ||
|
|
e6fd8d071e | ||
|
|
670dcca551 | ||
|
|
ed7e01a58b | ||
|
|
3450ccaf7f | ||
|
|
9b229f1e7c |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.27.0"
|
current_version = "0.28.0-beta.11"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/documentation.yml
vendored
2
.github/ISSUE_TEMPLATE/documentation.yml
vendored
@@ -18,6 +18,6 @@ body:
|
|||||||
label: Link
|
label: Link
|
||||||
description: >
|
description: >
|
||||||
Provide a link to the existing documentation, if applicable.
|
Provide a link to the existing documentation, if applicable.
|
||||||
placeholder: ex. https://lancedb.com/docs/tables/...
|
placeholder: ex. https://docs.lancedb.com/tables/...
|
||||||
validations:
|
validations:
|
||||||
required: false
|
required: false
|
||||||
|
|||||||
18
.github/dependabot.yml
vendored
Normal file
18
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
version: 2
|
||||||
|
|
||||||
|
# Scope: the root Cargo workspace, which produces the Rust binaries we
|
||||||
|
# ship to users (the Node.js and Python native extensions). The
|
||||||
|
# `rust/lancedb` library crate shares the same lockfile; its consumers
|
||||||
|
# pick their own dependency versions, but bumping transitive deps here
|
||||||
|
# keeps the binaries we ship current.
|
||||||
|
updates:
|
||||||
|
- package-ecosystem: cargo
|
||||||
|
directory: /
|
||||||
|
schedule:
|
||||||
|
interval: weekly
|
||||||
|
open-pull-requests-limit: 10
|
||||||
|
groups:
|
||||||
|
rust-minor-patch:
|
||||||
|
update-types:
|
||||||
|
- minor
|
||||||
|
- patch
|
||||||
@@ -23,8 +23,10 @@ runs:
|
|||||||
steps:
|
steps:
|
||||||
- name: CONFIRM ARM BUILD
|
- name: CONFIRM ARM BUILD
|
||||||
shell: bash
|
shell: bash
|
||||||
|
env:
|
||||||
|
ARM_BUILD: ${{ inputs.arm-build }}
|
||||||
run: |
|
run: |
|
||||||
echo "ARM BUILD: ${{ inputs.arm-build }}"
|
echo "ARM BUILD: $ARM_BUILD"
|
||||||
- name: Build x86_64 Manylinux wheel
|
- name: Build x86_64 Manylinux wheel
|
||||||
if: ${{ inputs.arm-build == 'false' }}
|
if: ${{ inputs.arm-build == 'false' }}
|
||||||
uses: PyO3/maturin-action@v1
|
uses: PyO3/maturin-action@v1
|
||||||
|
|||||||
3
.github/workflows/dev.yml
vendored
3
.github/workflows/dev.yml
vendored
@@ -8,6 +8,9 @@ concurrency:
|
|||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
labeler:
|
labeler:
|
||||||
permissions:
|
permissions:
|
||||||
|
|||||||
3
.github/workflows/java-publish.yml
vendored
3
.github/workflows/java-publish.yml
vendored
@@ -19,6 +19,9 @@ on:
|
|||||||
paths:
|
paths:
|
||||||
- .github/workflows/java-publish.yml
|
- .github/workflows/java-publish.yml
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
publish:
|
publish:
|
||||||
name: Build and Publish
|
name: Build and Publish
|
||||||
|
|||||||
3
.github/workflows/java.yml
vendored
3
.github/workflows/java.yml
vendored
@@ -24,6 +24,9 @@ on:
|
|||||||
- java/**
|
- java/**
|
||||||
- .github/workflows/java.yml
|
- .github/workflows/java.yml
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-java:
|
build-java:
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ubuntu-24.04
|
||||||
|
|||||||
4
.github/workflows/license-header-check.yml
vendored
4
.github/workflows/license-header-check.yml
vendored
@@ -10,6 +10,10 @@ on:
|
|||||||
- nodejs/**
|
- nodejs/**
|
||||||
- java/**
|
- java/**
|
||||||
- .github/workflows/license-header-check.yml
|
- .github/workflows/license-header-check.yml
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
check-licenses:
|
check-licenses:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|||||||
4
.github/workflows/nodejs.yml
vendored
4
.github/workflows/nodejs.yml
vendored
@@ -8,12 +8,16 @@ on:
|
|||||||
paths:
|
paths:
|
||||||
- Cargo.toml
|
- Cargo.toml
|
||||||
- Cargo.lock
|
- Cargo.lock
|
||||||
|
- rust-toolchain.toml
|
||||||
- nodejs/**
|
- nodejs/**
|
||||||
- rust/**
|
- rust/**
|
||||||
- docs/src/js/**
|
- docs/src/js/**
|
||||||
- .github/workflows/nodejs.yml
|
- .github/workflows/nodejs.yml
|
||||||
- docker-compose.yml
|
- docker-compose.yml
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|||||||
15
.github/workflows/pypi-publish.yml
vendored
15
.github/workflows/pypi-publish.yml
vendored
@@ -14,10 +14,16 @@ on:
|
|||||||
env:
|
env:
|
||||||
PIP_EXTRA_INDEX_URL: "https://pypi.fury.io/lance-format/ https://pypi.fury.io/lancedb/"
|
PIP_EXTRA_INDEX_URL: "https://pypi.fury.io/lance-format/ https://pypi.fury.io/lancedb/"
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
linux:
|
linux:
|
||||||
name: Python ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
|
name: Python ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
@@ -57,10 +63,12 @@ jobs:
|
|||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||||
with:
|
with:
|
||||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
|
||||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||||
mac:
|
mac:
|
||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
runs-on: ${{ matrix.config.runner }}
|
runs-on: ${{ matrix.config.runner }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@@ -85,10 +93,12 @@ jobs:
|
|||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||||
with:
|
with:
|
||||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
|
||||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||||
windows:
|
windows:
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
@@ -107,7 +117,6 @@ jobs:
|
|||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||||
with:
|
with:
|
||||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
|
||||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||||
gh-release:
|
gh-release:
|
||||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||||
|
|||||||
6
.github/workflows/python.yml
vendored
6
.github/workflows/python.yml
vendored
@@ -8,6 +8,7 @@ on:
|
|||||||
paths:
|
paths:
|
||||||
- Cargo.toml
|
- Cargo.toml
|
||||||
- Cargo.lock
|
- Cargo.lock
|
||||||
|
- rust-toolchain.toml
|
||||||
- python/**
|
- python/**
|
||||||
- rust/**
|
- rust/**
|
||||||
- .github/workflows/python.yml
|
- .github/workflows/python.yml
|
||||||
@@ -16,6 +17,9 @@ on:
|
|||||||
- .github/workflows/build_windows_wheel/**
|
- .github/workflows/build_windows_wheel/**
|
||||||
- .github/workflows/run_tests/**
|
- .github/workflows/run_tests/**
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
@@ -107,7 +111,6 @@ jobs:
|
|||||||
- name: Install
|
- name: Install
|
||||||
run: |
|
run: |
|
||||||
pip install --extra-index-url https://pypi.fury.io/lance-format/ --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests,dev,embeddings]
|
pip install --extra-index-url https://pypi.fury.io/lance-format/ --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests,dev,embeddings]
|
||||||
pip install tantivy
|
|
||||||
pip install mlx
|
pip install mlx
|
||||||
- name: Doctest
|
- name: Doctest
|
||||||
run: pytest --doctest-modules python/lancedb
|
run: pytest --doctest-modules python/lancedb
|
||||||
@@ -226,6 +229,5 @@ jobs:
|
|||||||
pip install "pydantic<2"
|
pip install "pydantic<2"
|
||||||
pip install pyarrow==16
|
pip install pyarrow==16
|
||||||
pip install --extra-index-url https://pypi.fury.io/lance-format/ --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests]
|
pip install --extra-index-url https://pypi.fury.io/lance-format/ --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests]
|
||||||
pip install tantivy
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: pytest -m "not slow and not s3_test" -x -v --durations=30 python/tests
|
run: pytest -m "not slow and not s3_test" -x -v --durations=30 python/tests
|
||||||
|
|||||||
34
.github/workflows/rust.yml
vendored
34
.github/workflows/rust.yml
vendored
@@ -8,9 +8,16 @@ on:
|
|||||||
paths:
|
paths:
|
||||||
- Cargo.toml
|
- Cargo.toml
|
||||||
- Cargo.lock
|
- Cargo.lock
|
||||||
|
- rust-toolchain.toml
|
||||||
|
- deny.toml
|
||||||
- rust/**
|
- rust/**
|
||||||
|
- nodejs/Cargo.toml
|
||||||
|
- python/Cargo.toml
|
||||||
- .github/workflows/rust.yml
|
- .github/workflows/rust.yml
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
@@ -52,6 +59,17 @@ jobs:
|
|||||||
- name: Run clippy (without remote feature)
|
- name: Run clippy (without remote feature)
|
||||||
run: cargo clippy --profile ci --workspace --tests -- -D warnings
|
run: cargo clippy --profile ci --workspace --tests -- -D warnings
|
||||||
|
|
||||||
|
deny:
|
||||||
|
# Supply-chain checks: advisories, licenses, banned crates, and source
|
||||||
|
# restrictions. Configuration lives in `deny.toml` at the workspace root.
|
||||||
|
timeout-minutes: 10
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: EmbarkStudios/cargo-deny-action@v2
|
||||||
|
with:
|
||||||
|
command: check advisories bans licenses sources
|
||||||
|
|
||||||
build-no-lock:
|
build-no-lock:
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ubuntu-24.04
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
@@ -207,14 +225,14 @@ jobs:
|
|||||||
- name: Downgrade dependencies
|
- name: Downgrade dependencies
|
||||||
# These packages have newer requirements for MSRV
|
# These packages have newer requirements for MSRV
|
||||||
run: |
|
run: |
|
||||||
cargo update -p aws-sdk-bedrockruntime --precise 1.64.0
|
cargo update -p aws-sdk-bedrockruntime --precise 1.77.0
|
||||||
cargo update -p aws-sdk-dynamodb --precise 1.55.0
|
cargo update -p aws-sdk-dynamodb --precise 1.68.0
|
||||||
cargo update -p aws-config --precise 1.5.10
|
cargo update -p aws-config --precise 1.6.0
|
||||||
cargo update -p aws-sdk-kms --precise 1.51.0
|
cargo update -p aws-sdk-kms --precise 1.63.0
|
||||||
cargo update -p aws-sdk-s3 --precise 1.65.0
|
cargo update -p aws-sdk-s3 --precise 1.79.0
|
||||||
cargo update -p aws-sdk-sso --precise 1.50.0
|
cargo update -p aws-sdk-sso --precise 1.62.0
|
||||||
cargo update -p aws-sdk-ssooidc --precise 1.51.0
|
cargo update -p aws-sdk-ssooidc --precise 1.63.0
|
||||||
cargo update -p aws-sdk-sts --precise 1.51.0
|
cargo update -p aws-sdk-sts --precise 1.63.0
|
||||||
cargo update -p home --precise 0.5.9
|
cargo update -p home --precise 0.5.9
|
||||||
- name: cargo +${{ matrix.msrv }} check
|
- name: cargo +${{ matrix.msrv }} check
|
||||||
env:
|
env:
|
||||||
|
|||||||
@@ -3,6 +3,9 @@ name: Update package-lock.json
|
|||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
publish:
|
publish:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|||||||
@@ -3,6 +3,9 @@ name: Update NodeJs package-lock.json
|
|||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
publish:
|
publish:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|||||||
25
.github/workflows/upload_wheel/action.yml
vendored
25
.github/workflows/upload_wheel/action.yml
vendored
@@ -2,9 +2,6 @@ name: upload-wheel
|
|||||||
|
|
||||||
description: "Upload wheels to Pypi"
|
description: "Upload wheels to Pypi"
|
||||||
inputs:
|
inputs:
|
||||||
pypi_token:
|
|
||||||
required: true
|
|
||||||
description: "release token for the repo"
|
|
||||||
fury_token:
|
fury_token:
|
||||||
required: true
|
required: true
|
||||||
description: "release token for the fury repo"
|
description: "release token for the fury repo"
|
||||||
@@ -12,12 +9,6 @@ inputs:
|
|||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
steps:
|
steps:
|
||||||
- name: Install dependencies
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install twine
|
|
||||||
python3 -m pip install --upgrade pkginfo
|
|
||||||
- name: Choose repo
|
- name: Choose repo
|
||||||
shell: bash
|
shell: bash
|
||||||
id: choose_repo
|
id: choose_repo
|
||||||
@@ -27,19 +18,17 @@ runs:
|
|||||||
else
|
else
|
||||||
echo "repo=pypi" >> $GITHUB_OUTPUT
|
echo "repo=pypi" >> $GITHUB_OUTPUT
|
||||||
fi
|
fi
|
||||||
- name: Publish to PyPI
|
- name: Publish to Fury
|
||||||
|
if: steps.choose_repo.outputs.repo == 'fury'
|
||||||
shell: bash
|
shell: bash
|
||||||
env:
|
env:
|
||||||
FURY_TOKEN: ${{ inputs.fury_token }}
|
FURY_TOKEN: ${{ inputs.fury_token }}
|
||||||
PYPI_TOKEN: ${{ inputs.pypi_token }}
|
|
||||||
run: |
|
run: |
|
||||||
if [[ ${{ steps.choose_repo.outputs.repo }} == fury ]]; then
|
|
||||||
WHEEL=$(ls target/wheels/lancedb-*.whl 2> /dev/null | head -n 1)
|
WHEEL=$(ls target/wheels/lancedb-*.whl 2> /dev/null | head -n 1)
|
||||||
echo "Uploading $WHEEL to Fury"
|
echo "Uploading $WHEEL to Fury"
|
||||||
curl -f -F package=@$WHEEL https://$FURY_TOKEN@push.fury.io/lancedb/
|
curl -f -F package=@$WHEEL https://$FURY_TOKEN@push.fury.io/lancedb/
|
||||||
else
|
- name: Publish to PyPI
|
||||||
twine upload --repository ${{ steps.choose_repo.outputs.repo }} \
|
if: steps.choose_repo.outputs.repo == 'pypi'
|
||||||
--username __token__ \
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
--password $PYPI_TOKEN \
|
with:
|
||||||
target/wheels/lancedb-*.whl
|
packages-dir: target/wheels/
|
||||||
fi
|
|
||||||
|
|||||||
2597
Cargo.lock
generated
2597
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
64
Cargo.toml
64
Cargo.toml
@@ -1,7 +1,5 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
members = ["rust/lancedb", "nodejs", "python"]
|
members = ["rust/lancedb", "nodejs", "python"]
|
||||||
# Python package needs to be built by maturin.
|
|
||||||
exclude = ["python"]
|
|
||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
@@ -15,40 +13,40 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.91.0"
|
rust-version = "1.91.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { version = "=3.0.1", default-features = false }
|
lance = { "version" = "=6.0.0-beta.7", default-features = false, "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-core = { version = "=3.0.1" }
|
lance-core = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-datagen = { version = "=3.0.1" }
|
lance-datagen = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-file = { version = "=3.0.1" }
|
lance-file = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-io = { version = "=3.0.1", default-features = false }
|
lance-io = { "version" = "=6.0.0-beta.7", default-features = false, "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-index = { version = "=3.0.1" }
|
lance-index = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-linalg = { version = "=3.0.1" }
|
lance-linalg = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-namespace = { version = "=3.0.1" }
|
lance-namespace = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-namespace-impls = { version = "=3.0.1", default-features = false }
|
lance-namespace-impls = { "version" = "=6.0.0-beta.7", default-features = false, "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-table = { version = "=3.0.1" }
|
lance-table = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-testing = { version = "=3.0.1" }
|
lance-testing = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-datafusion = { version = "=3.0.1" }
|
lance-datafusion = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-encoding = { version = "=3.0.1" }
|
lance-encoding = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
lance-arrow = { version = "=3.0.1" }
|
lance-arrow = { "version" = "=6.0.0-beta.7", "tag" = "v6.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||||
ahash = "0.8"
|
ahash = "0.8"
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "57.2", optional = false }
|
arrow = { version = "58.0.0", optional = false }
|
||||||
arrow-array = "57.2"
|
arrow-array = "58.0.0"
|
||||||
arrow-data = "57.2"
|
arrow-data = "58.0.0"
|
||||||
arrow-ipc = "57.2"
|
arrow-ipc = "58.0.0"
|
||||||
arrow-ord = "57.2"
|
arrow-ord = "58.0.0"
|
||||||
arrow-schema = "57.2"
|
arrow-schema = "58.0.0"
|
||||||
arrow-select = "57.2"
|
arrow-select = "58.0.0"
|
||||||
arrow-cast = "57.2"
|
arrow-cast = "58.0.0"
|
||||||
async-trait = "0"
|
async-trait = "0"
|
||||||
datafusion = { version = "52.1", default-features = false }
|
datafusion = { version = "53.0.0", default-features = false }
|
||||||
datafusion-catalog = "52.1"
|
datafusion-catalog = "53.0.0"
|
||||||
datafusion-common = { version = "52.1", default-features = false }
|
datafusion-common = { version = "53.0.0", default-features = false }
|
||||||
datafusion-execution = "52.1"
|
datafusion-execution = "53.0.0"
|
||||||
datafusion-expr = "52.1"
|
datafusion-expr = "53.0.0"
|
||||||
datafusion-functions = "52.1"
|
datafusion-functions = "53.0.0"
|
||||||
datafusion-physical-plan = "52.1"
|
datafusion-physical-plan = "53.0.0"
|
||||||
datafusion-physical-expr = "52.1"
|
datafusion-physical-expr = "53.0.0"
|
||||||
datafusion-sql = "52.1"
|
datafusion-sql = "53.0.0"
|
||||||
env_logger = "0.11"
|
env_logger = "0.11"
|
||||||
half = { "version" = "2.7.1", default-features = false, features = [
|
half = { "version" = "2.7.1", default-features = false, features = [
|
||||||
"num-traits",
|
"num-traits",
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
# **The Multimodal AI Lakehouse**
|
# **The Multimodal AI Lakehouse**
|
||||||
|
|
||||||
[**How to Install** ](#how-to-install) ✦ [**Detailed Documentation**](https://lancedb.com/docs) ✦ [**Tutorials and Recipes**](https://github.com/lancedb/vectordb-recipes/tree/main) ✦ [**Contributors**](#contributors)
|
[**How to Install** ](#how-to-install) ✦ [**Detailed Documentation**](https://docs.lancedb.com) ✦ [**Tutorials and Recipes**](https://github.com/lancedb/vectordb-recipes/tree/main) ✦ [**Contributors**](#contributors)
|
||||||
|
|
||||||
**The ultimate multimodal data platform for AI/ML applications.**
|
**The ultimate multimodal data platform for AI/ML applications.**
|
||||||
|
|
||||||
@@ -57,7 +57,7 @@ LanceDB is a central location where developers can build, train and analyze thei
|
|||||||
|
|
||||||
## **How to Install**:
|
## **How to Install**:
|
||||||
|
|
||||||
Follow the [Quickstart](https://lancedb.com/docs/quickstart/) doc to set up LanceDB locally.
|
Follow the [Quickstart](https://docs.lancedb.com/quickstart) doc to set up LanceDB locally.
|
||||||
|
|
||||||
**API & SDK:** We also support Python, Typescript and Rust SDKs
|
**API & SDK:** We also support Python, Typescript and Rust SDKs
|
||||||
|
|
||||||
|
|||||||
172
deny.toml
Normal file
172
deny.toml
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
# cargo-deny configuration for LanceDB.
|
||||||
|
#
|
||||||
|
# Run locally with `cargo deny check`. See
|
||||||
|
# https://embarkstudios.github.io/cargo-deny/ for the full reference.
|
||||||
|
|
||||||
|
# The set of target triples we care about. cargo-deny will only consider
|
||||||
|
# dependencies that are used on at least one of these targets. Keeping this
|
||||||
|
# explicit avoids noise from platform-specific crates (e.g. wasm, android,
|
||||||
|
# ios) that we never actually ship.
|
||||||
|
[graph]
|
||||||
|
targets = [
|
||||||
|
"x86_64-unknown-linux-gnu",
|
||||||
|
"aarch64-unknown-linux-gnu",
|
||||||
|
"x86_64-apple-darwin",
|
||||||
|
"aarch64-apple-darwin",
|
||||||
|
"x86_64-pc-windows-msvc",
|
||||||
|
"aarch64-pc-windows-msvc",
|
||||||
|
]
|
||||||
|
all-features = true
|
||||||
|
|
||||||
|
[output]
|
||||||
|
feature-depth = 1
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Advisories: security vulnerabilities and yanked crates.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
[advisories]
|
||||||
|
version = 2
|
||||||
|
# Fail the check if any crate in the lockfile has been yanked from crates.io.
|
||||||
|
# Yanked crates are a signal the author retracted the release (often due to
|
||||||
|
# bugs or security issues) and should not be depended on.
|
||||||
|
yanked = "deny"
|
||||||
|
# Advisory IDs we have explicitly reviewed and chosen to accept. Every
|
||||||
|
# entry must include a rationale and, where possible, an upstream issue
|
||||||
|
# pointing to a fix. Revisit this list whenever dependencies are updated.
|
||||||
|
ignore = [
|
||||||
|
# rsa: Marvin Attack timing side-channel in PKCS#1 v1.5 decryption.
|
||||||
|
# Reached only through opendal → reqsign → rsa. We do not use RSA
|
||||||
|
# decryption in LanceDB ourselves; this is dormant in the signing path.
|
||||||
|
# No fixed release exists upstream as of this writing.
|
||||||
|
# https://rustsec.org/advisories/RUSTSEC-2023-0071
|
||||||
|
{ id = "RUSTSEC-2023-0071", reason = "rsa crate via opendal/reqsign; no fixed upstream release" },
|
||||||
|
|
||||||
|
# instant: unmaintained. Pulled in via backoff → instant. Upstream
|
||||||
|
# recommends switching to `web-time`; fix has to come from backoff.
|
||||||
|
# https://rustsec.org/advisories/RUSTSEC-2024-0384
|
||||||
|
{ id = "RUSTSEC-2024-0384", reason = "transitive via backoff; waiting on backoff replacement" },
|
||||||
|
|
||||||
|
# paste: unmaintained (author archived the repo). Used transitively by
|
||||||
|
# datafusion and the arrow ecosystem; widespread, no drop-in replacement.
|
||||||
|
# https://rustsec.org/advisories/RUSTSEC-2024-0436
|
||||||
|
{ id = "RUSTSEC-2024-0436", reason = "transitive via datafusion; awaiting ecosystem migration" },
|
||||||
|
|
||||||
|
# tantivy: segfault on malformed input due to missing bounds check.
|
||||||
|
# Pulled in via lance for full-text search. We only feed tantivy
|
||||||
|
# documents we construct ourselves, not attacker-controlled bytes.
|
||||||
|
# Tracked for a lance dependency bump.
|
||||||
|
# https://rustsec.org/advisories/RUSTSEC-2025-0003
|
||||||
|
{ id = "RUSTSEC-2025-0003", reason = "tantivy via lance; inputs are internally produced, not user-supplied bytes" },
|
||||||
|
|
||||||
|
# backoff: unmaintained. Reached only via async-openai. Replacement
|
||||||
|
# requires async-openai to migrate (or us to drop async-openai).
|
||||||
|
# https://rustsec.org/advisories/RUSTSEC-2025-0012
|
||||||
|
{ id = "RUSTSEC-2025-0012", reason = "transitive via async-openai; waiting on upstream migration" },
|
||||||
|
|
||||||
|
# number_prefix: unmaintained. Transitive via indicatif → hf-hub.
|
||||||
|
# No security impact, just maintenance status.
|
||||||
|
# https://rustsec.org/advisories/RUSTSEC-2025-0119
|
||||||
|
{ id = "RUSTSEC-2025-0119", reason = "transitive via hf-hub/indicatif; cosmetic formatting crate" },
|
||||||
|
|
||||||
|
# rustls-pemfile: unmaintained. Reached from two separate chains:
|
||||||
|
# rustls-native-certs 0.6 (via hyper-rustls 0.24) and object_store 0.12.
|
||||||
|
# Both upstream dependencies need to move before we can drop it.
|
||||||
|
# https://rustsec.org/advisories/RUSTSEC-2025-0134
|
||||||
|
{ id = "RUSTSEC-2025-0134", reason = "transitive via rustls-native-certs/object_store; waiting on upstream migration" },
|
||||||
|
|
||||||
|
# rustls-webpki 0.101.7 (old major line): name-constraint checks for
|
||||||
|
# URI / wildcard names. Pulled in only via the legacy rustls 0.21 chain
|
||||||
|
# from aws-smithy-http-client. The 0.103 line we actively use is patched.
|
||||||
|
# Clearing the 0.101 copy requires the aws-sdk chain to migrate off
|
||||||
|
# rustls 0.21.
|
||||||
|
# https://rustsec.org/advisories/RUSTSEC-2026-0098
|
||||||
|
# https://rustsec.org/advisories/RUSTSEC-2026-0099
|
||||||
|
{ id = "RUSTSEC-2026-0098", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
|
||||||
|
{ id = "RUSTSEC-2026-0099", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
|
||||||
|
|
||||||
|
# rustls-webpki 0.101.7: reachable panic in CRL parsing. Same legacy
|
||||||
|
# rustls 0.21 chain from aws-smithy-http-client as above. The 0.103 line
|
||||||
|
# we actively use is upgraded to 0.103.13 which contains the fix.
|
||||||
|
# https://rustsec.org/advisories/RUSTSEC-2026-0104
|
||||||
|
{ id = "RUSTSEC-2026-0104", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
|
||||||
|
]
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Licenses: only allow licenses we've reviewed as compatible with Apache-2.0.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
[licenses]
|
||||||
|
version = 2
|
||||||
|
# SPDX identifiers for licenses that are compatible with our Apache-2.0
|
||||||
|
# distribution. Additions require legal review.
|
||||||
|
allow = [
|
||||||
|
"Apache-2.0",
|
||||||
|
"Apache-2.0 WITH LLVM-exception",
|
||||||
|
"MIT",
|
||||||
|
"BSD-2-Clause",
|
||||||
|
"BSD-3-Clause",
|
||||||
|
"ISC",
|
||||||
|
"Unicode-3.0",
|
||||||
|
"Unicode-DFS-2016",
|
||||||
|
"Zlib",
|
||||||
|
"CC0-1.0",
|
||||||
|
"MPL-2.0",
|
||||||
|
"BSL-1.0",
|
||||||
|
"OpenSSL",
|
||||||
|
# 0BSD ("BSD Zero Clause") is effectively public domain — no attribution
|
||||||
|
# required. Pulled in by `mock_instant`.
|
||||||
|
"0BSD",
|
||||||
|
# bzip2-1.0.6 is the permissive upstream bzip2 license (BSD-like). Pulled
|
||||||
|
# in by `libbz2-rs-sys`, the pure-Rust bzip2 implementation.
|
||||||
|
"bzip2-1.0.6",
|
||||||
|
# CDLA-Permissive-2.0 is a permissive data license used by `webpki-roots`
|
||||||
|
# for the Mozilla CA root bundle. Data-only, distribution-compatible.
|
||||||
|
"CDLA-Permissive-2.0",
|
||||||
|
]
|
||||||
|
confidence-threshold = 0.8
|
||||||
|
# Crates whose license cannot be determined from Cargo metadata but whose
|
||||||
|
# license we've manually confirmed from upstream. Keep this list minimal.
|
||||||
|
[[licenses.clarify]]
|
||||||
|
# polars-arrow-format omits the `license` field in its Cargo.toml, but the
|
||||||
|
# upstream repo (pola-rs/polars-arrow-format) is dual-licensed Apache-2.0 OR
|
||||||
|
# MIT. See https://github.com/pola-rs/polars-arrow-format/blob/main/LICENSE
|
||||||
|
crate = "polars-arrow-format"
|
||||||
|
expression = "Apache-2.0 OR MIT"
|
||||||
|
license-files = []
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Bans: disallow specific crates and flag dependency hygiene issues.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
[bans]
|
||||||
|
# Warn (not deny) on duplicate versions of the same crate. In a large
|
||||||
|
# workspace like this one, duplicates are common and often unavoidable
|
||||||
|
# transitively. We surface them to discourage growth, but don't fail CI.
|
||||||
|
multiple-versions = "warn"
|
||||||
|
# Wildcard version requirements (`foo = "*"`) are a footgun — they let any
|
||||||
|
# future release in without review. Ban them outright.
|
||||||
|
wildcards = "deny"
|
||||||
|
# Internal workspace crates reference each other via `path = "..."`, which
|
||||||
|
# cargo-deny sees as a wildcard version. That's fine for private workspace
|
||||||
|
# members (not published to crates.io), so allow it specifically for paths.
|
||||||
|
allow-wildcard-paths = true
|
||||||
|
# Features that, if enabled, should cause the check to fail.
|
||||||
|
deny = []
|
||||||
|
# Crates to skip when checking for duplicate versions.
|
||||||
|
skip = []
|
||||||
|
# Similar to `skip`, but also skips the entire transitive subtree.
|
||||||
|
skip-tree = []
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Sources: restrict where crates can come from.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
[sources]
|
||||||
|
# Deny any registry other than the ones explicitly listed below.
|
||||||
|
unknown-registry = "deny"
|
||||||
|
# Deny any git dependency whose host isn't in the allow-list below. This
|
||||||
|
# prevents accidental pulls from arbitrary forks.
|
||||||
|
unknown-git = "deny"
|
||||||
|
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
|
||||||
|
# Lance is developed in a sibling repo and pulled as a git dependency until
|
||||||
|
# releases are cut to crates.io. Allow that specific host.
|
||||||
|
allow-git = [
|
||||||
|
"https://github.com/lance-format/lance",
|
||||||
|
]
|
||||||
@@ -24,4 +24,4 @@ RUN python --version && \
|
|||||||
rustc --version && \
|
rustc --version && \
|
||||||
protoc --version
|
protoc --version
|
||||||
|
|
||||||
RUN pip install --no-cache-dir tantivy lancedb
|
RUN pip install --no-cache-dir lancedb
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# LanceDB Documentation
|
# LanceDB Documentation
|
||||||
|
|
||||||
LanceDB docs are available at [lancedb.com/docs](https://lancedb.com/docs).
|
LanceDB docs are available at [docs.lancedb.com](https://docs.lancedb.com).
|
||||||
|
|
||||||
The SDK docs are built and deployed automatically by [Github Actions](../.github/workflows/docs.yml)
|
The SDK docs are built and deployed automatically by [Github Actions](../.github/workflows/docs.yml)
|
||||||
whenever a commit is pushed to the `main` branch. So it is possible for the docs to show
|
whenever a commit is pushed to the `main` branch. So it is possible for the docs to show
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
mkdocs==1.5.3
|
mkdocs==1.6.1
|
||||||
mkdocs-jupyter==0.24.1
|
mkdocs-jupyter==0.24.1
|
||||||
mkdocs-material==9.5.3
|
mkdocs-material==9.6.23
|
||||||
mkdocs-autorefs>=0.5,<=1.0
|
mkdocs-autorefs>=0.5,<=1.0
|
||||||
mkdocstrings[python]==0.25.2
|
mkdocstrings[python]>=0.24,<1.0
|
||||||
griffe>=0.40,<1.0
|
griffe>=0.40,<1.0
|
||||||
mkdocs-render-swagger-plugin>=0.1.0
|
mkdocs-render-swagger-plugin>=0.1.0
|
||||||
pydantic>=2.0,<3.0
|
pydantic>=2.0,<3.0
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-core</artifactId>
|
<artifactId>lancedb-core</artifactId>
|
||||||
<version>0.27.0</version>
|
<version>0.28.0-beta.11</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -57,32 +57,32 @@ LanceNamespace namespaceClient = LanceDbNamespaceClientBuilder.newBuilder()
|
|||||||
|
|
||||||
## Metadata Operations
|
## Metadata Operations
|
||||||
|
|
||||||
### Creating a Namespace
|
### Creating a Namespace Path
|
||||||
|
|
||||||
Namespaces organize tables hierarchically. Create a namespace before creating tables within it:
|
Namespace paths organize tables hierarchically. Create the desired namespace path before creating tables within it:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
import org.lance.namespace.model.CreateNamespaceRequest;
|
import org.lance.namespace.model.CreateNamespaceRequest;
|
||||||
import org.lance.namespace.model.CreateNamespaceResponse;
|
import org.lance.namespace.model.CreateNamespaceResponse;
|
||||||
|
|
||||||
// Create a child namespace
|
// Create a child namespace path
|
||||||
CreateNamespaceRequest request = new CreateNamespaceRequest();
|
CreateNamespaceRequest request = new CreateNamespaceRequest();
|
||||||
request.setId(Arrays.asList("my_namespace"));
|
request.setId(Arrays.asList("my_namespace"));
|
||||||
|
|
||||||
CreateNamespaceResponse response = namespaceClient.createNamespace(request);
|
CreateNamespaceResponse response = namespaceClient.createNamespace(request);
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also create nested namespaces:
|
You can also create nested namespace paths:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
// Create a nested namespace: parent/child
|
// Create a nested namespace path: parent/child
|
||||||
CreateNamespaceRequest request = new CreateNamespaceRequest();
|
CreateNamespaceRequest request = new CreateNamespaceRequest();
|
||||||
request.setId(Arrays.asList("parent_namespace", "child_namespace"));
|
request.setId(Arrays.asList("parent_namespace", "child_namespace"));
|
||||||
|
|
||||||
CreateNamespaceResponse response = namespaceClient.createNamespace(request);
|
CreateNamespaceResponse response = namespaceClient.createNamespace(request);
|
||||||
```
|
```
|
||||||
|
|
||||||
### Describing a Namespace
|
### Describing a Namespace Path
|
||||||
|
|
||||||
```java
|
```java
|
||||||
import org.lance.namespace.model.DescribeNamespaceRequest;
|
import org.lance.namespace.model.DescribeNamespaceRequest;
|
||||||
@@ -95,22 +95,22 @@ DescribeNamespaceResponse response = namespaceClient.describeNamespace(request);
|
|||||||
System.out.println("Namespace properties: " + response.getProperties());
|
System.out.println("Namespace properties: " + response.getProperties());
|
||||||
```
|
```
|
||||||
|
|
||||||
### Listing Namespaces
|
### Listing Namespace Paths
|
||||||
|
|
||||||
```java
|
```java
|
||||||
import org.lance.namespace.model.ListNamespacesRequest;
|
import org.lance.namespace.model.ListNamespacesRequest;
|
||||||
import org.lance.namespace.model.ListNamespacesResponse;
|
import org.lance.namespace.model.ListNamespacesResponse;
|
||||||
|
|
||||||
// List all namespaces at root level
|
// List all namespace paths at the root level
|
||||||
ListNamespacesRequest request = new ListNamespacesRequest();
|
ListNamespacesRequest request = new ListNamespacesRequest();
|
||||||
request.setId(Arrays.asList()); // Empty for root
|
request.setId(Arrays.asList()); // Empty for root
|
||||||
|
|
||||||
ListNamespacesResponse response = namespaceClient.listNamespaces(request);
|
ListNamespacesResponse response = namespaceClient.listNamespaces(request);
|
||||||
for (String ns : response.getNamespaces()) {
|
for (String ns : response.getNamespaces()) {
|
||||||
System.out.println("Namespace: " + ns);
|
System.out.println("Namespace path: " + ns);
|
||||||
}
|
}
|
||||||
|
|
||||||
// List child namespaces under a parent
|
// List child namespace paths under a parent path
|
||||||
ListNamespacesRequest childRequest = new ListNamespacesRequest();
|
ListNamespacesRequest childRequest = new ListNamespacesRequest();
|
||||||
childRequest.setId(Arrays.asList("parent_namespace"));
|
childRequest.setId(Arrays.asList("parent_namespace"));
|
||||||
|
|
||||||
@@ -123,7 +123,7 @@ ListNamespacesResponse childResponse = namespaceClient.listNamespaces(childReque
|
|||||||
import org.lance.namespace.model.ListTablesRequest;
|
import org.lance.namespace.model.ListTablesRequest;
|
||||||
import org.lance.namespace.model.ListTablesResponse;
|
import org.lance.namespace.model.ListTablesResponse;
|
||||||
|
|
||||||
// List tables in a namespace
|
// List tables in a namespace path
|
||||||
ListTablesRequest request = new ListTablesRequest();
|
ListTablesRequest request = new ListTablesRequest();
|
||||||
request.setId(Arrays.asList("my_namespace"));
|
request.setId(Arrays.asList("my_namespace"));
|
||||||
|
|
||||||
@@ -133,7 +133,7 @@ for (String table : response.getTables()) {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Dropping a Namespace
|
### Dropping a Namespace Path
|
||||||
|
|
||||||
```java
|
```java
|
||||||
import org.lance.namespace.model.DropNamespaceRequest;
|
import org.lance.namespace.model.DropNamespaceRequest;
|
||||||
@@ -175,7 +175,7 @@ DropTableResponse response = namespaceClient.dropTable(request);
|
|||||||
|
|
||||||
### Creating a Table
|
### Creating a Table
|
||||||
|
|
||||||
Tables are created within a namespace by providing data in Apache Arrow IPC format:
|
Tables are created within a namespace path by providing data in Apache Arrow IPC format:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
import org.lance.namespace.LanceNamespace;
|
import org.lance.namespace.LanceNamespace;
|
||||||
@@ -242,7 +242,7 @@ try (BufferAllocator allocator = new RootAllocator();
|
|||||||
}
|
}
|
||||||
byte[] tableData = out.toByteArray();
|
byte[] tableData = out.toByteArray();
|
||||||
|
|
||||||
// Create table in a namespace
|
// Create a table in a namespace path
|
||||||
CreateTableRequest request = new CreateTableRequest();
|
CreateTableRequest request = new CreateTableRequest();
|
||||||
request.setId(Arrays.asList("my_namespace", "my_table"));
|
request.setId(Arrays.asList("my_namespace", "my_table"));
|
||||||
CreateTableResponse response = namespaceClient.createTable(request, tableData);
|
CreateTableResponse response = namespaceClient.createTable(request, tableData);
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ const results = await table.vectorSearch([0.1, 0.3]).limit(20).toArray();
|
|||||||
console.log(results);
|
console.log(results);
|
||||||
```
|
```
|
||||||
|
|
||||||
The [quickstart](https://lancedb.com/docs/quickstart/basic-usage/) contains more complete examples.
|
The [quickstart](https://docs.lancedb.com/quickstart/) contains more complete examples.
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
|
|||||||
@@ -61,8 +61,8 @@ sharing the same data, deletion, and index files.
|
|||||||
* **options.sourceVersion?**: `number`
|
* **options.sourceVersion?**: `number`
|
||||||
The version of the source table to clone.
|
The version of the source table to clone.
|
||||||
|
|
||||||
* **options.targetNamespace?**: `string`[]
|
* **options.targetNamespacePath?**: `string`[]
|
||||||
The namespace for the target table (defaults to root namespace).
|
The namespace path for the target table (defaults to root namespace).
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
@@ -116,13 +116,13 @@ Creates a new empty Table
|
|||||||
|
|
||||||
`Promise`<[`Table`](Table.md)>
|
`Promise`<[`Table`](Table.md)>
|
||||||
|
|
||||||
#### createEmptyTable(name, schema, namespace, options)
|
#### createEmptyTable(name, schema, namespacePath, options)
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
abstract createEmptyTable(
|
abstract createEmptyTable(
|
||||||
name,
|
name,
|
||||||
schema,
|
schema,
|
||||||
namespace?,
|
namespacePath?,
|
||||||
options?): Promise<Table>
|
options?): Promise<Table>
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -136,8 +136,8 @@ Creates a new empty Table
|
|||||||
* **schema**: [`SchemaLike`](../type-aliases/SchemaLike.md)
|
* **schema**: [`SchemaLike`](../type-aliases/SchemaLike.md)
|
||||||
The schema of the table
|
The schema of the table
|
||||||
|
|
||||||
* **namespace?**: `string`[]
|
* **namespacePath?**: `string`[]
|
||||||
The namespace to create the table in (defaults to root namespace)
|
The namespace path to create the table in (defaults to root namespace)
|
||||||
|
|
||||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||||
Additional options
|
Additional options
|
||||||
@@ -150,10 +150,10 @@ Creates a new empty Table
|
|||||||
|
|
||||||
### createTable()
|
### createTable()
|
||||||
|
|
||||||
#### createTable(options, namespace)
|
#### createTable(options, namespacePath)
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
abstract createTable(options, namespace?): Promise<Table>
|
abstract createTable(options, namespacePath?): Promise<Table>
|
||||||
```
|
```
|
||||||
|
|
||||||
Creates a new Table and initialize it with new data.
|
Creates a new Table and initialize it with new data.
|
||||||
@@ -163,8 +163,8 @@ Creates a new Table and initialize it with new data.
|
|||||||
* **options**: `object` & `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
* **options**: `object` & `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||||
The options object.
|
The options object.
|
||||||
|
|
||||||
* **namespace?**: `string`[]
|
* **namespacePath?**: `string`[]
|
||||||
The namespace to create the table in (defaults to root namespace)
|
The namespace path to create the table in (defaults to root namespace)
|
||||||
|
|
||||||
##### Returns
|
##### Returns
|
||||||
|
|
||||||
@@ -197,13 +197,13 @@ Creates a new Table and initialize it with new data.
|
|||||||
|
|
||||||
`Promise`<[`Table`](Table.md)>
|
`Promise`<[`Table`](Table.md)>
|
||||||
|
|
||||||
#### createTable(name, data, namespace, options)
|
#### createTable(name, data, namespacePath, options)
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
abstract createTable(
|
abstract createTable(
|
||||||
name,
|
name,
|
||||||
data,
|
data,
|
||||||
namespace?,
|
namespacePath?,
|
||||||
options?): Promise<Table>
|
options?): Promise<Table>
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -218,8 +218,8 @@ Creates a new Table and initialize it with new data.
|
|||||||
Non-empty Array of Records
|
Non-empty Array of Records
|
||||||
to be inserted into the table
|
to be inserted into the table
|
||||||
|
|
||||||
* **namespace?**: `string`[]
|
* **namespacePath?**: `string`[]
|
||||||
The namespace to create the table in (defaults to root namespace)
|
The namespace path to create the table in (defaults to root namespace)
|
||||||
|
|
||||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||||
Additional options
|
Additional options
|
||||||
@@ -247,15 +247,15 @@ Return a brief description of the connection
|
|||||||
### dropAllTables()
|
### dropAllTables()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
abstract dropAllTables(namespace?): Promise<void>
|
abstract dropAllTables(namespacePath?): Promise<void>
|
||||||
```
|
```
|
||||||
|
|
||||||
Drop all tables in the database.
|
Drop all tables in the database.
|
||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **namespace?**: `string`[]
|
* **namespacePath?**: `string`[]
|
||||||
The namespace to drop tables from (defaults to root namespace).
|
The namespace path to drop tables from (defaults to root namespace).
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
@@ -266,7 +266,7 @@ Drop all tables in the database.
|
|||||||
### dropTable()
|
### dropTable()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
abstract dropTable(name, namespace?): Promise<void>
|
abstract dropTable(name, namespacePath?): Promise<void>
|
||||||
```
|
```
|
||||||
|
|
||||||
Drop an existing table.
|
Drop an existing table.
|
||||||
@@ -276,8 +276,8 @@ Drop an existing table.
|
|||||||
* **name**: `string`
|
* **name**: `string`
|
||||||
The name of the table to drop.
|
The name of the table to drop.
|
||||||
|
|
||||||
* **namespace?**: `string`[]
|
* **namespacePath?**: `string`[]
|
||||||
The namespace of the table (defaults to root namespace).
|
The namespace path of the table (defaults to root namespace).
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
@@ -304,7 +304,7 @@ Return true if the connection has not been closed
|
|||||||
```ts
|
```ts
|
||||||
abstract openTable(
|
abstract openTable(
|
||||||
name,
|
name,
|
||||||
namespace?,
|
namespacePath?,
|
||||||
options?): Promise<Table>
|
options?): Promise<Table>
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -315,8 +315,8 @@ Open a table in the database.
|
|||||||
* **name**: `string`
|
* **name**: `string`
|
||||||
The name of the table
|
The name of the table
|
||||||
|
|
||||||
* **namespace?**: `string`[]
|
* **namespacePath?**: `string`[]
|
||||||
The namespace of the table (defaults to root namespace)
|
The namespace path of the table (defaults to root namespace)
|
||||||
|
|
||||||
* **options?**: `Partial`<[`OpenTableOptions`](../interfaces/OpenTableOptions.md)>
|
* **options?**: `Partial`<[`OpenTableOptions`](../interfaces/OpenTableOptions.md)>
|
||||||
Additional options
|
Additional options
|
||||||
@@ -349,10 +349,10 @@ Tables will be returned in lexicographical order.
|
|||||||
|
|
||||||
`Promise`<`string`[]>
|
`Promise`<`string`[]>
|
||||||
|
|
||||||
#### tableNames(namespace, options)
|
#### tableNames(namespacePath, options)
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
abstract tableNames(namespace?, options?): Promise<string[]>
|
abstract tableNames(namespacePath?, options?): Promise<string[]>
|
||||||
```
|
```
|
||||||
|
|
||||||
List all the table names in this database.
|
List all the table names in this database.
|
||||||
@@ -361,8 +361,8 @@ Tables will be returned in lexicographical order.
|
|||||||
|
|
||||||
##### Parameters
|
##### Parameters
|
||||||
|
|
||||||
* **namespace?**: `string`[]
|
* **namespacePath?**: `string`[]
|
||||||
The namespace to list tables from (defaults to root namespace)
|
The namespace path to list tables from (defaults to root namespace)
|
||||||
|
|
||||||
* **options?**: `Partial`<[`TableNamesOptions`](../interfaces/TableNamesOptions.md)>
|
* **options?**: `Partial`<[`TableNamesOptions`](../interfaces/TableNamesOptions.md)>
|
||||||
options to control the
|
options to control the
|
||||||
|
|||||||
@@ -53,3 +53,18 @@ optional tlsConfig: TlsConfig;
|
|||||||
```ts
|
```ts
|
||||||
optional userAgent: string;
|
optional userAgent: string;
|
||||||
```
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### userId?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional userId: string;
|
||||||
|
```
|
||||||
|
|
||||||
|
User identifier for tracking purposes.
|
||||||
|
|
||||||
|
This is sent as the `x-lancedb-user-id` header in requests to LanceDB Cloud/Enterprise.
|
||||||
|
It can be set directly, or via the `LANCEDB_USER_ID` environment variable.
|
||||||
|
Alternatively, set `LANCEDB_USER_ID_ENV_KEY` to specify another environment
|
||||||
|
variable that contains the user ID value.
|
||||||
|
|||||||
@@ -41,6 +41,29 @@ for testing purposes.
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### manifestEnabled?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional manifestEnabled: boolean;
|
||||||
|
```
|
||||||
|
|
||||||
|
(For LanceDB OSS only): use directory namespace manifests as the source
|
||||||
|
of truth for table metadata. Existing directory-listed root tables are
|
||||||
|
migrated into the manifest on access.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### namespaceClientProperties?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional namespaceClientProperties: Record<string, string>;
|
||||||
|
```
|
||||||
|
|
||||||
|
(For LanceDB OSS only): extra properties for the backing namespace
|
||||||
|
client used by manifest-enabled native connections.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### readConsistencyInterval?
|
### readConsistencyInterval?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
@@ -89,4 +112,4 @@ optional storageOptions: Record<string, string>;
|
|||||||
|
|
||||||
(For LanceDB OSS only): configuration for object storage.
|
(For LanceDB OSS only): configuration for object storage.
|
||||||
|
|
||||||
The available options are described at https://lancedb.com/docs/storage/
|
The available options are described at https://docs.lancedb.com/storage/
|
||||||
|
|||||||
@@ -97,4 +97,4 @@ Configuration for object storage.
|
|||||||
Options already set on the connection will be inherited by the table,
|
Options already set on the connection will be inherited by the table,
|
||||||
but can be overridden here.
|
but can be overridden here.
|
||||||
|
|
||||||
The available options are described at https://lancedb.com/docs/storage/
|
The available options are described at https://docs.lancedb.com/storage/
|
||||||
|
|||||||
@@ -42,4 +42,4 @@ Configuration for object storage.
|
|||||||
Options already set on the connection will be inherited by the table,
|
Options already set on the connection will be inherited by the table,
|
||||||
but can be overridden here.
|
but can be overridden here.
|
||||||
|
|
||||||
The available options are described at https://lancedb.com/docs/storage/
|
The available options are described at https://docs.lancedb.com/storage/
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ new EmbeddingFunction<T, M>(): EmbeddingFunction<T, M>
|
|||||||
### computeQueryEmbeddings()
|
### computeQueryEmbeddings()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
computeQueryEmbeddings(data): Promise<number[] | Float32Array | Float64Array>
|
computeQueryEmbeddings(data): Promise<number[] | Uint8Array | Float32Array | Float64Array>
|
||||||
```
|
```
|
||||||
|
|
||||||
Compute the embeddings for a single query
|
Compute the embeddings for a single query
|
||||||
@@ -63,7 +63,7 @@ Compute the embeddings for a single query
|
|||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
`Promise`<`number`[] \| `Float32Array` \| `Float64Array`>
|
`Promise`<`number`[] \| `Uint8Array` \| `Float32Array` \| `Float64Array`>
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ new TextEmbeddingFunction<M>(): TextEmbeddingFunction<M>
|
|||||||
### computeQueryEmbeddings()
|
### computeQueryEmbeddings()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
computeQueryEmbeddings(data): Promise<number[] | Float32Array | Float64Array>
|
computeQueryEmbeddings(data): Promise<number[] | Uint8Array | Float32Array | Float64Array>
|
||||||
```
|
```
|
||||||
|
|
||||||
Compute the embeddings for a single query
|
Compute the embeddings for a single query
|
||||||
@@ -48,7 +48,7 @@ Compute the embeddings for a single query
|
|||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
`Promise`<`number`[] \| `Float32Array` \| `Float64Array`>
|
`Promise`<`number`[] \| `Uint8Array` \| `Float32Array` \| `Float64Array`>
|
||||||
|
|
||||||
#### Overrides
|
#### Overrides
|
||||||
|
|
||||||
|
|||||||
@@ -7,5 +7,10 @@
|
|||||||
# Type Alias: IntoVector
|
# Type Alias: IntoVector
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
type IntoVector: Float32Array | Float64Array | number[] | Promise<Float32Array | Float64Array | number[]>;
|
type IntoVector:
|
||||||
|
| Float32Array
|
||||||
|
| Float64Array
|
||||||
|
| Uint8Array
|
||||||
|
| number[]
|
||||||
|
| Promise<Float32Array | Float64Array | Uint8Array | number[]>;
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -36,6 +36,20 @@ is also an [asynchronous API client](#connections-asynchronous).
|
|||||||
|
|
||||||
::: lancedb.table.Tags
|
::: lancedb.table.Tags
|
||||||
|
|
||||||
|
## Expressions
|
||||||
|
|
||||||
|
Type-safe expression builder for filters and projections. Use these instead
|
||||||
|
of raw SQL strings with [where][lancedb.query.LanceQueryBuilder.where] and
|
||||||
|
[select][lancedb.query.LanceQueryBuilder.select].
|
||||||
|
|
||||||
|
::: lancedb.expr.Expr
|
||||||
|
|
||||||
|
::: lancedb.expr.col
|
||||||
|
|
||||||
|
::: lancedb.expr.lit
|
||||||
|
|
||||||
|
::: lancedb.expr.func
|
||||||
|
|
||||||
## Querying (Synchronous)
|
## Querying (Synchronous)
|
||||||
|
|
||||||
::: lancedb.query.Query
|
::: lancedb.query.Query
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.27.0-final.0</version>
|
<version>0.28.0-beta.11</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.27.0-final.0</version>
|
<version>0.28.0-beta.11</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<name>${project.artifactId}</name>
|
<name>${project.artifactId}</name>
|
||||||
<description>LanceDB Java SDK Parent POM</description>
|
<description>LanceDB Java SDK Parent POM</description>
|
||||||
@@ -28,7 +28,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<arrow.version>15.0.0</arrow.version>
|
<arrow.version>15.0.0</arrow.version>
|
||||||
<lance-core.version>3.1.0-beta.2</lance-core.version>
|
<lance-core.version>6.0.0-beta.7</lance-core.version>
|
||||||
<spotless.skip>false</spotless.skip>
|
<spotless.skip>false</spotless.skip>
|
||||||
<spotless.version>2.30.0</spotless.version>
|
<spotless.version>2.30.0</spotless.version>
|
||||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.27.0"
|
version = "0.28.0-beta.11"
|
||||||
|
publish = false
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
@@ -15,6 +16,8 @@ crate-type = ["cdylib"]
|
|||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
arrow-ipc.workspace = true
|
arrow-ipc.workspace = true
|
||||||
arrow-array.workspace = true
|
arrow-array.workspace = true
|
||||||
|
arrow-buffer = "58.0.0"
|
||||||
|
half.workspace = true
|
||||||
arrow-schema.workspace = true
|
arrow-schema.workspace = true
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
@@ -29,8 +32,8 @@ lzma-sys = { version = "0.1", features = ["static"] }
|
|||||||
log.workspace = true
|
log.workspace = true
|
||||||
|
|
||||||
# Pin to resolve build failures; update periodically for security patches.
|
# Pin to resolve build failures; update periodically for security patches.
|
||||||
aws-lc-sys = "=0.38.0"
|
aws-lc-sys = "=0.40.0"
|
||||||
aws-lc-rs = "=1.16.1"
|
aws-lc-rs = "=1.16.3"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
napi-build = "2.3.1"
|
napi-build = "2.3.1"
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ const results = await table.vectorSearch([0.1, 0.3]).limit(20).toArray();
|
|||||||
console.log(results);
|
console.log(results);
|
||||||
```
|
```
|
||||||
|
|
||||||
The [quickstart](https://lancedb.com/docs/quickstart/basic-usage/) contains more complete examples.
|
The [quickstart](https://docs.lancedb.com/quickstart/) contains more complete examples.
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
import { spawn } from "node:child_process";
|
||||||
|
import * as path from "node:path";
|
||||||
import { RecordBatch } from "apache-arrow";
|
import { RecordBatch } from "apache-arrow";
|
||||||
import * as tmp from "tmp";
|
import * as tmp from "tmp";
|
||||||
import { Connection, Index, Table, connect, makeArrowTable } from "../lancedb";
|
import { Connection, Index, Table, connect, makeArrowTable } from "../lancedb";
|
||||||
@@ -76,4 +78,91 @@ describe("rerankers", function () {
|
|||||||
|
|
||||||
expect(result).toHaveLength(2);
|
expect(result).toHaveLength(2);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("does not keep process alive after rerank query", async function () {
|
||||||
|
const script = `
|
||||||
|
import * as lancedb from "./dist/index.js";
|
||||||
|
import * as os from "node:os";
|
||||||
|
import * as path from "node:path";
|
||||||
|
import * as fs from "node:fs/promises";
|
||||||
|
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "lancedb-rerank-exit-"));
|
||||||
|
const db = await lancedb.connect(dir);
|
||||||
|
const table = await db.createTable("test", [{ text: "hello", vector: [1, 2, 3] }], {
|
||||||
|
mode: "overwrite",
|
||||||
|
});
|
||||||
|
await table.createIndex("text", { config: lancedb.Index.fts() });
|
||||||
|
await table.waitForIndex(["text_idx"], 30);
|
||||||
|
|
||||||
|
const reranker = await lancedb.rerankers.RRFReranker.create();
|
||||||
|
await table
|
||||||
|
.query()
|
||||||
|
.nearestTo([1, 2, 3])
|
||||||
|
.fullTextSearch("hello")
|
||||||
|
.rerank(reranker)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
table.close();
|
||||||
|
db.close();
|
||||||
|
`;
|
||||||
|
|
||||||
|
await new Promise<void>((resolve, reject) => {
|
||||||
|
const child = spawn(
|
||||||
|
process.execPath,
|
||||||
|
["--input-type=module", "-e", script],
|
||||||
|
{
|
||||||
|
cwd: path.resolve(__dirname, ".."),
|
||||||
|
stdio: ["ignore", "pipe", "pipe"],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
let stdout = "";
|
||||||
|
let stderr = "";
|
||||||
|
|
||||||
|
child.stdout.on("data", (chunk) => {
|
||||||
|
stdout += chunk.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
child.stderr.on("data", (chunk) => {
|
||||||
|
stderr += chunk.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
const timeout = setTimeout(() => {
|
||||||
|
child.kill();
|
||||||
|
reject(
|
||||||
|
new Error(
|
||||||
|
`child process did not exit in time\nstdout:\n${stdout}\nstderr:\n${stderr}`,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}, 20_000);
|
||||||
|
|
||||||
|
child.on("error", (err) => {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
reject(err);
|
||||||
|
});
|
||||||
|
|
||||||
|
child.on("exit", (code, signal) => {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
if (signal !== null) {
|
||||||
|
reject(
|
||||||
|
new Error(
|
||||||
|
`child process exited with signal ${signal}\nstdout:\n${stdout}\nstderr:\n${stderr}`,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code !== 0) {
|
||||||
|
reject(
|
||||||
|
new Error(
|
||||||
|
`child process exited with code ${code}\nstdout:\n${stdout}\nstderr:\n${stderr}`,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
},
|
},
|
||||||
numIndices: 0,
|
numIndices: 0,
|
||||||
numRows: 3,
|
numRows: 3,
|
||||||
totalBytes: 24,
|
totalBytes: 44,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
110
nodejs/__test__/vector_types.test.ts
Normal file
110
nodejs/__test__/vector_types.test.ts
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
import * as tmp from "tmp";
|
||||||
|
|
||||||
|
import { type Table, connect } from "../lancedb";
|
||||||
|
import {
|
||||||
|
Field,
|
||||||
|
FixedSizeList,
|
||||||
|
Float32,
|
||||||
|
Int64,
|
||||||
|
Schema,
|
||||||
|
makeArrowTable,
|
||||||
|
} from "../lancedb/arrow";
|
||||||
|
|
||||||
|
describe("Vector query with different typed arrays", () => {
|
||||||
|
let tmpDir: tmp.DirResult;
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
tmpDir?.removeCallback();
|
||||||
|
});
|
||||||
|
|
||||||
|
async function createFloat32Table(): Promise<Table> {
|
||||||
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const schema = new Schema([
|
||||||
|
new Field("id", new Int64(), true),
|
||||||
|
new Field(
|
||||||
|
"vec",
|
||||||
|
new FixedSizeList(2, new Field("item", new Float32())),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
const data = makeArrowTable(
|
||||||
|
[
|
||||||
|
{ id: 1n, vec: [1.0, 0.0] },
|
||||||
|
{ id: 2n, vec: [0.0, 1.0] },
|
||||||
|
{ id: 3n, vec: [1.0, 1.0] },
|
||||||
|
],
|
||||||
|
{ schema },
|
||||||
|
);
|
||||||
|
return db.createTable("test_f32", data);
|
||||||
|
}
|
||||||
|
|
||||||
|
it("should search with Float32Array (baseline)", async () => {
|
||||||
|
const table = await createFloat32Table();
|
||||||
|
const results = await table
|
||||||
|
.query()
|
||||||
|
.nearestTo(new Float32Array([1.0, 0.0]))
|
||||||
|
.limit(1)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
expect(results.length).toBe(1);
|
||||||
|
expect(Number(results[0].id)).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should search with number[] (backward compat)", async () => {
|
||||||
|
const table = await createFloat32Table();
|
||||||
|
const results = await table
|
||||||
|
.query()
|
||||||
|
.nearestTo([1.0, 0.0])
|
||||||
|
.limit(1)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
expect(results.length).toBe(1);
|
||||||
|
expect(Number(results[0].id)).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should search with Float64Array via raw path", async () => {
|
||||||
|
const table = await createFloat32Table();
|
||||||
|
const results = await table
|
||||||
|
.query()
|
||||||
|
.nearestTo(new Float64Array([1.0, 0.0]))
|
||||||
|
.limit(1)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
expect(results.length).toBe(1);
|
||||||
|
expect(Number(results[0].id)).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should add multiple query vectors with Float64Array", async () => {
|
||||||
|
const table = await createFloat32Table();
|
||||||
|
const results = await table
|
||||||
|
.query()
|
||||||
|
.nearestTo(new Float64Array([1.0, 0.0]))
|
||||||
|
.addQueryVector(new Float64Array([0.0, 1.0]))
|
||||||
|
.limit(2)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
expect(results.length).toBeGreaterThanOrEqual(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Float16Array is only available in Node 22+; not in TypeScript's standard lib yet
|
||||||
|
const float16ArrayCtor = (globalThis as unknown as Record<string, unknown>)
|
||||||
|
.Float16Array as (new (values: number[]) => unknown) | undefined;
|
||||||
|
const hasFloat16 = float16ArrayCtor !== undefined;
|
||||||
|
const f16it = hasFloat16 ? it : it.skip;
|
||||||
|
|
||||||
|
f16it("should search with Float16Array via raw path", async () => {
|
||||||
|
const table = await createFloat32Table();
|
||||||
|
const results = await table
|
||||||
|
.query()
|
||||||
|
.nearestTo(new float16ArrayCtor!([1.0, 0.0]) as Float32Array)
|
||||||
|
.limit(1)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
expect(results.length).toBe(1);
|
||||||
|
expect(Number(results[0].id)).toBe(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -117,8 +117,9 @@ export type TableLike =
|
|||||||
export type IntoVector =
|
export type IntoVector =
|
||||||
| Float32Array
|
| Float32Array
|
||||||
| Float64Array
|
| Float64Array
|
||||||
|
| Uint8Array
|
||||||
| number[]
|
| number[]
|
||||||
| Promise<Float32Array | Float64Array | number[]>;
|
| Promise<Float32Array | Float64Array | Uint8Array | number[]>;
|
||||||
|
|
||||||
export type MultiVector = IntoVector[];
|
export type MultiVector = IntoVector[];
|
||||||
|
|
||||||
@@ -126,14 +127,48 @@ export function isMultiVector(value: unknown): value is MultiVector {
|
|||||||
return Array.isArray(value) && isIntoVector(value[0]);
|
return Array.isArray(value) && isIntoVector(value[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Float16Array is not in TypeScript's standard lib yet; access dynamically
|
||||||
|
type Float16ArrayCtor = new (
|
||||||
|
...args: unknown[]
|
||||||
|
) => { buffer: ArrayBuffer; byteOffset: number; byteLength: number };
|
||||||
|
const float16ArrayCtor = (globalThis as unknown as Record<string, unknown>)
|
||||||
|
.Float16Array as Float16ArrayCtor | undefined;
|
||||||
|
|
||||||
export function isIntoVector(value: unknown): value is IntoVector {
|
export function isIntoVector(value: unknown): value is IntoVector {
|
||||||
return (
|
return (
|
||||||
value instanceof Float32Array ||
|
value instanceof Float32Array ||
|
||||||
value instanceof Float64Array ||
|
value instanceof Float64Array ||
|
||||||
|
value instanceof Uint8Array ||
|
||||||
|
(float16ArrayCtor !== undefined && value instanceof float16ArrayCtor) ||
|
||||||
(Array.isArray(value) && !Array.isArray(value[0]))
|
(Array.isArray(value) && !Array.isArray(value[0]))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the underlying byte buffer and data type from a typed array
|
||||||
|
* for passing to the Rust NAPI layer without precision loss.
|
||||||
|
*/
|
||||||
|
export function extractVectorBuffer(
|
||||||
|
vector: Float32Array | Float64Array | Uint8Array,
|
||||||
|
): { data: Uint8Array; dtype: string } | null {
|
||||||
|
if (float16ArrayCtor !== undefined && vector instanceof float16ArrayCtor) {
|
||||||
|
return {
|
||||||
|
data: new Uint8Array(vector.buffer, vector.byteOffset, vector.byteLength),
|
||||||
|
dtype: "float16",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (vector instanceof Float64Array) {
|
||||||
|
return {
|
||||||
|
data: new Uint8Array(vector.buffer, vector.byteOffset, vector.byteLength),
|
||||||
|
dtype: "float64",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (vector instanceof Uint8Array && !(vector instanceof Float32Array)) {
|
||||||
|
return { data: vector, dtype: "uint8" };
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
export function isArrowTable(value: object): value is TableLike {
|
export function isArrowTable(value: object): value is TableLike {
|
||||||
if (value instanceof ArrowTable) return true;
|
if (value instanceof ArrowTable) return true;
|
||||||
return "schema" in value && "batches" in value;
|
return "schema" in value && "batches" in value;
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ export interface CreateTableOptions {
|
|||||||
* Options already set on the connection will be inherited by the table,
|
* Options already set on the connection will be inherited by the table,
|
||||||
* but can be overridden here.
|
* but can be overridden here.
|
||||||
*
|
*
|
||||||
* The available options are described at https://lancedb.com/docs/storage/
|
* The available options are described at https://docs.lancedb.com/storage/
|
||||||
*/
|
*/
|
||||||
storageOptions?: Record<string, string>;
|
storageOptions?: Record<string, string>;
|
||||||
|
|
||||||
@@ -78,7 +78,7 @@ export interface OpenTableOptions {
|
|||||||
* Options already set on the connection will be inherited by the table,
|
* Options already set on the connection will be inherited by the table,
|
||||||
* but can be overridden here.
|
* but can be overridden here.
|
||||||
*
|
*
|
||||||
* The available options are described at https://lancedb.com/docs/storage/
|
* The available options are described at https://docs.lancedb.com/storage/
|
||||||
*/
|
*/
|
||||||
storageOptions?: Record<string, string>;
|
storageOptions?: Record<string, string>;
|
||||||
/**
|
/**
|
||||||
@@ -166,25 +166,25 @@ export abstract class Connection {
|
|||||||
* List all the table names in this database.
|
* List all the table names in this database.
|
||||||
*
|
*
|
||||||
* Tables will be returned in lexicographical order.
|
* Tables will be returned in lexicographical order.
|
||||||
* @param {string[]} namespace - The namespace to list tables from (defaults to root namespace)
|
* @param {string[]} namespacePath - The namespace path to list tables from (defaults to root namespace)
|
||||||
* @param {Partial<TableNamesOptions>} options - options to control the
|
* @param {Partial<TableNamesOptions>} options - options to control the
|
||||||
* paging / start point
|
* paging / start point
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
abstract tableNames(
|
abstract tableNames(
|
||||||
namespace?: string[],
|
namespacePath?: string[],
|
||||||
options?: Partial<TableNamesOptions>,
|
options?: Partial<TableNamesOptions>,
|
||||||
): Promise<string[]>;
|
): Promise<string[]>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Open a table in the database.
|
* Open a table in the database.
|
||||||
* @param {string} name - The name of the table
|
* @param {string} name - The name of the table
|
||||||
* @param {string[]} namespace - The namespace of the table (defaults to root namespace)
|
* @param {string[]} namespacePath - The namespace path of the table (defaults to root namespace)
|
||||||
* @param {Partial<OpenTableOptions>} options - Additional options
|
* @param {Partial<OpenTableOptions>} options - Additional options
|
||||||
*/
|
*/
|
||||||
abstract openTable(
|
abstract openTable(
|
||||||
name: string,
|
name: string,
|
||||||
namespace?: string[],
|
namespacePath?: string[],
|
||||||
options?: Partial<OpenTableOptions>,
|
options?: Partial<OpenTableOptions>,
|
||||||
): Promise<Table>;
|
): Promise<Table>;
|
||||||
|
|
||||||
@@ -193,7 +193,7 @@ export abstract class Connection {
|
|||||||
* @param {object} options - The options object.
|
* @param {object} options - The options object.
|
||||||
* @param {string} options.name - The name of the table.
|
* @param {string} options.name - The name of the table.
|
||||||
* @param {Data} options.data - Non-empty Array of Records to be inserted into the table
|
* @param {Data} options.data - Non-empty Array of Records to be inserted into the table
|
||||||
* @param {string[]} namespace - The namespace to create the table in (defaults to root namespace)
|
* @param {string[]} namespacePath - The namespace path to create the table in (defaults to root namespace)
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
abstract createTable(
|
abstract createTable(
|
||||||
@@ -201,7 +201,7 @@ export abstract class Connection {
|
|||||||
name: string;
|
name: string;
|
||||||
data: Data;
|
data: Data;
|
||||||
} & Partial<CreateTableOptions>,
|
} & Partial<CreateTableOptions>,
|
||||||
namespace?: string[],
|
namespacePath?: string[],
|
||||||
): Promise<Table>;
|
): Promise<Table>;
|
||||||
/**
|
/**
|
||||||
* Creates a new Table and initialize it with new data.
|
* Creates a new Table and initialize it with new data.
|
||||||
@@ -220,13 +220,13 @@ export abstract class Connection {
|
|||||||
* @param {string} name - The name of the table.
|
* @param {string} name - The name of the table.
|
||||||
* @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
|
* @param {Record<string, unknown>[] | TableLike} data - Non-empty Array of Records
|
||||||
* to be inserted into the table
|
* to be inserted into the table
|
||||||
* @param {string[]} namespace - The namespace to create the table in (defaults to root namespace)
|
* @param {string[]} namespacePath - The namespace path to create the table in (defaults to root namespace)
|
||||||
* @param {Partial<CreateTableOptions>} options - Additional options
|
* @param {Partial<CreateTableOptions>} options - Additional options
|
||||||
*/
|
*/
|
||||||
abstract createTable(
|
abstract createTable(
|
||||||
name: string,
|
name: string,
|
||||||
data: Record<string, unknown>[] | TableLike,
|
data: Record<string, unknown>[] | TableLike,
|
||||||
namespace?: string[],
|
namespacePath?: string[],
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table>;
|
): Promise<Table>;
|
||||||
|
|
||||||
@@ -245,28 +245,28 @@ export abstract class Connection {
|
|||||||
* Creates a new empty Table
|
* Creates a new empty Table
|
||||||
* @param {string} name - The name of the table.
|
* @param {string} name - The name of the table.
|
||||||
* @param {Schema} schema - The schema of the table
|
* @param {Schema} schema - The schema of the table
|
||||||
* @param {string[]} namespace - The namespace to create the table in (defaults to root namespace)
|
* @param {string[]} namespacePath - The namespace path to create the table in (defaults to root namespace)
|
||||||
* @param {Partial<CreateTableOptions>} options - Additional options
|
* @param {Partial<CreateTableOptions>} options - Additional options
|
||||||
*/
|
*/
|
||||||
abstract createEmptyTable(
|
abstract createEmptyTable(
|
||||||
name: string,
|
name: string,
|
||||||
schema: import("./arrow").SchemaLike,
|
schema: import("./arrow").SchemaLike,
|
||||||
namespace?: string[],
|
namespacePath?: string[],
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table>;
|
): Promise<Table>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Drop an existing table.
|
* Drop an existing table.
|
||||||
* @param {string} name The name of the table to drop.
|
* @param {string} name The name of the table to drop.
|
||||||
* @param {string[]} namespace The namespace of the table (defaults to root namespace).
|
* @param {string[]} namespacePath The namespace path of the table (defaults to root namespace).
|
||||||
*/
|
*/
|
||||||
abstract dropTable(name: string, namespace?: string[]): Promise<void>;
|
abstract dropTable(name: string, namespacePath?: string[]): Promise<void>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Drop all tables in the database.
|
* Drop all tables in the database.
|
||||||
* @param {string[]} namespace The namespace to drop tables from (defaults to root namespace).
|
* @param {string[]} namespacePath The namespace path to drop tables from (defaults to root namespace).
|
||||||
*/
|
*/
|
||||||
abstract dropAllTables(namespace?: string[]): Promise<void>;
|
abstract dropAllTables(namespacePath?: string[]): Promise<void>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clone a table from a source table.
|
* Clone a table from a source table.
|
||||||
@@ -279,7 +279,7 @@ export abstract class Connection {
|
|||||||
* @param {string} targetTableName - The name of the target table to create.
|
* @param {string} targetTableName - The name of the target table to create.
|
||||||
* @param {string} sourceUri - The URI of the source table to clone from.
|
* @param {string} sourceUri - The URI of the source table to clone from.
|
||||||
* @param {object} options - Clone options.
|
* @param {object} options - Clone options.
|
||||||
* @param {string[]} options.targetNamespace - The namespace for the target table (defaults to root namespace).
|
* @param {string[]} options.targetNamespacePath - The namespace path for the target table (defaults to root namespace).
|
||||||
* @param {number} options.sourceVersion - The version of the source table to clone.
|
* @param {number} options.sourceVersion - The version of the source table to clone.
|
||||||
* @param {string} options.sourceTag - The tag of the source table to clone.
|
* @param {string} options.sourceTag - The tag of the source table to clone.
|
||||||
* @param {boolean} options.isShallow - Whether to perform a shallow clone (defaults to true).
|
* @param {boolean} options.isShallow - Whether to perform a shallow clone (defaults to true).
|
||||||
@@ -288,7 +288,7 @@ export abstract class Connection {
|
|||||||
targetTableName: string,
|
targetTableName: string,
|
||||||
sourceUri: string,
|
sourceUri: string,
|
||||||
options?: {
|
options?: {
|
||||||
targetNamespace?: string[];
|
targetNamespacePath?: string[];
|
||||||
sourceVersion?: number;
|
sourceVersion?: number;
|
||||||
sourceTag?: string;
|
sourceTag?: string;
|
||||||
isShallow?: boolean;
|
isShallow?: boolean;
|
||||||
@@ -319,25 +319,25 @@ export class LocalConnection extends Connection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async tableNames(
|
async tableNames(
|
||||||
namespaceOrOptions?: string[] | Partial<TableNamesOptions>,
|
namespacePathOrOptions?: string[] | Partial<TableNamesOptions>,
|
||||||
options?: Partial<TableNamesOptions>,
|
options?: Partial<TableNamesOptions>,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
// Detect if first argument is namespace array or options object
|
// Detect if first argument is namespacePath array or options object
|
||||||
let namespace: string[] | undefined;
|
let namespacePath: string[] | undefined;
|
||||||
let tableNamesOptions: Partial<TableNamesOptions> | undefined;
|
let tableNamesOptions: Partial<TableNamesOptions> | undefined;
|
||||||
|
|
||||||
if (Array.isArray(namespaceOrOptions)) {
|
if (Array.isArray(namespacePathOrOptions)) {
|
||||||
// First argument is namespace array
|
// First argument is namespacePath array
|
||||||
namespace = namespaceOrOptions;
|
namespacePath = namespacePathOrOptions;
|
||||||
tableNamesOptions = options;
|
tableNamesOptions = options;
|
||||||
} else {
|
} else {
|
||||||
// First argument is options object (backwards compatibility)
|
// First argument is options object (backwards compatibility)
|
||||||
namespace = undefined;
|
namespacePath = undefined;
|
||||||
tableNamesOptions = namespaceOrOptions;
|
tableNamesOptions = namespacePathOrOptions;
|
||||||
}
|
}
|
||||||
|
|
||||||
return this.inner.tableNames(
|
return this.inner.tableNames(
|
||||||
namespace ?? [],
|
namespacePath ?? [],
|
||||||
tableNamesOptions?.startAfter,
|
tableNamesOptions?.startAfter,
|
||||||
tableNamesOptions?.limit,
|
tableNamesOptions?.limit,
|
||||||
);
|
);
|
||||||
@@ -345,12 +345,12 @@ export class LocalConnection extends Connection {
|
|||||||
|
|
||||||
async openTable(
|
async openTable(
|
||||||
name: string,
|
name: string,
|
||||||
namespace?: string[],
|
namespacePath?: string[],
|
||||||
options?: Partial<OpenTableOptions>,
|
options?: Partial<OpenTableOptions>,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
const innerTable = await this.inner.openTable(
|
const innerTable = await this.inner.openTable(
|
||||||
name,
|
name,
|
||||||
namespace ?? [],
|
namespacePath ?? [],
|
||||||
cleanseStorageOptions(options?.storageOptions),
|
cleanseStorageOptions(options?.storageOptions),
|
||||||
options?.indexCacheSize,
|
options?.indexCacheSize,
|
||||||
);
|
);
|
||||||
@@ -362,7 +362,7 @@ export class LocalConnection extends Connection {
|
|||||||
targetTableName: string,
|
targetTableName: string,
|
||||||
sourceUri: string,
|
sourceUri: string,
|
||||||
options?: {
|
options?: {
|
||||||
targetNamespace?: string[];
|
targetNamespacePath?: string[];
|
||||||
sourceVersion?: number;
|
sourceVersion?: number;
|
||||||
sourceTag?: string;
|
sourceTag?: string;
|
||||||
isShallow?: boolean;
|
isShallow?: boolean;
|
||||||
@@ -371,7 +371,7 @@ export class LocalConnection extends Connection {
|
|||||||
const innerTable = await this.inner.cloneTable(
|
const innerTable = await this.inner.cloneTable(
|
||||||
targetTableName,
|
targetTableName,
|
||||||
sourceUri,
|
sourceUri,
|
||||||
options?.targetNamespace ?? [],
|
options?.targetNamespacePath ?? [],
|
||||||
options?.sourceVersion ?? null,
|
options?.sourceVersion ?? null,
|
||||||
options?.sourceTag ?? null,
|
options?.sourceTag ?? null,
|
||||||
options?.isShallow ?? true,
|
options?.isShallow ?? true,
|
||||||
@@ -406,42 +406,42 @@ export class LocalConnection extends Connection {
|
|||||||
nameOrOptions:
|
nameOrOptions:
|
||||||
| string
|
| string
|
||||||
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
|
| ({ name: string; data: Data } & Partial<CreateTableOptions>),
|
||||||
dataOrNamespace?: Record<string, unknown>[] | TableLike | string[],
|
dataOrNamespacePath?: Record<string, unknown>[] | TableLike | string[],
|
||||||
namespaceOrOptions?: string[] | Partial<CreateTableOptions>,
|
namespacePathOrOptions?: string[] | Partial<CreateTableOptions>,
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
|
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
|
||||||
// First overload: createTable(options, namespace?)
|
// First overload: createTable(options, namespacePath?)
|
||||||
const { name, data, ...createOptions } = nameOrOptions;
|
const { name, data, ...createOptions } = nameOrOptions;
|
||||||
const namespace = dataOrNamespace as string[] | undefined;
|
const namespacePath = dataOrNamespacePath as string[] | undefined;
|
||||||
return this._createTableImpl(name, data, namespace, createOptions);
|
return this._createTableImpl(name, data, namespacePath, createOptions);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Second overload: createTable(name, data, namespace?, options?)
|
// Second overload: createTable(name, data, namespacePath?, options?)
|
||||||
const name = nameOrOptions;
|
const name = nameOrOptions;
|
||||||
const data = dataOrNamespace as Record<string, unknown>[] | TableLike;
|
const data = dataOrNamespacePath as Record<string, unknown>[] | TableLike;
|
||||||
|
|
||||||
// Detect if third argument is namespace array or options object
|
// Detect if third argument is namespacePath array or options object
|
||||||
let namespace: string[] | undefined;
|
let namespacePath: string[] | undefined;
|
||||||
let createOptions: Partial<CreateTableOptions> | undefined;
|
let createOptions: Partial<CreateTableOptions> | undefined;
|
||||||
|
|
||||||
if (Array.isArray(namespaceOrOptions)) {
|
if (Array.isArray(namespacePathOrOptions)) {
|
||||||
// Third argument is namespace array
|
// Third argument is namespacePath array
|
||||||
namespace = namespaceOrOptions;
|
namespacePath = namespacePathOrOptions;
|
||||||
createOptions = options;
|
createOptions = options;
|
||||||
} else {
|
} else {
|
||||||
// Third argument is options object (backwards compatibility)
|
// Third argument is options object (backwards compatibility)
|
||||||
namespace = undefined;
|
namespacePath = undefined;
|
||||||
createOptions = namespaceOrOptions;
|
createOptions = namespacePathOrOptions;
|
||||||
}
|
}
|
||||||
|
|
||||||
return this._createTableImpl(name, data, namespace, createOptions);
|
return this._createTableImpl(name, data, namespacePath, createOptions);
|
||||||
}
|
}
|
||||||
|
|
||||||
private async _createTableImpl(
|
private async _createTableImpl(
|
||||||
name: string,
|
name: string,
|
||||||
data: Data,
|
data: Data,
|
||||||
namespace?: string[],
|
namespacePath?: string[],
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
if (data === undefined) {
|
if (data === undefined) {
|
||||||
@@ -455,7 +455,7 @@ export class LocalConnection extends Connection {
|
|||||||
name,
|
name,
|
||||||
buf,
|
buf,
|
||||||
mode,
|
mode,
|
||||||
namespace ?? [],
|
namespacePath ?? [],
|
||||||
storageOptions,
|
storageOptions,
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -465,21 +465,21 @@ export class LocalConnection extends Connection {
|
|||||||
async createEmptyTable(
|
async createEmptyTable(
|
||||||
name: string,
|
name: string,
|
||||||
schema: import("./arrow").SchemaLike,
|
schema: import("./arrow").SchemaLike,
|
||||||
namespaceOrOptions?: string[] | Partial<CreateTableOptions>,
|
namespacePathOrOptions?: string[] | Partial<CreateTableOptions>,
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Promise<Table> {
|
): Promise<Table> {
|
||||||
// Detect if third argument is namespace array or options object
|
// Detect if third argument is namespacePath array or options object
|
||||||
let namespace: string[] | undefined;
|
let namespacePath: string[] | undefined;
|
||||||
let createOptions: Partial<CreateTableOptions> | undefined;
|
let createOptions: Partial<CreateTableOptions> | undefined;
|
||||||
|
|
||||||
if (Array.isArray(namespaceOrOptions)) {
|
if (Array.isArray(namespacePathOrOptions)) {
|
||||||
// Third argument is namespace array
|
// Third argument is namespacePath array
|
||||||
namespace = namespaceOrOptions;
|
namespacePath = namespacePathOrOptions;
|
||||||
createOptions = options;
|
createOptions = options;
|
||||||
} else {
|
} else {
|
||||||
// Third argument is options object (backwards compatibility)
|
// Third argument is options object (backwards compatibility)
|
||||||
namespace = undefined;
|
namespacePath = undefined;
|
||||||
createOptions = namespaceOrOptions;
|
createOptions = namespacePathOrOptions;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mode: string = createOptions?.mode ?? "create";
|
let mode: string = createOptions?.mode ?? "create";
|
||||||
@@ -502,18 +502,18 @@ export class LocalConnection extends Connection {
|
|||||||
name,
|
name,
|
||||||
buf,
|
buf,
|
||||||
mode,
|
mode,
|
||||||
namespace ?? [],
|
namespacePath ?? [],
|
||||||
storageOptions,
|
storageOptions,
|
||||||
);
|
);
|
||||||
return new LocalTable(innerTable);
|
return new LocalTable(innerTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
async dropTable(name: string, namespace?: string[]): Promise<void> {
|
async dropTable(name: string, namespacePath?: string[]): Promise<void> {
|
||||||
return this.inner.dropTable(name, namespace ?? []);
|
return this.inner.dropTable(name, namespacePath ?? []);
|
||||||
}
|
}
|
||||||
|
|
||||||
async dropAllTables(namespace?: string[]): Promise<void> {
|
async dropAllTables(namespacePath?: string[]): Promise<void> {
|
||||||
return this.inner.dropAllTables(namespace ?? []);
|
return this.inner.dropAllTables(namespacePath ?? []);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import {
|
|||||||
Table as ArrowTable,
|
Table as ArrowTable,
|
||||||
type IntoVector,
|
type IntoVector,
|
||||||
RecordBatch,
|
RecordBatch,
|
||||||
|
extractVectorBuffer,
|
||||||
fromBufferToRecordBatch,
|
fromBufferToRecordBatch,
|
||||||
fromRecordBatchToBuffer,
|
fromRecordBatchToBuffer,
|
||||||
tableFromIPC,
|
tableFromIPC,
|
||||||
@@ -661,10 +662,8 @@ export class VectorQuery extends StandardQueryBase<NativeVectorQuery> {
|
|||||||
const res = (async () => {
|
const res = (async () => {
|
||||||
try {
|
try {
|
||||||
const v = await vector;
|
const v = await vector;
|
||||||
const arr = Float32Array.from(v);
|
|
||||||
//
|
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: we need to get the `inner`, but js has no package scoping
|
// biome-ignore lint/suspicious/noExplicitAny: we need to get the `inner`, but js has no package scoping
|
||||||
const value: any = this.addQueryVector(arr);
|
const value: any = this.addQueryVector(v);
|
||||||
const inner = value.inner as
|
const inner = value.inner as
|
||||||
| NativeVectorQuery
|
| NativeVectorQuery
|
||||||
| Promise<NativeVectorQuery>;
|
| Promise<NativeVectorQuery>;
|
||||||
@@ -676,7 +675,12 @@ export class VectorQuery extends StandardQueryBase<NativeVectorQuery> {
|
|||||||
return new VectorQuery(res);
|
return new VectorQuery(res);
|
||||||
} else {
|
} else {
|
||||||
super.doCall((inner) => {
|
super.doCall((inner) => {
|
||||||
inner.addQueryVector(Float32Array.from(vector));
|
const raw = Array.isArray(vector) ? null : extractVectorBuffer(vector);
|
||||||
|
if (raw) {
|
||||||
|
inner.addQueryVectorRaw(raw.data, raw.dtype);
|
||||||
|
} else {
|
||||||
|
inner.addQueryVector(Float32Array.from(vector as number[]));
|
||||||
|
}
|
||||||
});
|
});
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
@@ -765,14 +769,23 @@ export class Query extends StandardQueryBase<NativeQuery> {
|
|||||||
* a default `limit` of 10 will be used. @see {@link Query#limit}
|
* a default `limit` of 10 will be used. @see {@link Query#limit}
|
||||||
*/
|
*/
|
||||||
nearestTo(vector: IntoVector): VectorQuery {
|
nearestTo(vector: IntoVector): VectorQuery {
|
||||||
|
const callNearestTo = (
|
||||||
|
inner: NativeQuery,
|
||||||
|
resolved: Float32Array | Float64Array | Uint8Array | number[],
|
||||||
|
): NativeVectorQuery => {
|
||||||
|
const raw = Array.isArray(resolved)
|
||||||
|
? null
|
||||||
|
: extractVectorBuffer(resolved);
|
||||||
|
if (raw) {
|
||||||
|
return inner.nearestToRaw(raw.data, raw.dtype);
|
||||||
|
}
|
||||||
|
return inner.nearestTo(Float32Array.from(resolved as number[]));
|
||||||
|
};
|
||||||
|
|
||||||
if (this.inner instanceof Promise) {
|
if (this.inner instanceof Promise) {
|
||||||
const nativeQuery = this.inner.then(async (inner) => {
|
const nativeQuery = this.inner.then(async (inner) => {
|
||||||
if (vector instanceof Promise) {
|
const resolved = vector instanceof Promise ? await vector : vector;
|
||||||
const arr = await vector.then((v) => Float32Array.from(v));
|
return callNearestTo(inner, resolved);
|
||||||
return inner.nearestTo(arr);
|
|
||||||
} else {
|
|
||||||
return inner.nearestTo(Float32Array.from(vector));
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
return new VectorQuery(nativeQuery);
|
return new VectorQuery(nativeQuery);
|
||||||
}
|
}
|
||||||
@@ -780,10 +793,8 @@ export class Query extends StandardQueryBase<NativeQuery> {
|
|||||||
const res = (async () => {
|
const res = (async () => {
|
||||||
try {
|
try {
|
||||||
const v = await vector;
|
const v = await vector;
|
||||||
const arr = Float32Array.from(v);
|
|
||||||
//
|
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: we need to get the `inner`, but js has no package scoping
|
// biome-ignore lint/suspicious/noExplicitAny: we need to get the `inner`, but js has no package scoping
|
||||||
const value: any = this.nearestTo(arr);
|
const value: any = this.nearestTo(v);
|
||||||
const inner = value.inner as
|
const inner = value.inner as
|
||||||
| NativeVectorQuery
|
| NativeVectorQuery
|
||||||
| Promise<NativeVectorQuery>;
|
| Promise<NativeVectorQuery>;
|
||||||
@@ -794,7 +805,7 @@ export class Query extends StandardQueryBase<NativeQuery> {
|
|||||||
})();
|
})();
|
||||||
return new VectorQuery(res);
|
return new VectorQuery(res);
|
||||||
} else {
|
} else {
|
||||||
const vectorQuery = this.inner.nearestTo(Float32Array.from(vector));
|
const vectorQuery = callNearestTo(this.inner, vector);
|
||||||
return new VectorQuery(vectorQuery);
|
return new VectorQuery(vectorQuery);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.27.0",
|
"version": "0.28.0-beta.11",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.27.0",
|
"version": "0.28.0-beta.11",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.27.0",
|
"version": "0.28.0-beta.11",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.27.0",
|
"version": "0.28.0-beta.11",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.27.0",
|
"version": "0.28.0-beta.11",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.27.0",
|
"version": "0.28.0-beta.11",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.27.0",
|
"version": "0.28.0-beta.11",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.27.0",
|
"version": "0.28.0-beta.11",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.27.0",
|
"version": "0.28.0-beta.11",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.27.0",
|
"version": "0.28.0-beta.11",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
@@ -75,7 +75,6 @@
|
|||||||
"build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir lancedb",
|
"build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir lancedb",
|
||||||
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
|
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
|
||||||
"build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir dist",
|
"build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir dist",
|
||||||
"postbuild:release": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
|
|
||||||
"build": "npm run build:debug && npm run tsc",
|
"build": "npm run build:debug && npm run tsc",
|
||||||
"build-release": "npm run build:release && npm run tsc",
|
"build-release": "npm run build:release && npm run tsc",
|
||||||
"tsc": "tsc -b",
|
"tsc": "tsc -b",
|
||||||
|
|||||||
@@ -67,6 +67,12 @@ impl Connection {
|
|||||||
builder = builder.storage_option(key, value);
|
builder = builder.storage_option(key, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if let Some(manifest_enabled) = options.manifest_enabled {
|
||||||
|
builder = builder.manifest_enabled(manifest_enabled);
|
||||||
|
}
|
||||||
|
if let Some(namespace_client_properties) = options.namespace_client_properties {
|
||||||
|
builder = builder.namespace_client_properties(namespace_client_properties);
|
||||||
|
}
|
||||||
|
|
||||||
// Create client config, optionally with header provider
|
// Create client config, optionally with header provider
|
||||||
let client_config = options.client_config.unwrap_or_default();
|
let client_config = options.client_config.unwrap_or_default();
|
||||||
@@ -119,12 +125,12 @@ impl Connection {
|
|||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub async fn table_names(
|
pub async fn table_names(
|
||||||
&self,
|
&self,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
start_after: Option<String>,
|
start_after: Option<String>,
|
||||||
limit: Option<u32>,
|
limit: Option<u32>,
|
||||||
) -> napi::Result<Vec<String>> {
|
) -> napi::Result<Vec<String>> {
|
||||||
let mut op = self.get_inner()?.table_names();
|
let mut op = self.get_inner()?.table_names();
|
||||||
op = op.namespace(namespace);
|
op = op.namespace(namespace_path.unwrap_or_default());
|
||||||
if let Some(start_after) = start_after {
|
if let Some(start_after) = start_after {
|
||||||
op = op.start_after(start_after);
|
op = op.start_after(start_after);
|
||||||
}
|
}
|
||||||
@@ -146,7 +152,7 @@ impl Connection {
|
|||||||
name: String,
|
name: String,
|
||||||
buf: Buffer,
|
buf: Buffer,
|
||||||
mode: String,
|
mode: String,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
storage_options: Option<HashMap<String, String>>,
|
||||||
) -> napi::Result<Table> {
|
) -> napi::Result<Table> {
|
||||||
let batches = ipc_file_to_batches(buf.to_vec())
|
let batches = ipc_file_to_batches(buf.to_vec())
|
||||||
@@ -154,7 +160,7 @@ impl Connection {
|
|||||||
let mode = Self::parse_create_mode_str(&mode)?;
|
let mode = Self::parse_create_mode_str(&mode)?;
|
||||||
let mut builder = self.get_inner()?.create_table(&name, batches).mode(mode);
|
let mut builder = self.get_inner()?.create_table(&name, batches).mode(mode);
|
||||||
|
|
||||||
builder = builder.namespace(namespace);
|
builder = builder.namespace(namespace_path.unwrap_or_default());
|
||||||
|
|
||||||
if let Some(storage_options) = storage_options {
|
if let Some(storage_options) = storage_options {
|
||||||
for (key, value) in storage_options {
|
for (key, value) in storage_options {
|
||||||
@@ -171,7 +177,7 @@ impl Connection {
|
|||||||
name: String,
|
name: String,
|
||||||
schema_buf: Buffer,
|
schema_buf: Buffer,
|
||||||
mode: String,
|
mode: String,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
storage_options: Option<HashMap<String, String>>,
|
||||||
) -> napi::Result<Table> {
|
) -> napi::Result<Table> {
|
||||||
let schema = ipc_file_to_schema(schema_buf.to_vec()).map_err(|e| {
|
let schema = ipc_file_to_schema(schema_buf.to_vec()).map_err(|e| {
|
||||||
@@ -183,7 +189,7 @@ impl Connection {
|
|||||||
.create_empty_table(&name, schema)
|
.create_empty_table(&name, schema)
|
||||||
.mode(mode);
|
.mode(mode);
|
||||||
|
|
||||||
builder = builder.namespace(namespace);
|
builder = builder.namespace(namespace_path.unwrap_or_default());
|
||||||
|
|
||||||
if let Some(storage_options) = storage_options {
|
if let Some(storage_options) = storage_options {
|
||||||
for (key, value) in storage_options {
|
for (key, value) in storage_options {
|
||||||
@@ -198,13 +204,13 @@ impl Connection {
|
|||||||
pub async fn open_table(
|
pub async fn open_table(
|
||||||
&self,
|
&self,
|
||||||
name: String,
|
name: String,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
storage_options: Option<HashMap<String, String>>,
|
||||||
index_cache_size: Option<u32>,
|
index_cache_size: Option<u32>,
|
||||||
) -> napi::Result<Table> {
|
) -> napi::Result<Table> {
|
||||||
let mut builder = self.get_inner()?.open_table(&name);
|
let mut builder = self.get_inner()?.open_table(&name);
|
||||||
|
|
||||||
builder = builder.namespace(namespace);
|
builder = builder.namespace(namespace_path.unwrap_or_default());
|
||||||
|
|
||||||
if let Some(storage_options) = storage_options {
|
if let Some(storage_options) = storage_options {
|
||||||
for (key, value) in storage_options {
|
for (key, value) in storage_options {
|
||||||
@@ -223,7 +229,7 @@ impl Connection {
|
|||||||
&self,
|
&self,
|
||||||
target_table_name: String,
|
target_table_name: String,
|
||||||
source_uri: String,
|
source_uri: String,
|
||||||
target_namespace: Vec<String>,
|
target_namespace_path: Option<Vec<String>>,
|
||||||
source_version: Option<i64>,
|
source_version: Option<i64>,
|
||||||
source_tag: Option<String>,
|
source_tag: Option<String>,
|
||||||
is_shallow: bool,
|
is_shallow: bool,
|
||||||
@@ -232,7 +238,7 @@ impl Connection {
|
|||||||
.get_inner()?
|
.get_inner()?
|
||||||
.clone_table(&target_table_name, &source_uri);
|
.clone_table(&target_table_name, &source_uri);
|
||||||
|
|
||||||
builder = builder.target_namespace(target_namespace);
|
builder = builder.target_namespace(target_namespace_path.unwrap_or_default());
|
||||||
|
|
||||||
if let Some(version) = source_version {
|
if let Some(version) = source_version {
|
||||||
builder = builder.source_version(version as u64);
|
builder = builder.source_version(version as u64);
|
||||||
@@ -250,18 +256,21 @@ impl Connection {
|
|||||||
|
|
||||||
/// Drop table with the name. Or raise an error if the table does not exist.
|
/// Drop table with the name. Or raise an error if the table does not exist.
|
||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub async fn drop_table(&self, name: String, namespace: Vec<String>) -> napi::Result<()> {
|
pub async fn drop_table(
|
||||||
|
&self,
|
||||||
|
name: String,
|
||||||
|
namespace_path: Option<Vec<String>>,
|
||||||
|
) -> napi::Result<()> {
|
||||||
|
let ns = namespace_path.unwrap_or_default();
|
||||||
self.get_inner()?
|
self.get_inner()?
|
||||||
.drop_table(&name, &namespace)
|
.drop_table(&name, &ns)
|
||||||
.await
|
.await
|
||||||
.default_error()
|
.default_error()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub async fn drop_all_tables(&self, namespace: Vec<String>) -> napi::Result<()> {
|
pub async fn drop_all_tables(&self, namespace_path: Option<Vec<String>>) -> napi::Result<()> {
|
||||||
self.get_inner()?
|
let ns = namespace_path.unwrap_or_default();
|
||||||
.drop_all_tables(&namespace)
|
self.get_inner()?.drop_all_tables(&ns).await.default_error()
|
||||||
.await
|
|
||||||
.default_error()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,8 +35,15 @@ pub struct ConnectionOptions {
|
|||||||
pub read_consistency_interval: Option<f64>,
|
pub read_consistency_interval: Option<f64>,
|
||||||
/// (For LanceDB OSS only): configuration for object storage.
|
/// (For LanceDB OSS only): configuration for object storage.
|
||||||
///
|
///
|
||||||
/// The available options are described at https://lancedb.com/docs/storage/
|
/// The available options are described at https://docs.lancedb.com/storage/
|
||||||
pub storage_options: Option<HashMap<String, String>>,
|
pub storage_options: Option<HashMap<String, String>>,
|
||||||
|
/// (For LanceDB OSS only): use directory namespace manifests as the source
|
||||||
|
/// of truth for table metadata. Existing directory-listed root tables are
|
||||||
|
/// migrated into the manifest on access.
|
||||||
|
pub manifest_enabled: Option<bool>,
|
||||||
|
/// (For LanceDB OSS only): extra properties for the backing namespace
|
||||||
|
/// client used by manifest-enabled native connections.
|
||||||
|
pub namespace_client_properties: Option<HashMap<String, String>>,
|
||||||
/// (For LanceDB OSS only): the session to use for this connection. Holds
|
/// (For LanceDB OSS only): the session to use for this connection. Holds
|
||||||
/// shared caches and other session-specific state.
|
/// shared caches and other session-specific state.
|
||||||
pub session: Option<session::Session>,
|
pub session: Option<session::Session>,
|
||||||
|
|||||||
@@ -3,6 +3,12 @@
|
|||||||
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use arrow_array::{
|
||||||
|
Array, Float16Array as ArrowFloat16Array, Float32Array as ArrowFloat32Array,
|
||||||
|
Float64Array as ArrowFloat64Array, UInt8Array as ArrowUInt8Array,
|
||||||
|
};
|
||||||
|
use arrow_buffer::ScalarBuffer;
|
||||||
|
use half::f16;
|
||||||
use lancedb::index::scalar::{
|
use lancedb::index::scalar::{
|
||||||
BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur,
|
BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur,
|
||||||
Operator, PhraseQuery,
|
Operator, PhraseQuery,
|
||||||
@@ -24,6 +30,33 @@ use crate::rerankers::RerankHybridCallbackArgs;
|
|||||||
use crate::rerankers::Reranker;
|
use crate::rerankers::Reranker;
|
||||||
use crate::util::{parse_distance_type, schema_to_buffer};
|
use crate::util::{parse_distance_type, schema_to_buffer};
|
||||||
|
|
||||||
|
fn bytes_to_arrow_array(data: Uint8Array, dtype: String) -> napi::Result<Arc<dyn Array>> {
|
||||||
|
let buf = arrow_buffer::Buffer::from(data.to_vec());
|
||||||
|
let num_bytes = buf.len();
|
||||||
|
match dtype.as_str() {
|
||||||
|
"float16" => {
|
||||||
|
let scalar_buf = ScalarBuffer::<f16>::new(buf, 0, num_bytes / 2);
|
||||||
|
Ok(Arc::new(ArrowFloat16Array::new(scalar_buf, None)))
|
||||||
|
}
|
||||||
|
"float32" => {
|
||||||
|
let scalar_buf = ScalarBuffer::<f32>::new(buf, 0, num_bytes / 4);
|
||||||
|
Ok(Arc::new(ArrowFloat32Array::new(scalar_buf, None)))
|
||||||
|
}
|
||||||
|
"float64" => {
|
||||||
|
let scalar_buf = ScalarBuffer::<f64>::new(buf, 0, num_bytes / 8);
|
||||||
|
Ok(Arc::new(ArrowFloat64Array::new(scalar_buf, None)))
|
||||||
|
}
|
||||||
|
"uint8" => {
|
||||||
|
let scalar_buf = ScalarBuffer::<u8>::new(buf, 0, num_bytes);
|
||||||
|
Ok(Arc::new(ArrowUInt8Array::new(scalar_buf, None)))
|
||||||
|
}
|
||||||
|
_ => Err(napi::Error::from_reason(format!(
|
||||||
|
"Unsupported vector dtype: {}. Expected one of: float16, float32, float64, uint8",
|
||||||
|
dtype
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub struct Query {
|
pub struct Query {
|
||||||
inner: LanceDbQuery,
|
inner: LanceDbQuery,
|
||||||
@@ -78,6 +111,13 @@ impl Query {
|
|||||||
Ok(VectorQuery { inner })
|
Ok(VectorQuery { inner })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn nearest_to_raw(&mut self, data: Uint8Array, dtype: String) -> Result<VectorQuery> {
|
||||||
|
let array = bytes_to_arrow_array(data, dtype)?;
|
||||||
|
let inner = self.inner.clone().nearest_to(array).default_error()?;
|
||||||
|
Ok(VectorQuery { inner })
|
||||||
|
}
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub fn fast_search(&mut self) {
|
pub fn fast_search(&mut self) {
|
||||||
self.inner = self.inner.clone().fast_search();
|
self.inner = self.inner.clone().fast_search();
|
||||||
@@ -163,6 +203,13 @@ impl VectorQuery {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn add_query_vector_raw(&mut self, data: Uint8Array, dtype: String) -> Result<()> {
|
||||||
|
let array = bytes_to_arrow_array(data, dtype)?;
|
||||||
|
self.inner = self.inner.clone().add_query_vector(array).default_error()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub fn distance_type(&mut self, distance_type: String) -> napi::Result<()> {
|
pub fn distance_type(&mut self, distance_type: String) -> napi::Result<()> {
|
||||||
let distance_type = parse_distance_type(distance_type)?;
|
let distance_type = parse_distance_type(distance_type)?;
|
||||||
|
|||||||
@@ -92,6 +92,13 @@ pub struct ClientConfig {
|
|||||||
pub extra_headers: Option<HashMap<String, String>>,
|
pub extra_headers: Option<HashMap<String, String>>,
|
||||||
pub id_delimiter: Option<String>,
|
pub id_delimiter: Option<String>,
|
||||||
pub tls_config: Option<TlsConfig>,
|
pub tls_config: Option<TlsConfig>,
|
||||||
|
/// User identifier for tracking purposes.
|
||||||
|
///
|
||||||
|
/// This is sent as the `x-lancedb-user-id` header in requests to LanceDB Cloud/Enterprise.
|
||||||
|
/// It can be set directly, or via the `LANCEDB_USER_ID` environment variable.
|
||||||
|
/// Alternatively, set `LANCEDB_USER_ID_ENV_KEY` to specify another environment
|
||||||
|
/// variable that contains the user ID value.
|
||||||
|
pub user_id: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<TimeoutConfig> for lancedb::remote::TimeoutConfig {
|
impl From<TimeoutConfig> for lancedb::remote::TimeoutConfig {
|
||||||
@@ -145,6 +152,7 @@ impl From<ClientConfig> for lancedb::remote::ClientConfig {
|
|||||||
id_delimiter: config.id_delimiter,
|
id_delimiter: config.id_delimiter,
|
||||||
tls_config: config.tls_config.map(Into::into),
|
tls_config: config.tls_config.map(Into::into),
|
||||||
header_provider: None, // the header provider is set separately later
|
header_provider: None, // the header provider is set separately later
|
||||||
|
user_id: config.user_id,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ type RerankHybridFn = ThreadsafeFunction<
|
|||||||
RerankHybridCallbackArgs,
|
RerankHybridCallbackArgs,
|
||||||
Status,
|
Status,
|
||||||
false,
|
false,
|
||||||
|
true,
|
||||||
>;
|
>;
|
||||||
|
|
||||||
/// Reranker implementation that "wraps" a NodeJS Reranker implementation.
|
/// Reranker implementation that "wraps" a NodeJS Reranker implementation.
|
||||||
@@ -32,7 +33,10 @@ impl Reranker {
|
|||||||
pub fn new(
|
pub fn new(
|
||||||
rerank_hybrid: Function<RerankHybridCallbackArgs, Promise<Buffer>>,
|
rerank_hybrid: Function<RerankHybridCallbackArgs, Promise<Buffer>>,
|
||||||
) -> napi::Result<Self> {
|
) -> napi::Result<Self> {
|
||||||
let rerank_hybrid = rerank_hybrid.build_threadsafe_function().build()?;
|
let rerank_hybrid = rerank_hybrid
|
||||||
|
.build_threadsafe_function()
|
||||||
|
.weak::<true>()
|
||||||
|
.build()?;
|
||||||
Ok(Self { rerank_hybrid })
|
Ok(Self { rerank_hybrid })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.30.1"
|
current_version = "0.31.0-beta.11"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
2
python/.gitignore
vendored
2
python/.gitignore
vendored
@@ -1,3 +1,5 @@
|
|||||||
# Test data created by some example tests
|
# Test data created by some example tests
|
||||||
data/
|
data/
|
||||||
_lancedb.pyd
|
_lancedb.pyd
|
||||||
|
# macOS debug symbols bundle generated during build
|
||||||
|
*.dSYM/
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.30.1"
|
version = "0.31.0-beta.11"
|
||||||
|
publish = false
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -14,7 +15,7 @@ name = "_lancedb"
|
|||||||
crate-type = ["cdylib"]
|
crate-type = ["cdylib"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
arrow = { version = "57.2", features = ["pyarrow"] }
|
arrow = { version = "58.0.0", features = ["pyarrow"] }
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
bytes = "1"
|
bytes = "1"
|
||||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||||
@@ -23,8 +24,9 @@ lance-namespace.workspace = true
|
|||||||
lance-namespace-impls.workspace = true
|
lance-namespace-impls.workspace = true
|
||||||
lance-io.workspace = true
|
lance-io.workspace = true
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
pyo3 = { version = "0.26", features = ["extension-module", "abi3-py39"] }
|
log.workspace = true
|
||||||
pyo3-async-runtimes = { version = "0.26", features = [
|
pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39"] }
|
||||||
|
pyo3-async-runtimes = { version = "0.28", features = [
|
||||||
"attributes",
|
"attributes",
|
||||||
"tokio-runtime",
|
"tokio-runtime",
|
||||||
] }
|
] }
|
||||||
@@ -36,7 +38,7 @@ snafu.workspace = true
|
|||||||
tokio = { version = "1.40", features = ["sync"] }
|
tokio = { version = "1.40", features = ["sync"] }
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
pyo3-build-config = { version = "0.26", features = [
|
pyo3-build-config = { version = "0.28", features = [
|
||||||
"extension-module",
|
"extension-module",
|
||||||
"abi3-py39",
|
"abi3-py39",
|
||||||
] }
|
] }
|
||||||
|
|||||||
@@ -183,7 +183,6 @@
|
|||||||
| stack-data | 0.6.3 | MIT License | http://github.com/alexmojaki/stack_data |
|
| stack-data | 0.6.3 | MIT License | http://github.com/alexmojaki/stack_data |
|
||||||
| sympy | 1.14.0 | BSD License | https://sympy.org |
|
| sympy | 1.14.0 | BSD License | https://sympy.org |
|
||||||
| tabulate | 0.9.0 | MIT License | https://github.com/astanin/python-tabulate |
|
| tabulate | 0.9.0 | MIT License | https://github.com/astanin/python-tabulate |
|
||||||
| tantivy | 0.25.1 | UNKNOWN | UNKNOWN |
|
|
||||||
| threadpoolctl | 3.6.0 | BSD License | https://github.com/joblib/threadpoolctl |
|
| threadpoolctl | 3.6.0 | BSD License | https://github.com/joblib/threadpoolctl |
|
||||||
| timm | 1.0.24 | Apache Software License | https://github.com/huggingface/pytorch-image-models |
|
| timm | 1.0.24 | Apache Software License | https://github.com/huggingface/pytorch-image-models |
|
||||||
| tinycss2 | 1.4.0 | BSD License | https://www.courtbouillon.org/tinycss2 |
|
| tinycss2 | 1.4.0 | BSD License | https://www.courtbouillon.org/tinycss2 |
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ repository = "https://github.com/lancedb/lancedb"
|
|||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
pylance = [
|
pylance = [
|
||||||
"pylance>=4.0.0b7",
|
"pylance>=5.0.0b5",
|
||||||
]
|
]
|
||||||
tests = [
|
tests = [
|
||||||
"aiohttp>=3.9.0",
|
"aiohttp>=3.9.0",
|
||||||
@@ -57,9 +57,8 @@ tests = [
|
|||||||
"duckdb>=0.9.0",
|
"duckdb>=0.9.0",
|
||||||
"pytz>=2023.3",
|
"pytz>=2023.3",
|
||||||
"polars>=0.19, <=1.3.0",
|
"polars>=0.19, <=1.3.0",
|
||||||
"tantivy>=0.20.0",
|
|
||||||
"pyarrow-stubs>=16.0",
|
"pyarrow-stubs>=16.0",
|
||||||
"pylance>=4.0.0b7",
|
"pylance>=5.0.0b5",
|
||||||
"requests>=2.31.0",
|
"requests>=2.31.0",
|
||||||
"datafusion>=52,<53",
|
"datafusion>=52,<53",
|
||||||
]
|
]
|
||||||
@@ -83,7 +82,7 @@ embeddings = [
|
|||||||
"colpali-engine>=0.3.10",
|
"colpali-engine>=0.3.10",
|
||||||
"huggingface_hub>=0.19.0",
|
"huggingface_hub>=0.19.0",
|
||||||
"InstructorEmbedding>=1.0.1",
|
"InstructorEmbedding>=1.0.1",
|
||||||
"google.generativeai>=0.3.0",
|
"google-genai>=1.0.0",
|
||||||
"boto3>=1.28.57",
|
"boto3>=1.28.57",
|
||||||
"awscli>=1.44.38",
|
"awscli>=1.44.38",
|
||||||
"botocore>=1.31.57",
|
"botocore>=1.31.57",
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import importlib.metadata
|
|||||||
import os
|
import os
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import Dict, Optional, Union, Any
|
from typing import Dict, Optional, Union, Any, List
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
__version__ = importlib.metadata.version("lancedb")
|
__version__ = importlib.metadata.version("lancedb")
|
||||||
@@ -15,9 +15,9 @@ from ._lancedb import connect as lancedb_connect
|
|||||||
from .common import URI, sanitize_uri
|
from .common import URI, sanitize_uri
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from .db import AsyncConnection, DBConnection, LanceDBConnection
|
from .db import AsyncConnection, DBConnection, LanceDBConnection
|
||||||
from .io import StorageOptionsProvider
|
|
||||||
from .remote import ClientConfig
|
from .remote import ClientConfig
|
||||||
from .remote.db import RemoteDBConnection
|
from .remote.db import RemoteDBConnection
|
||||||
|
from .expr import Expr, col, lit, func
|
||||||
from .schema import vector
|
from .schema import vector
|
||||||
from .table import AsyncTable, Table
|
from .table import AsyncTable, Table
|
||||||
from ._lancedb import Session
|
from ._lancedb import Session
|
||||||
@@ -63,7 +63,7 @@ def _check_s3_bucket_with_dots(
|
|||||||
|
|
||||||
|
|
||||||
def connect(
|
def connect(
|
||||||
uri: URI,
|
uri: Optional[URI] = None,
|
||||||
*,
|
*,
|
||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
region: str = "us-east-1",
|
region: str = "us-east-1",
|
||||||
@@ -73,14 +73,19 @@ def connect(
|
|||||||
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
|
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
session: Optional[Session] = None,
|
session: Optional[Session] = None,
|
||||||
|
manifest_enabled: bool = False,
|
||||||
|
namespace_client_impl: Optional[str] = None,
|
||||||
|
namespace_client_properties: Optional[Dict[str, str]] = None,
|
||||||
|
namespace_client_pushdown_operations: Optional[List[str]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> DBConnection:
|
) -> DBConnection:
|
||||||
"""Connect to a LanceDB database.
|
"""Connect to a LanceDB database.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
uri: str or Path
|
uri: str or Path, optional
|
||||||
The uri of the database.
|
The uri of the database. When ``namespace_client_impl`` is provided you may
|
||||||
|
omit ``uri`` and connect through a namespace client instead.
|
||||||
api_key: str, optional
|
api_key: str, optional
|
||||||
If presented, connect to LanceDB cloud.
|
If presented, connect to LanceDB cloud.
|
||||||
Otherwise, connect to a database on file system or cloud storage.
|
Otherwise, connect to a database on file system or cloud storage.
|
||||||
@@ -106,13 +111,29 @@ def connect(
|
|||||||
default configuration is used.
|
default configuration is used.
|
||||||
storage_options: dict, optional
|
storage_options: dict, optional
|
||||||
Additional options for the storage backend. See available options at
|
Additional options for the storage backend. See available options at
|
||||||
<https://lancedb.com/docs/storage/>
|
<https://docs.lancedb.com/storage/>
|
||||||
|
manifest_enabled : bool, default False
|
||||||
|
When true for local/native connections, use directory namespace
|
||||||
|
manifests as the source of truth for table metadata. Existing
|
||||||
|
directory-listed root tables are migrated into the manifest on access.
|
||||||
session: Session, optional
|
session: Session, optional
|
||||||
(For LanceDB OSS only)
|
(For LanceDB OSS only)
|
||||||
A session to use for this connection. Sessions allow you to configure
|
A session to use for this connection. Sessions allow you to configure
|
||||||
cache sizes for index and metadata caches, which can significantly
|
cache sizes for index and metadata caches, which can significantly
|
||||||
impact memory use and performance. They can also be re-used across
|
impact memory use and performance. They can also be re-used across
|
||||||
multiple connections to share the same cache state.
|
multiple connections to share the same cache state.
|
||||||
|
namespace_client_impl : str, optional
|
||||||
|
When provided along with ``namespace_client_properties``, ``connect``
|
||||||
|
returns a namespace-backed connection by delegating to
|
||||||
|
:func:`connect_namespace`. The value identifies which namespace
|
||||||
|
implementation to load (e.g., ``"dir"`` or ``"rest"``).
|
||||||
|
namespace_client_properties : dict, optional
|
||||||
|
Configuration to pass to the namespace client implementation. Required
|
||||||
|
when ``namespace_client_impl`` is set.
|
||||||
|
namespace_client_pushdown_operations : list[str], optional
|
||||||
|
Only used when ``namespace_client_properties`` is provided. Forwards to
|
||||||
|
:func:`connect_namespace` to control which operations are executed on the
|
||||||
|
namespace service (e.g., ``["QueryTable", "CreateTable"]``).
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -132,11 +153,48 @@ def connect(
|
|||||||
>>> db = lancedb.connect("db://my_database", api_key="ldb_...",
|
>>> db = lancedb.connect("db://my_database", api_key="ldb_...",
|
||||||
... client_config={"retry_config": {"retries": 5}})
|
... client_config={"retry_config": {"retries": 5}})
|
||||||
|
|
||||||
|
Connect to a namespace-backed database:
|
||||||
|
|
||||||
|
>>> db = lancedb.connect(namespace_client_impl="dir",
|
||||||
|
... namespace_client_properties={"root": "/tmp/ns"})
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
conn : DBConnection
|
conn : DBConnection
|
||||||
A connection to a LanceDB database.
|
A connection to a LanceDB database.
|
||||||
"""
|
"""
|
||||||
|
if namespace_client_impl is not None:
|
||||||
|
if namespace_client_properties is None:
|
||||||
|
raise ValueError(
|
||||||
|
"namespace_client_properties must be provided when "
|
||||||
|
"namespace_client_impl is set"
|
||||||
|
)
|
||||||
|
if kwargs:
|
||||||
|
raise ValueError(f"Unknown keyword arguments: {kwargs}")
|
||||||
|
return connect_namespace(
|
||||||
|
namespace_client_impl,
|
||||||
|
namespace_client_properties,
|
||||||
|
read_consistency_interval=read_consistency_interval,
|
||||||
|
storage_options=storage_options,
|
||||||
|
session=session,
|
||||||
|
namespace_client_pushdown_operations=namespace_client_pushdown_operations,
|
||||||
|
)
|
||||||
|
|
||||||
|
if namespace_client_properties is not None and not manifest_enabled:
|
||||||
|
raise ValueError(
|
||||||
|
"namespace_client_impl must be provided when using "
|
||||||
|
"namespace_client_properties unless manifest_enabled=True"
|
||||||
|
)
|
||||||
|
|
||||||
|
if namespace_client_pushdown_operations is not None:
|
||||||
|
raise ValueError(
|
||||||
|
"namespace_client_pushdown_operations is only valid when "
|
||||||
|
"connecting through a namespace"
|
||||||
|
)
|
||||||
|
if uri is None:
|
||||||
|
raise ValueError(
|
||||||
|
"uri is required when not connecting through a namespace client"
|
||||||
|
)
|
||||||
if isinstance(uri, str) and uri.startswith("db://"):
|
if isinstance(uri, str) and uri.startswith("db://"):
|
||||||
if api_key is None:
|
if api_key is None:
|
||||||
api_key = os.environ.get("LANCEDB_API_KEY")
|
api_key = os.environ.get("LANCEDB_API_KEY")
|
||||||
@@ -165,9 +223,92 @@ def connect(
|
|||||||
read_consistency_interval=read_consistency_interval,
|
read_consistency_interval=read_consistency_interval,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
session=session,
|
session=session,
|
||||||
|
manifest_enabled=manifest_enabled,
|
||||||
|
namespace_client_properties=namespace_client_properties,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
WORKER_PROPERTY_PREFIX = "_lancedb_worker_"
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_worker_overrides(props: dict[str, str]) -> dict[str, str]:
|
||||||
|
"""Apply worker property overrides.
|
||||||
|
|
||||||
|
Any key starting with ``_lancedb_worker_`` is extracted, the prefix
|
||||||
|
is stripped, and the resulting key-value pair is put back into the
|
||||||
|
map (overriding the existing value if present). The original
|
||||||
|
prefixed key is removed.
|
||||||
|
"""
|
||||||
|
worker_keys = [k for k in props if k.startswith(WORKER_PROPERTY_PREFIX)]
|
||||||
|
if not worker_keys:
|
||||||
|
return props
|
||||||
|
result = dict(props)
|
||||||
|
for key in worker_keys:
|
||||||
|
value = result.pop(key)
|
||||||
|
real_key = key[len(WORKER_PROPERTY_PREFIX) :]
|
||||||
|
result[real_key] = value
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def deserialize_conn(
|
||||||
|
data: str,
|
||||||
|
*,
|
||||||
|
for_worker: bool = False,
|
||||||
|
) -> DBConnection:
|
||||||
|
"""Reconstruct a DBConnection from a serialized string.
|
||||||
|
|
||||||
|
The string must have been produced by
|
||||||
|
:meth:`DBConnection.serialize`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : str
|
||||||
|
String produced by ``serialize()``.
|
||||||
|
for_worker : bool, default False
|
||||||
|
When ``True``, any namespace client property whose key starts
|
||||||
|
with ``_lancedb_worker_`` has that prefix stripped and the
|
||||||
|
value overrides the corresponding property. For example,
|
||||||
|
``_lancedb_worker_uri`` replaces ``uri``.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
DBConnection
|
||||||
|
A new connection matching the serialized state.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
|
||||||
|
parsed = json.loads(data)
|
||||||
|
connection_type = parsed.get("connection_type")
|
||||||
|
|
||||||
|
rci_secs = parsed.get("read_consistency_interval_seconds")
|
||||||
|
rci = timedelta(seconds=rci_secs) if rci_secs is not None else None
|
||||||
|
storage_options = parsed.get("storage_options")
|
||||||
|
|
||||||
|
if connection_type == "namespace":
|
||||||
|
props = dict(parsed.get("namespace_client_properties") or {})
|
||||||
|
if for_worker:
|
||||||
|
props = _apply_worker_overrides(props)
|
||||||
|
return connect_namespace(
|
||||||
|
namespace_client_impl=parsed["namespace_client_impl"],
|
||||||
|
namespace_client_properties=props,
|
||||||
|
read_consistency_interval=rci,
|
||||||
|
storage_options=storage_options,
|
||||||
|
namespace_client_pushdown_operations=parsed.get(
|
||||||
|
"namespace_client_pushdown_operations"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
elif connection_type == "local":
|
||||||
|
return LanceDBConnection(
|
||||||
|
parsed["uri"],
|
||||||
|
read_consistency_interval=rci,
|
||||||
|
storage_options=storage_options,
|
||||||
|
manifest_enabled=parsed.get("manifest_enabled", False),
|
||||||
|
namespace_client_properties=parsed.get("namespace_client_properties"),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown connection_type: {connection_type}")
|
||||||
|
|
||||||
|
|
||||||
async def connect_async(
|
async def connect_async(
|
||||||
uri: URI,
|
uri: URI,
|
||||||
*,
|
*,
|
||||||
@@ -178,6 +319,8 @@ async def connect_async(
|
|||||||
client_config: Optional[Union[ClientConfig, Dict[str, Any]]] = None,
|
client_config: Optional[Union[ClientConfig, Dict[str, Any]]] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
session: Optional[Session] = None,
|
session: Optional[Session] = None,
|
||||||
|
manifest_enabled: bool = False,
|
||||||
|
namespace_client_properties: Optional[Dict[str, str]] = None,
|
||||||
) -> AsyncConnection:
|
) -> AsyncConnection:
|
||||||
"""Connect to a LanceDB database.
|
"""Connect to a LanceDB database.
|
||||||
|
|
||||||
@@ -210,13 +353,20 @@ async def connect_async(
|
|||||||
default configuration is used.
|
default configuration is used.
|
||||||
storage_options: dict, optional
|
storage_options: dict, optional
|
||||||
Additional options for the storage backend. See available options at
|
Additional options for the storage backend. See available options at
|
||||||
<https://lancedb.com/docs/storage/>
|
<https://docs.lancedb.com/storage/>
|
||||||
session: Session, optional
|
session: Session, optional
|
||||||
(For LanceDB OSS only)
|
(For LanceDB OSS only)
|
||||||
A session to use for this connection. Sessions allow you to configure
|
A session to use for this connection. Sessions allow you to configure
|
||||||
cache sizes for index and metadata caches, which can significantly
|
cache sizes for index and metadata caches, which can significantly
|
||||||
impact memory use and performance. They can also be re-used across
|
impact memory use and performance. They can also be re-used across
|
||||||
multiple connections to share the same cache state.
|
multiple connections to share the same cache state.
|
||||||
|
manifest_enabled : bool, default False
|
||||||
|
When true for local/native connections, use directory namespace
|
||||||
|
manifests as the source of truth for table metadata. Existing
|
||||||
|
directory-listed root tables are migrated into the manifest on access.
|
||||||
|
namespace_client_properties : dict, optional
|
||||||
|
Additional directory namespace client properties to use with
|
||||||
|
``manifest_enabled=True``.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -259,6 +409,8 @@ async def connect_async(
|
|||||||
client_config,
|
client_config,
|
||||||
storage_options,
|
storage_options,
|
||||||
session,
|
session,
|
||||||
|
manifest_enabled,
|
||||||
|
namespace_client_properties,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -271,6 +423,10 @@ __all__ = [
|
|||||||
"AsyncConnection",
|
"AsyncConnection",
|
||||||
"AsyncLanceNamespaceDBConnection",
|
"AsyncLanceNamespaceDBConnection",
|
||||||
"AsyncTable",
|
"AsyncTable",
|
||||||
|
"col",
|
||||||
|
"Expr",
|
||||||
|
"func",
|
||||||
|
"lit",
|
||||||
"URI",
|
"URI",
|
||||||
"sanitize_uri",
|
"sanitize_uri",
|
||||||
"vector",
|
"vector",
|
||||||
@@ -279,7 +435,6 @@ __all__ = [
|
|||||||
"LanceNamespaceDBConnection",
|
"LanceNamespaceDBConnection",
|
||||||
"RemoteDBConnection",
|
"RemoteDBConnection",
|
||||||
"Session",
|
"Session",
|
||||||
"StorageOptionsProvider",
|
|
||||||
"Table",
|
"Table",
|
||||||
"__version__",
|
"__version__",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ from .index import (
|
|||||||
HnswSq,
|
HnswSq,
|
||||||
FTS,
|
FTS,
|
||||||
)
|
)
|
||||||
from .io import StorageOptionsProvider
|
|
||||||
from lance_namespace import (
|
from lance_namespace import (
|
||||||
ListNamespacesResponse,
|
ListNamespacesResponse,
|
||||||
CreateNamespaceResponse,
|
CreateNamespaceResponse,
|
||||||
@@ -27,6 +26,32 @@ from .remote import ClientConfig
|
|||||||
IvfHnswPq: type[HnswPq] = HnswPq
|
IvfHnswPq: type[HnswPq] = HnswPq
|
||||||
IvfHnswSq: type[HnswSq] = HnswSq
|
IvfHnswSq: type[HnswSq] = HnswSq
|
||||||
|
|
||||||
|
class PyExpr:
|
||||||
|
"""A type-safe DataFusion expression node (Rust-side handle)."""
|
||||||
|
|
||||||
|
def eq(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def ne(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def lt(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def lte(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def gt(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def gte(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def and_(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def or_(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def not_(self) -> "PyExpr": ...
|
||||||
|
def add(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def sub(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def mul(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def div(self, other: "PyExpr") -> "PyExpr": ...
|
||||||
|
def lower(self) -> "PyExpr": ...
|
||||||
|
def upper(self) -> "PyExpr": ...
|
||||||
|
def contains(self, substr: "PyExpr") -> "PyExpr": ...
|
||||||
|
def cast(self, data_type: pa.DataType) -> "PyExpr": ...
|
||||||
|
def to_sql(self) -> str: ...
|
||||||
|
|
||||||
|
def expr_col(name: str) -> PyExpr: ...
|
||||||
|
def expr_lit(value: Union[bool, int, float, str]) -> PyExpr: ...
|
||||||
|
def expr_func(name: str, args: List[PyExpr]) -> PyExpr: ...
|
||||||
|
|
||||||
class Session:
|
class Session:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -46,35 +71,35 @@ class Connection(object):
|
|||||||
async def close(self): ...
|
async def close(self): ...
|
||||||
async def list_namespaces(
|
async def list_namespaces(
|
||||||
self,
|
self,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
page_token: Optional[str] = None,
|
page_token: Optional[str] = None,
|
||||||
limit: Optional[int] = None,
|
limit: Optional[int] = None,
|
||||||
) -> ListNamespacesResponse: ...
|
) -> ListNamespacesResponse: ...
|
||||||
async def create_namespace(
|
async def create_namespace(
|
||||||
self,
|
self,
|
||||||
namespace: List[str],
|
namespace_path: List[str],
|
||||||
mode: Optional[str] = None,
|
mode: Optional[str] = None,
|
||||||
properties: Optional[Dict[str, str]] = None,
|
properties: Optional[Dict[str, str]] = None,
|
||||||
) -> CreateNamespaceResponse: ...
|
) -> CreateNamespaceResponse: ...
|
||||||
async def drop_namespace(
|
async def drop_namespace(
|
||||||
self,
|
self,
|
||||||
namespace: List[str],
|
namespace_path: List[str],
|
||||||
mode: Optional[str] = None,
|
mode: Optional[str] = None,
|
||||||
behavior: Optional[str] = None,
|
behavior: Optional[str] = None,
|
||||||
) -> DropNamespaceResponse: ...
|
) -> DropNamespaceResponse: ...
|
||||||
async def describe_namespace(
|
async def describe_namespace(
|
||||||
self,
|
self,
|
||||||
namespace: List[str],
|
namespace_path: List[str],
|
||||||
) -> DescribeNamespaceResponse: ...
|
) -> DescribeNamespaceResponse: ...
|
||||||
async def list_tables(
|
async def list_tables(
|
||||||
self,
|
self,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
page_token: Optional[str] = None,
|
page_token: Optional[str] = None,
|
||||||
limit: Optional[int] = None,
|
limit: Optional[int] = None,
|
||||||
) -> ListTablesResponse: ...
|
) -> ListTablesResponse: ...
|
||||||
async def table_names(
|
async def table_names(
|
||||||
self,
|
self,
|
||||||
namespace: Optional[List[str]],
|
namespace_path: Optional[List[str]],
|
||||||
start_after: Optional[str],
|
start_after: Optional[str],
|
||||||
limit: Optional[int],
|
limit: Optional[int],
|
||||||
) -> list[str]: ... # Deprecated: Use list_tables instead
|
) -> list[str]: ... # Deprecated: Use list_tables instead
|
||||||
@@ -83,9 +108,8 @@ class Connection(object):
|
|||||||
name: str,
|
name: str,
|
||||||
mode: str,
|
mode: str,
|
||||||
data: pa.RecordBatchReader,
|
data: pa.RecordBatchReader,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
storage_options_provider: Optional[StorageOptionsProvider] = None,
|
|
||||||
location: Optional[str] = None,
|
location: Optional[str] = None,
|
||||||
) -> Table: ...
|
) -> Table: ...
|
||||||
async def create_empty_table(
|
async def create_empty_table(
|
||||||
@@ -93,17 +117,15 @@ class Connection(object):
|
|||||||
name: str,
|
name: str,
|
||||||
mode: str,
|
mode: str,
|
||||||
schema: pa.Schema,
|
schema: pa.Schema,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
storage_options_provider: Optional[StorageOptionsProvider] = None,
|
|
||||||
location: Optional[str] = None,
|
location: Optional[str] = None,
|
||||||
) -> Table: ...
|
) -> Table: ...
|
||||||
async def open_table(
|
async def open_table(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
storage_options_provider: Optional[StorageOptionsProvider] = None,
|
|
||||||
index_cache_size: Optional[int] = None,
|
index_cache_size: Optional[int] = None,
|
||||||
location: Optional[str] = None,
|
location: Optional[str] = None,
|
||||||
) -> Table: ...
|
) -> Table: ...
|
||||||
@@ -111,7 +133,7 @@ class Connection(object):
|
|||||||
self,
|
self,
|
||||||
target_table_name: str,
|
target_table_name: str,
|
||||||
source_uri: str,
|
source_uri: str,
|
||||||
target_namespace: Optional[List[str]] = None,
|
target_namespace_path: Optional[List[str]] = None,
|
||||||
source_version: Optional[int] = None,
|
source_version: Optional[int] = None,
|
||||||
source_tag: Optional[str] = None,
|
source_tag: Optional[str] = None,
|
||||||
is_shallow: bool = True,
|
is_shallow: bool = True,
|
||||||
@@ -120,13 +142,18 @@ class Connection(object):
|
|||||||
self,
|
self,
|
||||||
cur_name: str,
|
cur_name: str,
|
||||||
new_name: str,
|
new_name: str,
|
||||||
cur_namespace: Optional[List[str]] = None,
|
cur_namespace_path: Optional[List[str]] = None,
|
||||||
new_namespace: Optional[List[str]] = None,
|
new_namespace_path: Optional[List[str]] = None,
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
async def drop_table(
|
async def drop_table(
|
||||||
self, name: str, namespace: Optional[List[str]] = None
|
self, name: str, namespace_path: Optional[List[str]] = None
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
async def drop_all_tables(self, namespace: Optional[List[str]] = None) -> None: ...
|
async def drop_all_tables(
|
||||||
|
self, namespace_path: Optional[List[str]] = None
|
||||||
|
) -> None: ...
|
||||||
|
async def namespace_client_config(
|
||||||
|
self,
|
||||||
|
) -> Dict[str, Any]: ...
|
||||||
|
|
||||||
class Table:
|
class Table:
|
||||||
def name(self) -> str: ...
|
def name(self) -> str: ...
|
||||||
@@ -135,7 +162,10 @@ class Table:
|
|||||||
def close(self) -> None: ...
|
def close(self) -> None: ...
|
||||||
async def schema(self) -> pa.Schema: ...
|
async def schema(self) -> pa.Schema: ...
|
||||||
async def add(
|
async def add(
|
||||||
self, data: pa.RecordBatchReader, mode: Literal["append", "overwrite"]
|
self,
|
||||||
|
data: pa.RecordBatchReader,
|
||||||
|
mode: Literal["append", "overwrite"],
|
||||||
|
progress: Optional[Any] = None,
|
||||||
) -> AddResult: ...
|
) -> AddResult: ...
|
||||||
async def update(
|
async def update(
|
||||||
self, updates: Dict[str, str], where: Optional[str]
|
self, updates: Dict[str, str], where: Optional[str]
|
||||||
@@ -212,6 +242,8 @@ async def connect(
|
|||||||
client_config: Optional[Union[ClientConfig, Dict[str, Any]]],
|
client_config: Optional[Union[ClientConfig, Dict[str, Any]]],
|
||||||
storage_options: Optional[Dict[str, str]],
|
storage_options: Optional[Dict[str, str]],
|
||||||
session: Optional[Session],
|
session: Optional[Session],
|
||||||
|
manifest_enabled: bool = False,
|
||||||
|
namespace_client_properties: Optional[Dict[str, str]] = None,
|
||||||
) -> Connection: ...
|
) -> Connection: ...
|
||||||
|
|
||||||
class RecordBatchStream:
|
class RecordBatchStream:
|
||||||
@@ -222,7 +254,9 @@ class RecordBatchStream:
|
|||||||
|
|
||||||
class Query:
|
class Query:
|
||||||
def where(self, filter: str): ...
|
def where(self, filter: str): ...
|
||||||
def select(self, columns: Tuple[str, str]): ...
|
def where_expr(self, expr: PyExpr): ...
|
||||||
|
def select(self, columns: List[Tuple[str, str]]): ...
|
||||||
|
def select_expr(self, columns: List[Tuple[str, PyExpr]]): ...
|
||||||
def select_columns(self, columns: List[str]): ...
|
def select_columns(self, columns: List[str]): ...
|
||||||
def limit(self, limit: int): ...
|
def limit(self, limit: int): ...
|
||||||
def offset(self, offset: int): ...
|
def offset(self, offset: int): ...
|
||||||
@@ -248,7 +282,9 @@ class TakeQuery:
|
|||||||
|
|
||||||
class FTSQuery:
|
class FTSQuery:
|
||||||
def where(self, filter: str): ...
|
def where(self, filter: str): ...
|
||||||
def select(self, columns: List[str]): ...
|
def where_expr(self, expr: PyExpr): ...
|
||||||
|
def select(self, columns: List[Tuple[str, str]]): ...
|
||||||
|
def select_expr(self, columns: List[Tuple[str, PyExpr]]): ...
|
||||||
def limit(self, limit: int): ...
|
def limit(self, limit: int): ...
|
||||||
def offset(self, offset: int): ...
|
def offset(self, offset: int): ...
|
||||||
def fast_search(self): ...
|
def fast_search(self): ...
|
||||||
@@ -267,7 +303,9 @@ class VectorQuery:
|
|||||||
async def output_schema(self) -> pa.Schema: ...
|
async def output_schema(self) -> pa.Schema: ...
|
||||||
async def execute(self) -> RecordBatchStream: ...
|
async def execute(self) -> RecordBatchStream: ...
|
||||||
def where(self, filter: str): ...
|
def where(self, filter: str): ...
|
||||||
def select(self, columns: List[str]): ...
|
def where_expr(self, expr: PyExpr): ...
|
||||||
|
def select(self, columns: List[Tuple[str, str]]): ...
|
||||||
|
def select_expr(self, columns: List[Tuple[str, PyExpr]]): ...
|
||||||
def select_with_projection(self, columns: Tuple[str, str]): ...
|
def select_with_projection(self, columns: Tuple[str, str]): ...
|
||||||
def limit(self, limit: int): ...
|
def limit(self, limit: int): ...
|
||||||
def offset(self, offset: int): ...
|
def offset(self, offset: int): ...
|
||||||
@@ -284,7 +322,9 @@ class VectorQuery:
|
|||||||
|
|
||||||
class HybridQuery:
|
class HybridQuery:
|
||||||
def where(self, filter: str): ...
|
def where(self, filter: str): ...
|
||||||
def select(self, columns: List[str]): ...
|
def where_expr(self, expr: PyExpr): ...
|
||||||
|
def select(self, columns: List[Tuple[str, str]]): ...
|
||||||
|
def select_expr(self, columns: List[Tuple[str, PyExpr]]): ...
|
||||||
def limit(self, limit: int): ...
|
def limit(self, limit: int): ...
|
||||||
def offset(self, offset: int): ...
|
def offset(self, offset: int): ...
|
||||||
def fast_search(self): ...
|
def fast_search(self): ...
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ def data_to_reader(
|
|||||||
f"Unknown data type {type(data)}. "
|
f"Unknown data type {type(data)}. "
|
||||||
"Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
|
"Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
|
||||||
"pyarrow Table/RecordBatch, or Pydantic models. "
|
"pyarrow Table/RecordBatch, or Pydantic models. "
|
||||||
"See https://lancedb.com/docs/tables/ for examples."
|
"See https://docs.lancedb.com/tables/ for examples."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -19,10 +19,10 @@ from .utils import TEXT, api_key_not_found_help
|
|||||||
@register("gemini-text")
|
@register("gemini-text")
|
||||||
class GeminiText(TextEmbeddingFunction):
|
class GeminiText(TextEmbeddingFunction):
|
||||||
"""
|
"""
|
||||||
An embedding function that uses the Google's Gemini API. Requires GOOGLE_API_KEY to
|
An embedding function that uses Google's Gemini API. Requires GOOGLE_API_KEY to
|
||||||
be set.
|
be set.
|
||||||
|
|
||||||
https://ai.google.dev/docs/embeddings_guide
|
https://ai.google.dev/gemini-api/docs/embeddings
|
||||||
|
|
||||||
Supports various tasks types:
|
Supports various tasks types:
|
||||||
| Task Type | Description |
|
| Task Type | Description |
|
||||||
@@ -46,9 +46,12 @@ class GeminiText(TextEmbeddingFunction):
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
name: str, default "models/embedding-001"
|
name: str, default "gemini-embedding-001"
|
||||||
The name of the model to use. See the Gemini documentation for a list of
|
The name of the model to use. Supported models include:
|
||||||
available models.
|
- "gemini-embedding-001" (768 dimensions)
|
||||||
|
|
||||||
|
Note: The legacy "models/embedding-001" format is also supported but
|
||||||
|
"gemini-embedding-001" is recommended.
|
||||||
|
|
||||||
query_task_type: str, default "retrieval_query"
|
query_task_type: str, default "retrieval_query"
|
||||||
Sets the task type for the queries.
|
Sets the task type for the queries.
|
||||||
@@ -77,7 +80,7 @@ class GeminiText(TextEmbeddingFunction):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name: str = "models/embedding-001"
|
name: str = "gemini-embedding-001"
|
||||||
query_task_type: str = "retrieval_query"
|
query_task_type: str = "retrieval_query"
|
||||||
source_task_type: str = "retrieval_document"
|
source_task_type: str = "retrieval_document"
|
||||||
|
|
||||||
@@ -114,23 +117,48 @@ class GeminiText(TextEmbeddingFunction):
|
|||||||
texts: list[str] or np.ndarray (of str)
|
texts: list[str] or np.ndarray (of str)
|
||||||
The texts to embed
|
The texts to embed
|
||||||
"""
|
"""
|
||||||
if (
|
from google.genai import types
|
||||||
kwargs.get("task_type") == "retrieval_document"
|
|
||||||
): # Provide a title to use existing API design
|
|
||||||
title = "Embedding of a document"
|
|
||||||
kwargs["title"] = title
|
|
||||||
|
|
||||||
return [
|
task_type = kwargs.get("task_type")
|
||||||
self.client.embed_content(model=self.name, content=text, **kwargs)[
|
|
||||||
"embedding"
|
# Build content objects for embed_content
|
||||||
]
|
contents = []
|
||||||
for text in texts
|
for text in texts:
|
||||||
]
|
if task_type == "retrieval_document":
|
||||||
|
# Provide a title for retrieval_document task
|
||||||
|
contents.append(
|
||||||
|
{"parts": [{"text": "Embedding of a document"}, {"text": text}]}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
contents.append({"parts": [{"text": text}]})
|
||||||
|
|
||||||
|
# Build config
|
||||||
|
config_kwargs = {}
|
||||||
|
if task_type:
|
||||||
|
config_kwargs["task_type"] = task_type.upper() # API expects uppercase
|
||||||
|
|
||||||
|
# Call embed_content for each content
|
||||||
|
embeddings = []
|
||||||
|
for content in contents:
|
||||||
|
config = (
|
||||||
|
types.EmbedContentConfig(**config_kwargs) if config_kwargs else None
|
||||||
|
)
|
||||||
|
response = self.client.models.embed_content(
|
||||||
|
model=self.name,
|
||||||
|
contents=content,
|
||||||
|
config=config,
|
||||||
|
)
|
||||||
|
embeddings.append(response.embeddings[0].values)
|
||||||
|
|
||||||
|
return embeddings
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def client(self):
|
def client(self):
|
||||||
genai = attempt_import_or_raise("google.generativeai", "google.generativeai")
|
attempt_import_or_raise("google.genai", "google-genai")
|
||||||
|
|
||||||
if not os.environ.get("GOOGLE_API_KEY"):
|
if not os.environ.get("GOOGLE_API_KEY"):
|
||||||
api_key_not_found_help("google")
|
api_key_not_found_help("google")
|
||||||
return genai
|
|
||||||
|
from google import genai as genai_module
|
||||||
|
|
||||||
|
return genai_module.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import sys
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import urllib.error
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
import weakref
|
import weakref
|
||||||
import logging
|
import logging
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
|||||||
298
python/python/lancedb/expr.py
Normal file
298
python/python/lancedb/expr.py
Normal file
@@ -0,0 +1,298 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
"""Type-safe expression builder for filters and projections.
|
||||||
|
|
||||||
|
Instead of writing raw SQL strings you can build expressions with Python
|
||||||
|
operators::
|
||||||
|
|
||||||
|
from lancedb.expr import col, lit
|
||||||
|
|
||||||
|
# filter: age > 18 AND status = 'active'
|
||||||
|
filt = (col("age") > lit(18)) & (col("status") == lit("active"))
|
||||||
|
|
||||||
|
# projection: compute a derived column
|
||||||
|
proj = {"score": col("raw_score") * lit(1.5)}
|
||||||
|
|
||||||
|
table.search().where(filt).select(proj).to_list()
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
from lancedb._lancedb import PyExpr, expr_col, expr_lit, expr_func
|
||||||
|
|
||||||
|
__all__ = ["Expr", "col", "lit", "func"]
|
||||||
|
|
||||||
|
_STR_TO_PA_TYPE: dict = {
|
||||||
|
"bool": pa.bool_(),
|
||||||
|
"boolean": pa.bool_(),
|
||||||
|
"int8": pa.int8(),
|
||||||
|
"int16": pa.int16(),
|
||||||
|
"int32": pa.int32(),
|
||||||
|
"int64": pa.int64(),
|
||||||
|
"uint8": pa.uint8(),
|
||||||
|
"uint16": pa.uint16(),
|
||||||
|
"uint32": pa.uint32(),
|
||||||
|
"uint64": pa.uint64(),
|
||||||
|
"float16": pa.float16(),
|
||||||
|
"float32": pa.float32(),
|
||||||
|
"float": pa.float32(),
|
||||||
|
"float64": pa.float64(),
|
||||||
|
"double": pa.float64(),
|
||||||
|
"string": pa.string(),
|
||||||
|
"utf8": pa.string(),
|
||||||
|
"str": pa.string(),
|
||||||
|
"large_string": pa.large_utf8(),
|
||||||
|
"large_utf8": pa.large_utf8(),
|
||||||
|
"date32": pa.date32(),
|
||||||
|
"date": pa.date32(),
|
||||||
|
"date64": pa.date64(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _coerce(value: "ExprLike") -> "Expr":
|
||||||
|
"""Return *value* as an :class:`Expr`, wrapping plain Python values via
|
||||||
|
:func:`lit` if needed."""
|
||||||
|
if isinstance(value, Expr):
|
||||||
|
return value
|
||||||
|
return lit(value)
|
||||||
|
|
||||||
|
|
||||||
|
# Type alias used in annotations.
|
||||||
|
ExprLike = Union["Expr", bool, int, float, str]
|
||||||
|
|
||||||
|
|
||||||
|
class Expr:
|
||||||
|
"""A type-safe expression node.
|
||||||
|
|
||||||
|
Construct instances with :func:`col` and :func:`lit`, then combine them
|
||||||
|
using Python operators or the named methods below.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> from lancedb.expr import col, lit
|
||||||
|
>>> filt = (col("age") > lit(18)) & (col("name").lower() == lit("alice"))
|
||||||
|
>>> proj = {"double": col("x") * lit(2)}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Make Expr unhashable so that == returns an Expr rather than being used
|
||||||
|
# for dict keys / set membership.
|
||||||
|
__hash__ = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
def __init__(self, inner: PyExpr) -> None:
|
||||||
|
self._inner = inner
|
||||||
|
|
||||||
|
# ── comparisons ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def __eq__(self, other: ExprLike) -> "Expr": # type: ignore[override]
|
||||||
|
"""Equal to (``col("x") == 1``)."""
|
||||||
|
return Expr(self._inner.eq(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __ne__(self, other: ExprLike) -> "Expr": # type: ignore[override]
|
||||||
|
"""Not equal to (``col("x") != 1``)."""
|
||||||
|
return Expr(self._inner.ne(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __lt__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Less than (``col("x") < 1``)."""
|
||||||
|
return Expr(self._inner.lt(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __le__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Less than or equal to (``col("x") <= 1``)."""
|
||||||
|
return Expr(self._inner.lte(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __gt__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Greater than (``col("x") > 1``)."""
|
||||||
|
return Expr(self._inner.gt(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __ge__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Greater than or equal to (``col("x") >= 1``)."""
|
||||||
|
return Expr(self._inner.gte(_coerce(other)._inner))
|
||||||
|
|
||||||
|
# ── logical ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def __and__(self, other: "Expr") -> "Expr":
|
||||||
|
"""Logical AND (``expr_a & expr_b``)."""
|
||||||
|
return Expr(self._inner.and_(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __or__(self, other: "Expr") -> "Expr":
|
||||||
|
"""Logical OR (``expr_a | expr_b``)."""
|
||||||
|
return Expr(self._inner.or_(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __invert__(self) -> "Expr":
|
||||||
|
"""Logical NOT (``~expr``)."""
|
||||||
|
return Expr(self._inner.not_())
|
||||||
|
|
||||||
|
# ── arithmetic ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def __add__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Add (``col("x") + 1``)."""
|
||||||
|
return Expr(self._inner.add(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __radd__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Right-hand add (``1 + col("x")``)."""
|
||||||
|
return Expr(_coerce(other)._inner.add(self._inner))
|
||||||
|
|
||||||
|
def __sub__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Subtract (``col("x") - 1``)."""
|
||||||
|
return Expr(self._inner.sub(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __rsub__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Right-hand subtract (``1 - col("x")``)."""
|
||||||
|
return Expr(_coerce(other)._inner.sub(self._inner))
|
||||||
|
|
||||||
|
def __mul__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Multiply (``col("x") * 2``)."""
|
||||||
|
return Expr(self._inner.mul(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __rmul__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Right-hand multiply (``2 * col("x")``)."""
|
||||||
|
return Expr(_coerce(other)._inner.mul(self._inner))
|
||||||
|
|
||||||
|
def __truediv__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Divide (``col("x") / 2``)."""
|
||||||
|
return Expr(self._inner.div(_coerce(other)._inner))
|
||||||
|
|
||||||
|
def __rtruediv__(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Right-hand divide (``1 / col("x")``)."""
|
||||||
|
return Expr(_coerce(other)._inner.div(self._inner))
|
||||||
|
|
||||||
|
# ── string methods ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def lower(self) -> "Expr":
|
||||||
|
"""Convert string column values to lowercase."""
|
||||||
|
return Expr(self._inner.lower())
|
||||||
|
|
||||||
|
def upper(self) -> "Expr":
|
||||||
|
"""Convert string column values to uppercase."""
|
||||||
|
return Expr(self._inner.upper())
|
||||||
|
|
||||||
|
def contains(self, substr: "ExprLike") -> "Expr":
|
||||||
|
"""Return True where the string contains *substr*."""
|
||||||
|
return Expr(self._inner.contains(_coerce(substr)._inner))
|
||||||
|
|
||||||
|
# ── type cast ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def cast(self, data_type: Union[str, "pa.DataType"]) -> "Expr":
|
||||||
|
"""Cast values to *data_type*.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data_type:
|
||||||
|
A PyArrow ``DataType`` (e.g. ``pa.int32()``) or one of the type
|
||||||
|
name strings: ``"bool"``, ``"int8"``, ``"int16"``, ``"int32"``,
|
||||||
|
``"int64"``, ``"uint8"``–``"uint64"``, ``"float32"``,
|
||||||
|
``"float64"``, ``"string"``, ``"date32"``, ``"date64"``.
|
||||||
|
"""
|
||||||
|
if isinstance(data_type, str):
|
||||||
|
try:
|
||||||
|
data_type = _STR_TO_PA_TYPE[data_type]
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError(
|
||||||
|
f"unsupported data type: '{data_type}'. Supported: "
|
||||||
|
f"{', '.join(_STR_TO_PA_TYPE)}"
|
||||||
|
)
|
||||||
|
return Expr(self._inner.cast(data_type))
|
||||||
|
|
||||||
|
# ── named comparison helpers (alternative to operators) ──────────────────
|
||||||
|
|
||||||
|
def eq(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Equal to."""
|
||||||
|
return self.__eq__(other)
|
||||||
|
|
||||||
|
def ne(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Not equal to."""
|
||||||
|
return self.__ne__(other)
|
||||||
|
|
||||||
|
def lt(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Less than."""
|
||||||
|
return self.__lt__(other)
|
||||||
|
|
||||||
|
def lte(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Less than or equal to."""
|
||||||
|
return self.__le__(other)
|
||||||
|
|
||||||
|
def gt(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Greater than."""
|
||||||
|
return self.__gt__(other)
|
||||||
|
|
||||||
|
def gte(self, other: ExprLike) -> "Expr":
|
||||||
|
"""Greater than or equal to."""
|
||||||
|
return self.__ge__(other)
|
||||||
|
|
||||||
|
def and_(self, other: "Expr") -> "Expr":
|
||||||
|
"""Logical AND."""
|
||||||
|
return self.__and__(other)
|
||||||
|
|
||||||
|
def or_(self, other: "Expr") -> "Expr":
|
||||||
|
"""Logical OR."""
|
||||||
|
return self.__or__(other)
|
||||||
|
|
||||||
|
# ── utilities ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def to_sql(self) -> str:
|
||||||
|
"""Render the expression as a SQL string (useful for debugging)."""
|
||||||
|
return self._inner.to_sql()
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"Expr({self._inner.to_sql()})"
|
||||||
|
|
||||||
|
|
||||||
|
# ── free functions ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def col(name: str) -> Expr:
|
||||||
|
"""Reference a table column by name.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
name:
|
||||||
|
The column name.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> from lancedb.expr import col, lit
|
||||||
|
>>> col("age") > lit(18)
|
||||||
|
Expr((age > 18))
|
||||||
|
"""
|
||||||
|
return Expr(expr_col(name))
|
||||||
|
|
||||||
|
|
||||||
|
def lit(value: Union[bool, int, float, str]) -> Expr:
|
||||||
|
"""Create a literal (constant) value expression.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
value:
|
||||||
|
A Python ``bool``, ``int``, ``float``, or ``str``.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> from lancedb.expr import col, lit
|
||||||
|
>>> col("price") * lit(1.1)
|
||||||
|
Expr((price * 1.1))
|
||||||
|
"""
|
||||||
|
return Expr(expr_lit(value))
|
||||||
|
|
||||||
|
|
||||||
|
def func(name: str, *args: ExprLike) -> Expr:
|
||||||
|
"""Call an arbitrary SQL function by name.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
name:
|
||||||
|
The SQL function name (e.g. ``"lower"``, ``"upper"``).
|
||||||
|
*args:
|
||||||
|
The function arguments as :class:`Expr` or plain Python literals.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> from lancedb.expr import col, func
|
||||||
|
>>> func("lower", col("name"))
|
||||||
|
Expr(lower(name))
|
||||||
|
"""
|
||||||
|
inner_args = [_coerce(a)._inner for a in args]
|
||||||
|
return Expr(expr_func(name, inner_args))
|
||||||
@@ -1,201 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
||||||
|
|
||||||
"""Full text search index using tantivy-py"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import List, Tuple, Optional
|
|
||||||
|
|
||||||
import pyarrow as pa
|
|
||||||
|
|
||||||
try:
|
|
||||||
import tantivy
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(
|
|
||||||
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
|
|
||||||
)
|
|
||||||
|
|
||||||
from .table import LanceTable
|
|
||||||
|
|
||||||
|
|
||||||
def create_index(
|
|
||||||
index_path: str,
|
|
||||||
text_fields: List[str],
|
|
||||||
ordering_fields: Optional[List[str]] = None,
|
|
||||||
tokenizer_name: str = "default",
|
|
||||||
) -> tantivy.Index:
|
|
||||||
"""
|
|
||||||
Create a new Index (not populated)
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
index_path : str
|
|
||||||
Path to the index directory
|
|
||||||
text_fields : List[str]
|
|
||||||
List of text fields to index
|
|
||||||
ordering_fields: List[str]
|
|
||||||
List of unsigned type fields to order by at search time
|
|
||||||
tokenizer_name : str, default "default"
|
|
||||||
The tokenizer to use
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
index : tantivy.Index
|
|
||||||
The index object (not yet populated)
|
|
||||||
"""
|
|
||||||
if ordering_fields is None:
|
|
||||||
ordering_fields = []
|
|
||||||
# Declaring our schema.
|
|
||||||
schema_builder = tantivy.SchemaBuilder()
|
|
||||||
# special field that we'll populate with row_id
|
|
||||||
schema_builder.add_integer_field("doc_id", stored=True)
|
|
||||||
# data fields
|
|
||||||
for name in text_fields:
|
|
||||||
schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
|
|
||||||
if ordering_fields:
|
|
||||||
for name in ordering_fields:
|
|
||||||
schema_builder.add_unsigned_field(name, fast=True)
|
|
||||||
schema = schema_builder.build()
|
|
||||||
os.makedirs(index_path, exist_ok=True)
|
|
||||||
index = tantivy.Index(schema, path=index_path)
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
def populate_index(
|
|
||||||
index: tantivy.Index,
|
|
||||||
table: LanceTable,
|
|
||||||
fields: List[str],
|
|
||||||
writer_heap_size: Optional[int] = None,
|
|
||||||
ordering_fields: Optional[List[str]] = None,
|
|
||||||
) -> int:
|
|
||||||
"""
|
|
||||||
Populate an index with data from a LanceTable
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
index : tantivy.Index
|
|
||||||
The index object
|
|
||||||
table : LanceTable
|
|
||||||
The table to index
|
|
||||||
fields : List[str]
|
|
||||||
List of fields to index
|
|
||||||
writer_heap_size : int
|
|
||||||
The writer heap size in bytes, defaults to 1GB
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
int
|
|
||||||
The number of rows indexed
|
|
||||||
"""
|
|
||||||
if ordering_fields is None:
|
|
||||||
ordering_fields = []
|
|
||||||
writer_heap_size = writer_heap_size or 1024 * 1024 * 1024
|
|
||||||
# first check the fields exist and are string or large string type
|
|
||||||
nested = []
|
|
||||||
|
|
||||||
for name in fields:
|
|
||||||
try:
|
|
||||||
f = table.schema.field(name) # raises KeyError if not found
|
|
||||||
except KeyError:
|
|
||||||
f = resolve_path(table.schema, name)
|
|
||||||
nested.append(name)
|
|
||||||
|
|
||||||
if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
|
|
||||||
raise TypeError(f"Field {name} is not a string type")
|
|
||||||
|
|
||||||
# create a tantivy writer
|
|
||||||
writer = index.writer(heap_size=writer_heap_size)
|
|
||||||
# write data into index
|
|
||||||
dataset = table.to_lance()
|
|
||||||
row_id = 0
|
|
||||||
|
|
||||||
max_nested_level = 0
|
|
||||||
if len(nested) > 0:
|
|
||||||
max_nested_level = max([len(name.split(".")) for name in nested])
|
|
||||||
|
|
||||||
for b in dataset.to_batches(columns=fields + ordering_fields):
|
|
||||||
if max_nested_level > 0:
|
|
||||||
b = pa.Table.from_batches([b])
|
|
||||||
for _ in range(max_nested_level - 1):
|
|
||||||
b = b.flatten()
|
|
||||||
for i in range(b.num_rows):
|
|
||||||
doc = tantivy.Document()
|
|
||||||
for name in fields:
|
|
||||||
value = b[name][i].as_py()
|
|
||||||
if value is not None:
|
|
||||||
doc.add_text(name, value)
|
|
||||||
for name in ordering_fields:
|
|
||||||
value = b[name][i].as_py()
|
|
||||||
if value is not None:
|
|
||||||
doc.add_unsigned(name, value)
|
|
||||||
if not doc.is_empty:
|
|
||||||
doc.add_integer("doc_id", row_id)
|
|
||||||
writer.add_document(doc)
|
|
||||||
row_id += 1
|
|
||||||
# commit changes
|
|
||||||
writer.commit()
|
|
||||||
return row_id
|
|
||||||
|
|
||||||
|
|
||||||
def resolve_path(schema, field_name: str) -> pa.Field:
|
|
||||||
"""
|
|
||||||
Resolve a nested field path to a list of field names
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
field_name : str
|
|
||||||
The field name to resolve
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
List[str]
|
|
||||||
The resolved path
|
|
||||||
"""
|
|
||||||
path = field_name.split(".")
|
|
||||||
field = schema.field(path.pop(0))
|
|
||||||
for segment in path:
|
|
||||||
if pa.types.is_struct(field.type):
|
|
||||||
field = field.type.field(segment)
|
|
||||||
else:
|
|
||||||
raise KeyError(f"field {field_name} not found in schema {schema}")
|
|
||||||
return field
|
|
||||||
|
|
||||||
|
|
||||||
def search_index(
|
|
||||||
index: tantivy.Index, query: str, limit: int = 10, ordering_field=None
|
|
||||||
) -> Tuple[Tuple[int], Tuple[float]]:
|
|
||||||
"""
|
|
||||||
Search an index for a query
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
index : tantivy.Index
|
|
||||||
The index object
|
|
||||||
query : str
|
|
||||||
The query string
|
|
||||||
limit : int
|
|
||||||
The maximum number of results to return
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
ids_and_score: list[tuple[int], tuple[float]]
|
|
||||||
A tuple of two tuples, the first containing the document ids
|
|
||||||
and the second containing the scores
|
|
||||||
"""
|
|
||||||
searcher = index.searcher()
|
|
||||||
query = index.parse_query(query)
|
|
||||||
# get top results
|
|
||||||
if ordering_field:
|
|
||||||
results = searcher.search(query, limit, order_by_field=ordering_field)
|
|
||||||
else:
|
|
||||||
results = searcher.search(query, limit)
|
|
||||||
if results.count == 0:
|
|
||||||
return tuple(), tuple()
|
|
||||||
return tuple(
|
|
||||||
zip(
|
|
||||||
*[
|
|
||||||
(searcher.doc(doc_address)["doc_id"][0], score)
|
|
||||||
for score, doc_address in results.hits
|
|
||||||
]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
@@ -2,70 +2,3 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
"""I/O utilities and interfaces for LanceDB."""
|
"""I/O utilities and interfaces for LanceDB."""
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
|
|
||||||
class StorageOptionsProvider(ABC):
|
|
||||||
"""Abstract base class for providing storage options to LanceDB tables.
|
|
||||||
|
|
||||||
Storage options providers enable automatic credential refresh for cloud
|
|
||||||
storage backends (e.g., AWS S3, Azure Blob Storage, GCS). When credentials
|
|
||||||
have an expiration time, the provider's fetch_storage_options() method will
|
|
||||||
be called periodically to get fresh credentials before they expire.
|
|
||||||
|
|
||||||
Example
|
|
||||||
-------
|
|
||||||
>>> class MyProvider(StorageOptionsProvider):
|
|
||||||
... def fetch_storage_options(self) -> Dict[str, str]:
|
|
||||||
... # Fetch fresh credentials from your credential manager
|
|
||||||
... return {
|
|
||||||
... "aws_access_key_id": "...",
|
|
||||||
... "aws_secret_access_key": "...",
|
|
||||||
... "expires_at_millis": "1234567890000" # Optional
|
|
||||||
... }
|
|
||||||
"""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def fetch_storage_options(self) -> Dict[str, str]:
|
|
||||||
"""Fetch fresh storage credentials.
|
|
||||||
|
|
||||||
This method is called by LanceDB when credentials need to be refreshed.
|
|
||||||
If the returned dictionary contains an "expires_at_millis" key with a
|
|
||||||
Unix timestamp in milliseconds, LanceDB will automatically refresh the
|
|
||||||
credentials before that time. If the key is not present, credentials
|
|
||||||
are assumed to not expire.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Dict[str, str]
|
|
||||||
Dictionary containing cloud storage credentials and optionally an
|
|
||||||
expiration time:
|
|
||||||
- "expires_at_millis" (optional): Unix timestamp in milliseconds when
|
|
||||||
credentials expire
|
|
||||||
- Provider-specific credential keys (e.g., aws_access_key_id,
|
|
||||||
aws_secret_access_key, etc.)
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
RuntimeError
|
|
||||||
If credentials cannot be fetched or are invalid
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def provider_id(self) -> str:
|
|
||||||
"""Return a human-readable unique identifier for this provider instance.
|
|
||||||
|
|
||||||
This identifier is used for caching and equality comparison. Two providers
|
|
||||||
with the same ID will share the same cached object store connection.
|
|
||||||
|
|
||||||
The default implementation uses the class name and string representation.
|
|
||||||
Override this method if you need custom identification logic.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
str
|
|
||||||
A unique identifier for this provider instance
|
|
||||||
"""
|
|
||||||
return f"{self.__class__.__name__} {{ repr: {str(self)!r} }}"
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -284,9 +284,8 @@ class Permutations:
|
|||||||
self.permutation_table = permutation_table
|
self.permutation_table = permutation_table
|
||||||
|
|
||||||
if permutation_table.schema.metadata is not None:
|
if permutation_table.schema.metadata is not None:
|
||||||
split_names = permutation_table.schema.metadata.get(
|
raw = permutation_table.schema.metadata.get(b"split_names")
|
||||||
b"split_names", None
|
split_names = raw.decode("utf-8") if raw is not None else None
|
||||||
).decode("utf-8")
|
|
||||||
if split_names is not None:
|
if split_names is not None:
|
||||||
self.split_names = json.loads(split_names)
|
self.split_names = json.loads(split_names)
|
||||||
self.split_dict = {
|
self.split_dict = {
|
||||||
@@ -460,9 +459,8 @@ class Permutation:
|
|||||||
f"Cannot create a permutation on split `{split}`"
|
f"Cannot create a permutation on split `{split}`"
|
||||||
" because no split names are defined in the permutation table"
|
" because no split names are defined in the permutation table"
|
||||||
)
|
)
|
||||||
split_names = permutation_table.schema.metadata.get(
|
raw = permutation_table.schema.metadata.get(b"split_names")
|
||||||
b"split_names", None
|
split_names = raw.decode("utf-8") if raw is not None else None
|
||||||
).decode("utf-8")
|
|
||||||
if split_names is None:
|
if split_names is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Cannot create a permutation on split `{split}`"
|
f"Cannot create a permutation on split `{split}`"
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import sys
|
|||||||
import types
|
import types
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from datetime import date, datetime
|
from datetime import date, datetime
|
||||||
|
from enum import Enum
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
@@ -314,6 +315,19 @@ def _pydantic_type_to_arrow_type(tp: Any, field: FieldInfo) -> pa.DataType:
|
|||||||
return pa.list_(pa.list_(tp.value_arrow_type(), tp.dim()))
|
return pa.list_(pa.list_(tp.value_arrow_type(), tp.dim()))
|
||||||
# For regular Vector
|
# For regular Vector
|
||||||
return pa.list_(tp.value_arrow_type(), tp.dim())
|
return pa.list_(tp.value_arrow_type(), tp.dim())
|
||||||
|
if _safe_issubclass(tp, Enum):
|
||||||
|
# Map Enum to the Arrow type of its value.
|
||||||
|
# For string-valued enums, use dictionary encoding for efficiency.
|
||||||
|
# For integer enums, use the native type.
|
||||||
|
# Fall back to utf8 for mixed-type or empty enums.
|
||||||
|
value_types = {type(m.value) for m in tp}
|
||||||
|
if len(value_types) == 1:
|
||||||
|
value_type = value_types.pop()
|
||||||
|
if value_type is str:
|
||||||
|
# Use dictionary encoding for string enums
|
||||||
|
return pa.dictionary(pa.int32(), pa.utf8())
|
||||||
|
return _py_type_to_arrow_type(value_type, field)
|
||||||
|
return pa.utf8()
|
||||||
return _py_type_to_arrow_type(tp, field)
|
return _py_type_to_arrow_type(tp, field)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -25,7 +25,6 @@ import deprecation
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pyarrow.compute as pc
|
import pyarrow.compute as pc
|
||||||
import pyarrow.fs as pa_fs
|
|
||||||
import pydantic
|
import pydantic
|
||||||
|
|
||||||
from lancedb.pydantic import PYDANTIC_VERSION
|
from lancedb.pydantic import PYDANTIC_VERSION
|
||||||
@@ -38,6 +37,7 @@ from .rerankers.base import Reranker
|
|||||||
from .rerankers.rrf import RRFReranker
|
from .rerankers.rrf import RRFReranker
|
||||||
from .rerankers.util import check_reranker_result
|
from .rerankers.util import check_reranker_result
|
||||||
from .util import flatten_columns
|
from .util import flatten_columns
|
||||||
|
from .expr import Expr
|
||||||
from lancedb._lancedb import fts_query_to_json
|
from lancedb._lancedb import fts_query_to_json
|
||||||
from typing_extensions import Annotated
|
from typing_extensions import Annotated
|
||||||
|
|
||||||
@@ -70,7 +70,7 @@ def ensure_vector_query(
|
|||||||
) -> Union[List[float], List[List[float]], pa.Array, List[pa.Array]]:
|
) -> Union[List[float], List[List[float]], pa.Array, List[pa.Array]]:
|
||||||
if isinstance(val, list):
|
if isinstance(val, list):
|
||||||
if len(val) == 0:
|
if len(val) == 0:
|
||||||
return ValueError("Vector query must be a non-empty list")
|
raise ValueError("Vector query must be a non-empty list")
|
||||||
sample = val[0]
|
sample = val[0]
|
||||||
else:
|
else:
|
||||||
if isinstance(val, float):
|
if isinstance(val, float):
|
||||||
@@ -83,7 +83,7 @@ def ensure_vector_query(
|
|||||||
return val
|
return val
|
||||||
if isinstance(sample, list):
|
if isinstance(sample, list):
|
||||||
if len(sample) == 0:
|
if len(sample) == 0:
|
||||||
return ValueError("Vector query must be a non-empty list")
|
raise ValueError("Vector query must be a non-empty list")
|
||||||
if isinstance(sample[0], float):
|
if isinstance(sample[0], float):
|
||||||
# val is list of list of floats
|
# val is list of list of floats
|
||||||
return val
|
return val
|
||||||
@@ -449,8 +449,8 @@ class Query(pydantic.BaseModel):
|
|||||||
ensure_vector_query,
|
ensure_vector_query,
|
||||||
] = None
|
] = None
|
||||||
|
|
||||||
# sql filter to refine the query with
|
# sql filter or type-safe Expr to refine the query with
|
||||||
filter: Optional[str] = None
|
filter: Optional[Union[str, Expr]] = None
|
||||||
|
|
||||||
# if True then apply the filter after vector search
|
# if True then apply the filter after vector search
|
||||||
postfilter: Optional[bool] = None
|
postfilter: Optional[bool] = None
|
||||||
@@ -464,8 +464,8 @@ class Query(pydantic.BaseModel):
|
|||||||
# distance type to use for vector search
|
# distance type to use for vector search
|
||||||
distance_type: Optional[str] = None
|
distance_type: Optional[str] = None
|
||||||
|
|
||||||
# which columns to return in the results
|
# which columns to return in the results (dict values may be str or Expr)
|
||||||
columns: Optional[Union[List[str], Dict[str, str]]] = None
|
columns: Optional[Union[List[str], Dict[str, Union[str, Expr]]]] = None
|
||||||
|
|
||||||
# minimum number of IVF partitions to search
|
# minimum number of IVF partitions to search
|
||||||
#
|
#
|
||||||
@@ -856,14 +856,15 @@ class LanceQueryBuilder(ABC):
|
|||||||
self._offset = offset
|
self._offset = offset
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def select(self, columns: Union[list[str], dict[str, str]]) -> Self:
|
def select(self, columns: Union[list[str], dict[str, Union[str, Expr]]]) -> Self:
|
||||||
"""Set the columns to return.
|
"""Set the columns to return.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
columns: list of str, or dict of str to str default None
|
columns: list of str, or dict of str to str or Expr
|
||||||
List of column names to be fetched.
|
List of column names to be fetched.
|
||||||
Or a dictionary of column names to SQL expressions.
|
Or a dictionary of column names to SQL expressions or
|
||||||
|
:class:`~lancedb.expr.Expr` objects.
|
||||||
All columns are fetched if None or unspecified.
|
All columns are fetched if None or unspecified.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
@@ -877,15 +878,15 @@ class LanceQueryBuilder(ABC):
|
|||||||
raise ValueError("columns must be a list or a dictionary")
|
raise ValueError("columns must be a list or a dictionary")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def where(self, where: str, prefilter: bool = True) -> Self:
|
def where(self, where: Union[str, Expr], prefilter: bool = True) -> Self:
|
||||||
"""Set the where clause.
|
"""Set the where clause.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
where: str
|
where: str or :class:`~lancedb.expr.Expr`
|
||||||
The where clause which is a valid SQL where clause. See
|
The filter condition. Can be a SQL string or a type-safe
|
||||||
`Lance filter pushdown <https://lance.org/guide/read_and_write#filter-push-down>`_
|
:class:`~lancedb.expr.Expr` built with :func:`~lancedb.expr.col`
|
||||||
for valid SQL expressions.
|
and :func:`~lancedb.expr.lit`.
|
||||||
prefilter: bool, default True
|
prefilter: bool, default True
|
||||||
If True, apply the filter before vector search, otherwise the
|
If True, apply the filter before vector search, otherwise the
|
||||||
filter is applied on the result of vector search.
|
filter is applied on the result of vector search.
|
||||||
@@ -1355,15 +1356,17 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
|
|
||||||
return result_set
|
return result_set
|
||||||
|
|
||||||
def where(self, where: str, prefilter: bool = None) -> LanceVectorQueryBuilder:
|
def where(
|
||||||
|
self, where: Union[str, Expr], prefilter: bool = None
|
||||||
|
) -> LanceVectorQueryBuilder:
|
||||||
"""Set the where clause.
|
"""Set the where clause.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
where: str
|
where: str or :class:`~lancedb.expr.Expr`
|
||||||
The where clause which is a valid SQL where clause. See
|
The filter condition. Can be a SQL string or a type-safe
|
||||||
`Lance filter pushdown <https://lance.org/guide/read_and_write#filter-push-down>`_
|
:class:`~lancedb.expr.Expr` built with :func:`~lancedb.expr.col`
|
||||||
for valid SQL expressions.
|
and :func:`~lancedb.expr.lit`.
|
||||||
prefilter: bool, default True
|
prefilter: bool, default True
|
||||||
If True, apply the filter before vector search, otherwise the
|
If True, apply the filter before vector search, otherwise the
|
||||||
filter is applied on the result of vector search.
|
filter is applied on the result of vector search.
|
||||||
@@ -1522,9 +1525,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
|||||||
return self._table._output_schema(self.to_query_object())
|
return self._table._output_schema(self.to_query_object())
|
||||||
|
|
||||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||||
path, fs, exist = self._table._get_fts_index_path()
|
self._table._ensure_no_legacy_fts_index()
|
||||||
if exist:
|
|
||||||
return self.tantivy_to_arrow()
|
|
||||||
|
|
||||||
query = self._query
|
query = self._query
|
||||||
if self._phrase_query:
|
if self._phrase_query:
|
||||||
@@ -1548,90 +1549,6 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
|||||||
):
|
):
|
||||||
raise NotImplementedError("to_batches on an FTS query")
|
raise NotImplementedError("to_batches on an FTS query")
|
||||||
|
|
||||||
def tantivy_to_arrow(self) -> pa.Table:
|
|
||||||
try:
|
|
||||||
import tantivy
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(
|
|
||||||
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
|
|
||||||
)
|
|
||||||
|
|
||||||
from .fts import search_index
|
|
||||||
|
|
||||||
# get the index path
|
|
||||||
path, fs, exist = self._table._get_fts_index_path()
|
|
||||||
|
|
||||||
# check if the index exist
|
|
||||||
if not exist:
|
|
||||||
raise FileNotFoundError(
|
|
||||||
"Fts index does not exist. "
|
|
||||||
"Please first call table.create_fts_index(['<field_names>']) to "
|
|
||||||
"create the fts index."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check that we are on local filesystem
|
|
||||||
if not isinstance(fs, pa_fs.LocalFileSystem):
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Tantivy-based full text search "
|
|
||||||
"is only supported on the local filesystem"
|
|
||||||
)
|
|
||||||
# open the index
|
|
||||||
index = tantivy.Index.open(path)
|
|
||||||
# get the scores and doc ids
|
|
||||||
query = self._query
|
|
||||||
if self._phrase_query:
|
|
||||||
query = query.replace('"', "'")
|
|
||||||
query = f'"{query}"'
|
|
||||||
limit = self._limit if self._limit is not None else 10
|
|
||||||
row_ids, scores = search_index(
|
|
||||||
index, query, limit, ordering_field=self.ordering_field_name
|
|
||||||
)
|
|
||||||
if len(row_ids) == 0:
|
|
||||||
empty_schema = pa.schema([pa.field("_score", pa.float32())])
|
|
||||||
return pa.Table.from_batches([], schema=empty_schema)
|
|
||||||
scores = pa.array(scores)
|
|
||||||
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
|
|
||||||
output_tbl = output_tbl.append_column("_score", scores)
|
|
||||||
# this needs to match vector search results which are uint64
|
|
||||||
row_ids = pa.array(row_ids, type=pa.uint64())
|
|
||||||
|
|
||||||
if self._where is not None:
|
|
||||||
tmp_name = "__lancedb__duckdb__indexer__"
|
|
||||||
output_tbl = output_tbl.append_column(
|
|
||||||
tmp_name, pa.array(range(len(output_tbl)))
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
# TODO would be great to have Substrait generate pyarrow compute
|
|
||||||
# expressions or conversely have pyarrow support SQL expressions
|
|
||||||
# using Substrait
|
|
||||||
import duckdb
|
|
||||||
|
|
||||||
indexer = duckdb.sql(
|
|
||||||
f"SELECT {tmp_name} FROM output_tbl WHERE {self._where}"
|
|
||||||
).to_arrow_table()[tmp_name]
|
|
||||||
output_tbl = output_tbl.take(indexer).drop([tmp_name])
|
|
||||||
row_ids = row_ids.take(indexer)
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
import lance
|
|
||||||
|
|
||||||
# TODO Use "memory://" instead once that's supported
|
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
|
||||||
ds = lance.write_dataset(output_tbl, tmp)
|
|
||||||
output_tbl = ds.to_table(filter=self._where)
|
|
||||||
indexer = output_tbl[tmp_name]
|
|
||||||
row_ids = row_ids.take(indexer)
|
|
||||||
output_tbl = output_tbl.drop([tmp_name])
|
|
||||||
|
|
||||||
if self._with_row_id:
|
|
||||||
output_tbl = output_tbl.append_column("_rowid", row_ids)
|
|
||||||
|
|
||||||
if self._reranker is not None:
|
|
||||||
output_tbl = self._reranker.rerank_fts(self._query, output_tbl)
|
|
||||||
return output_tbl
|
|
||||||
|
|
||||||
def rerank(self, reranker: Reranker) -> LanceFtsQueryBuilder:
|
def rerank(self, reranker: Reranker) -> LanceFtsQueryBuilder:
|
||||||
"""Rerank the results using the specified reranker.
|
"""Rerank the results using the specified reranker.
|
||||||
|
|
||||||
@@ -1726,7 +1643,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
def _validate_query(self, query, vector=None, text=None):
|
def _validate_query(self, query, vector=None, text=None):
|
||||||
if query is not None and (vector is not None or text is not None):
|
if query is not None and (vector is not None or text is not None):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"You can either provide a string query in search() method"
|
"You can either provide a string query in search() method "
|
||||||
"or set `vector()` and `text()` explicitly for hybrid search."
|
"or set `vector()` and `text()` explicitly for hybrid search."
|
||||||
"But not both."
|
"But not both."
|
||||||
)
|
)
|
||||||
@@ -2286,10 +2203,20 @@ class AsyncQueryBase(object):
|
|||||||
"""
|
"""
|
||||||
if isinstance(columns, list) and all(isinstance(c, str) for c in columns):
|
if isinstance(columns, list) and all(isinstance(c, str) for c in columns):
|
||||||
self._inner.select_columns(columns)
|
self._inner.select_columns(columns)
|
||||||
elif isinstance(columns, dict) and all(
|
elif isinstance(columns, dict) and all(isinstance(k, str) for k in columns):
|
||||||
isinstance(k, str) and isinstance(v, str) for k, v in columns.items()
|
if any(isinstance(v, Expr) for v in columns.values()):
|
||||||
):
|
# At least one value is an Expr — use the type-safe path.
|
||||||
|
from .expr import _coerce
|
||||||
|
|
||||||
|
pairs = [(k, _coerce(v)._inner) for k, v in columns.items()]
|
||||||
|
self._inner.select_expr(pairs)
|
||||||
|
elif all(isinstance(v, str) for v in columns.values()):
|
||||||
self._inner.select(list(columns.items()))
|
self._inner.select(list(columns.items()))
|
||||||
|
else:
|
||||||
|
raise TypeError(
|
||||||
|
"dict values must be str or Expr, got "
|
||||||
|
+ str({k: type(v) for k, v in columns.items()})
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise TypeError("columns must be a list of column names or a dict")
|
raise TypeError("columns must be a list of column names or a dict")
|
||||||
return self
|
return self
|
||||||
@@ -2529,11 +2456,13 @@ class AsyncStandardQuery(AsyncQueryBase):
|
|||||||
"""
|
"""
|
||||||
super().__init__(inner)
|
super().__init__(inner)
|
||||||
|
|
||||||
def where(self, predicate: str) -> Self:
|
def where(self, predicate: Union[str, Expr]) -> Self:
|
||||||
"""
|
"""
|
||||||
Only return rows matching the given predicate
|
Only return rows matching the given predicate
|
||||||
|
|
||||||
The predicate should be supplied as an SQL query string.
|
The predicate can be a SQL string or a type-safe
|
||||||
|
:class:`~lancedb.expr.Expr` built with :func:`~lancedb.expr.col`
|
||||||
|
and :func:`~lancedb.expr.lit`.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -2545,6 +2474,9 @@ class AsyncStandardQuery(AsyncQueryBase):
|
|||||||
Filtering performance can often be improved by creating a scalar index
|
Filtering performance can often be improved by creating a scalar index
|
||||||
on the filter column(s).
|
on the filter column(s).
|
||||||
"""
|
"""
|
||||||
|
if isinstance(predicate, Expr):
|
||||||
|
self._inner.where_expr(predicate._inner)
|
||||||
|
else:
|
||||||
self._inner.where(predicate)
|
self._inner.where(predicate)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|||||||
@@ -145,6 +145,33 @@ class TlsConfig:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ClientConfig:
|
class ClientConfig:
|
||||||
|
"""Configuration for the LanceDB Cloud HTTP client.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
user_agent: str
|
||||||
|
User agent string sent with requests.
|
||||||
|
retry_config: RetryConfig
|
||||||
|
Configuration for retrying failed requests.
|
||||||
|
timeout_config: Optional[TimeoutConfig]
|
||||||
|
Configuration for request timeouts.
|
||||||
|
extra_headers: Optional[dict]
|
||||||
|
Additional headers to include in requests.
|
||||||
|
id_delimiter: Optional[str]
|
||||||
|
The delimiter to use when constructing object identifiers.
|
||||||
|
tls_config: Optional[TlsConfig]
|
||||||
|
TLS/mTLS configuration for secure connections.
|
||||||
|
header_provider: Optional[HeaderProvider]
|
||||||
|
Provider for dynamic headers to be added to each request.
|
||||||
|
user_id: Optional[str]
|
||||||
|
User identifier for tracking purposes. This is sent as the
|
||||||
|
`x-lancedb-user-id` header in requests to LanceDB Cloud/Enterprise.
|
||||||
|
|
||||||
|
This can also be set via the `LANCEDB_USER_ID` environment variable.
|
||||||
|
Alternatively, set `LANCEDB_USER_ID_ENV_KEY` to specify another
|
||||||
|
environment variable that contains the user ID value.
|
||||||
|
"""
|
||||||
|
|
||||||
user_agent: str = f"LanceDB-Python-Client/{__version__}"
|
user_agent: str = f"LanceDB-Python-Client/{__version__}"
|
||||||
retry_config: RetryConfig = field(default_factory=RetryConfig)
|
retry_config: RetryConfig = field(default_factory=RetryConfig)
|
||||||
timeout_config: Optional[TimeoutConfig] = field(default_factory=TimeoutConfig)
|
timeout_config: Optional[TimeoutConfig] = field(default_factory=TimeoutConfig)
|
||||||
@@ -152,6 +179,7 @@ class ClientConfig:
|
|||||||
id_delimiter: Optional[str] = None
|
id_delimiter: Optional[str] = None
|
||||||
tls_config: Optional[TlsConfig] = None
|
tls_config: Optional[TlsConfig] = None
|
||||||
header_provider: Optional["HeaderProvider"] = None
|
header_provider: Optional["HeaderProvider"] = None
|
||||||
|
user_id: Optional[str] = None
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if isinstance(self.retry_config, dict):
|
if isinstance(self.retry_config, dict):
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from ..common import DATA
|
|||||||
from ..db import DBConnection, LOOP
|
from ..db import DBConnection, LOOP
|
||||||
from ..embeddings import EmbeddingFunctionConfig
|
from ..embeddings import EmbeddingFunctionConfig
|
||||||
from lance_namespace import (
|
from lance_namespace import (
|
||||||
|
LanceNamespace,
|
||||||
CreateNamespaceResponse,
|
CreateNamespaceResponse,
|
||||||
DescribeNamespaceResponse,
|
DescribeNamespaceResponse,
|
||||||
DropNamespaceResponse,
|
DropNamespaceResponse,
|
||||||
@@ -111,7 +112,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
@override
|
@override
|
||||||
def list_namespaces(
|
def list_namespaces(
|
||||||
self,
|
self,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
page_token: Optional[str] = None,
|
page_token: Optional[str] = None,
|
||||||
limit: Optional[int] = None,
|
limit: Optional[int] = None,
|
||||||
) -> ListNamespacesResponse:
|
) -> ListNamespacesResponse:
|
||||||
@@ -119,7 +120,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
namespace: List[str], optional
|
namespace_path: List[str], optional
|
||||||
The parent namespace to list namespaces in.
|
The parent namespace to list namespaces in.
|
||||||
None or empty list represents root namespace.
|
None or empty list represents root namespace.
|
||||||
page_token: str, optional
|
page_token: str, optional
|
||||||
@@ -133,18 +134,18 @@ class RemoteDBConnection(DBConnection):
|
|||||||
ListNamespacesResponse
|
ListNamespacesResponse
|
||||||
Response containing namespace names and optional page_token for pagination.
|
Response containing namespace names and optional page_token for pagination.
|
||||||
"""
|
"""
|
||||||
if namespace is None:
|
if namespace_path is None:
|
||||||
namespace = []
|
namespace_path = []
|
||||||
return LOOP.run(
|
return LOOP.run(
|
||||||
self._conn.list_namespaces(
|
self._conn.list_namespaces(
|
||||||
namespace=namespace, page_token=page_token, limit=limit
|
namespace_path=namespace_path, page_token=page_token, limit=limit
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def create_namespace(
|
def create_namespace(
|
||||||
self,
|
self,
|
||||||
namespace: List[str],
|
namespace_path: List[str],
|
||||||
mode: Optional[str] = None,
|
mode: Optional[str] = None,
|
||||||
properties: Optional[Dict[str, str]] = None,
|
properties: Optional[Dict[str, str]] = None,
|
||||||
) -> CreateNamespaceResponse:
|
) -> CreateNamespaceResponse:
|
||||||
@@ -152,7 +153,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
namespace: List[str]
|
namespace_path: List[str]
|
||||||
The namespace identifier to create.
|
The namespace identifier to create.
|
||||||
mode: str, optional
|
mode: str, optional
|
||||||
Creation mode - "create" (fail if exists), "exist_ok" (skip if exists),
|
Creation mode - "create" (fail if exists), "exist_ok" (skip if exists),
|
||||||
@@ -167,14 +168,14 @@ class RemoteDBConnection(DBConnection):
|
|||||||
"""
|
"""
|
||||||
return LOOP.run(
|
return LOOP.run(
|
||||||
self._conn.create_namespace(
|
self._conn.create_namespace(
|
||||||
namespace=namespace, mode=mode, properties=properties
|
namespace_path=namespace_path, mode=mode, properties=properties
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def drop_namespace(
|
def drop_namespace(
|
||||||
self,
|
self,
|
||||||
namespace: List[str],
|
namespace_path: List[str],
|
||||||
mode: Optional[str] = None,
|
mode: Optional[str] = None,
|
||||||
behavior: Optional[str] = None,
|
behavior: Optional[str] = None,
|
||||||
) -> DropNamespaceResponse:
|
) -> DropNamespaceResponse:
|
||||||
@@ -182,7 +183,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
namespace: List[str]
|
namespace_path: List[str]
|
||||||
The namespace identifier to drop.
|
The namespace identifier to drop.
|
||||||
mode: str, optional
|
mode: str, optional
|
||||||
Whether to skip if not exists ("SKIP") or fail ("FAIL"). Case insensitive.
|
Whether to skip if not exists ("SKIP") or fail ("FAIL"). Case insensitive.
|
||||||
@@ -196,16 +197,20 @@ class RemoteDBConnection(DBConnection):
|
|||||||
Response containing properties and transaction_id if applicable.
|
Response containing properties and transaction_id if applicable.
|
||||||
"""
|
"""
|
||||||
return LOOP.run(
|
return LOOP.run(
|
||||||
self._conn.drop_namespace(namespace=namespace, mode=mode, behavior=behavior)
|
self._conn.drop_namespace(
|
||||||
|
namespace_path=namespace_path, mode=mode, behavior=behavior
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def describe_namespace(self, namespace: List[str]) -> DescribeNamespaceResponse:
|
def describe_namespace(
|
||||||
|
self, namespace_path: List[str]
|
||||||
|
) -> DescribeNamespaceResponse:
|
||||||
"""Describe a namespace.
|
"""Describe a namespace.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
namespace: List[str]
|
namespace_path: List[str]
|
||||||
The namespace identifier to describe.
|
The namespace identifier to describe.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
@@ -213,12 +218,12 @@ class RemoteDBConnection(DBConnection):
|
|||||||
DescribeNamespaceResponse
|
DescribeNamespaceResponse
|
||||||
Response containing the namespace properties.
|
Response containing the namespace properties.
|
||||||
"""
|
"""
|
||||||
return LOOP.run(self._conn.describe_namespace(namespace=namespace))
|
return LOOP.run(self._conn.describe_namespace(namespace_path=namespace_path))
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def list_tables(
|
def list_tables(
|
||||||
self,
|
self,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
page_token: Optional[str] = None,
|
page_token: Optional[str] = None,
|
||||||
limit: Optional[int] = None,
|
limit: Optional[int] = None,
|
||||||
) -> ListTablesResponse:
|
) -> ListTablesResponse:
|
||||||
@@ -226,7 +231,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
namespace: List[str], optional
|
namespace_path: List[str], optional
|
||||||
The namespace to list tables in.
|
The namespace to list tables in.
|
||||||
None or empty list represents root namespace.
|
None or empty list represents root namespace.
|
||||||
page_token: str, optional
|
page_token: str, optional
|
||||||
@@ -240,11 +245,11 @@ class RemoteDBConnection(DBConnection):
|
|||||||
ListTablesResponse
|
ListTablesResponse
|
||||||
Response containing table names and optional page_token for pagination.
|
Response containing table names and optional page_token for pagination.
|
||||||
"""
|
"""
|
||||||
if namespace is None:
|
if namespace_path is None:
|
||||||
namespace = []
|
namespace_path = []
|
||||||
return LOOP.run(
|
return LOOP.run(
|
||||||
self._conn.list_tables(
|
self._conn.list_tables(
|
||||||
namespace=namespace, page_token=page_token, limit=limit
|
namespace_path=namespace_path, page_token=page_token, limit=limit
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -254,7 +259,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
page_token: Optional[str] = None,
|
page_token: Optional[str] = None,
|
||||||
limit: int = 10,
|
limit: int = 10,
|
||||||
*,
|
*,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
) -> Iterable[str]:
|
) -> Iterable[str]:
|
||||||
"""List the names of all tables in the database.
|
"""List the names of all tables in the database.
|
||||||
|
|
||||||
@@ -263,7 +268,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
namespace: List[str], default []
|
namespace_path: List[str], default []
|
||||||
The namespace to list tables in.
|
The namespace to list tables in.
|
||||||
Empty list represents root namespace.
|
Empty list represents root namespace.
|
||||||
page_token: str
|
page_token: str
|
||||||
@@ -282,11 +287,11 @@ class RemoteDBConnection(DBConnection):
|
|||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
if namespace is None:
|
if namespace_path is None:
|
||||||
namespace = []
|
namespace_path = []
|
||||||
return LOOP.run(
|
return LOOP.run(
|
||||||
self._conn.table_names(
|
self._conn.table_names(
|
||||||
namespace=namespace, start_after=page_token, limit=limit
|
namespace_path=namespace_path, start_after=page_token, limit=limit
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -295,7 +300,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
*,
|
*,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
index_cache_size: Optional[int] = None,
|
index_cache_size: Optional[int] = None,
|
||||||
) -> Table:
|
) -> Table:
|
||||||
@@ -305,7 +310,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
----------
|
----------
|
||||||
name: str
|
name: str
|
||||||
The name of the table.
|
The name of the table.
|
||||||
namespace: List[str], optional
|
namespace_path: List[str], optional
|
||||||
The namespace to open the table from.
|
The namespace to open the table from.
|
||||||
None or empty list represents root namespace.
|
None or empty list represents root namespace.
|
||||||
|
|
||||||
@@ -315,15 +320,15 @@ class RemoteDBConnection(DBConnection):
|
|||||||
"""
|
"""
|
||||||
from .table import RemoteTable
|
from .table import RemoteTable
|
||||||
|
|
||||||
if namespace is None:
|
if namespace_path is None:
|
||||||
namespace = []
|
namespace_path = []
|
||||||
if index_cache_size is not None:
|
if index_cache_size is not None:
|
||||||
logging.info(
|
logging.info(
|
||||||
"index_cache_size is ignored in LanceDb Cloud"
|
"index_cache_size is ignored in LanceDb Cloud"
|
||||||
" (there is no local cache to configure)"
|
" (there is no local cache to configure)"
|
||||||
)
|
)
|
||||||
|
|
||||||
table = LOOP.run(self._conn.open_table(name, namespace=namespace))
|
table = LOOP.run(self._conn.open_table(name, namespace_path=namespace_path))
|
||||||
return RemoteTable(table, self.db_name)
|
return RemoteTable(table, self.db_name)
|
||||||
|
|
||||||
def clone_table(
|
def clone_table(
|
||||||
@@ -331,7 +336,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
target_table_name: str,
|
target_table_name: str,
|
||||||
source_uri: str,
|
source_uri: str,
|
||||||
*,
|
*,
|
||||||
target_namespace: Optional[List[str]] = None,
|
target_namespace_path: Optional[List[str]] = None,
|
||||||
source_version: Optional[int] = None,
|
source_version: Optional[int] = None,
|
||||||
source_tag: Optional[str] = None,
|
source_tag: Optional[str] = None,
|
||||||
is_shallow: bool = True,
|
is_shallow: bool = True,
|
||||||
@@ -344,7 +349,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
The name of the target table to create.
|
The name of the target table to create.
|
||||||
source_uri: str
|
source_uri: str
|
||||||
The URI of the source table to clone from.
|
The URI of the source table to clone from.
|
||||||
target_namespace: List[str], optional
|
target_namespace_path: List[str], optional
|
||||||
The namespace for the target table.
|
The namespace for the target table.
|
||||||
None or empty list represents root namespace.
|
None or empty list represents root namespace.
|
||||||
source_version: int, optional
|
source_version: int, optional
|
||||||
@@ -361,13 +366,13 @@ class RemoteDBConnection(DBConnection):
|
|||||||
"""
|
"""
|
||||||
from .table import RemoteTable
|
from .table import RemoteTable
|
||||||
|
|
||||||
if target_namespace is None:
|
if target_namespace_path is None:
|
||||||
target_namespace = []
|
target_namespace_path = []
|
||||||
table = LOOP.run(
|
table = LOOP.run(
|
||||||
self._conn.clone_table(
|
self._conn.clone_table(
|
||||||
target_table_name,
|
target_table_name,
|
||||||
source_uri,
|
source_uri,
|
||||||
target_namespace=target_namespace,
|
target_namespace_path=target_namespace_path,
|
||||||
source_version=source_version,
|
source_version=source_version,
|
||||||
source_tag=source_tag,
|
source_tag=source_tag,
|
||||||
is_shallow=is_shallow,
|
is_shallow=is_shallow,
|
||||||
@@ -387,7 +392,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
exist_ok: bool = False,
|
exist_ok: bool = False,
|
||||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||||
*,
|
*,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
) -> Table:
|
) -> Table:
|
||||||
"""Create a [Table][lancedb.table.Table] in the database.
|
"""Create a [Table][lancedb.table.Table] in the database.
|
||||||
|
|
||||||
@@ -395,7 +400,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
----------
|
----------
|
||||||
name: str
|
name: str
|
||||||
The name of the table.
|
The name of the table.
|
||||||
namespace: List[str], optional
|
namespace_path: List[str], optional
|
||||||
The namespace to create the table in.
|
The namespace to create the table in.
|
||||||
None or empty list represents root namespace.
|
None or empty list represents root namespace.
|
||||||
data: The data to initialize the table, *optional*
|
data: The data to initialize the table, *optional*
|
||||||
@@ -495,8 +500,8 @@ class RemoteDBConnection(DBConnection):
|
|||||||
mode = "exist_ok"
|
mode = "exist_ok"
|
||||||
elif not mode:
|
elif not mode:
|
||||||
mode = "exist_ok"
|
mode = "exist_ok"
|
||||||
if namespace is None:
|
if namespace_path is None:
|
||||||
namespace = []
|
namespace_path = []
|
||||||
validate_table_name(name)
|
validate_table_name(name)
|
||||||
if embedding_functions is not None:
|
if embedding_functions is not None:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
@@ -511,7 +516,7 @@ class RemoteDBConnection(DBConnection):
|
|||||||
self._conn.create_table(
|
self._conn.create_table(
|
||||||
name,
|
name,
|
||||||
data,
|
data,
|
||||||
namespace=namespace,
|
namespace_path=namespace_path,
|
||||||
mode=mode,
|
mode=mode,
|
||||||
schema=schema,
|
schema=schema,
|
||||||
on_bad_vectors=on_bad_vectors,
|
on_bad_vectors=on_bad_vectors,
|
||||||
@@ -521,28 +526,28 @@ class RemoteDBConnection(DBConnection):
|
|||||||
return RemoteTable(table, self.db_name)
|
return RemoteTable(table, self.db_name)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def drop_table(self, name: str, namespace: Optional[List[str]] = None):
|
def drop_table(self, name: str, namespace_path: Optional[List[str]] = None):
|
||||||
"""Drop a table from the database.
|
"""Drop a table from the database.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
name: str
|
name: str
|
||||||
The name of the table.
|
The name of the table.
|
||||||
namespace: List[str], optional
|
namespace_path: List[str], optional
|
||||||
The namespace to drop the table from.
|
The namespace to drop the table from.
|
||||||
None or empty list represents root namespace.
|
None or empty list represents root namespace.
|
||||||
"""
|
"""
|
||||||
if namespace is None:
|
if namespace_path is None:
|
||||||
namespace = []
|
namespace_path = []
|
||||||
LOOP.run(self._conn.drop_table(name, namespace=namespace))
|
LOOP.run(self._conn.drop_table(name, namespace_path=namespace_path))
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def rename_table(
|
def rename_table(
|
||||||
self,
|
self,
|
||||||
cur_name: str,
|
cur_name: str,
|
||||||
new_name: str,
|
new_name: str,
|
||||||
cur_namespace: Optional[List[str]] = None,
|
cur_namespace_path: Optional[List[str]] = None,
|
||||||
new_namespace: Optional[List[str]] = None,
|
new_namespace_path: Optional[List[str]] = None,
|
||||||
):
|
):
|
||||||
"""Rename a table in the database.
|
"""Rename a table in the database.
|
||||||
|
|
||||||
@@ -553,19 +558,32 @@ class RemoteDBConnection(DBConnection):
|
|||||||
new_name: str
|
new_name: str
|
||||||
The new name of the table.
|
The new name of the table.
|
||||||
"""
|
"""
|
||||||
if cur_namespace is None:
|
if cur_namespace_path is None:
|
||||||
cur_namespace = []
|
cur_namespace_path = []
|
||||||
if new_namespace is None:
|
if new_namespace_path is None:
|
||||||
new_namespace = []
|
new_namespace_path = []
|
||||||
LOOP.run(
|
LOOP.run(
|
||||||
self._conn.rename_table(
|
self._conn.rename_table(
|
||||||
cur_name,
|
cur_name,
|
||||||
new_name,
|
new_name,
|
||||||
cur_namespace=cur_namespace,
|
cur_namespace_path=cur_namespace_path,
|
||||||
new_namespace=new_namespace,
|
new_namespace_path=new_namespace_path,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def namespace_client(self) -> LanceNamespace:
|
||||||
|
"""Get the equivalent namespace client for this connection.
|
||||||
|
|
||||||
|
Returns a RestNamespace with the same URI and authentication headers.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
LanceNamespace
|
||||||
|
The namespace client for this connection.
|
||||||
|
"""
|
||||||
|
return LOOP.run(self._conn.namespace_client())
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""Close the connection to the database."""
|
"""Close the connection to the database."""
|
||||||
self._client.close()
|
self._conn.close()
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
import logging
|
import logging
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import Dict, Iterable, List, Optional, Union, Literal
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Union, Literal
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from lancedb._lancedb import (
|
from lancedb._lancedb import (
|
||||||
@@ -35,6 +35,7 @@ import pyarrow as pa
|
|||||||
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
|
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||||
from lancedb.merge import LanceMergeInsertBuilder
|
from lancedb.merge import LanceMergeInsertBuilder
|
||||||
from lancedb.embeddings import EmbeddingFunctionRegistry
|
from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||||
|
from lancedb.table import _normalize_progress
|
||||||
|
|
||||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
|
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
|
||||||
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
||||||
@@ -308,6 +309,7 @@ class RemoteTable(Table):
|
|||||||
mode: str = "append",
|
mode: str = "append",
|
||||||
on_bad_vectors: str = "error",
|
on_bad_vectors: str = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
|
progress: Optional[Union[bool, Callable, Any]] = None,
|
||||||
) -> AddResult:
|
) -> AddResult:
|
||||||
"""Add more data to the [Table](Table). It has the same API signature as
|
"""Add more data to the [Table](Table). It has the same API signature as
|
||||||
the OSS version.
|
the OSS version.
|
||||||
@@ -330,17 +332,29 @@ class RemoteTable(Table):
|
|||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
|
progress: bool, callable, or tqdm-like, optional
|
||||||
|
A callback or tqdm-compatible progress bar. See
|
||||||
|
:meth:`Table.add` for details.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
AddResult
|
AddResult
|
||||||
An object containing the new version number of the table after adding data.
|
An object containing the new version number of the table after adding data.
|
||||||
"""
|
"""
|
||||||
|
progress, owns = _normalize_progress(progress)
|
||||||
|
try:
|
||||||
return LOOP.run(
|
return LOOP.run(
|
||||||
self._table.add(
|
self._table.add(
|
||||||
data, mode=mode, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
data,
|
||||||
|
mode=mode,
|
||||||
|
on_bad_vectors=on_bad_vectors,
|
||||||
|
fill_value=fill_value,
|
||||||
|
progress=progress,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
finally:
|
||||||
|
if owns:
|
||||||
|
progress.close()
|
||||||
|
|
||||||
def search(
|
def search(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ from functools import cached_property
|
|||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
|
Callable,
|
||||||
Dict,
|
Dict,
|
||||||
Iterable,
|
Iterable,
|
||||||
List,
|
List,
|
||||||
@@ -88,7 +89,6 @@ from .index import lang_mapping
|
|||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .db import LanceDBConnection
|
from .db import LanceDBConnection
|
||||||
from .io import StorageOptionsProvider
|
|
||||||
from ._lancedb import (
|
from ._lancedb import (
|
||||||
Table as LanceDBTable,
|
Table as LanceDBTable,
|
||||||
OptimizeStats,
|
OptimizeStats,
|
||||||
@@ -191,7 +191,7 @@ def _into_pyarrow_reader(
|
|||||||
f"Unknown data type {type(data)}. "
|
f"Unknown data type {type(data)}. "
|
||||||
"Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
|
"Supported types: list of dicts, pandas DataFrame, polars DataFrame, "
|
||||||
"pyarrow Table/RecordBatch, or Pydantic models. "
|
"pyarrow Table/RecordBatch, or Pydantic models. "
|
||||||
"See https://lancedb.com/docs/tables/ for examples."
|
"See https://docs.lancedb.com/tables/ for examples."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -270,15 +270,17 @@ def _sanitize_data(
|
|||||||
reader,
|
reader,
|
||||||
on_bad_vectors=on_bad_vectors,
|
on_bad_vectors=on_bad_vectors,
|
||||||
fill_value=fill_value,
|
fill_value=fill_value,
|
||||||
|
target_schema=target_schema,
|
||||||
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
if target_schema is None:
|
if target_schema is None:
|
||||||
target_schema, reader = _infer_target_schema(reader)
|
target_schema, reader = _infer_target_schema(reader)
|
||||||
|
|
||||||
if metadata:
|
if metadata:
|
||||||
new_metadata = target_schema.metadata or {}
|
target_schema = target_schema.with_metadata(
|
||||||
new_metadata = new_metadata.update(metadata)
|
_merge_metadata(target_schema.metadata, metadata)
|
||||||
target_schema = target_schema.with_metadata(new_metadata)
|
)
|
||||||
|
|
||||||
_validate_schema(target_schema)
|
_validate_schema(target_schema)
|
||||||
reader = _cast_to_target_schema(reader, target_schema, allow_subschema)
|
reader = _cast_to_target_schema(reader, target_schema, allow_subschema)
|
||||||
@@ -294,7 +296,7 @@ def _cast_to_target_schema(
|
|||||||
# pa.Table.cast expects field order not to be changed.
|
# pa.Table.cast expects field order not to be changed.
|
||||||
# Lance doesn't care about field order, so we don't need to rearrange fields
|
# Lance doesn't care about field order, so we don't need to rearrange fields
|
||||||
# to match the target schema. We just need to correctly cast the fields.
|
# to match the target schema. We just need to correctly cast the fields.
|
||||||
if reader.schema == target_schema:
|
if reader.schema.equals(target_schema, check_metadata=True):
|
||||||
# Fast path when the schemas are already the same
|
# Fast path when the schemas are already the same
|
||||||
return reader
|
return reader
|
||||||
|
|
||||||
@@ -314,7 +316,13 @@ def _cast_to_target_schema(
|
|||||||
def gen():
|
def gen():
|
||||||
for batch in reader:
|
for batch in reader:
|
||||||
# Table but not RecordBatch has cast.
|
# Table but not RecordBatch has cast.
|
||||||
yield pa.Table.from_batches([batch]).cast(reordered_schema).to_batches()[0]
|
cast_batches = (
|
||||||
|
pa.Table.from_batches([batch]).cast(reordered_schema).to_batches()
|
||||||
|
)
|
||||||
|
if cast_batches:
|
||||||
|
yield pa.RecordBatch.from_arrays(
|
||||||
|
cast_batches[0].columns, schema=reordered_schema
|
||||||
|
)
|
||||||
|
|
||||||
return pa.RecordBatchReader.from_batches(reordered_schema, gen())
|
return pa.RecordBatchReader.from_batches(reordered_schema, gen())
|
||||||
|
|
||||||
@@ -332,27 +340,37 @@ def _align_field_types(
|
|||||||
if target_field is None:
|
if target_field is None:
|
||||||
raise ValueError(f"Field '{field.name}' not found in target schema")
|
raise ValueError(f"Field '{field.name}' not found in target schema")
|
||||||
if pa.types.is_struct(target_field.type):
|
if pa.types.is_struct(target_field.type):
|
||||||
|
if pa.types.is_struct(field.type):
|
||||||
new_type = pa.struct(
|
new_type = pa.struct(
|
||||||
_align_field_types(
|
_align_field_types(
|
||||||
field.type.fields,
|
field.type.fields,
|
||||||
target_field.type.fields,
|
target_field.type.fields,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
new_type = target_field.type
|
||||||
elif pa.types.is_list(target_field.type):
|
elif pa.types.is_list(target_field.type):
|
||||||
|
if _is_list_like(field.type):
|
||||||
new_type = pa.list_(
|
new_type = pa.list_(
|
||||||
_align_field_types(
|
_align_field_types(
|
||||||
[field.type.value_field],
|
[field.type.value_field],
|
||||||
[target_field.type.value_field],
|
[target_field.type.value_field],
|
||||||
)[0]
|
)[0]
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
new_type = target_field.type
|
||||||
elif pa.types.is_large_list(target_field.type):
|
elif pa.types.is_large_list(target_field.type):
|
||||||
|
if _is_list_like(field.type):
|
||||||
new_type = pa.large_list(
|
new_type = pa.large_list(
|
||||||
_align_field_types(
|
_align_field_types(
|
||||||
[field.type.value_field],
|
[field.type.value_field],
|
||||||
[target_field.type.value_field],
|
[target_field.type.value_field],
|
||||||
)[0]
|
)[0]
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
new_type = target_field.type
|
||||||
elif pa.types.is_fixed_size_list(target_field.type):
|
elif pa.types.is_fixed_size_list(target_field.type):
|
||||||
|
if _is_list_like(field.type):
|
||||||
new_type = pa.list_(
|
new_type = pa.list_(
|
||||||
_align_field_types(
|
_align_field_types(
|
||||||
[field.type.value_field],
|
[field.type.value_field],
|
||||||
@@ -362,7 +380,11 @@ def _align_field_types(
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
new_type = target_field.type
|
new_type = target_field.type
|
||||||
new_fields.append(pa.field(field.name, new_type, field.nullable))
|
else:
|
||||||
|
new_type = target_field.type
|
||||||
|
new_fields.append(
|
||||||
|
pa.field(field.name, new_type, field.nullable, target_field.metadata)
|
||||||
|
)
|
||||||
return new_fields
|
return new_fields
|
||||||
|
|
||||||
|
|
||||||
@@ -440,6 +462,7 @@ def sanitize_create_table(
|
|||||||
schema = data.schema
|
schema = data.schema
|
||||||
|
|
||||||
if metadata:
|
if metadata:
|
||||||
|
metadata = _merge_metadata(schema.metadata, metadata)
|
||||||
schema = schema.with_metadata(metadata)
|
schema = schema.with_metadata(metadata)
|
||||||
# Need to apply metadata to the data as well
|
# Need to apply metadata to the data as well
|
||||||
if isinstance(data, pa.Table):
|
if isinstance(data, pa.Table):
|
||||||
@@ -492,9 +515,9 @@ def _append_vector_columns(
|
|||||||
vector columns to the table.
|
vector columns to the table.
|
||||||
"""
|
"""
|
||||||
if schema is None:
|
if schema is None:
|
||||||
metadata = metadata or {}
|
metadata = _merge_metadata(metadata)
|
||||||
else:
|
else:
|
||||||
metadata = schema.metadata or metadata or {}
|
metadata = _merge_metadata(schema.metadata, metadata)
|
||||||
functions = EmbeddingFunctionRegistry.get_instance().parse_functions(metadata)
|
functions = EmbeddingFunctionRegistry.get_instance().parse_functions(metadata)
|
||||||
|
|
||||||
if not functions:
|
if not functions:
|
||||||
@@ -556,6 +579,21 @@ def _table_uri(base: str, table_name: str) -> str:
|
|||||||
return join_uri(base, f"{table_name}.lance")
|
return join_uri(base, f"{table_name}.lance")
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_progress(progress):
|
||||||
|
"""Normalize a ``progress`` parameter for :meth:`Table.add`.
|
||||||
|
|
||||||
|
Returns ``(progress_obj, owns)`` where *owns* is True when we created a
|
||||||
|
tqdm bar that the caller must close.
|
||||||
|
"""
|
||||||
|
if progress is True:
|
||||||
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
|
return tqdm(unit=" rows"), True
|
||||||
|
if progress is False or progress is None:
|
||||||
|
return None, False
|
||||||
|
return progress, False
|
||||||
|
|
||||||
|
|
||||||
class Table(ABC):
|
class Table(ABC):
|
||||||
"""
|
"""
|
||||||
A Table is a collection of Records in a LanceDB Database.
|
A Table is a collection of Records in a LanceDB Database.
|
||||||
@@ -905,29 +943,26 @@ class Table(ABC):
|
|||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
field_names: str or list of str
|
field_names: str or list of str
|
||||||
The name(s) of the field to index.
|
The name of the field to index. Native FTS indexes can only be
|
||||||
If ``use_tantivy`` is False (default), only a single field name
|
created on a single field at a time. To search over multiple text
|
||||||
(str) is supported. To index multiple fields, create a separate
|
fields, create a separate FTS index for each field.
|
||||||
FTS index for each field.
|
|
||||||
replace: bool, default False
|
replace: bool, default False
|
||||||
If True, replace the existing index if it exists. Note that this is
|
If True, replace the existing index if it exists. Note that this is
|
||||||
not yet an atomic operation; the index will be temporarily
|
not yet an atomic operation; the index will be temporarily
|
||||||
unavailable while the new index is being created.
|
unavailable while the new index is being created.
|
||||||
writer_heap_size: int, default 1GB
|
writer_heap_size: int, default 1GB
|
||||||
Only available with use_tantivy=True
|
Deprecated legacy Tantivy parameter. Any value other than the
|
||||||
|
default raises an error.
|
||||||
ordering_field_names:
|
ordering_field_names:
|
||||||
A list of unsigned type fields to index to optionally order
|
Deprecated legacy Tantivy parameter. Setting this raises an error.
|
||||||
results on at search time.
|
|
||||||
only available with use_tantivy=True
|
|
||||||
tokenizer_name: str, default "default"
|
tokenizer_name: str, default "default"
|
||||||
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
A compatibility alias for native tokenizer configs. Can be "raw",
|
||||||
language code followed by "_stem". So for english it would be "en_stem".
|
"default" or the 2 letter language code followed by "_stem". So
|
||||||
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
for english it would be "en_stem".
|
||||||
use_tantivy: bool, default False
|
use_tantivy: bool, default False
|
||||||
If True, use the legacy full-text search implementation based on tantivy.
|
Deprecated legacy Tantivy parameter. Setting this to True raises an
|
||||||
If False, use the new full-text search implementation based on lance-index.
|
error.
|
||||||
with_position: bool, default False
|
with_position: bool, default False
|
||||||
Only available with use_tantivy=False
|
|
||||||
If False, do not store the positions of the terms in the text.
|
If False, do not store the positions of the terms in the text.
|
||||||
This can reduce the size of the index and improve indexing speed.
|
This can reduce the size of the index and improve indexing speed.
|
||||||
But it will raise an exception for phrase queries.
|
But it will raise an exception for phrase queries.
|
||||||
@@ -974,6 +1009,7 @@ class Table(ABC):
|
|||||||
mode: AddMode = "append",
|
mode: AddMode = "append",
|
||||||
on_bad_vectors: OnBadVectorsType = "error",
|
on_bad_vectors: OnBadVectorsType = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
|
progress: Optional[Union[bool, Callable, Any]] = None,
|
||||||
) -> AddResult:
|
) -> AddResult:
|
||||||
"""Add more data to the [Table](Table).
|
"""Add more data to the [Table](Table).
|
||||||
|
|
||||||
@@ -995,6 +1031,29 @@ class Table(ABC):
|
|||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
|
progress: bool, callable, or tqdm-like, optional
|
||||||
|
Progress reporting during the add operation. Can be:
|
||||||
|
|
||||||
|
- ``True`` to automatically create and display a tqdm progress
|
||||||
|
bar (requires ``tqdm`` to be installed)::
|
||||||
|
|
||||||
|
table.add(data, progress=True)
|
||||||
|
|
||||||
|
- A **callable** that receives a dict with keys ``output_rows``,
|
||||||
|
``output_bytes``, ``total_rows``, ``elapsed_seconds``,
|
||||||
|
``active_tasks``, ``total_tasks``, and ``done``::
|
||||||
|
|
||||||
|
def on_progress(p):
|
||||||
|
print(f"{p['output_rows']}/{p['total_rows']} rows, "
|
||||||
|
f"{p['active_tasks']}/{p['total_tasks']} workers")
|
||||||
|
table.add(data, progress=on_progress)
|
||||||
|
|
||||||
|
- A **tqdm-compatible** progress bar whose ``total`` and
|
||||||
|
``update()`` will be called automatically. The postfix shows
|
||||||
|
write throughput (MB/s) and active worker count::
|
||||||
|
|
||||||
|
with tqdm() as pbar:
|
||||||
|
table.add(data, progress=pbar)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -1684,6 +1743,16 @@ class Table(ABC):
|
|||||||
index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound
|
index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound
|
||||||
return (path, fs, index_exists)
|
return (path, fs, index_exists)
|
||||||
|
|
||||||
|
def _ensure_no_legacy_fts_index(self):
|
||||||
|
path, _, exists = self._get_fts_index_path()
|
||||||
|
if exists:
|
||||||
|
raise ValueError(
|
||||||
|
"Legacy Tantivy FTS index detected at "
|
||||||
|
f"{path}. Tantivy-based FTS has been removed. "
|
||||||
|
"Delete the legacy index and recreate it with "
|
||||||
|
"table.create_fts_index(...)."
|
||||||
|
)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def uses_v2_manifest_paths(self) -> bool:
|
def uses_v2_manifest_paths(self) -> bool:
|
||||||
"""
|
"""
|
||||||
@@ -1736,30 +1805,30 @@ class LanceTable(Table):
|
|||||||
connection: "LanceDBConnection",
|
connection: "LanceDBConnection",
|
||||||
name: str,
|
name: str,
|
||||||
*,
|
*,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
storage_options_provider: Optional["StorageOptionsProvider"] = None,
|
|
||||||
index_cache_size: Optional[int] = None,
|
index_cache_size: Optional[int] = None,
|
||||||
location: Optional[str] = None,
|
location: Optional[str] = None,
|
||||||
namespace_client: Optional[Any] = None,
|
namespace_client: Optional[Any] = None,
|
||||||
managed_versioning: Optional[bool] = None,
|
managed_versioning: Optional[bool] = None,
|
||||||
|
pushdown_operations: Optional[set] = None,
|
||||||
_async: AsyncTable = None,
|
_async: AsyncTable = None,
|
||||||
):
|
):
|
||||||
if namespace is None:
|
if namespace_path is None:
|
||||||
namespace = []
|
namespace_path = []
|
||||||
self._conn = connection
|
self._conn = connection
|
||||||
self._namespace = namespace
|
self._namespace_path = namespace_path
|
||||||
self._location = location # Store location for use in _dataset_path
|
self._location = location # Store location for use in _dataset_path
|
||||||
self._namespace_client = namespace_client
|
self._namespace_client = namespace_client
|
||||||
|
self._pushdown_operations = pushdown_operations or set()
|
||||||
if _async is not None:
|
if _async is not None:
|
||||||
self._table = _async
|
self._table = _async
|
||||||
else:
|
else:
|
||||||
self._table = LOOP.run(
|
self._table = LOOP.run(
|
||||||
connection._conn.open_table(
|
connection._conn.open_table(
|
||||||
name,
|
name,
|
||||||
namespace=namespace,
|
namespace_path=namespace_path,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
storage_options_provider=storage_options_provider,
|
|
||||||
index_cache_size=index_cache_size,
|
index_cache_size=index_cache_size,
|
||||||
location=location,
|
location=location,
|
||||||
namespace_client=namespace_client,
|
namespace_client=namespace_client,
|
||||||
@@ -1774,13 +1843,13 @@ class LanceTable(Table):
|
|||||||
@property
|
@property
|
||||||
def namespace(self) -> List[str]:
|
def namespace(self) -> List[str]:
|
||||||
"""Return the namespace path of the table."""
|
"""Return the namespace path of the table."""
|
||||||
return self._namespace
|
return self._namespace_path
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def id(self) -> str:
|
def id(self) -> str:
|
||||||
"""Return the full identifier of the table (namespace$name)."""
|
"""Return the full identifier of the table (namespace$name)."""
|
||||||
if self._namespace:
|
if self._namespace_path:
|
||||||
return "$".join(self._namespace + [self.name])
|
return "$".join(self._namespace_path + [self.name])
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -1801,26 +1870,26 @@ class LanceTable(Table):
|
|||||||
db,
|
db,
|
||||||
name,
|
name,
|
||||||
*,
|
*,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
storage_options_provider: Optional["StorageOptionsProvider"] = None,
|
|
||||||
index_cache_size: Optional[int] = None,
|
index_cache_size: Optional[int] = None,
|
||||||
location: Optional[str] = None,
|
location: Optional[str] = None,
|
||||||
namespace_client: Optional[Any] = None,
|
namespace_client: Optional[Any] = None,
|
||||||
managed_versioning: Optional[bool] = None,
|
managed_versioning: Optional[bool] = None,
|
||||||
|
pushdown_operations: Optional[set] = None,
|
||||||
):
|
):
|
||||||
if namespace is None:
|
if namespace_path is None:
|
||||||
namespace = []
|
namespace_path = []
|
||||||
tbl = cls(
|
tbl = cls(
|
||||||
db,
|
db,
|
||||||
name,
|
name,
|
||||||
namespace=namespace,
|
namespace_path=namespace_path,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
storage_options_provider=storage_options_provider,
|
|
||||||
index_cache_size=index_cache_size,
|
index_cache_size=index_cache_size,
|
||||||
location=location,
|
location=location,
|
||||||
namespace_client=namespace_client,
|
namespace_client=namespace_client,
|
||||||
managed_versioning=managed_versioning,
|
managed_versioning=managed_versioning,
|
||||||
|
pushdown_operations=pushdown_operations,
|
||||||
)
|
)
|
||||||
|
|
||||||
# check the dataset exists
|
# check the dataset exists
|
||||||
@@ -1853,11 +1922,11 @@ class LanceTable(Table):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self._namespace_client is not None:
|
if self._namespace_client is not None:
|
||||||
table_id = self._namespace + [self.name]
|
table_id = self._namespace_path + [self.name]
|
||||||
return lance.dataset(
|
return lance.dataset(
|
||||||
version=self.version,
|
version=self.version,
|
||||||
storage_options=self._conn.storage_options,
|
storage_options=self._conn.storage_options,
|
||||||
namespace=self._namespace_client,
|
namespace_client=self._namespace_client,
|
||||||
table_id=table_id,
|
table_id=table_id,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
@@ -2343,13 +2412,34 @@ class LanceTable(Table):
|
|||||||
prefix_only: bool = False,
|
prefix_only: bool = False,
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
):
|
):
|
||||||
if not use_tantivy:
|
self._ensure_no_legacy_fts_index()
|
||||||
|
|
||||||
|
if use_tantivy:
|
||||||
|
raise ValueError(
|
||||||
|
"Tantivy-based FTS has been removed. "
|
||||||
|
"Remove use_tantivy and recreate the index with native FTS."
|
||||||
|
)
|
||||||
|
if ordering_field_names is not None:
|
||||||
|
raise ValueError(
|
||||||
|
"ordering_field_names was only supported by the removed "
|
||||||
|
"Tantivy-based FTS implementation."
|
||||||
|
)
|
||||||
|
if writer_heap_size != 1024 * 1024 * 1024:
|
||||||
|
raise ValueError(
|
||||||
|
"writer_heap_size was only supported by the removed "
|
||||||
|
"Tantivy-based FTS implementation."
|
||||||
|
)
|
||||||
if not isinstance(field_names, str):
|
if not isinstance(field_names, str):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Native FTS indexes can only be created on a single field "
|
"Native FTS indexes can only be created on a single field "
|
||||||
"at a time. To search over multiple text fields, create a "
|
"at a time. To search over multiple text fields, create a "
|
||||||
"separate FTS index for each field."
|
"separate FTS index for each field."
|
||||||
)
|
)
|
||||||
|
if "." in field_names:
|
||||||
|
raise ValueError(
|
||||||
|
"Native FTS indexes can only be created on top-level fields. "
|
||||||
|
f"Received nested field path: {field_names!r}."
|
||||||
|
)
|
||||||
|
|
||||||
if tokenizer_name is None:
|
if tokenizer_name is None:
|
||||||
tokenizer_configs = {
|
tokenizer_configs = {
|
||||||
@@ -2372,12 +2462,6 @@ class LanceTable(Table):
|
|||||||
**tokenizer_configs,
|
**tokenizer_configs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# delete the existing legacy index if it exists
|
|
||||||
if replace:
|
|
||||||
path, fs, exist = self._get_fts_index_path()
|
|
||||||
if exist:
|
|
||||||
fs.delete_dir(path)
|
|
||||||
|
|
||||||
LOOP.run(
|
LOOP.run(
|
||||||
self._table.create_index(
|
self._table.create_index(
|
||||||
field_names,
|
field_names,
|
||||||
@@ -2386,42 +2470,6 @@ class LanceTable(Table):
|
|||||||
name=name,
|
name=name,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return
|
|
||||||
|
|
||||||
from .fts import create_index, populate_index
|
|
||||||
|
|
||||||
if isinstance(field_names, str):
|
|
||||||
field_names = [field_names]
|
|
||||||
|
|
||||||
if isinstance(ordering_field_names, str):
|
|
||||||
ordering_field_names = [ordering_field_names]
|
|
||||||
|
|
||||||
path, fs, exist = self._get_fts_index_path()
|
|
||||||
if exist:
|
|
||||||
if not replace:
|
|
||||||
raise ValueError("Index already exists. Use replace=True to overwrite.")
|
|
||||||
fs.delete_dir(path)
|
|
||||||
|
|
||||||
if not isinstance(fs, pa_fs.LocalFileSystem):
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Full-text search is only supported on the local filesystem"
|
|
||||||
)
|
|
||||||
|
|
||||||
if tokenizer_name is None:
|
|
||||||
tokenizer_name = "default"
|
|
||||||
index = create_index(
|
|
||||||
path,
|
|
||||||
field_names,
|
|
||||||
ordering_fields=ordering_field_names,
|
|
||||||
tokenizer_name=tokenizer_name,
|
|
||||||
)
|
|
||||||
populate_index(
|
|
||||||
index,
|
|
||||||
self,
|
|
||||||
field_names,
|
|
||||||
ordering_fields=ordering_field_names,
|
|
||||||
writer_heap_size=writer_heap_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
|
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
|
||||||
@@ -2492,6 +2540,7 @@ class LanceTable(Table):
|
|||||||
mode: AddMode = "append",
|
mode: AddMode = "append",
|
||||||
on_bad_vectors: OnBadVectorsType = "error",
|
on_bad_vectors: OnBadVectorsType = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
|
progress: Optional[Union[bool, Callable, Any]] = None,
|
||||||
) -> AddResult:
|
) -> AddResult:
|
||||||
"""Add data to the table.
|
"""Add data to the table.
|
||||||
If vector columns are missing and the table
|
If vector columns are missing and the table
|
||||||
@@ -2510,17 +2559,29 @@ class LanceTable(Table):
|
|||||||
One of "error", "drop", "fill", "null".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
|
progress: bool, callable, or tqdm-like, optional
|
||||||
|
A callback or tqdm-compatible progress bar. See
|
||||||
|
:meth:`Table.add` for details.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
int
|
int
|
||||||
The number of vectors in the table.
|
The number of vectors in the table.
|
||||||
"""
|
"""
|
||||||
|
progress, owns = _normalize_progress(progress)
|
||||||
|
try:
|
||||||
return LOOP.run(
|
return LOOP.run(
|
||||||
self._table.add(
|
self._table.add(
|
||||||
data, mode=mode, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
data,
|
||||||
|
mode=mode,
|
||||||
|
on_bad_vectors=on_bad_vectors,
|
||||||
|
fill_value=fill_value,
|
||||||
|
progress=progress,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
finally:
|
||||||
|
if owns:
|
||||||
|
progress.close()
|
||||||
|
|
||||||
def merge(
|
def merge(
|
||||||
self,
|
self,
|
||||||
@@ -2750,13 +2811,13 @@ class LanceTable(Table):
|
|||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||||
*,
|
*,
|
||||||
namespace: Optional[List[str]] = None,
|
namespace_path: Optional[List[str]] = None,
|
||||||
storage_options: Optional[Dict[str, str | bool]] = None,
|
storage_options: Optional[Dict[str, str | bool]] = None,
|
||||||
storage_options_provider: Optional["StorageOptionsProvider"] = None,
|
|
||||||
data_storage_version: Optional[str] = None,
|
data_storage_version: Optional[str] = None,
|
||||||
enable_v2_manifest_paths: Optional[bool] = None,
|
enable_v2_manifest_paths: Optional[bool] = None,
|
||||||
location: Optional[str] = None,
|
location: Optional[str] = None,
|
||||||
namespace_client: Optional[Any] = None,
|
namespace_client: Optional[Any] = None,
|
||||||
|
pushdown_operations: Optional[set] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a new table.
|
Create a new table.
|
||||||
@@ -2811,13 +2872,14 @@ class LanceTable(Table):
|
|||||||
Deprecated. Set `storage_options` when connecting to the database and set
|
Deprecated. Set `storage_options` when connecting to the database and set
|
||||||
`new_table_enable_v2_manifest_paths` in the options.
|
`new_table_enable_v2_manifest_paths` in the options.
|
||||||
"""
|
"""
|
||||||
if namespace is None:
|
if namespace_path is None:
|
||||||
namespace = []
|
namespace_path = []
|
||||||
self = cls.__new__(cls)
|
self = cls.__new__(cls)
|
||||||
self._conn = db
|
self._conn = db
|
||||||
self._namespace = namespace
|
self._namespace_path = namespace_path
|
||||||
self._location = location
|
self._location = location
|
||||||
self._namespace_client = namespace_client
|
self._namespace_client = namespace_client
|
||||||
|
self._pushdown_operations = pushdown_operations or set()
|
||||||
|
|
||||||
if data_storage_version is not None:
|
if data_storage_version is not None:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
@@ -2850,10 +2912,10 @@ class LanceTable(Table):
|
|||||||
on_bad_vectors=on_bad_vectors,
|
on_bad_vectors=on_bad_vectors,
|
||||||
fill_value=fill_value,
|
fill_value=fill_value,
|
||||||
embedding_functions=embedding_functions,
|
embedding_functions=embedding_functions,
|
||||||
namespace=namespace,
|
namespace_path=namespace_path,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
storage_options_provider=storage_options_provider,
|
|
||||||
location=location,
|
location=location,
|
||||||
|
namespace_client=namespace_client,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return self
|
return self
|
||||||
@@ -2921,6 +2983,15 @@ class LanceTable(Table):
|
|||||||
batch_size: Optional[int] = None,
|
batch_size: Optional[int] = None,
|
||||||
timeout: Optional[timedelta] = None,
|
timeout: Optional[timedelta] = None,
|
||||||
) -> pa.RecordBatchReader:
|
) -> pa.RecordBatchReader:
|
||||||
|
if (
|
||||||
|
"QueryTable" in self._pushdown_operations
|
||||||
|
and self._namespace_client is not None
|
||||||
|
):
|
||||||
|
from lancedb.namespace import _execute_server_side_query
|
||||||
|
|
||||||
|
table_id = self._namespace_path + [self.name]
|
||||||
|
return _execute_server_side_query(self._namespace_client, table_id, query)
|
||||||
|
|
||||||
async_iter = LOOP.run(
|
async_iter = LOOP.run(
|
||||||
self._table._execute_query(query, batch_size=batch_size, timeout=timeout)
|
self._table._execute_query(query, batch_size=batch_size, timeout=timeout)
|
||||||
)
|
)
|
||||||
@@ -3150,43 +3221,157 @@ def _handle_bad_vectors(
|
|||||||
reader: pa.RecordBatchReader,
|
reader: pa.RecordBatchReader,
|
||||||
on_bad_vectors: Literal["error", "drop", "fill", "null"] = "error",
|
on_bad_vectors: Literal["error", "drop", "fill", "null"] = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
|
target_schema: Optional[pa.Schema] = None,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
) -> pa.RecordBatchReader:
|
) -> pa.RecordBatchReader:
|
||||||
vector_columns = []
|
vector_columns = _find_vector_columns(reader.schema, target_schema, metadata)
|
||||||
|
if not vector_columns:
|
||||||
|
return reader
|
||||||
|
|
||||||
for field in reader.schema:
|
output_schema = _vector_output_schema(reader.schema, vector_columns)
|
||||||
# They can provide a 'vector' column that isn't yet a FSL
|
|
||||||
named_vector_col = (
|
def gen():
|
||||||
(
|
for batch in reader:
|
||||||
pa.types.is_list(field.type)
|
pending_dims = []
|
||||||
or pa.types.is_large_list(field.type)
|
for vector_column in vector_columns:
|
||||||
or pa.types.is_fixed_size_list(field.type)
|
dim = vector_column["expected_dim"]
|
||||||
|
if target_schema is not None and dim is None:
|
||||||
|
dim = _infer_vector_dim(batch[vector_column["name"]])
|
||||||
|
pending_dims.append(vector_column)
|
||||||
|
batch = _handle_bad_vector_column(
|
||||||
|
batch,
|
||||||
|
vector_column_name=vector_column["name"],
|
||||||
|
on_bad_vectors=on_bad_vectors,
|
||||||
|
fill_value=fill_value,
|
||||||
|
expected_dim=dim,
|
||||||
|
expected_value_type=vector_column["expected_value_type"],
|
||||||
)
|
)
|
||||||
|
for vector_column in pending_dims:
|
||||||
|
if vector_column["expected_dim"] is None:
|
||||||
|
vector_column["expected_dim"] = _infer_vector_dim(
|
||||||
|
batch[vector_column["name"]]
|
||||||
|
)
|
||||||
|
if batch.schema.equals(output_schema, check_metadata=True):
|
||||||
|
yield batch
|
||||||
|
continue
|
||||||
|
|
||||||
|
cast_batches = (
|
||||||
|
pa.Table.from_batches([batch]).cast(output_schema).to_batches()
|
||||||
|
)
|
||||||
|
if cast_batches:
|
||||||
|
yield pa.RecordBatch.from_arrays(
|
||||||
|
cast_batches[0].columns,
|
||||||
|
schema=output_schema,
|
||||||
|
)
|
||||||
|
|
||||||
|
return pa.RecordBatchReader.from_batches(output_schema, gen())
|
||||||
|
|
||||||
|
|
||||||
|
def _find_vector_columns(
|
||||||
|
reader_schema: pa.Schema,
|
||||||
|
target_schema: Optional[pa.Schema],
|
||||||
|
metadata: Optional[dict],
|
||||||
|
) -> List[dict]:
|
||||||
|
if target_schema is None:
|
||||||
|
vector_columns = []
|
||||||
|
for field in reader_schema:
|
||||||
|
named_vector_col = (
|
||||||
|
_is_list_like(field.type)
|
||||||
and pa.types.is_floating(field.type.value_type)
|
and pa.types.is_floating(field.type.value_type)
|
||||||
and field.name == VECTOR_COLUMN_NAME
|
and field.name == VECTOR_COLUMN_NAME
|
||||||
)
|
)
|
||||||
# TODO: we're making an assumption that fixed size list of 10 or more
|
|
||||||
# is a vector column. This is definitely a bit hacky.
|
|
||||||
likely_vector_col = (
|
likely_vector_col = (
|
||||||
pa.types.is_fixed_size_list(field.type)
|
pa.types.is_fixed_size_list(field.type)
|
||||||
and pa.types.is_floating(field.type.value_type)
|
and pa.types.is_floating(field.type.value_type)
|
||||||
and (field.type.list_size >= 10)
|
and (field.type.list_size >= 10)
|
||||||
)
|
)
|
||||||
|
|
||||||
if named_vector_col or likely_vector_col:
|
if named_vector_col or likely_vector_col:
|
||||||
vector_columns.append(field.name)
|
vector_columns.append(
|
||||||
|
{
|
||||||
def gen():
|
"name": field.name,
|
||||||
for batch in reader:
|
"expected_dim": None,
|
||||||
for name in vector_columns:
|
"expected_value_type": None,
|
||||||
batch = _handle_bad_vector_column(
|
}
|
||||||
batch,
|
|
||||||
vector_column_name=name,
|
|
||||||
on_bad_vectors=on_bad_vectors,
|
|
||||||
fill_value=fill_value,
|
|
||||||
)
|
)
|
||||||
yield batch
|
return vector_columns
|
||||||
|
|
||||||
return pa.RecordBatchReader.from_batches(reader.schema, gen())
|
reader_column_names = set(reader_schema.names)
|
||||||
|
active_metadata = _merge_metadata(target_schema.metadata, metadata)
|
||||||
|
embedding_function_columns = set(
|
||||||
|
EmbeddingFunctionRegistry.get_instance().parse_functions(active_metadata).keys()
|
||||||
|
)
|
||||||
|
vector_columns = []
|
||||||
|
for field in target_schema:
|
||||||
|
if field.name not in reader_column_names:
|
||||||
|
continue
|
||||||
|
if not _is_list_like(field.type) or not pa.types.is_floating(
|
||||||
|
field.type.value_type
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
|
||||||
|
reader_field = reader_schema.field(field.name)
|
||||||
|
named_vector_col = (
|
||||||
|
field.name in embedding_function_columns
|
||||||
|
or field.name == VECTOR_COLUMN_NAME
|
||||||
|
or (field.name == "embedding" and pa.types.is_fixed_size_list(field.type))
|
||||||
|
)
|
||||||
|
typed_fixed_vector_col = (
|
||||||
|
pa.types.is_fixed_size_list(reader_field.type)
|
||||||
|
and pa.types.is_floating(reader_field.type.value_type)
|
||||||
|
and reader_field.type.list_size >= 10
|
||||||
|
)
|
||||||
|
|
||||||
|
if named_vector_col or typed_fixed_vector_col:
|
||||||
|
vector_columns.append(
|
||||||
|
{
|
||||||
|
"name": field.name,
|
||||||
|
"expected_dim": (
|
||||||
|
field.type.list_size
|
||||||
|
if pa.types.is_fixed_size_list(field.type)
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
"expected_value_type": field.type.value_type,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return vector_columns
|
||||||
|
|
||||||
|
|
||||||
|
def _vector_output_schema(
|
||||||
|
reader_schema: pa.Schema,
|
||||||
|
vector_columns: List[dict],
|
||||||
|
) -> pa.Schema:
|
||||||
|
columns_by_name = {column["name"]: column for column in vector_columns}
|
||||||
|
fields = []
|
||||||
|
for field in reader_schema:
|
||||||
|
column = columns_by_name.get(field.name)
|
||||||
|
if column is None:
|
||||||
|
output_type = field.type
|
||||||
|
else:
|
||||||
|
output_type = _vector_output_type(field, column)
|
||||||
|
fields.append(pa.field(field.name, output_type, field.nullable, field.metadata))
|
||||||
|
return pa.schema(fields, metadata=reader_schema.metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def _vector_output_type(field: pa.Field, vector_column: dict) -> pa.DataType:
|
||||||
|
if not _is_list_like(field.type):
|
||||||
|
return field.type
|
||||||
|
|
||||||
|
if vector_column["expected_value_type"] is not None and (
|
||||||
|
pa.types.is_null(field.type.value_type)
|
||||||
|
or pa.types.is_integer(field.type.value_type)
|
||||||
|
or pa.types.is_unsigned_integer(field.type.value_type)
|
||||||
|
):
|
||||||
|
return pa.list_(vector_column["expected_value_type"])
|
||||||
|
|
||||||
|
if (
|
||||||
|
vector_column["expected_dim"] is not None
|
||||||
|
and pa.types.is_fixed_size_list(field.type)
|
||||||
|
and field.type.list_size != vector_column["expected_dim"]
|
||||||
|
):
|
||||||
|
return pa.list_(field.type.value_type)
|
||||||
|
|
||||||
|
return field.type
|
||||||
|
|
||||||
|
|
||||||
def _handle_bad_vector_column(
|
def _handle_bad_vector_column(
|
||||||
@@ -3194,6 +3379,8 @@ def _handle_bad_vector_column(
|
|||||||
vector_column_name: str,
|
vector_column_name: str,
|
||||||
on_bad_vectors: str = "error",
|
on_bad_vectors: str = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
|
expected_dim: Optional[int] = None,
|
||||||
|
expected_value_type: Optional[pa.DataType] = None,
|
||||||
) -> pa.RecordBatch:
|
) -> pa.RecordBatch:
|
||||||
"""
|
"""
|
||||||
Ensure that the vector column exists and has type fixed_size_list(float)
|
Ensure that the vector column exists and has type fixed_size_list(float)
|
||||||
@@ -3210,14 +3397,39 @@ def _handle_bad_vector_column(
|
|||||||
fill_value: float, default 0.0
|
fill_value: float, default 0.0
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
"""
|
"""
|
||||||
|
position = data.column_names.index(vector_column_name)
|
||||||
vec_arr = data[vector_column_name]
|
vec_arr = data[vector_column_name]
|
||||||
|
if not _is_list_like(vec_arr.type):
|
||||||
|
return data
|
||||||
|
|
||||||
|
if (
|
||||||
|
expected_dim is not None
|
||||||
|
and pa.types.is_fixed_size_list(vec_arr.type)
|
||||||
|
and vec_arr.type.list_size != expected_dim
|
||||||
|
):
|
||||||
|
vec_arr = pa.array(vec_arr.to_pylist(), type=pa.list_(vec_arr.type.value_type))
|
||||||
|
data = data.set_column(position, vector_column_name, vec_arr)
|
||||||
|
|
||||||
|
if expected_value_type is not None and (
|
||||||
|
pa.types.is_integer(vec_arr.type.value_type)
|
||||||
|
or pa.types.is_unsigned_integer(vec_arr.type.value_type)
|
||||||
|
):
|
||||||
|
vec_arr = pa.array(vec_arr.to_pylist(), type=pa.list_(expected_value_type))
|
||||||
|
data = data.set_column(position, vector_column_name, vec_arr)
|
||||||
|
|
||||||
|
if pa.types.is_floating(vec_arr.type.value_type):
|
||||||
has_nan = has_nan_values(vec_arr)
|
has_nan = has_nan_values(vec_arr)
|
||||||
|
else:
|
||||||
|
has_nan = pa.array([False] * len(vec_arr))
|
||||||
|
|
||||||
if pa.types.is_fixed_size_list(vec_arr.type):
|
if expected_dim is not None:
|
||||||
|
dim = expected_dim
|
||||||
|
elif pa.types.is_fixed_size_list(vec_arr.type):
|
||||||
dim = vec_arr.type.list_size
|
dim = vec_arr.type.list_size
|
||||||
else:
|
else:
|
||||||
dim = _modal_list_size(vec_arr)
|
dim = _infer_vector_dim(vec_arr)
|
||||||
|
if dim is None:
|
||||||
|
return data
|
||||||
has_wrong_dim = pc.not_equal(pc.list_value_length(vec_arr), dim)
|
has_wrong_dim = pc.not_equal(pc.list_value_length(vec_arr), dim)
|
||||||
|
|
||||||
has_bad_vectors = pc.any(has_nan).as_py() or pc.any(has_wrong_dim).as_py()
|
has_bad_vectors = pc.any(has_nan).as_py() or pc.any(has_wrong_dim).as_py()
|
||||||
@@ -3255,13 +3467,12 @@ def _handle_bad_vector_column(
|
|||||||
)
|
)
|
||||||
vec_arr = pc.if_else(
|
vec_arr = pc.if_else(
|
||||||
is_bad,
|
is_bad,
|
||||||
pa.scalar([fill_value] * dim),
|
pa.scalar([fill_value] * dim, type=vec_arr.type),
|
||||||
vec_arr,
|
vec_arr,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid value for on_bad_vectors: {on_bad_vectors}")
|
raise ValueError(f"Invalid value for on_bad_vectors: {on_bad_vectors}")
|
||||||
|
|
||||||
position = data.column_names.index(vector_column_name)
|
|
||||||
return data.set_column(position, vector_column_name, vec_arr)
|
return data.set_column(position, vector_column_name, vec_arr)
|
||||||
|
|
||||||
|
|
||||||
@@ -3282,6 +3493,28 @@ def has_nan_values(arr: Union[pa.ListArray, pa.ChunkedArray]) -> pa.BooleanArray
|
|||||||
return pc.is_in(indices, has_nan_indices)
|
return pc.is_in(indices, has_nan_indices)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_list_like(data_type: pa.DataType) -> bool:
|
||||||
|
return (
|
||||||
|
pa.types.is_list(data_type)
|
||||||
|
or pa.types.is_large_list(data_type)
|
||||||
|
or pa.types.is_fixed_size_list(data_type)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_metadata(*metadata_dicts: Optional[dict]) -> dict:
|
||||||
|
merged = {}
|
||||||
|
for metadata in metadata_dicts:
|
||||||
|
if metadata is None:
|
||||||
|
continue
|
||||||
|
for key, value in metadata.items():
|
||||||
|
if isinstance(key, str):
|
||||||
|
key = key.encode("utf-8")
|
||||||
|
if isinstance(value, str):
|
||||||
|
value = value.encode("utf-8")
|
||||||
|
merged[key] = value
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
def _name_suggests_vector_column(field_name: str) -> bool:
|
def _name_suggests_vector_column(field_name: str) -> bool:
|
||||||
"""Check if a field name indicates a vector column."""
|
"""Check if a field name indicates a vector column."""
|
||||||
name_lower = field_name.lower()
|
name_lower = field_name.lower()
|
||||||
@@ -3349,6 +3582,16 @@ def _modal_list_size(arr: Union[pa.ListArray, pa.ChunkedArray]) -> int:
|
|||||||
return pc.mode(pc.list_value_length(arr))[0].as_py()["mode"]
|
return pc.mode(pc.list_value_length(arr))[0].as_py()["mode"]
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_vector_dim(arr: Union[pa.Array, pa.ChunkedArray]) -> Optional[int]:
|
||||||
|
if not _is_list_like(arr.type):
|
||||||
|
return None
|
||||||
|
lengths = pc.list_value_length(arr)
|
||||||
|
lengths = pc.filter(lengths, pc.greater(lengths, 0))
|
||||||
|
if len(lengths) == 0:
|
||||||
|
return None
|
||||||
|
return pc.mode(lengths)[0].as_py()["mode"]
|
||||||
|
|
||||||
|
|
||||||
def _validate_schema(schema: pa.Schema):
|
def _validate_schema(schema: pa.Schema):
|
||||||
"""
|
"""
|
||||||
Make sure the metadata is valid utf8
|
Make sure the metadata is valid utf8
|
||||||
@@ -3769,6 +4012,7 @@ class AsyncTable:
|
|||||||
mode: Optional[Literal["append", "overwrite"]] = "append",
|
mode: Optional[Literal["append", "overwrite"]] = "append",
|
||||||
on_bad_vectors: Optional[OnBadVectorsType] = None,
|
on_bad_vectors: Optional[OnBadVectorsType] = None,
|
||||||
fill_value: Optional[float] = None,
|
fill_value: Optional[float] = None,
|
||||||
|
progress: Optional[Union[bool, Callable, Any]] = None,
|
||||||
) -> AddResult:
|
) -> AddResult:
|
||||||
"""Add more data to the [Table](Table).
|
"""Add more data to the [Table](Table).
|
||||||
|
|
||||||
@@ -3790,6 +4034,9 @@ class AsyncTable:
|
|||||||
One of "error", "drop", "fill", "null".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
|
progress: callable or tqdm-like, optional
|
||||||
|
A callback or tqdm-compatible progress bar. See
|
||||||
|
:meth:`Table.add` for details.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
schema = await self.schema()
|
schema = await self.schema()
|
||||||
@@ -3800,7 +4047,13 @@ class AsyncTable:
|
|||||||
|
|
||||||
# _santitize_data is an old code path, but we will use it until the
|
# _santitize_data is an old code path, but we will use it until the
|
||||||
# new code path is ready.
|
# new code path is ready.
|
||||||
if on_bad_vectors != "error" or (
|
if mode == "overwrite":
|
||||||
|
# For overwrite, apply the same preprocessing as create_table
|
||||||
|
# so vector columns are inferred as FixedSizeList.
|
||||||
|
data, _ = sanitize_create_table(
|
||||||
|
data, None, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
||||||
|
)
|
||||||
|
elif on_bad_vectors != "error" or (
|
||||||
schema.metadata is not None and b"embedding_functions" in schema.metadata
|
schema.metadata is not None and b"embedding_functions" in schema.metadata
|
||||||
):
|
):
|
||||||
data = _sanitize_data(
|
data = _sanitize_data(
|
||||||
@@ -3813,8 +4066,9 @@ class AsyncTable:
|
|||||||
)
|
)
|
||||||
_register_optional_converters()
|
_register_optional_converters()
|
||||||
data = to_scannable(data)
|
data = to_scannable(data)
|
||||||
|
progress, owns = _normalize_progress(progress)
|
||||||
try:
|
try:
|
||||||
return await self._inner.add(data, mode or "append")
|
return await self._inner.add(data, mode or "append", progress=progress)
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
if "Cast error" in str(e):
|
if "Cast error" in str(e):
|
||||||
raise ValueError(e)
|
raise ValueError(e)
|
||||||
@@ -3822,6 +4076,9 @@ class AsyncTable:
|
|||||||
raise ValueError(e)
|
raise ValueError(e)
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
finally:
|
||||||
|
if owns:
|
||||||
|
progress.close()
|
||||||
|
|
||||||
def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:
|
def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:
|
||||||
"""
|
"""
|
||||||
@@ -4144,7 +4401,7 @@ class AsyncTable:
|
|||||||
async_query = async_query.offset(query.offset)
|
async_query = async_query.offset(query.offset)
|
||||||
if query.columns:
|
if query.columns:
|
||||||
async_query = async_query.select(query.columns)
|
async_query = async_query.select(query.columns)
|
||||||
if query.filter:
|
if query.filter is not None:
|
||||||
async_query = async_query.where(query.filter)
|
async_query = async_query.where(query.filter)
|
||||||
if query.fast_search:
|
if query.fast_search:
|
||||||
async_query = async_query.fast_search()
|
async_query = async_query.fast_search()
|
||||||
@@ -4751,7 +5008,16 @@ class IndexStatistics:
|
|||||||
num_indexed_rows: int
|
num_indexed_rows: int
|
||||||
num_unindexed_rows: int
|
num_unindexed_rows: int
|
||||||
index_type: Literal[
|
index_type: Literal[
|
||||||
"IVF_PQ", "IVF_HNSW_PQ", "IVF_HNSW_SQ", "FTS", "BTREE", "BITMAP", "LABEL_LIST"
|
"IVF_FLAT",
|
||||||
|
"IVF_SQ",
|
||||||
|
"IVF_PQ",
|
||||||
|
"IVF_RQ",
|
||||||
|
"IVF_HNSW_SQ",
|
||||||
|
"IVF_HNSW_PQ",
|
||||||
|
"FTS",
|
||||||
|
"BTREE",
|
||||||
|
"BITMAP",
|
||||||
|
"LABEL_LIST",
|
||||||
]
|
]
|
||||||
distance_type: Optional[Literal["l2", "cosine", "dot"]] = None
|
distance_type: Optional[Literal["l2", "cosine", "dot"]] = None
|
||||||
num_indices: Optional[int] = None
|
num_indices: Optional[int] = None
|
||||||
|
|||||||
@@ -180,7 +180,7 @@ def test_fts_fuzzy_query():
|
|||||||
),
|
),
|
||||||
mode="overwrite",
|
mode="overwrite",
|
||||||
)
|
)
|
||||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
table.create_fts_index("text", replace=True)
|
||||||
|
|
||||||
results = table.search(MatchQuery("foo", "text", fuzziness=1)).to_pandas()
|
results = table.search(MatchQuery("foo", "text", fuzziness=1)).to_pandas()
|
||||||
assert len(results) == 4
|
assert len(results) == 4
|
||||||
@@ -230,7 +230,7 @@ def test_fts_boost_query():
|
|||||||
),
|
),
|
||||||
mode="overwrite",
|
mode="overwrite",
|
||||||
)
|
)
|
||||||
table.create_fts_index("desc", use_tantivy=False, replace=True)
|
table.create_fts_index("desc", replace=True)
|
||||||
|
|
||||||
results = table.search(
|
results = table.search(
|
||||||
BoostQuery(
|
BoostQuery(
|
||||||
@@ -265,7 +265,7 @@ def test_fts_boolean_query(tmp_path):
|
|||||||
],
|
],
|
||||||
mode="overwrite",
|
mode="overwrite",
|
||||||
)
|
)
|
||||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
table.create_fts_index("text", replace=True)
|
||||||
|
|
||||||
# SHOULD
|
# SHOULD
|
||||||
results = table.search(
|
results = table.search(
|
||||||
@@ -319,9 +319,7 @@ def test_fts_native():
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
# passing `use_tantivy=False` to use lance FTS index
|
table.create_fts_index("text")
|
||||||
# `use_tantivy=True` by default
|
|
||||||
table.create_fts_index("text", use_tantivy=False)
|
|
||||||
table.search("puppy").limit(10).select(["text"]).to_list()
|
table.search("puppy").limit(10).select(["text"]).to_list()
|
||||||
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
|
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
|
||||||
# ...
|
# ...
|
||||||
@@ -332,7 +330,6 @@ def test_fts_native():
|
|||||||
# --8<-- [start:fts_config_folding]
|
# --8<-- [start:fts_config_folding]
|
||||||
table.create_fts_index(
|
table.create_fts_index(
|
||||||
"text",
|
"text",
|
||||||
use_tantivy=False,
|
|
||||||
language="French",
|
language="French",
|
||||||
stem=True,
|
stem=True,
|
||||||
ascii_folding=True,
|
ascii_folding=True,
|
||||||
@@ -346,7 +343,7 @@ def test_fts_native():
|
|||||||
table.search("puppy").limit(10).where("text='foo'", prefilter=False).to_list()
|
table.search("puppy").limit(10).where("text='foo'", prefilter=False).to_list()
|
||||||
# --8<-- [end:fts_postfiltering]
|
# --8<-- [end:fts_postfiltering]
|
||||||
# --8<-- [start:fts_with_position]
|
# --8<-- [start:fts_with_position]
|
||||||
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
|
table.create_fts_index("text", with_position=True, replace=True)
|
||||||
# --8<-- [end:fts_with_position]
|
# --8<-- [end:fts_with_position]
|
||||||
# --8<-- [start:fts_incremental_index]
|
# --8<-- [start:fts_incremental_index]
|
||||||
table.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
|
table.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
import os
|
import os
|
||||||
|
|
||||||
@@ -14,8 +15,7 @@ import pytest
|
|||||||
from lancedb.pydantic import LanceModel, Vector
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_basic(tmp_path):
|
||||||
def test_basic(tmp_path, use_tantivy):
|
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
|
|
||||||
assert db.uri == str(tmp_path)
|
assert db.uri == str(tmp_path)
|
||||||
@@ -48,7 +48,7 @@ def test_basic(tmp_path, use_tantivy):
|
|||||||
assert len(rs) == 1
|
assert len(rs) == 1
|
||||||
assert rs["item"].iloc[0] == "foo"
|
assert rs["item"].iloc[0] == "foo"
|
||||||
|
|
||||||
table.create_fts_index("item", use_tantivy=use_tantivy)
|
table.create_fts_index("item")
|
||||||
rs = table.search("bar", query_type="fts").to_pandas()
|
rs = table.search("bar", query_type="fts").to_pandas()
|
||||||
assert len(rs) == 1
|
assert len(rs) == 1
|
||||||
assert rs["item"].iloc[0] == "bar"
|
assert rs["item"].iloc[0] == "bar"
|
||||||
@@ -183,8 +183,8 @@ def test_table_names(tmp_db: lancedb.DBConnection):
|
|||||||
result = list(tmp_db.table_names("test2", limit=2))
|
result = list(tmp_db.table_names("test2", limit=2))
|
||||||
assert result == ["test3"], f"Expected ['test3'], got {result}"
|
assert result == ["test3"], f"Expected ['test3'], got {result}"
|
||||||
|
|
||||||
# Test that namespace parameter can be passed as keyword
|
# Test that namespace_path parameter can be passed as keyword
|
||||||
result = list(tmp_db.table_names(namespace=[]))
|
result = list(tmp_db.table_names(namespace_path=[]))
|
||||||
assert len(result) == 3
|
assert len(result) == 3
|
||||||
|
|
||||||
|
|
||||||
@@ -896,42 +896,22 @@ def test_bypass_vector_index_sync(tmp_db: lancedb.DBConnection):
|
|||||||
|
|
||||||
|
|
||||||
def test_local_namespace_operations(tmp_path):
|
def test_local_namespace_operations(tmp_path):
|
||||||
"""Test that local mode namespace operations behave as expected."""
|
"""Test that local mode namespace operations work via directory namespace."""
|
||||||
# Create a local database connection
|
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
|
|
||||||
# Test list_namespaces returns empty list for root namespace
|
# Root namespace starts empty
|
||||||
namespaces = db.list_namespaces().namespaces
|
assert db.list_namespaces().namespaces == []
|
||||||
assert namespaces == []
|
|
||||||
|
|
||||||
# Test list_namespaces with non-empty namespace raises NotImplementedError
|
# Create and list child namespace
|
||||||
with pytest.raises(
|
db.create_namespace(["child"])
|
||||||
NotImplementedError,
|
assert "child" in db.list_namespaces().namespaces
|
||||||
match="Namespace operations are not supported for listing database",
|
|
||||||
):
|
|
||||||
db.list_namespaces(namespace=["test"])
|
|
||||||
|
|
||||||
|
# List namespaces under child
|
||||||
|
assert db.list_namespaces(namespace_path=["child"]).namespaces == []
|
||||||
|
|
||||||
def test_local_create_namespace_not_supported(tmp_path):
|
# Drop namespace
|
||||||
"""Test that create_namespace is not supported in local mode."""
|
db.drop_namespace(["child"])
|
||||||
db = lancedb.connect(tmp_path)
|
assert db.list_namespaces().namespaces == []
|
||||||
|
|
||||||
with pytest.raises(
|
|
||||||
NotImplementedError,
|
|
||||||
match="Namespace operations are not supported for listing database",
|
|
||||||
):
|
|
||||||
db.create_namespace(["test_namespace"])
|
|
||||||
|
|
||||||
|
|
||||||
def test_local_drop_namespace_not_supported(tmp_path):
|
|
||||||
"""Test that drop_namespace is not supported in local mode."""
|
|
||||||
db = lancedb.connect(tmp_path)
|
|
||||||
|
|
||||||
with pytest.raises(
|
|
||||||
NotImplementedError,
|
|
||||||
match="Namespace operations are not supported for listing database",
|
|
||||||
):
|
|
||||||
db.drop_namespace(["test_namespace"])
|
|
||||||
|
|
||||||
|
|
||||||
def test_clone_table_latest_version(tmp_path):
|
def test_clone_table_latest_version(tmp_path):
|
||||||
@@ -1048,3 +1028,59 @@ def test_clone_table_deep_clone_fails(tmp_path):
|
|||||||
source_uri = os.path.join(tmp_path, "source.lance")
|
source_uri = os.path.join(tmp_path, "source.lance")
|
||||||
with pytest.raises(Exception, match="Deep clone is not yet implemented"):
|
with pytest.raises(Exception, match="Deep clone is not yet implemented"):
|
||||||
db.clone_table("cloned", source_uri, is_shallow=False)
|
db.clone_table("cloned", source_uri, is_shallow=False)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(sys.platform == "win32", reason="Namespace client issues")
|
||||||
|
def test_namespace_client_native_storage(tmp_path):
|
||||||
|
"""Test namespace_client() returns DirectoryNamespace for native storage."""
|
||||||
|
from lance.namespace import DirectoryNamespace
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
ns_client = db.namespace_client()
|
||||||
|
|
||||||
|
assert isinstance(ns_client, DirectoryNamespace)
|
||||||
|
assert str(tmp_path) in ns_client.namespace_id()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(sys.platform == "win32", reason="Namespace client issues")
|
||||||
|
def test_namespace_client_with_storage_options(tmp_path):
|
||||||
|
"""Test namespace_client() preserves storage options."""
|
||||||
|
from lance.namespace import DirectoryNamespace
|
||||||
|
|
||||||
|
storage_options = {"timeout": "10s"}
|
||||||
|
db = lancedb.connect(tmp_path, storage_options=storage_options)
|
||||||
|
ns_client = db.namespace_client()
|
||||||
|
|
||||||
|
assert isinstance(ns_client, DirectoryNamespace)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(sys.platform == "win32", reason="Namespace client issues")
|
||||||
|
def test_namespace_client_operations(tmp_path):
|
||||||
|
"""Test that namespace_client() returns a functional namespace client."""
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
ns_client = db.namespace_client()
|
||||||
|
|
||||||
|
# Create a table through the main db connection
|
||||||
|
data = [{"id": 1, "text": "hello", "vector": [1.0, 2.0]}]
|
||||||
|
db.create_table("test_table", data=data)
|
||||||
|
|
||||||
|
# Verify the namespace client can see the table
|
||||||
|
from lance_namespace import ListTablesRequest
|
||||||
|
|
||||||
|
# id=[] means root namespace
|
||||||
|
response = ns_client.list_tables(ListTablesRequest(id=[]))
|
||||||
|
# Tables can be strings or objects with name attribute
|
||||||
|
table_names = [t.name if hasattr(t, "name") else t for t in response.tables]
|
||||||
|
assert "test_table" in table_names
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(sys.platform == "win32", reason="Namespace client issues")
|
||||||
|
def test_namespace_client_namespace_connection(tmp_path):
|
||||||
|
"""Test namespace_client() returns the backing client for namespace connections."""
|
||||||
|
from lance.namespace import DirectoryNamespace
|
||||||
|
|
||||||
|
db = lancedb.connect_namespace("dir", {"root": str(tmp_path)})
|
||||||
|
ns_client = db.namespace_client()
|
||||||
|
|
||||||
|
assert isinstance(ns_client, DirectoryNamespace)
|
||||||
|
assert str(tmp_path) in ns_client.namespace_id()
|
||||||
|
|||||||
@@ -546,3 +546,24 @@ def test_openai_no_retry_on_401(mock_sleep):
|
|||||||
assert mock_func.call_count == 1
|
assert mock_func.call_count == 1
|
||||||
# Verify that sleep was never called (no retries)
|
# Verify that sleep was never called (no retries)
|
||||||
assert mock_sleep.call_count == 0
|
assert mock_sleep.call_count == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_retrieve_downloads_image():
|
||||||
|
"""
|
||||||
|
Embedding functions like open-clip, siglip, and jinaai use url_retrieve()
|
||||||
|
to download images from HTTP URLs. For example, open_clip._to_pil() calls:
|
||||||
|
|
||||||
|
PIL_Image.open(io.BytesIO(url_retrieve(image)))
|
||||||
|
|
||||||
|
Verify that url_retrieve() can download an image and open it as PIL Image,
|
||||||
|
matching the real usage pattern in embedding functions.
|
||||||
|
"""
|
||||||
|
import io
|
||||||
|
|
||||||
|
Image = pytest.importorskip("PIL.Image")
|
||||||
|
from lancedb.embeddings.utils import url_retrieve
|
||||||
|
|
||||||
|
image_url = "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg"
|
||||||
|
image_bytes = url_retrieve(image_url)
|
||||||
|
img = Image.open(io.BytesIO(image_bytes))
|
||||||
|
assert img.size[0] > 0 and img.size[1] > 0
|
||||||
|
|||||||
@@ -36,9 +36,6 @@ import pytest
|
|||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
from utils import exception_output
|
from utils import exception_output
|
||||||
|
|
||||||
pytest.importorskip("lancedb.fts")
|
|
||||||
tantivy = pytest.importorskip("tantivy")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def table(tmp_path) -> ldb.table.LanceTable:
|
def table(tmp_path) -> ldb.table.LanceTable:
|
||||||
@@ -144,58 +141,53 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
|||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
def test_create_index(tmp_path):
|
@pytest.mark.parametrize(
|
||||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
("kwargs", "match"),
|
||||||
assert isinstance(index, tantivy.Index)
|
[
|
||||||
assert os.path.exists(str(tmp_path / "index"))
|
(
|
||||||
|
{"use_tantivy": True},
|
||||||
|
"Tantivy-based FTS has been removed",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{"ordering_field_names": ["count"]},
|
||||||
|
"ordering_field_names was only supported",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{"writer_heap_size": 128},
|
||||||
|
"writer_heap_size was only supported",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_reject_removed_tantivy_parameters(table, kwargs, match):
|
||||||
|
with pytest.raises(ValueError, match=match):
|
||||||
|
table.create_fts_index("text", **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def test_create_index_with_stemming(tmp_path, table):
|
def test_reject_legacy_tantivy_index(table):
|
||||||
index = ldb.fts.create_index(
|
path, _, _ = table._get_fts_index_path()
|
||||||
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
|
os.makedirs(path, exist_ok=True)
|
||||||
)
|
|
||||||
assert isinstance(index, tantivy.Index)
|
|
||||||
assert os.path.exists(str(tmp_path / "index"))
|
|
||||||
|
|
||||||
# Check stemming by running tokenizer on non empty table
|
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
|
||||||
table.create_fts_index("text", tokenizer_name="en_stem", use_tantivy=True)
|
table.search("puppy").limit(5).to_list()
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
|
||||||
|
table.create_fts_index("text")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
|
||||||
@pytest.mark.parametrize("with_position", [True, False])
|
@pytest.mark.parametrize("with_position", [True, False])
|
||||||
def test_create_inverted_index(table, use_tantivy, with_position):
|
def test_create_inverted_index(table, with_position):
|
||||||
if use_tantivy and not with_position:
|
|
||||||
pytest.skip("we don't support building a tantivy index without position")
|
|
||||||
table.create_fts_index(
|
table.create_fts_index(
|
||||||
"text",
|
"text",
|
||||||
use_tantivy=use_tantivy,
|
|
||||||
with_position=with_position,
|
with_position=with_position,
|
||||||
name="custom_fts_index",
|
name="custom_fts_index",
|
||||||
)
|
)
|
||||||
if not use_tantivy:
|
|
||||||
indices = table.list_indices()
|
indices = table.list_indices()
|
||||||
fts_indices = [i for i in indices if i.index_type == "FTS"]
|
fts_indices = [i for i in indices if i.index_type == "FTS"]
|
||||||
assert any(i.name == "custom_fts_index" for i in fts_indices)
|
assert any(i.name == "custom_fts_index" for i in fts_indices)
|
||||||
|
|
||||||
|
|
||||||
def test_populate_index(tmp_path, table):
|
def test_search_fts(table):
|
||||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
table.create_fts_index("text")
|
||||||
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
|
||||||
|
|
||||||
|
|
||||||
def test_search_index(tmp_path, table):
|
|
||||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
|
||||||
ldb.fts.populate_index(index, table, ["text"])
|
|
||||||
index.reload()
|
|
||||||
results = ldb.fts.search_index(index, query="puppy", limit=5)
|
|
||||||
assert len(results) == 2
|
|
||||||
assert len(results[0]) == 5 # row_ids
|
|
||||||
assert len(results[1]) == 5 # _score
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
|
||||||
def test_search_fts(table, use_tantivy):
|
|
||||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
|
||||||
results = table.search("puppy").select(["id", "text"]).limit(5).to_list()
|
results = table.search("puppy").select(["id", "text"]).limit(5).to_list()
|
||||||
assert len(results) == 5
|
assert len(results) == 5
|
||||||
assert len(results[0]) == 3 # id, text, _score
|
assert len(results[0]) == 3 # id, text, _score
|
||||||
@@ -204,7 +196,6 @@ def test_search_fts(table, use_tantivy):
|
|||||||
results = table.search("puppy").select(["id", "text"]).to_list()
|
results = table.search("puppy").select(["id", "text"]).to_list()
|
||||||
assert len(results) == 10
|
assert len(results) == 10
|
||||||
|
|
||||||
if not use_tantivy:
|
|
||||||
# Test with a query
|
# Test with a query
|
||||||
results = (
|
results = (
|
||||||
table.search(MatchQuery("puppy", "text"))
|
table.search(MatchQuery("puppy", "text"))
|
||||||
@@ -229,7 +220,7 @@ def test_search_fts(table, use_tantivy):
|
|||||||
assert len(results) == 5
|
assert len(results) == 5
|
||||||
|
|
||||||
# Test multi match query
|
# Test multi match query
|
||||||
table.create_fts_index("text2", use_tantivy=use_tantivy)
|
table.create_fts_index("text2")
|
||||||
results = (
|
results = (
|
||||||
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
|
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
|
||||||
.select(["id", "text"])
|
.select(["id", "text"])
|
||||||
@@ -318,13 +309,13 @@ async def test_fts_select_async(async_table):
|
|||||||
|
|
||||||
|
|
||||||
def test_search_fts_phrase_query(table):
|
def test_search_fts_phrase_query(table):
|
||||||
table.create_fts_index("text", use_tantivy=False, with_position=False)
|
table.create_fts_index("text", with_position=False)
|
||||||
try:
|
try:
|
||||||
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||||
assert False
|
assert False
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
|
table.create_fts_index("text", with_position=True, replace=True)
|
||||||
results = table.search("puppy").limit(100).to_list()
|
results = table.search("puppy").limit(100).to_list()
|
||||||
|
|
||||||
# Test with quotation marks
|
# Test with quotation marks
|
||||||
@@ -375,8 +366,8 @@ async def test_search_fts_phrase_query_async(async_table):
|
|||||||
|
|
||||||
|
|
||||||
def test_search_fts_specify_column(table):
|
def test_search_fts_specify_column(table):
|
||||||
table.create_fts_index("text", use_tantivy=False)
|
table.create_fts_index("text")
|
||||||
table.create_fts_index("text2", use_tantivy=False)
|
table.create_fts_index("text2")
|
||||||
|
|
||||||
results = table.search("puppy", fts_columns="text").limit(5).to_list()
|
results = table.search("puppy", fts_columns="text").limit(5).to_list()
|
||||||
assert len(results) == 5
|
assert len(results) == 5
|
||||||
@@ -470,42 +461,8 @@ async def test_search_fts_specify_column_async(async_table):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_search_ordering_field_index_table(tmp_path, table):
|
def test_create_index_from_table(tmp_path, table):
|
||||||
table.create_fts_index("text", ordering_field_names=["count"], use_tantivy=True)
|
table.create_fts_index("text")
|
||||||
rows = (
|
|
||||||
table.search("puppy", ordering_field_name="count")
|
|
||||||
.limit(20)
|
|
||||||
.select(["text", "count"])
|
|
||||||
.to_list()
|
|
||||||
)
|
|
||||||
for r in rows:
|
|
||||||
assert "puppy" in r["text"]
|
|
||||||
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
|
||||||
|
|
||||||
|
|
||||||
def test_search_ordering_field_index(tmp_path, table):
|
|
||||||
index = ldb.fts.create_index(
|
|
||||||
str(tmp_path / "index"), ["text"], ordering_fields=["count"]
|
|
||||||
)
|
|
||||||
|
|
||||||
ldb.fts.populate_index(index, table, ["text"], ordering_fields=["count"])
|
|
||||||
index.reload()
|
|
||||||
results = ldb.fts.search_index(
|
|
||||||
index, query="puppy", limit=5, ordering_field="count"
|
|
||||||
)
|
|
||||||
assert len(results) == 2
|
|
||||||
assert len(results[0]) == 5 # row_ids
|
|
||||||
assert len(results[1]) == 5 # _distance
|
|
||||||
rows = table.to_lance().take(results[0]).to_pylist()
|
|
||||||
|
|
||||||
for r in rows:
|
|
||||||
assert "puppy" in r["text"]
|
|
||||||
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
|
||||||
def test_create_index_from_table(tmp_path, table, use_tantivy):
|
|
||||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
|
||||||
df = table.search("puppy").limit(5).select(["text"]).to_pandas()
|
df = table.search("puppy").limit(5).select(["text"]).to_pandas()
|
||||||
assert len(df) <= 5
|
assert len(df) <= 5
|
||||||
assert "text" in df.columns
|
assert "text" in df.columns
|
||||||
@@ -525,36 +482,24 @@ def test_create_index_from_table(tmp_path, table, use_tantivy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
with pytest.raises(Exception, match="already exists"):
|
with pytest.raises(Exception, match="already exists"):
|
||||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
table.create_fts_index("text")
|
||||||
|
|
||||||
table.create_fts_index("text", replace=True, use_tantivy=use_tantivy)
|
table.create_fts_index("text", replace=True)
|
||||||
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_create_index_multiple_columns(tmp_path, table):
|
def test_create_index_multiple_columns(tmp_path, table):
|
||||||
table.create_fts_index(["text", "text2"], use_tantivy=True)
|
with pytest.raises(ValueError, match="Native FTS indexes can only be created"):
|
||||||
df = table.search("puppy").limit(5).to_pandas()
|
table.create_fts_index(["text", "text2"])
|
||||||
assert len(df) == 5
|
|
||||||
assert "text" in df.columns
|
|
||||||
assert "text2" in df.columns
|
|
||||||
|
|
||||||
|
|
||||||
def test_empty_rs(tmp_path, table, mocker):
|
|
||||||
table.create_fts_index(["text", "text2"], use_tantivy=True)
|
|
||||||
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
|
||||||
df = table.search("puppy").limit(5).to_pandas()
|
|
||||||
assert len(df) == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_nested_schema(tmp_path, table):
|
def test_nested_schema(tmp_path, table):
|
||||||
table.create_fts_index("nested.text", use_tantivy=True)
|
with pytest.raises(ValueError, match="top-level fields"):
|
||||||
rs = table.search("puppy").limit(5).to_list()
|
table.create_fts_index("nested.text")
|
||||||
assert len(rs) == 5
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_search_index_with_filter(table):
|
||||||
def test_search_index_with_filter(table, use_tantivy):
|
table.create_fts_index("text")
|
||||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
|
||||||
orig_import = __import__
|
orig_import = __import__
|
||||||
|
|
||||||
def import_mock(name, *args):
|
def import_mock(name, *args):
|
||||||
@@ -584,8 +529,7 @@ def test_search_index_with_filter(table, use_tantivy):
|
|||||||
assert r["_rowid"] is not None
|
assert r["_rowid"] is not None
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_null_input(table):
|
||||||
def test_null_input(table, use_tantivy):
|
|
||||||
table.add(
|
table.add(
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
@@ -598,13 +542,12 @@ def test_null_input(table, use_tantivy):
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
table.create_fts_index("text")
|
||||||
|
|
||||||
|
|
||||||
def test_syntax(table):
|
def test_syntax(table):
|
||||||
# https://github.com/lancedb/lancedb/issues/769
|
# https://github.com/lancedb/lancedb/issues/769
|
||||||
table.create_fts_index("text", use_tantivy=True)
|
table.create_fts_index("text")
|
||||||
with pytest.raises(ValueError, match="Syntax Error"):
|
|
||||||
table.search("they could have been dogs OR").limit(10).to_list()
|
table.search("they could have been dogs OR").limit(10).to_list()
|
||||||
|
|
||||||
# these should work
|
# these should work
|
||||||
@@ -616,6 +559,7 @@ def test_syntax(table):
|
|||||||
).to_list()
|
).to_list()
|
||||||
|
|
||||||
# phrase queries
|
# phrase queries
|
||||||
|
table.create_fts_index("text", with_position=True, replace=True)
|
||||||
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
|
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
|
||||||
table.search('"they could have been dogs OR cats"').limit(10).to_list()
|
table.search('"they could have been dogs OR cats"').limit(10).to_list()
|
||||||
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
|
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
|
||||||
@@ -639,7 +583,7 @@ def test_language(mem_db: DBConnection):
|
|||||||
table = mem_db.create_table("test", data=data)
|
table = mem_db.create_table("test", data=data)
|
||||||
|
|
||||||
with pytest.raises(ValueError) as e:
|
with pytest.raises(ValueError) as e:
|
||||||
table.create_fts_index("text", use_tantivy=False, language="klingon")
|
table.create_fts_index("text", language="klingon")
|
||||||
|
|
||||||
assert exception_output(e) == (
|
assert exception_output(e) == (
|
||||||
"ValueError: LanceDB does not support the requested language: 'klingon'\n"
|
"ValueError: LanceDB does not support the requested language: 'klingon'\n"
|
||||||
@@ -650,7 +594,6 @@ def test_language(mem_db: DBConnection):
|
|||||||
|
|
||||||
table.create_fts_index(
|
table.create_fts_index(
|
||||||
"text",
|
"text",
|
||||||
use_tantivy=False,
|
|
||||||
language="French",
|
language="French",
|
||||||
stem=True,
|
stem=True,
|
||||||
ascii_folding=True,
|
ascii_folding=True,
|
||||||
@@ -690,7 +633,7 @@ def test_fts_on_list(mem_db: DBConnection):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
table = mem_db.create_table("test", data=data)
|
table = mem_db.create_table("test", data=data)
|
||||||
table.create_fts_index("text", use_tantivy=False, with_position=True)
|
table.create_fts_index("text", with_position=True)
|
||||||
|
|
||||||
res = table.search("lance").limit(5).to_list()
|
res = table.search("lance").limit(5).to_list()
|
||||||
assert len(res) == 3
|
assert len(res) == 3
|
||||||
@@ -702,7 +645,7 @@ def test_fts_on_list(mem_db: DBConnection):
|
|||||||
def test_fts_ngram(mem_db: DBConnection):
|
def test_fts_ngram(mem_db: DBConnection):
|
||||||
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
|
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
|
||||||
table = mem_db.create_table("test", data=data)
|
table = mem_db.create_table("test", data=data)
|
||||||
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
|
table.create_fts_index("text", base_tokenizer="ngram")
|
||||||
|
|
||||||
results = table.search("lan", query_type="fts").limit(10).to_list()
|
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||||
assert len(results) == 2
|
assert len(results) == 2
|
||||||
@@ -721,7 +664,6 @@ def test_fts_ngram(mem_db: DBConnection):
|
|||||||
# test setting min_ngram_length and prefix_only
|
# test setting min_ngram_length and prefix_only
|
||||||
table.create_fts_index(
|
table.create_fts_index(
|
||||||
"text",
|
"text",
|
||||||
use_tantivy=False,
|
|
||||||
base_tokenizer="ngram",
|
base_tokenizer="ngram",
|
||||||
replace=True,
|
replace=True,
|
||||||
ngram_min_length=2,
|
ngram_min_length=2,
|
||||||
@@ -886,7 +828,7 @@ def test_fts_query_to_json():
|
|||||||
|
|
||||||
|
|
||||||
def test_fts_fast_search(table):
|
def test_fts_fast_search(table):
|
||||||
table.create_fts_index("text", use_tantivy=False)
|
table.create_fts_index("text")
|
||||||
|
|
||||||
# Insert some unindexed data
|
# Insert some unindexed data
|
||||||
table.add(
|
table.add(
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ def sync_table(tmpdir_factory) -> Table:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
table = db.create_table("test", data)
|
table = db.create_table("test", data)
|
||||||
table.create_fts_index("text", with_position=False, use_tantivy=False)
|
table.create_fts_index("text", with_position=False)
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
@@ -192,7 +192,7 @@ def table_with_id(tmpdir_factory) -> Table:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
table = db.create_table("test_with_id", data)
|
table = db.create_table("test_with_id", data)
|
||||||
table.create_fts_index("text", with_position=False, use_tantivy=False)
|
table.create_fts_index("text", with_position=False)
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
import random
|
import random
|
||||||
|
from typing import get_args, get_type_hints
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
@@ -22,6 +23,7 @@ from lancedb.index import (
|
|||||||
HnswSq,
|
HnswSq,
|
||||||
FTS,
|
FTS,
|
||||||
)
|
)
|
||||||
|
from lancedb.table import IndexStatistics
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
@pytest_asyncio.fixture
|
||||||
@@ -283,3 +285,23 @@ async def test_create_index_with_binary_vectors(binary_table: AsyncTable):
|
|||||||
for v in range(256):
|
for v in range(256):
|
||||||
res = await binary_table.query().nearest_to([v] * 128).to_arrow()
|
res = await binary_table.query().nearest_to([v] * 128).to_arrow()
|
||||||
assert res["id"][0].as_py() == v
|
assert res["id"][0].as_py() == v
|
||||||
|
|
||||||
|
|
||||||
|
def test_index_statistics_index_type_lists_all_supported_values():
|
||||||
|
expected_index_types = {
|
||||||
|
"IVF_FLAT",
|
||||||
|
"IVF_SQ",
|
||||||
|
"IVF_PQ",
|
||||||
|
"IVF_RQ",
|
||||||
|
"IVF_HNSW_SQ",
|
||||||
|
"IVF_HNSW_PQ",
|
||||||
|
"FTS",
|
||||||
|
"BTREE",
|
||||||
|
"BITMAP",
|
||||||
|
"LABEL_LIST",
|
||||||
|
}
|
||||||
|
|
||||||
|
assert (
|
||||||
|
set(get_args(get_type_hints(IndexStatistics)["index_type"]))
|
||||||
|
== expected_index_types
|
||||||
|
)
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import shutil
|
|||||||
import pytest
|
import pytest
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import lancedb
|
import lancedb
|
||||||
|
from lance_namespace.errors import NamespaceNotEmptyError, TableNotFoundError
|
||||||
|
|
||||||
|
|
||||||
class TestNamespaceConnection:
|
class TestNamespaceConnection:
|
||||||
@@ -32,6 +33,16 @@ class TestNamespaceConnection:
|
|||||||
# Initially no tables in root
|
# Initially no tables in root
|
||||||
assert len(list(db.table_names())) == 0
|
assert len(list(db.table_names())) == 0
|
||||||
|
|
||||||
|
def test_connect_via_connect_helper(self):
|
||||||
|
"""Connecting via lancedb.connect should delegate to namespace connection."""
|
||||||
|
db = lancedb.connect(
|
||||||
|
namespace_client_impl="dir",
|
||||||
|
namespace_client_properties={"root": self.temp_dir},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(db, lancedb.LanceNamespaceDBConnection)
|
||||||
|
assert len(list(db.table_names())) == 0
|
||||||
|
|
||||||
def test_create_table_through_namespace(self):
|
def test_create_table_through_namespace(self):
|
||||||
"""Test creating a table through namespace."""
|
"""Test creating a table through namespace."""
|
||||||
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
|
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
|
||||||
@@ -49,14 +60,14 @@ class TestNamespaceConnection:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create empty table in child namespace
|
# Create empty table in child namespace
|
||||||
table = db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
table = db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||||
assert table is not None
|
assert table is not None
|
||||||
assert table.name == "test_table"
|
assert table.name == "test_table"
|
||||||
assert table.namespace == ["test_ns"]
|
assert table.namespace == ["test_ns"]
|
||||||
assert table.id == "test_ns$test_table"
|
assert table.id == "test_ns$test_table"
|
||||||
|
|
||||||
# Table should appear in child namespace
|
# Table should appear in child namespace
|
||||||
table_names = list(db.table_names(namespace=["test_ns"]))
|
table_names = list(db.table_names(namespace_path=["test_ns"]))
|
||||||
assert "test_table" in table_names
|
assert "test_table" in table_names
|
||||||
assert len(table_names) == 1
|
assert len(table_names) == 1
|
||||||
|
|
||||||
@@ -79,10 +90,10 @@ class TestNamespaceConnection:
|
|||||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Open the table
|
# Open the table
|
||||||
table = db.open_table("test_table", namespace=["test_ns"])
|
table = db.open_table("test_table", namespace_path=["test_ns"])
|
||||||
assert table is not None
|
assert table is not None
|
||||||
assert table.name == "test_table"
|
assert table.name == "test_table"
|
||||||
assert table.namespace == ["test_ns"]
|
assert table.namespace == ["test_ns"]
|
||||||
@@ -107,31 +118,31 @@ class TestNamespaceConnection:
|
|||||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
db.create_table("table1", schema=schema, namespace=["test_ns"])
|
db.create_table("table1", schema=schema, namespace_path=["test_ns"])
|
||||||
db.create_table("table2", schema=schema, namespace=["test_ns"])
|
db.create_table("table2", schema=schema, namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Verify both tables exist in child namespace
|
# Verify both tables exist in child namespace
|
||||||
table_names = list(db.table_names(namespace=["test_ns"]))
|
table_names = list(db.table_names(namespace_path=["test_ns"]))
|
||||||
assert "table1" in table_names
|
assert "table1" in table_names
|
||||||
assert "table2" in table_names
|
assert "table2" in table_names
|
||||||
assert len(table_names) == 2
|
assert len(table_names) == 2
|
||||||
|
|
||||||
# Drop one table
|
# Drop one table
|
||||||
db.drop_table("table1", namespace=["test_ns"])
|
db.drop_table("table1", namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Verify only table2 remains
|
# Verify only table2 remains
|
||||||
table_names = list(db.table_names(namespace=["test_ns"]))
|
table_names = list(db.table_names(namespace_path=["test_ns"]))
|
||||||
assert "table1" not in table_names
|
assert "table1" not in table_names
|
||||||
assert "table2" in table_names
|
assert "table2" in table_names
|
||||||
assert len(table_names) == 1
|
assert len(table_names) == 1
|
||||||
|
|
||||||
# Drop the second table
|
# Drop the second table
|
||||||
db.drop_table("table2", namespace=["test_ns"])
|
db.drop_table("table2", namespace_path=["test_ns"])
|
||||||
assert len(list(db.table_names(namespace=["test_ns"]))) == 0
|
assert len(list(db.table_names(namespace_path=["test_ns"]))) == 0
|
||||||
|
|
||||||
# Should not be able to open dropped table
|
# Should not be able to open dropped table
|
||||||
with pytest.raises(RuntimeError):
|
with pytest.raises(TableNotFoundError):
|
||||||
db.open_table("table1", namespace=["test_ns"])
|
db.open_table("table1", namespace_path=["test_ns"])
|
||||||
|
|
||||||
def test_create_table_with_schema(self):
|
def test_create_table_with_schema(self):
|
||||||
"""Test creating a table with explicit schema through namespace."""
|
"""Test creating a table with explicit schema through namespace."""
|
||||||
@@ -150,7 +161,7 @@ class TestNamespaceConnection:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create table with schema in child namespace
|
# Create table with schema in child namespace
|
||||||
table = db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
table = db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||||
assert table is not None
|
assert table is not None
|
||||||
assert table.namespace == ["test_ns"]
|
assert table.namespace == ["test_ns"]
|
||||||
|
|
||||||
@@ -174,7 +185,7 @@ class TestNamespaceConnection:
|
|||||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
db.create_table("old_name", schema=schema, namespace=["test_ns"])
|
db.create_table("old_name", schema=schema, namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Rename should raise NotImplementedError
|
# Rename should raise NotImplementedError
|
||||||
with pytest.raises(NotImplementedError, match="rename_table is not supported"):
|
with pytest.raises(NotImplementedError, match="rename_table is not supported"):
|
||||||
@@ -195,20 +206,20 @@ class TestNamespaceConnection:
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
db.create_table(f"table{i}", schema=schema, namespace=["test_ns"])
|
db.create_table(f"table{i}", schema=schema, namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Verify tables exist in child namespace
|
# Verify tables exist in child namespace
|
||||||
assert len(list(db.table_names(namespace=["test_ns"]))) == 3
|
assert len(list(db.table_names(namespace_path=["test_ns"]))) == 3
|
||||||
|
|
||||||
# Drop all tables in child namespace
|
# Drop all tables in child namespace
|
||||||
db.drop_all_tables(namespace=["test_ns"])
|
db.drop_all_tables(namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Verify all tables are gone from child namespace
|
# Verify all tables are gone from child namespace
|
||||||
assert len(list(db.table_names(namespace=["test_ns"]))) == 0
|
assert len(list(db.table_names(namespace_path=["test_ns"]))) == 0
|
||||||
|
|
||||||
# Test that table_names works with keyword-only namespace parameter
|
# Test that table_names works with keyword-only namespace parameter
|
||||||
db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||||
result = list(db.table_names(namespace=["test_ns"]))
|
result = list(db.table_names(namespace_path=["test_ns"]))
|
||||||
assert "test_table" in result
|
assert "test_table" in result
|
||||||
|
|
||||||
def test_table_operations(self):
|
def test_table_operations(self):
|
||||||
@@ -226,7 +237,7 @@ class TestNamespaceConnection:
|
|||||||
pa.field("text", pa.string()),
|
pa.field("text", pa.string()),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
table = db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
table = db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Verify empty table was created
|
# Verify empty table was created
|
||||||
result = table.to_pandas()
|
result = table.to_pandas()
|
||||||
@@ -297,25 +308,25 @@ class TestNamespaceConnection:
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
table = db.create_table(
|
table = db.create_table(
|
||||||
"test_table", schema=schema, namespace=["test_namespace"]
|
"test_table", schema=schema, namespace_path=["test_namespace"]
|
||||||
)
|
)
|
||||||
assert table is not None
|
assert table is not None
|
||||||
|
|
||||||
# Verify table exists in namespace
|
# Verify table exists in namespace
|
||||||
tables_in_namespace = list(db.table_names(namespace=["test_namespace"]))
|
tables_in_namespace = list(db.table_names(namespace_path=["test_namespace"]))
|
||||||
assert "test_table" in tables_in_namespace
|
assert "test_table" in tables_in_namespace
|
||||||
assert len(tables_in_namespace) == 1
|
assert len(tables_in_namespace) == 1
|
||||||
|
|
||||||
# Open table from namespace
|
# Open table from namespace
|
||||||
table = db.open_table("test_table", namespace=["test_namespace"])
|
table = db.open_table("test_table", namespace_path=["test_namespace"])
|
||||||
assert table is not None
|
assert table is not None
|
||||||
assert table.name == "test_table"
|
assert table.name == "test_table"
|
||||||
|
|
||||||
# Drop table from namespace
|
# Drop table from namespace
|
||||||
db.drop_table("test_table", namespace=["test_namespace"])
|
db.drop_table("test_table", namespace_path=["test_namespace"])
|
||||||
|
|
||||||
# Verify table no longer exists in namespace
|
# Verify table no longer exists in namespace
|
||||||
tables_in_namespace = list(db.table_names(namespace=["test_namespace"]))
|
tables_in_namespace = list(db.table_names(namespace_path=["test_namespace"]))
|
||||||
assert len(tables_in_namespace) == 0
|
assert len(tables_in_namespace) == 0
|
||||||
|
|
||||||
# Drop namespace
|
# Drop namespace
|
||||||
@@ -337,14 +348,14 @@ class TestNamespaceConnection:
|
|||||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
db.create_table("test_table", schema=schema, namespace=["test_namespace"])
|
db.create_table("test_table", schema=schema, namespace_path=["test_namespace"])
|
||||||
|
|
||||||
# Try to drop namespace with tables - should fail
|
# Try to drop namespace with tables - should fail
|
||||||
with pytest.raises(RuntimeError, match="is not empty"):
|
with pytest.raises(NamespaceNotEmptyError):
|
||||||
db.drop_namespace(["test_namespace"])
|
db.drop_namespace(["test_namespace"])
|
||||||
|
|
||||||
# Drop table first
|
# Drop table first
|
||||||
db.drop_table("test_table", namespace=["test_namespace"])
|
db.drop_table("test_table", namespace_path=["test_namespace"])
|
||||||
|
|
||||||
# Now dropping namespace should work
|
# Now dropping namespace should work
|
||||||
db.drop_namespace(["test_namespace"])
|
db.drop_namespace(["test_namespace"])
|
||||||
@@ -367,10 +378,10 @@ class TestNamespaceConnection:
|
|||||||
|
|
||||||
# Create table with same name in both namespaces
|
# Create table with same name in both namespaces
|
||||||
table_a = db.create_table(
|
table_a = db.create_table(
|
||||||
"same_name_table", schema=schema, namespace=["namespace_a"]
|
"same_name_table", schema=schema, namespace_path=["namespace_a"]
|
||||||
)
|
)
|
||||||
table_b = db.create_table(
|
table_b = db.create_table(
|
||||||
"same_name_table", schema=schema, namespace=["namespace_b"]
|
"same_name_table", schema=schema, namespace_path=["namespace_b"]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add different data to each table
|
# Add different data to each table
|
||||||
@@ -388,7 +399,9 @@ class TestNamespaceConnection:
|
|||||||
table_b.add(data_b)
|
table_b.add(data_b)
|
||||||
|
|
||||||
# Verify data in namespace_a table
|
# Verify data in namespace_a table
|
||||||
opened_table_a = db.open_table("same_name_table", namespace=["namespace_a"])
|
opened_table_a = db.open_table(
|
||||||
|
"same_name_table", namespace_path=["namespace_a"]
|
||||||
|
)
|
||||||
result_a = opened_table_a.to_pandas().sort_values("id").reset_index(drop=True)
|
result_a = opened_table_a.to_pandas().sort_values("id").reset_index(drop=True)
|
||||||
assert len(result_a) == 2
|
assert len(result_a) == 2
|
||||||
assert result_a["id"].tolist() == [1, 2]
|
assert result_a["id"].tolist() == [1, 2]
|
||||||
@@ -399,7 +412,9 @@ class TestNamespaceConnection:
|
|||||||
assert [v.tolist() for v in result_a["vector"]] == [[1.0, 2.0], [3.0, 4.0]]
|
assert [v.tolist() for v in result_a["vector"]] == [[1.0, 2.0], [3.0, 4.0]]
|
||||||
|
|
||||||
# Verify data in namespace_b table
|
# Verify data in namespace_b table
|
||||||
opened_table_b = db.open_table("same_name_table", namespace=["namespace_b"])
|
opened_table_b = db.open_table(
|
||||||
|
"same_name_table", namespace_path=["namespace_b"]
|
||||||
|
)
|
||||||
result_b = opened_table_b.to_pandas().sort_values("id").reset_index(drop=True)
|
result_b = opened_table_b.to_pandas().sort_values("id").reset_index(drop=True)
|
||||||
assert len(result_b) == 3
|
assert len(result_b) == 3
|
||||||
assert result_b["id"].tolist() == [10, 20, 30]
|
assert result_b["id"].tolist() == [10, 20, 30]
|
||||||
@@ -419,8 +434,8 @@ class TestNamespaceConnection:
|
|||||||
assert "same_name_table" not in root_tables
|
assert "same_name_table" not in root_tables
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
db.drop_table("same_name_table", namespace=["namespace_a"])
|
db.drop_table("same_name_table", namespace_path=["namespace_a"])
|
||||||
db.drop_table("same_name_table", namespace=["namespace_b"])
|
db.drop_table("same_name_table", namespace_path=["namespace_b"])
|
||||||
db.drop_namespace(["namespace_a"])
|
db.drop_namespace(["namespace_a"])
|
||||||
db.drop_namespace(["namespace_b"])
|
db.drop_namespace(["namespace_b"])
|
||||||
|
|
||||||
@@ -448,6 +463,8 @@ class TestAsyncNamespaceConnection:
|
|||||||
table_names = await db.table_names()
|
table_names = await db.table_names()
|
||||||
assert len(list(table_names)) == 0
|
assert len(list(table_names)) == 0
|
||||||
|
|
||||||
|
# Async connect via namespace helper is not enabled yet.
|
||||||
|
|
||||||
async def test_create_table_async(self):
|
async def test_create_table_async(self):
|
||||||
"""Test creating a table asynchronously through namespace."""
|
"""Test creating a table asynchronously through namespace."""
|
||||||
db = lancedb.connect_namespace_async("dir", {"root": self.temp_dir})
|
db = lancedb.connect_namespace_async("dir", {"root": self.temp_dir})
|
||||||
@@ -466,13 +483,13 @@ class TestAsyncNamespaceConnection:
|
|||||||
|
|
||||||
# Create empty table in child namespace
|
# Create empty table in child namespace
|
||||||
table = await db.create_table(
|
table = await db.create_table(
|
||||||
"test_table", schema=schema, namespace=["test_ns"]
|
"test_table", schema=schema, namespace_path=["test_ns"]
|
||||||
)
|
)
|
||||||
assert table is not None
|
assert table is not None
|
||||||
assert isinstance(table, lancedb.AsyncTable)
|
assert isinstance(table, lancedb.AsyncTable)
|
||||||
|
|
||||||
# Table should appear in child namespace
|
# Table should appear in child namespace
|
||||||
table_names = await db.table_names(namespace=["test_ns"])
|
table_names = await db.table_names(namespace_path=["test_ns"])
|
||||||
assert "test_table" in list(table_names)
|
assert "test_table" in list(table_names)
|
||||||
|
|
||||||
async def test_open_table_async(self):
|
async def test_open_table_async(self):
|
||||||
@@ -489,10 +506,10 @@ class TestAsyncNamespaceConnection:
|
|||||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
await db.create_table("test_table", schema=schema, namespace=["test_ns"])
|
await db.create_table("test_table", schema=schema, namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Open the table
|
# Open the table
|
||||||
table = await db.open_table("test_table", namespace=["test_ns"])
|
table = await db.open_table("test_table", namespace_path=["test_ns"])
|
||||||
assert table is not None
|
assert table is not None
|
||||||
assert isinstance(table, lancedb.AsyncTable)
|
assert isinstance(table, lancedb.AsyncTable)
|
||||||
|
|
||||||
@@ -546,20 +563,20 @@ class TestAsyncNamespaceConnection:
|
|||||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
await db.create_table("table1", schema=schema, namespace=["test_ns"])
|
await db.create_table("table1", schema=schema, namespace_path=["test_ns"])
|
||||||
await db.create_table("table2", schema=schema, namespace=["test_ns"])
|
await db.create_table("table2", schema=schema, namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Verify both tables exist in child namespace
|
# Verify both tables exist in child namespace
|
||||||
table_names = list(await db.table_names(namespace=["test_ns"]))
|
table_names = list(await db.table_names(namespace_path=["test_ns"]))
|
||||||
assert "table1" in table_names
|
assert "table1" in table_names
|
||||||
assert "table2" in table_names
|
assert "table2" in table_names
|
||||||
assert len(table_names) == 2
|
assert len(table_names) == 2
|
||||||
|
|
||||||
# Drop one table
|
# Drop one table
|
||||||
await db.drop_table("table1", namespace=["test_ns"])
|
await db.drop_table("table1", namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Verify only table2 remains
|
# Verify only table2 remains
|
||||||
table_names = list(await db.table_names(namespace=["test_ns"]))
|
table_names = list(await db.table_names(namespace_path=["test_ns"]))
|
||||||
assert "table1" not in table_names
|
assert "table1" not in table_names
|
||||||
assert "table2" in table_names
|
assert "table2" in table_names
|
||||||
assert len(table_names) == 1
|
assert len(table_names) == 1
|
||||||
@@ -588,20 +605,24 @@ class TestAsyncNamespaceConnection:
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
table = await db.create_table(
|
table = await db.create_table(
|
||||||
"test_table", schema=schema, namespace=["test_namespace"]
|
"test_table", schema=schema, namespace_path=["test_namespace"]
|
||||||
)
|
)
|
||||||
assert table is not None
|
assert table is not None
|
||||||
|
|
||||||
# Verify table exists in namespace
|
# Verify table exists in namespace
|
||||||
tables_in_namespace = list(await db.table_names(namespace=["test_namespace"]))
|
tables_in_namespace = list(
|
||||||
|
await db.table_names(namespace_path=["test_namespace"])
|
||||||
|
)
|
||||||
assert "test_table" in tables_in_namespace
|
assert "test_table" in tables_in_namespace
|
||||||
assert len(tables_in_namespace) == 1
|
assert len(tables_in_namespace) == 1
|
||||||
|
|
||||||
# Drop table from namespace
|
# Drop table from namespace
|
||||||
await db.drop_table("test_table", namespace=["test_namespace"])
|
await db.drop_table("test_table", namespace_path=["test_namespace"])
|
||||||
|
|
||||||
# Verify table no longer exists in namespace
|
# Verify table no longer exists in namespace
|
||||||
tables_in_namespace = list(await db.table_names(namespace=["test_namespace"]))
|
tables_in_namespace = list(
|
||||||
|
await db.table_names(namespace_path=["test_namespace"])
|
||||||
|
)
|
||||||
assert len(tables_in_namespace) == 0
|
assert len(tables_in_namespace) == 0
|
||||||
|
|
||||||
# Drop namespace
|
# Drop namespace
|
||||||
@@ -626,15 +647,98 @@ class TestAsyncNamespaceConnection:
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
await db.create_table(f"table{i}", schema=schema, namespace=["test_ns"])
|
await db.create_table(
|
||||||
|
f"table{i}", schema=schema, namespace_path=["test_ns"]
|
||||||
|
)
|
||||||
|
|
||||||
# Verify tables exist in child namespace
|
# Verify tables exist in child namespace
|
||||||
table_names = await db.table_names(namespace=["test_ns"])
|
table_names = await db.table_names(namespace_path=["test_ns"])
|
||||||
assert len(list(table_names)) == 3
|
assert len(list(table_names)) == 3
|
||||||
|
|
||||||
# Drop all tables in child namespace
|
# Drop all tables in child namespace
|
||||||
await db.drop_all_tables(namespace=["test_ns"])
|
await db.drop_all_tables(namespace_path=["test_ns"])
|
||||||
|
|
||||||
# Verify all tables are gone from child namespace
|
# Verify all tables are gone from child namespace
|
||||||
table_names = await db.table_names(namespace=["test_ns"])
|
table_names = await db.table_names(namespace_path=["test_ns"])
|
||||||
assert len(list(table_names)) == 0
|
assert len(list(table_names)) == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestPushdownOperations:
|
||||||
|
"""Test pushdown operations on namespace connections."""
|
||||||
|
|
||||||
|
def setup_method(self):
|
||||||
|
"""Set up test fixtures."""
|
||||||
|
self.temp_dir = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
def teardown_method(self):
|
||||||
|
"""Clean up test fixtures."""
|
||||||
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def test_query_table_pushdown_stored(self):
|
||||||
|
"""Test that QueryTable pushdown is stored on sync connection."""
|
||||||
|
db = lancedb.connect_namespace(
|
||||||
|
"dir",
|
||||||
|
{"root": self.temp_dir},
|
||||||
|
namespace_client_pushdown_operations=["QueryTable"],
|
||||||
|
)
|
||||||
|
assert "QueryTable" in db._namespace_client_pushdown_operations
|
||||||
|
|
||||||
|
def test_create_table_pushdown_stored(self):
|
||||||
|
"""Test that CreateTable pushdown is stored on sync connection."""
|
||||||
|
db = lancedb.connect_namespace(
|
||||||
|
"dir",
|
||||||
|
{"root": self.temp_dir},
|
||||||
|
namespace_client_pushdown_operations=["CreateTable"],
|
||||||
|
)
|
||||||
|
assert "CreateTable" in db._namespace_client_pushdown_operations
|
||||||
|
|
||||||
|
def test_both_pushdowns_stored(self):
|
||||||
|
"""Test that both pushdown operations can be set together."""
|
||||||
|
db = lancedb.connect_namespace(
|
||||||
|
"dir",
|
||||||
|
{"root": self.temp_dir},
|
||||||
|
namespace_client_pushdown_operations=["QueryTable", "CreateTable"],
|
||||||
|
)
|
||||||
|
assert "QueryTable" in db._namespace_client_pushdown_operations
|
||||||
|
assert "CreateTable" in db._namespace_client_pushdown_operations
|
||||||
|
|
||||||
|
def test_pushdown_defaults_to_empty(self):
|
||||||
|
"""Test that pushdown operations default to empty."""
|
||||||
|
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
|
||||||
|
assert len(db._namespace_client_pushdown_operations) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
class TestAsyncPushdownOperations:
|
||||||
|
"""Test pushdown operations on async namespace connections."""
|
||||||
|
|
||||||
|
def setup_method(self):
|
||||||
|
"""Set up test fixtures."""
|
||||||
|
self.temp_dir = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
def teardown_method(self):
|
||||||
|
"""Clean up test fixtures."""
|
||||||
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
async def test_async_query_table_pushdown_stored(self):
|
||||||
|
"""Test that QueryTable pushdown is stored on async connection."""
|
||||||
|
db = lancedb.connect_namespace_async(
|
||||||
|
"dir",
|
||||||
|
{"root": self.temp_dir},
|
||||||
|
namespace_client_pushdown_operations=["QueryTable"],
|
||||||
|
)
|
||||||
|
assert "QueryTable" in db._namespace_client_pushdown_operations
|
||||||
|
|
||||||
|
async def test_async_create_table_pushdown_stored(self):
|
||||||
|
"""Test that CreateTable pushdown is stored on async connection."""
|
||||||
|
db = lancedb.connect_namespace_async(
|
||||||
|
"dir",
|
||||||
|
{"root": self.temp_dir},
|
||||||
|
namespace_client_pushdown_operations=["CreateTable"],
|
||||||
|
)
|
||||||
|
assert "CreateTable" in db._namespace_client_pushdown_operations
|
||||||
|
|
||||||
|
async def test_async_pushdown_defaults_to_empty(self):
|
||||||
|
"""Test that pushdown operations default to empty on async connection."""
|
||||||
|
db = lancedb.connect_namespace_async("dir", {"root": self.temp_dir})
|
||||||
|
assert len(db._namespace_client_pushdown_operations) == 0
|
||||||
|
|||||||
@@ -4,9 +4,11 @@
|
|||||||
"""
|
"""
|
||||||
Integration tests for LanceDB Namespace with S3 and credential refresh.
|
Integration tests for LanceDB Namespace with S3 and credential refresh.
|
||||||
|
|
||||||
This test simulates a namespace server that returns incrementing credentials
|
This test uses DirectoryNamespace with native ops_metrics and vend_input_storage_options
|
||||||
and verifies that the credential refresh mechanism works correctly for both
|
features to track API calls and test credential refresh mechanisms.
|
||||||
create_table and open_table operations.
|
|
||||||
|
Tests are parameterized to run with both DirectoryNamespace and a CustomNamespace
|
||||||
|
wrapper to verify Python-Rust binding works correctly for custom implementations.
|
||||||
|
|
||||||
Tests verify:
|
Tests verify:
|
||||||
- Storage options provider is auto-created and used
|
- Storage options provider is auto-created and used
|
||||||
@@ -16,24 +18,141 @@ Tests verify:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from threading import Lock
|
from typing import Dict, Optional
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
from lance_namespace import (
|
from lance.namespace import (
|
||||||
CreateEmptyTableRequest,
|
|
||||||
CreateEmptyTableResponse,
|
|
||||||
DeclareTableRequest,
|
DeclareTableRequest,
|
||||||
DeclareTableResponse,
|
DeclareTableResponse,
|
||||||
DescribeTableRequest,
|
DescribeTableRequest,
|
||||||
DescribeTableResponse,
|
DescribeTableResponse,
|
||||||
|
DirectoryNamespace,
|
||||||
LanceNamespace,
|
LanceNamespace,
|
||||||
)
|
)
|
||||||
|
from lance_namespace import (
|
||||||
|
CreateNamespaceRequest,
|
||||||
|
CreateNamespaceResponse,
|
||||||
|
CreateTableRequest,
|
||||||
|
CreateTableResponse,
|
||||||
|
CreateTableVersionRequest,
|
||||||
|
CreateTableVersionResponse,
|
||||||
|
DeregisterTableRequest,
|
||||||
|
DeregisterTableResponse,
|
||||||
|
DescribeNamespaceRequest,
|
||||||
|
DescribeNamespaceResponse,
|
||||||
|
DescribeTableVersionRequest,
|
||||||
|
DescribeTableVersionResponse,
|
||||||
|
DropNamespaceRequest,
|
||||||
|
DropNamespaceResponse,
|
||||||
|
DropTableRequest,
|
||||||
|
DropTableResponse,
|
||||||
|
ListNamespacesRequest,
|
||||||
|
ListNamespacesResponse,
|
||||||
|
ListTablesRequest,
|
||||||
|
ListTablesResponse,
|
||||||
|
ListTableVersionsRequest,
|
||||||
|
ListTableVersionsResponse,
|
||||||
|
NamespaceExistsRequest,
|
||||||
|
RegisterTableRequest,
|
||||||
|
RegisterTableResponse,
|
||||||
|
TableExistsRequest,
|
||||||
|
)
|
||||||
from lancedb.namespace import LanceNamespaceDBConnection
|
from lancedb.namespace import LanceNamespaceDBConnection
|
||||||
|
|
||||||
|
|
||||||
|
class CustomNamespace(LanceNamespace):
|
||||||
|
"""A custom namespace wrapper that delegates to DirectoryNamespace.
|
||||||
|
|
||||||
|
This class verifies that the Python-Rust binding works correctly for
|
||||||
|
custom namespace implementations that wrap the native DirectoryNamespace.
|
||||||
|
All methods simply delegate to the underlying DirectoryNamespace instance.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, inner: DirectoryNamespace):
|
||||||
|
self._inner = inner
|
||||||
|
|
||||||
|
def namespace_id(self) -> str:
|
||||||
|
return f"CustomNamespace[{self._inner.namespace_id()}]"
|
||||||
|
|
||||||
|
def create_namespace(
|
||||||
|
self, request: CreateNamespaceRequest
|
||||||
|
) -> CreateNamespaceResponse:
|
||||||
|
return self._inner.create_namespace(request)
|
||||||
|
|
||||||
|
def describe_namespace(
|
||||||
|
self, request: DescribeNamespaceRequest
|
||||||
|
) -> DescribeNamespaceResponse:
|
||||||
|
return self._inner.describe_namespace(request)
|
||||||
|
|
||||||
|
def namespace_exists(self, request: NamespaceExistsRequest) -> None:
|
||||||
|
return self._inner.namespace_exists(request)
|
||||||
|
|
||||||
|
def drop_namespace(self, request: DropNamespaceRequest) -> DropNamespaceResponse:
|
||||||
|
return self._inner.drop_namespace(request)
|
||||||
|
|
||||||
|
def list_namespaces(self, request: ListNamespacesRequest) -> ListNamespacesResponse:
|
||||||
|
return self._inner.list_namespaces(request)
|
||||||
|
|
||||||
|
def create_table(
|
||||||
|
self, request: CreateTableRequest, data: bytes
|
||||||
|
) -> CreateTableResponse:
|
||||||
|
return self._inner.create_table(request, data)
|
||||||
|
|
||||||
|
def declare_table(self, request: DeclareTableRequest) -> DeclareTableResponse:
|
||||||
|
return self._inner.declare_table(request)
|
||||||
|
|
||||||
|
def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse:
|
||||||
|
return self._inner.describe_table(request)
|
||||||
|
|
||||||
|
def table_exists(self, request: TableExistsRequest) -> None:
|
||||||
|
return self._inner.table_exists(request)
|
||||||
|
|
||||||
|
def drop_table(self, request: DropTableRequest) -> DropTableResponse:
|
||||||
|
return self._inner.drop_table(request)
|
||||||
|
|
||||||
|
def list_tables(self, request: ListTablesRequest) -> ListTablesResponse:
|
||||||
|
return self._inner.list_tables(request)
|
||||||
|
|
||||||
|
def register_table(self, request: RegisterTableRequest) -> RegisterTableResponse:
|
||||||
|
return self._inner.register_table(request)
|
||||||
|
|
||||||
|
def deregister_table(
|
||||||
|
self, request: DeregisterTableRequest
|
||||||
|
) -> DeregisterTableResponse:
|
||||||
|
return self._inner.deregister_table(request)
|
||||||
|
|
||||||
|
def list_table_versions(
|
||||||
|
self, request: ListTableVersionsRequest
|
||||||
|
) -> ListTableVersionsResponse:
|
||||||
|
return self._inner.list_table_versions(request)
|
||||||
|
|
||||||
|
def describe_table_version(
|
||||||
|
self, request: DescribeTableVersionRequest
|
||||||
|
) -> DescribeTableVersionResponse:
|
||||||
|
return self._inner.describe_table_version(request)
|
||||||
|
|
||||||
|
def create_table_version(
|
||||||
|
self, request: CreateTableVersionRequest
|
||||||
|
) -> CreateTableVersionResponse:
|
||||||
|
return self._inner.create_table_version(request)
|
||||||
|
|
||||||
|
def retrieve_ops_metrics(self) -> Optional[Dict[str, int]]:
|
||||||
|
return self._inner.retrieve_ops_metrics()
|
||||||
|
|
||||||
|
|
||||||
|
def _wrap_if_custom(ns_client: DirectoryNamespace, use_custom: bool):
|
||||||
|
"""Wrap namespace client in CustomNamespace if use_custom is True."""
|
||||||
|
if use_custom:
|
||||||
|
return CustomNamespace(ns_client)
|
||||||
|
return ns_client
|
||||||
|
|
||||||
|
|
||||||
# LocalStack S3 configuration
|
# LocalStack S3 configuration
|
||||||
CONFIG = {
|
CONFIG = {
|
||||||
"allow_http": "true",
|
"allow_http": "true",
|
||||||
@@ -89,162 +208,88 @@ def delete_bucket(s3, bucket_name):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class TrackingNamespace(LanceNamespace):
|
def create_tracking_namespace(
|
||||||
"""
|
|
||||||
Mock namespace that wraps DirectoryNamespace and tracks API calls.
|
|
||||||
|
|
||||||
This namespace returns incrementing credentials with each API call to simulate
|
|
||||||
credential rotation. It also tracks the number of times each API is called
|
|
||||||
to verify caching behavior.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
bucket_name: str,
|
bucket_name: str,
|
||||||
storage_options: Dict[str, str],
|
storage_options: dict,
|
||||||
credential_expires_in_seconds: int = 60,
|
credential_expires_in_seconds: int = 60,
|
||||||
):
|
use_custom: bool = False,
|
||||||
from lance.namespace import DirectoryNamespace
|
):
|
||||||
|
"""Create a DirectoryNamespace with ops metrics and credential vending enabled.
|
||||||
|
|
||||||
self.bucket_name = bucket_name
|
Uses native DirectoryNamespace features:
|
||||||
self.base_storage_options = storage_options
|
- ops_metrics_enabled=true: Tracks API call counts via retrieve_ops_metrics()
|
||||||
self.credential_expires_in_seconds = credential_expires_in_seconds
|
- vend_input_storage_options=true: Returns input storage options in responses
|
||||||
self.describe_call_count = 0
|
- vend_input_storage_options_refresh_interval_millis: Adds expires_at_millis
|
||||||
self.create_call_count = 0
|
|
||||||
self.lock = Lock()
|
|
||||||
|
|
||||||
# Create underlying DirectoryNamespace with storage options
|
Args:
|
||||||
dir_props = {f"storage.{k}": v for k, v in storage_options.items()}
|
bucket_name: S3 bucket name or local path
|
||||||
|
storage_options: Storage options to pass through (credentials, endpoint, etc.)
|
||||||
|
credential_expires_in_seconds: Interval in seconds for credential expiration
|
||||||
|
use_custom: If True, wrap in CustomNamespace for testing custom implementations
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (namespace_client, inner_namespace_client) where inner is always
|
||||||
|
the DirectoryNamespace (used for metrics retrieval)
|
||||||
|
"""
|
||||||
|
# Add refresh_offset_millis to storage options so that credentials are not
|
||||||
|
# considered expired immediately. Set to 1 second (1000ms) so that refresh
|
||||||
|
# checks work correctly with short-lived credentials in tests.
|
||||||
|
storage_options_with_refresh = dict(storage_options)
|
||||||
|
storage_options_with_refresh["refresh_offset_millis"] = "1000"
|
||||||
|
|
||||||
|
dir_props = {f"storage.{k}": v for k, v in storage_options_with_refresh.items()}
|
||||||
|
|
||||||
# Use S3 path for bucket name, local path for file paths
|
|
||||||
if bucket_name.startswith("/") or bucket_name.startswith("file://"):
|
if bucket_name.startswith("/") or bucket_name.startswith("file://"):
|
||||||
dir_props["root"] = f"{bucket_name}/namespace_root"
|
dir_props["root"] = f"{bucket_name}/namespace_root"
|
||||||
else:
|
else:
|
||||||
dir_props["root"] = f"s3://{bucket_name}/namespace_root"
|
dir_props["root"] = f"s3://{bucket_name}/namespace_root"
|
||||||
|
|
||||||
self.inner = DirectoryNamespace(**dir_props)
|
# Enable ops metrics tracking
|
||||||
|
dir_props["ops_metrics_enabled"] = "true"
|
||||||
def get_describe_call_count(self) -> int:
|
# Enable storage options vending
|
||||||
"""Thread-safe getter for describe call count."""
|
dir_props["vend_input_storage_options"] = "true"
|
||||||
with self.lock:
|
# Set refresh interval in milliseconds
|
||||||
return self.describe_call_count
|
dir_props["vend_input_storage_options_refresh_interval_millis"] = str(
|
||||||
|
credential_expires_in_seconds * 1000
|
||||||
def get_create_call_count(self) -> int:
|
|
||||||
"""Thread-safe getter for create call count."""
|
|
||||||
with self.lock:
|
|
||||||
return self.create_call_count
|
|
||||||
|
|
||||||
def namespace_id(self) -> str:
|
|
||||||
"""Return namespace identifier."""
|
|
||||||
return f"TrackingNamespace {{ inner: {self.inner.namespace_id()} }}"
|
|
||||||
|
|
||||||
def _modify_storage_options(
|
|
||||||
self, storage_options: Dict[str, str], count: int
|
|
||||||
) -> Dict[str, str]:
|
|
||||||
"""
|
|
||||||
Add incrementing credentials with expiration timestamp.
|
|
||||||
|
|
||||||
This simulates a credential rotation system where each call returns
|
|
||||||
new credentials that expire after credential_expires_in_seconds.
|
|
||||||
"""
|
|
||||||
# Start from base storage options (endpoint, region, allow_http, etc.)
|
|
||||||
# because DirectoryNamespace returns None for storage_options from
|
|
||||||
# describe_table/declare_table when no credential vendor is configured.
|
|
||||||
modified = copy.deepcopy(self.base_storage_options)
|
|
||||||
if storage_options:
|
|
||||||
modified.update(storage_options)
|
|
||||||
|
|
||||||
# Increment credentials to simulate rotation
|
|
||||||
modified["aws_access_key_id"] = f"AKID_{count}"
|
|
||||||
modified["aws_secret_access_key"] = f"SECRET_{count}"
|
|
||||||
modified["aws_session_token"] = f"TOKEN_{count}"
|
|
||||||
|
|
||||||
# Set expiration time
|
|
||||||
expires_at_millis = int(
|
|
||||||
(time.time() + self.credential_expires_in_seconds) * 1000
|
|
||||||
)
|
|
||||||
modified["expires_at_millis"] = str(expires_at_millis)
|
|
||||||
|
|
||||||
return modified
|
|
||||||
|
|
||||||
def declare_table(self, request: DeclareTableRequest) -> DeclareTableResponse:
|
|
||||||
"""Track declare_table calls and inject rotating credentials."""
|
|
||||||
with self.lock:
|
|
||||||
self.create_call_count += 1
|
|
||||||
count = self.create_call_count
|
|
||||||
|
|
||||||
response = self.inner.declare_table(request)
|
|
||||||
response.storage_options = self._modify_storage_options(
|
|
||||||
response.storage_options, count
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return response
|
inner_ns_client = DirectoryNamespace(**dir_props)
|
||||||
|
ns_client = _wrap_if_custom(inner_ns_client, use_custom)
|
||||||
|
return ns_client, inner_ns_client
|
||||||
|
|
||||||
def create_empty_table(
|
|
||||||
self, request: CreateEmptyTableRequest
|
|
||||||
) -> CreateEmptyTableResponse:
|
|
||||||
"""Track create_empty_table calls and inject rotating credentials."""
|
|
||||||
with self.lock:
|
|
||||||
self.create_call_count += 1
|
|
||||||
count = self.create_call_count
|
|
||||||
|
|
||||||
response = self.inner.create_empty_table(request)
|
def get_describe_call_count(namespace_client) -> int:
|
||||||
response.storage_options = self._modify_storage_options(
|
"""Get the number of describe_table calls made to the namespace client."""
|
||||||
response.storage_options, count
|
return namespace_client.retrieve_ops_metrics().get("describe_table", 0)
|
||||||
)
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse:
|
def get_declare_call_count(namespace_client) -> int:
|
||||||
"""Track describe_table calls and inject rotating credentials."""
|
"""Get the number of declare_table calls made to the namespace client."""
|
||||||
with self.lock:
|
return namespace_client.retrieve_ops_metrics().get("declare_table", 0)
|
||||||
self.describe_call_count += 1
|
|
||||||
count = self.describe_call_count
|
|
||||||
|
|
||||||
response = self.inner.describe_table(request)
|
|
||||||
response.storage_options = self._modify_storage_options(
|
|
||||||
response.storage_options, count
|
|
||||||
)
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
# Pass through other methods to inner namespace
|
|
||||||
def list_tables(self, request):
|
|
||||||
return self.inner.list_tables(request)
|
|
||||||
|
|
||||||
def drop_table(self, request):
|
|
||||||
return self.inner.drop_table(request)
|
|
||||||
|
|
||||||
def list_namespaces(self, request):
|
|
||||||
return self.inner.list_namespaces(request)
|
|
||||||
|
|
||||||
def create_namespace(self, request):
|
|
||||||
return self.inner.create_namespace(request)
|
|
||||||
|
|
||||||
def drop_namespace(self, request):
|
|
||||||
return self.inner.drop_namespace(request)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.s3_test
|
@pytest.mark.s3_test
|
||||||
def test_namespace_create_table_with_provider(s3_bucket: str):
|
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||||
|
def test_namespace_create_table_with_provider(s3_bucket: str, use_custom: bool):
|
||||||
"""
|
"""
|
||||||
Test creating a table through namespace with storage options provider.
|
Test creating a table through namespace with storage options provider.
|
||||||
|
|
||||||
Verifies:
|
Verifies:
|
||||||
- create_empty_table is called once to reserve location
|
- declare_table is called once to reserve location
|
||||||
- Storage options provider is auto-created
|
- Storage options provider is auto-created
|
||||||
- Table can be written successfully
|
- Table can be written successfully
|
||||||
- Credentials are cached during write operations
|
- Credentials are cached during write operations
|
||||||
"""
|
"""
|
||||||
storage_options = copy.deepcopy(CONFIG)
|
storage_options = copy.deepcopy(CONFIG)
|
||||||
|
|
||||||
namespace = TrackingNamespace(
|
ns_client, inner_ns_client = create_tracking_namespace(
|
||||||
bucket_name=s3_bucket,
|
bucket_name=s3_bucket,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
credential_expires_in_seconds=3600, # 1 hour
|
credential_expires_in_seconds=3600, # 1 hour
|
||||||
|
use_custom=use_custom,
|
||||||
)
|
)
|
||||||
|
|
||||||
db = LanceNamespaceDBConnection(namespace)
|
db = LanceNamespaceDBConnection(ns_client)
|
||||||
|
|
||||||
# Create unique namespace for this test
|
# Create unique namespace for this test
|
||||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||||
@@ -254,8 +299,8 @@ def test_namespace_create_table_with_provider(s3_bucket: str):
|
|||||||
namespace_path = [namespace_name]
|
namespace_path = [namespace_name]
|
||||||
|
|
||||||
# Verify initial state
|
# Verify initial state
|
||||||
assert namespace.get_create_call_count() == 0
|
assert get_declare_call_count(inner_ns_client) == 0
|
||||||
assert namespace.get_describe_call_count() == 0
|
assert get_describe_call_count(inner_ns_client) == 0
|
||||||
|
|
||||||
# Create table with data
|
# Create table with data
|
||||||
data = pa.table(
|
data = pa.table(
|
||||||
@@ -266,12 +311,12 @@ def test_namespace_create_table_with_provider(s3_bucket: str):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
table = db.create_table(table_name, data, namespace=namespace_path)
|
table = db.create_table(table_name, data, namespace_path=namespace_path)
|
||||||
|
|
||||||
# Verify create_empty_table was called exactly once
|
# Verify declare_table was called exactly once
|
||||||
assert namespace.get_create_call_count() == 1
|
assert get_declare_call_count(inner_ns_client) == 1
|
||||||
# describe_table should NOT be called during create in create mode
|
# describe_table should NOT be called during create in create mode
|
||||||
assert namespace.get_describe_call_count() == 0
|
assert get_describe_call_count(inner_ns_client) == 0
|
||||||
|
|
||||||
# Verify table was created successfully
|
# Verify table was created successfully
|
||||||
assert table.name == table_name
|
assert table.name == table_name
|
||||||
@@ -281,7 +326,8 @@ def test_namespace_create_table_with_provider(s3_bucket: str):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.s3_test
|
@pytest.mark.s3_test
|
||||||
def test_namespace_open_table_with_provider(s3_bucket: str):
|
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||||
|
def test_namespace_open_table_with_provider(s3_bucket: str, use_custom: bool):
|
||||||
"""
|
"""
|
||||||
Test opening a table through namespace with storage options provider.
|
Test opening a table through namespace with storage options provider.
|
||||||
|
|
||||||
@@ -293,13 +339,14 @@ def test_namespace_open_table_with_provider(s3_bucket: str):
|
|||||||
"""
|
"""
|
||||||
storage_options = copy.deepcopy(CONFIG)
|
storage_options = copy.deepcopy(CONFIG)
|
||||||
|
|
||||||
namespace = TrackingNamespace(
|
ns_client, inner_ns_client = create_tracking_namespace(
|
||||||
bucket_name=s3_bucket,
|
bucket_name=s3_bucket,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
credential_expires_in_seconds=3600,
|
credential_expires_in_seconds=3600,
|
||||||
|
use_custom=use_custom,
|
||||||
)
|
)
|
||||||
|
|
||||||
db = LanceNamespaceDBConnection(namespace)
|
db = LanceNamespaceDBConnection(ns_client)
|
||||||
|
|
||||||
# Create unique namespace for this test
|
# Create unique namespace for this test
|
||||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||||
@@ -317,21 +364,21 @@ def test_namespace_open_table_with_provider(s3_bucket: str):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
db.create_table(table_name, data, namespace=namespace_path)
|
db.create_table(table_name, data, namespace_path=namespace_path)
|
||||||
|
|
||||||
initial_create_count = namespace.get_create_call_count()
|
initial_declare_count = get_declare_call_count(inner_ns_client)
|
||||||
assert initial_create_count == 1
|
assert initial_declare_count == 1
|
||||||
|
|
||||||
# Open the table
|
# Open the table
|
||||||
opened_table = db.open_table(table_name, namespace=namespace_path)
|
opened_table = db.open_table(table_name, namespace_path=namespace_path)
|
||||||
|
|
||||||
# Verify describe_table was called exactly once
|
# Verify describe_table was called exactly once
|
||||||
assert namespace.get_describe_call_count() == 1
|
assert get_describe_call_count(inner_ns_client) == 1
|
||||||
# create_empty_table should not be called again
|
# declare_table should not be called again
|
||||||
assert namespace.get_create_call_count() == initial_create_count
|
assert get_declare_call_count(inner_ns_client) == initial_declare_count
|
||||||
|
|
||||||
# Perform multiple read operations
|
# Perform multiple read operations
|
||||||
describe_count_after_open = namespace.get_describe_call_count()
|
describe_count_after_open = get_describe_call_count(inner_ns_client)
|
||||||
|
|
||||||
for _ in range(3):
|
for _ in range(3):
|
||||||
result = opened_table.to_pandas()
|
result = opened_table.to_pandas()
|
||||||
@@ -340,11 +387,72 @@ def test_namespace_open_table_with_provider(s3_bucket: str):
|
|||||||
assert count == 5
|
assert count == 5
|
||||||
|
|
||||||
# Verify credentials were cached (no additional describe_table calls)
|
# Verify credentials were cached (no additional describe_table calls)
|
||||||
assert namespace.get_describe_call_count() == describe_count_after_open
|
assert get_describe_call_count(inner_ns_client) == describe_count_after_open
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
sys.platform == "win32",
|
||||||
|
reason="TODO: fix schema-only namespace metrics test on Windows",
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||||
|
def test_namespace_create_schema_only_with_provider(use_custom: bool):
|
||||||
|
"""
|
||||||
|
Test creating a schema-only table through namespace with storage options provider.
|
||||||
|
|
||||||
|
Verifies:
|
||||||
|
- declare_table is called once to reserve the location
|
||||||
|
- describe_table is not needed during create in create mode
|
||||||
|
- the table can be reopened successfully afterward
|
||||||
|
- opening the table triggers exactly one describe_table call
|
||||||
|
"""
|
||||||
|
temp_dir = tempfile.mkdtemp()
|
||||||
|
try:
|
||||||
|
ns_client, inner_ns_client = create_tracking_namespace(
|
||||||
|
bucket_name=temp_dir,
|
||||||
|
storage_options={},
|
||||||
|
credential_expires_in_seconds=3600,
|
||||||
|
use_custom=use_custom,
|
||||||
|
)
|
||||||
|
|
||||||
|
db = LanceNamespaceDBConnection(ns_client)
|
||||||
|
|
||||||
|
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||||
|
db.create_namespace([namespace_name])
|
||||||
|
|
||||||
|
table_name = f"test_table_{uuid.uuid4().hex}"
|
||||||
|
namespace_path = [namespace_name]
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64()),
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
|
pa.field("text", pa.string()),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert get_declare_call_count(inner_ns_client) == 0
|
||||||
|
assert get_describe_call_count(inner_ns_client) == 0
|
||||||
|
|
||||||
|
table = db.create_table(
|
||||||
|
table_name, schema=schema, namespace_path=namespace_path
|
||||||
|
)
|
||||||
|
|
||||||
|
assert table.name == table_name
|
||||||
|
assert table.namespace == namespace_path
|
||||||
|
assert get_declare_call_count(inner_ns_client) == 1
|
||||||
|
assert get_describe_call_count(inner_ns_client) == 0
|
||||||
|
|
||||||
|
reopened_table = db.open_table(table_name, namespace_path=namespace_path)
|
||||||
|
|
||||||
|
assert reopened_table.schema == schema
|
||||||
|
assert get_declare_call_count(inner_ns_client) == 1
|
||||||
|
assert get_describe_call_count(inner_ns_client) == 1
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.s3_test
|
@pytest.mark.s3_test
|
||||||
def test_namespace_credential_refresh_on_read(s3_bucket: str):
|
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||||
|
def test_namespace_credential_refresh_on_read(s3_bucket: str, use_custom: bool):
|
||||||
"""
|
"""
|
||||||
Test credential refresh when credentials expire during read operations.
|
Test credential refresh when credentials expire during read operations.
|
||||||
|
|
||||||
@@ -355,13 +463,14 @@ def test_namespace_credential_refresh_on_read(s3_bucket: str):
|
|||||||
"""
|
"""
|
||||||
storage_options = copy.deepcopy(CONFIG)
|
storage_options = copy.deepcopy(CONFIG)
|
||||||
|
|
||||||
namespace = TrackingNamespace(
|
ns_client, inner_ns_client = create_tracking_namespace(
|
||||||
bucket_name=s3_bucket,
|
bucket_name=s3_bucket,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
credential_expires_in_seconds=3, # Short expiration for testing
|
credential_expires_in_seconds=3, # Short expiration for testing
|
||||||
|
use_custom=use_custom,
|
||||||
)
|
)
|
||||||
|
|
||||||
db = LanceNamespaceDBConnection(namespace)
|
db = LanceNamespaceDBConnection(ns_client)
|
||||||
|
|
||||||
# Create unique namespace for this test
|
# Create unique namespace for this test
|
||||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||||
@@ -378,16 +487,16 @@ def test_namespace_credential_refresh_on_read(s3_bucket: str):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
db.create_table(table_name, data, namespace=namespace_path)
|
db.create_table(table_name, data, namespace_path=namespace_path)
|
||||||
|
|
||||||
# Open table (triggers describe_table)
|
# Open table (triggers describe_table)
|
||||||
opened_table = db.open_table(table_name, namespace=namespace_path)
|
opened_table = db.open_table(table_name, namespace_path=namespace_path)
|
||||||
|
|
||||||
# Perform an immediate read (should use credentials from open)
|
# Perform an immediate read (should use credentials from open)
|
||||||
result = opened_table.to_pandas()
|
result = opened_table.to_pandas()
|
||||||
assert len(result) == 3
|
assert len(result) == 3
|
||||||
|
|
||||||
describe_count_after_first_read = namespace.get_describe_call_count()
|
describe_count_after_first_read = get_describe_call_count(inner_ns_client)
|
||||||
|
|
||||||
# Wait for credentials to expire (3 seconds + buffer)
|
# Wait for credentials to expire (3 seconds + buffer)
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
@@ -396,7 +505,7 @@ def test_namespace_credential_refresh_on_read(s3_bucket: str):
|
|||||||
result = opened_table.to_pandas()
|
result = opened_table.to_pandas()
|
||||||
assert len(result) == 3
|
assert len(result) == 3
|
||||||
|
|
||||||
describe_count_after_refresh = namespace.get_describe_call_count()
|
describe_count_after_refresh = get_describe_call_count(inner_ns_client)
|
||||||
# Verify describe_table was called again (credential refresh)
|
# Verify describe_table was called again (credential refresh)
|
||||||
refresh_delta = describe_count_after_refresh - describe_count_after_first_read
|
refresh_delta = describe_count_after_refresh - describe_count_after_first_read
|
||||||
|
|
||||||
@@ -409,7 +518,8 @@ def test_namespace_credential_refresh_on_read(s3_bucket: str):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.s3_test
|
@pytest.mark.s3_test
|
||||||
def test_namespace_credential_refresh_on_write(s3_bucket: str):
|
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||||
|
def test_namespace_credential_refresh_on_write(s3_bucket: str, use_custom: bool):
|
||||||
"""
|
"""
|
||||||
Test credential refresh when credentials expire during write operations.
|
Test credential refresh when credentials expire during write operations.
|
||||||
|
|
||||||
@@ -420,13 +530,14 @@ def test_namespace_credential_refresh_on_write(s3_bucket: str):
|
|||||||
"""
|
"""
|
||||||
storage_options = copy.deepcopy(CONFIG)
|
storage_options = copy.deepcopy(CONFIG)
|
||||||
|
|
||||||
namespace = TrackingNamespace(
|
ns_client, inner_ns_client = create_tracking_namespace(
|
||||||
bucket_name=s3_bucket,
|
bucket_name=s3_bucket,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
credential_expires_in_seconds=3, # Short expiration
|
credential_expires_in_seconds=3, # Short expiration
|
||||||
|
use_custom=use_custom,
|
||||||
)
|
)
|
||||||
|
|
||||||
db = LanceNamespaceDBConnection(namespace)
|
db = LanceNamespaceDBConnection(ns_client)
|
||||||
|
|
||||||
# Create unique namespace for this test
|
# Create unique namespace for this test
|
||||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||||
@@ -443,7 +554,7 @@ def test_namespace_credential_refresh_on_write(s3_bucket: str):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
table = db.create_table(table_name, initial_data, namespace=namespace_path)
|
table = db.create_table(table_name, initial_data, namespace_path=namespace_path)
|
||||||
|
|
||||||
# Add more data (should use cached credentials)
|
# Add more data (should use cached credentials)
|
||||||
new_data = pa.table(
|
new_data = pa.table(
|
||||||
@@ -471,24 +582,26 @@ def test_namespace_credential_refresh_on_write(s3_bucket: str):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.s3_test
|
@pytest.mark.s3_test
|
||||||
def test_namespace_overwrite_mode(s3_bucket: str):
|
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||||
|
def test_namespace_overwrite_mode(s3_bucket: str, use_custom: bool):
|
||||||
"""
|
"""
|
||||||
Test creating table in overwrite mode with credential tracking.
|
Test creating table in overwrite mode with credential tracking.
|
||||||
|
|
||||||
Verifies:
|
Verifies:
|
||||||
- First create calls create_empty_table exactly once
|
- First create calls declare_table exactly once
|
||||||
- Overwrite mode calls describe_table exactly once to check existence
|
- Overwrite mode calls describe_table exactly once to check existence
|
||||||
- Storage options provider works in overwrite mode
|
- Storage options provider works in overwrite mode
|
||||||
"""
|
"""
|
||||||
storage_options = copy.deepcopy(CONFIG)
|
storage_options = copy.deepcopy(CONFIG)
|
||||||
|
|
||||||
namespace = TrackingNamespace(
|
ns_client, inner_ns_client = create_tracking_namespace(
|
||||||
bucket_name=s3_bucket,
|
bucket_name=s3_bucket,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
credential_expires_in_seconds=3600,
|
credential_expires_in_seconds=3600,
|
||||||
|
use_custom=use_custom,
|
||||||
)
|
)
|
||||||
|
|
||||||
db = LanceNamespaceDBConnection(namespace)
|
db = LanceNamespaceDBConnection(ns_client)
|
||||||
|
|
||||||
# Create unique namespace for this test
|
# Create unique namespace for this test
|
||||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||||
@@ -505,11 +618,11 @@ def test_namespace_overwrite_mode(s3_bucket: str):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
table = db.create_table(table_name, data1, namespace=namespace_path)
|
table = db.create_table(table_name, data1, namespace_path=namespace_path)
|
||||||
# Exactly one create_empty_table call for initial create
|
# Exactly one declare_table call for initial create
|
||||||
assert namespace.get_create_call_count() == 1
|
assert get_declare_call_count(inner_ns_client) == 1
|
||||||
# No describe_table calls in create mode
|
# No describe_table calls in create mode
|
||||||
assert namespace.get_describe_call_count() == 0
|
assert get_describe_call_count(inner_ns_client) == 0
|
||||||
assert table.count_rows() == 2
|
assert table.count_rows() == 2
|
||||||
|
|
||||||
# Overwrite the table
|
# Overwrite the table
|
||||||
@@ -521,14 +634,14 @@ def test_namespace_overwrite_mode(s3_bucket: str):
|
|||||||
)
|
)
|
||||||
|
|
||||||
table2 = db.create_table(
|
table2 = db.create_table(
|
||||||
table_name, data2, namespace=namespace_path, mode="overwrite"
|
table_name, data2, namespace_path=namespace_path, mode="overwrite"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Should still have only 1 create_empty_table call
|
# Should still have only 1 declare_table call
|
||||||
# (overwrite reuses location from describe_table)
|
# (overwrite reuses location from describe_table)
|
||||||
assert namespace.get_create_call_count() == 1
|
assert get_declare_call_count(inner_ns_client) == 1
|
||||||
# Should have called describe_table exactly once to get existing table location
|
# Should have called describe_table exactly once to get existing table location
|
||||||
assert namespace.get_describe_call_count() == 1
|
assert get_describe_call_count(inner_ns_client) == 1
|
||||||
|
|
||||||
# Verify new data
|
# Verify new data
|
||||||
assert table2.count_rows() == 3
|
assert table2.count_rows() == 3
|
||||||
@@ -537,7 +650,8 @@ def test_namespace_overwrite_mode(s3_bucket: str):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.s3_test
|
@pytest.mark.s3_test
|
||||||
def test_namespace_multiple_tables(s3_bucket: str):
|
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||||
|
def test_namespace_multiple_tables(s3_bucket: str, use_custom: bool):
|
||||||
"""
|
"""
|
||||||
Test creating and opening multiple tables in the same namespace.
|
Test creating and opening multiple tables in the same namespace.
|
||||||
|
|
||||||
@@ -548,13 +662,14 @@ def test_namespace_multiple_tables(s3_bucket: str):
|
|||||||
"""
|
"""
|
||||||
storage_options = copy.deepcopy(CONFIG)
|
storage_options = copy.deepcopy(CONFIG)
|
||||||
|
|
||||||
namespace = TrackingNamespace(
|
ns_client, inner_ns_client = create_tracking_namespace(
|
||||||
bucket_name=s3_bucket,
|
bucket_name=s3_bucket,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
credential_expires_in_seconds=3600,
|
credential_expires_in_seconds=3600,
|
||||||
|
use_custom=use_custom,
|
||||||
)
|
)
|
||||||
|
|
||||||
db = LanceNamespaceDBConnection(namespace)
|
db = LanceNamespaceDBConnection(ns_client)
|
||||||
|
|
||||||
# Create unique namespace for this test
|
# Create unique namespace for this test
|
||||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||||
@@ -564,22 +679,22 @@ def test_namespace_multiple_tables(s3_bucket: str):
|
|||||||
# Create first table
|
# Create first table
|
||||||
table1_name = f"table1_{uuid.uuid4().hex}"
|
table1_name = f"table1_{uuid.uuid4().hex}"
|
||||||
data1 = pa.table({"id": [1, 2], "value": [10, 20]})
|
data1 = pa.table({"id": [1, 2], "value": [10, 20]})
|
||||||
db.create_table(table1_name, data1, namespace=namespace_path)
|
db.create_table(table1_name, data1, namespace_path=namespace_path)
|
||||||
|
|
||||||
# Create second table
|
# Create second table
|
||||||
table2_name = f"table2_{uuid.uuid4().hex}"
|
table2_name = f"table2_{uuid.uuid4().hex}"
|
||||||
data2 = pa.table({"id": [3, 4], "value": [30, 40]})
|
data2 = pa.table({"id": [3, 4], "value": [30, 40]})
|
||||||
db.create_table(table2_name, data2, namespace=namespace_path)
|
db.create_table(table2_name, data2, namespace_path=namespace_path)
|
||||||
|
|
||||||
# Should have 2 create calls (one per table)
|
# Should have 2 declare calls (one per table)
|
||||||
assert namespace.get_create_call_count() == 2
|
assert get_declare_call_count(inner_ns_client) == 2
|
||||||
|
|
||||||
# Open both tables
|
# Open both tables
|
||||||
opened1 = db.open_table(table1_name, namespace=namespace_path)
|
opened1 = db.open_table(table1_name, namespace_path=namespace_path)
|
||||||
opened2 = db.open_table(table2_name, namespace=namespace_path)
|
opened2 = db.open_table(table2_name, namespace_path=namespace_path)
|
||||||
|
|
||||||
# Should have 2 describe calls (one per open)
|
# Should have 2 describe calls (one per open)
|
||||||
assert namespace.get_describe_call_count() == 2
|
assert get_describe_call_count(inner_ns_client) == 2
|
||||||
|
|
||||||
# Verify both tables work independently
|
# Verify both tables work independently
|
||||||
assert opened1.count_rows() == 2
|
assert opened1.count_rows() == 2
|
||||||
@@ -593,7 +708,8 @@ def test_namespace_multiple_tables(s3_bucket: str):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.s3_test
|
@pytest.mark.s3_test
|
||||||
def test_namespace_with_schema_only(s3_bucket: str):
|
@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
|
||||||
|
def test_namespace_with_schema_only(s3_bucket: str, use_custom: bool):
|
||||||
"""
|
"""
|
||||||
Test creating empty table with schema only (no data).
|
Test creating empty table with schema only (no data).
|
||||||
|
|
||||||
@@ -604,13 +720,14 @@ def test_namespace_with_schema_only(s3_bucket: str):
|
|||||||
"""
|
"""
|
||||||
storage_options = copy.deepcopy(CONFIG)
|
storage_options = copy.deepcopy(CONFIG)
|
||||||
|
|
||||||
namespace = TrackingNamespace(
|
ns_client, inner_ns_client = create_tracking_namespace(
|
||||||
bucket_name=s3_bucket,
|
bucket_name=s3_bucket,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
credential_expires_in_seconds=3600,
|
credential_expires_in_seconds=3600,
|
||||||
|
use_custom=use_custom,
|
||||||
)
|
)
|
||||||
|
|
||||||
db = LanceNamespaceDBConnection(namespace)
|
db = LanceNamespaceDBConnection(ns_client)
|
||||||
|
|
||||||
# Create unique namespace for this test
|
# Create unique namespace for this test
|
||||||
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
namespace_name = f"test_ns_{uuid.uuid4().hex[:8]}"
|
||||||
@@ -628,12 +745,12 @@ def test_namespace_with_schema_only(s3_bucket: str):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
table = db.create_table(table_name, schema=schema, namespace=namespace_path)
|
table = db.create_table(table_name, schema=schema, namespace_path=namespace_path)
|
||||||
|
|
||||||
# Should have called create_empty_table once
|
# Should have called declare_table once
|
||||||
assert namespace.get_create_call_count() == 1
|
assert get_declare_call_count(inner_ns_client) == 1
|
||||||
# Should NOT have called describe_table in create mode
|
# Should NOT have called describe_table in create mode
|
||||||
assert namespace.get_describe_call_count() == 0
|
assert get_describe_call_count(inner_ns_client) == 0
|
||||||
|
|
||||||
# Verify empty table
|
# Verify empty table
|
||||||
assert table.count_rows() == 0
|
assert table.count_rows() == 0
|
||||||
|
|||||||
@@ -522,6 +522,50 @@ def test_no_split_names(some_table: Table):
|
|||||||
assert permutations[1].num_rows == 500
|
assert permutations[1].num_rows == 500
|
||||||
|
|
||||||
|
|
||||||
|
def test_permutations_metadata_without_split_names_key(mem_db: DBConnection):
|
||||||
|
"""Regression: schema metadata present but missing split_names key must not crash.
|
||||||
|
|
||||||
|
Previously, `.get(b"split_names", None).decode()` was called unconditionally,
|
||||||
|
so any permutation table whose metadata dict had other keys but no split_names
|
||||||
|
raised AttributeError: 'NoneType' has no attribute 'decode'.
|
||||||
|
"""
|
||||||
|
base = mem_db.create_table("base_nosplit", pa.table({"x": range(10)}))
|
||||||
|
|
||||||
|
# Build a permutation-like table that carries some metadata but NOT split_names.
|
||||||
|
raw = pa.table(
|
||||||
|
{
|
||||||
|
"row_id": pa.array(range(10), type=pa.uint64()),
|
||||||
|
"split_id": pa.array([0] * 10, type=pa.uint32()),
|
||||||
|
}
|
||||||
|
).replace_schema_metadata({b"other_key": b"other_value"})
|
||||||
|
perm_tbl = mem_db.create_table("perm_nosplit", raw)
|
||||||
|
|
||||||
|
permutations = Permutations(base, perm_tbl)
|
||||||
|
assert permutations.split_names == []
|
||||||
|
assert permutations.split_dict == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_from_tables_string_split_missing_names_key(mem_db: DBConnection):
|
||||||
|
"""Regression: from_tables() with a string split must raise ValueError, not
|
||||||
|
AttributeError.
|
||||||
|
|
||||||
|
Previously, `.get(b"split_names", None).decode()` crashed with AttributeError
|
||||||
|
when the metadata dict existed but had no split_names key.
|
||||||
|
"""
|
||||||
|
base = mem_db.create_table("base_strsplit", pa.table({"x": range(10)}))
|
||||||
|
|
||||||
|
raw = pa.table(
|
||||||
|
{
|
||||||
|
"row_id": pa.array(range(10), type=pa.uint64()),
|
||||||
|
"split_id": pa.array([0] * 10, type=pa.uint32()),
|
||||||
|
}
|
||||||
|
).replace_schema_metadata({b"other_key": b"other_value"})
|
||||||
|
perm_tbl = mem_db.create_table("perm_strsplit", raw)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="no split names are defined"):
|
||||||
|
Permutation.from_tables(base, perm_tbl, split="train")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def some_perm_table(some_table: Table) -> Table:
|
def some_perm_table(some_table: Table) -> Table:
|
||||||
return (
|
return (
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
from datetime import date, datetime
|
from datetime import date, datetime
|
||||||
|
from enum import Enum
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
@@ -673,3 +674,29 @@ async def test_aliases_in_lance_model_async(mem_db_async):
|
|||||||
assert hasattr(model, "name")
|
assert hasattr(model, "name")
|
||||||
assert hasattr(model, "distance")
|
assert hasattr(model, "distance")
|
||||||
assert model.distance < 0.01
|
assert model.distance < 0.01
|
||||||
|
|
||||||
|
|
||||||
|
def test_enum_types():
|
||||||
|
"""Enum fields should map to the Arrow type of their value (issue #1846)."""
|
||||||
|
|
||||||
|
class StrStatus(str, Enum):
|
||||||
|
PENDING = "pending"
|
||||||
|
RUNNING = "running"
|
||||||
|
DONE = "done"
|
||||||
|
|
||||||
|
class IntPriority(int, Enum):
|
||||||
|
LOW = 1
|
||||||
|
MEDIUM = 2
|
||||||
|
HIGH = 3
|
||||||
|
|
||||||
|
class TestModel(pydantic.BaseModel):
|
||||||
|
status: StrStatus
|
||||||
|
priority: IntPriority
|
||||||
|
opt_status: Optional[StrStatus] = None
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(TestModel)
|
||||||
|
|
||||||
|
assert schema.field("status").type == pa.dictionary(pa.int32(), pa.utf8())
|
||||||
|
assert schema.field("priority").type == pa.int64()
|
||||||
|
assert schema.field("opt_status").type == pa.dictionary(pa.int32(), pa.utf8())
|
||||||
|
assert schema.field("opt_status").nullable
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from lancedb.query import (
|
|||||||
PhraseQuery,
|
PhraseQuery,
|
||||||
Query,
|
Query,
|
||||||
FullTextSearchQuery,
|
FullTextSearchQuery,
|
||||||
|
ensure_vector_query,
|
||||||
)
|
)
|
||||||
from lancedb.rerankers.cross_encoder import CrossEncoderReranker
|
from lancedb.rerankers.cross_encoder import CrossEncoderReranker
|
||||||
from lancedb.table import AsyncTable, LanceTable
|
from lancedb.table import AsyncTable, LanceTable
|
||||||
@@ -1384,7 +1385,7 @@ def test_query_timeout(tmp_path):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
table = db.create_table("test", data)
|
table = db.create_table("test", data)
|
||||||
table.create_fts_index("text", use_tantivy=False)
|
table.create_fts_index("text")
|
||||||
|
|
||||||
with pytest.raises(Exception, match="Query timeout"):
|
with pytest.raises(Exception, match="Query timeout"):
|
||||||
table.search().where("text = 'a'").to_list(timeout=timedelta(0))
|
table.search().where("text = 'a'").to_list(timeout=timedelta(0))
|
||||||
@@ -1501,6 +1502,18 @@ def test_search_empty_table(mem_db):
|
|||||||
assert results == []
|
assert results == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_ensure_vector_query_empty_list():
|
||||||
|
"""Regression: ensure_vector_query used to return instead of raise ValueError."""
|
||||||
|
with pytest.raises(ValueError, match="non-empty"):
|
||||||
|
ensure_vector_query([])
|
||||||
|
|
||||||
|
|
||||||
|
def test_ensure_vector_query_nested_empty_list():
|
||||||
|
"""Regression: ensure_vector_query used to return instead of raise ValueError."""
|
||||||
|
with pytest.raises(ValueError, match="non-empty"):
|
||||||
|
ensure_vector_query([[]])
|
||||||
|
|
||||||
|
|
||||||
def test_fast_search(tmp_path):
|
def test_fast_search(tmp_path):
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
|
|
||||||
|
|||||||
@@ -1201,6 +1201,18 @@ async def test_header_provider_overrides_static_headers():
|
|||||||
await db.table_names()
|
await db.table_names()
|
||||||
|
|
||||||
|
|
||||||
|
def test_close():
|
||||||
|
"""Test that close() works without AttributeError."""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
def handler(req):
|
||||||
|
req.send_response(200)
|
||||||
|
req.end_headers()
|
||||||
|
|
||||||
|
with mock_lancedb_connection(handler) as db:
|
||||||
|
asyncio.run(db.close())
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("exception", [KeyboardInterrupt, SystemExit, GeneratorExit])
|
@pytest.mark.parametrize("exception", [KeyboardInterrupt, SystemExit, GeneratorExit])
|
||||||
def test_background_loop_cancellation(exception):
|
def test_background_loop_cancellation(exception):
|
||||||
"""Test that BackgroundEventLoop.run() cancels the future on interrupt."""
|
"""Test that BackgroundEventLoop.run() cancels the future on interrupt."""
|
||||||
|
|||||||
@@ -26,11 +26,8 @@ from lancedb.rerankers import (
|
|||||||
)
|
)
|
||||||
from lancedb.table import LanceTable
|
from lancedb.table import LanceTable
|
||||||
|
|
||||||
# Tests rely on FTS index
|
|
||||||
pytest.importorskip("lancedb.fts")
|
|
||||||
|
|
||||||
|
def get_test_table(tmp_path):
|
||||||
def get_test_table(tmp_path, use_tantivy):
|
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
# Create a LanceDB table schema with a vector and a text column
|
# Create a LanceDB table schema with a vector and a text column
|
||||||
emb = EmbeddingFunctionRegistry.get_instance().get("test").create()
|
emb = EmbeddingFunctionRegistry.get_instance().get("test").create()
|
||||||
@@ -98,7 +95,7 @@ def get_test_table(tmp_path, use_tantivy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create a fts index
|
# Create a fts index
|
||||||
table.create_fts_index("text", use_tantivy=use_tantivy, replace=True)
|
table.create_fts_index("text", replace=True)
|
||||||
|
|
||||||
return table, MyTable
|
return table, MyTable
|
||||||
|
|
||||||
@@ -208,8 +205,8 @@ def _run_test_reranker(reranker, table, query, query_vector, schema):
|
|||||||
assert len(result) == 20 and result == result_arrow
|
assert len(result) == 20 and result == result_arrow
|
||||||
|
|
||||||
|
|
||||||
def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
|
def _run_test_hybrid_reranker(reranker, tmp_path):
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path)
|
||||||
# The default reranker
|
# The default reranker
|
||||||
result1 = (
|
result1 = (
|
||||||
table.search(
|
table.search(
|
||||||
@@ -285,8 +282,7 @@ def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_linear_combination(tmp_path):
|
||||||
def test_linear_combination(tmp_path, use_tantivy):
|
|
||||||
reranker = LinearCombinationReranker()
|
reranker = LinearCombinationReranker()
|
||||||
|
|
||||||
vector_results = pa.Table.from_pydict(
|
vector_results = pa.Table.from_pydict(
|
||||||
@@ -313,22 +309,20 @@ def test_linear_combination(tmp_path, use_tantivy):
|
|||||||
assert "_score" not in combined_results.column_names
|
assert "_score" not in combined_results.column_names
|
||||||
assert "_relevance_score" in combined_results.column_names
|
assert "_relevance_score" in combined_results.column_names
|
||||||
|
|
||||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
_run_test_hybrid_reranker(reranker, tmp_path)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_rrf_reranker(tmp_path):
|
||||||
def test_rrf_reranker(tmp_path, use_tantivy):
|
|
||||||
reranker = RRFReranker()
|
reranker = RRFReranker()
|
||||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
_run_test_hybrid_reranker(reranker, tmp_path)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_mrr_reranker(tmp_path):
|
||||||
def test_mrr_reranker(tmp_path, use_tantivy):
|
|
||||||
reranker = MRRReranker()
|
reranker = MRRReranker()
|
||||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
_run_test_hybrid_reranker(reranker, tmp_path)
|
||||||
|
|
||||||
# Test multi-vector part
|
# Test multi-vector part
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path)
|
||||||
query = "single player experience"
|
query = "single player experience"
|
||||||
rs1 = table.search(query, vector_column_name="vector").limit(10).with_row_id(True)
|
rs1 = table.search(query, vector_column_name="vector").limit(10).with_row_id(True)
|
||||||
rs2 = (
|
rs2 = (
|
||||||
@@ -363,7 +357,7 @@ def test_rrf_reranker_distance():
|
|||||||
table = db.create_table("test", data)
|
table = db.create_table("test", data)
|
||||||
|
|
||||||
table.create_index(num_partitions=1, num_sub_vectors=2)
|
table.create_index(num_partitions=1, num_sub_vectors=2)
|
||||||
table.create_fts_index("text", use_tantivy=False)
|
table.create_fts_index("text")
|
||||||
|
|
||||||
reranker = RRFReranker(return_score="all")
|
reranker = RRFReranker(return_score="all")
|
||||||
|
|
||||||
@@ -422,35 +416,31 @@ def test_rrf_reranker_distance():
|
|||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
|
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_cohere_reranker(tmp_path):
|
||||||
def test_cohere_reranker(tmp_path, use_tantivy):
|
|
||||||
pytest.importorskip("cohere")
|
pytest.importorskip("cohere")
|
||||||
reranker = CohereReranker()
|
reranker = CohereReranker()
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path)
|
||||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_cross_encoder_reranker(tmp_path):
|
||||||
def test_cross_encoder_reranker(tmp_path, use_tantivy):
|
|
||||||
pytest.importorskip("sentence_transformers")
|
pytest.importorskip("sentence_transformers")
|
||||||
reranker = CrossEncoderReranker()
|
reranker = CrossEncoderReranker()
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path)
|
||||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_colbert_reranker(tmp_path):
|
||||||
def test_colbert_reranker(tmp_path, use_tantivy):
|
|
||||||
pytest.importorskip("rerankers")
|
pytest.importorskip("rerankers")
|
||||||
reranker = ColbertReranker()
|
reranker = ColbertReranker()
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path)
|
||||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_answerdotai_reranker(tmp_path):
|
||||||
def test_answerdotai_reranker(tmp_path, use_tantivy):
|
|
||||||
pytest.importorskip("rerankers")
|
pytest.importorskip("rerankers")
|
||||||
reranker = AnswerdotaiRerankers()
|
reranker = AnswerdotaiRerankers()
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path)
|
||||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||||
|
|
||||||
|
|
||||||
@@ -459,10 +449,9 @@ def test_answerdotai_reranker(tmp_path, use_tantivy):
|
|||||||
or os.environ.get("OPENAI_BASE_URL") is not None,
|
or os.environ.get("OPENAI_BASE_URL") is not None,
|
||||||
reason="OPENAI_API_KEY not set",
|
reason="OPENAI_API_KEY not set",
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_openai_reranker(tmp_path):
|
||||||
def test_openai_reranker(tmp_path, use_tantivy):
|
|
||||||
pytest.importorskip("openai")
|
pytest.importorskip("openai")
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path)
|
||||||
reranker = OpenaiReranker()
|
reranker = OpenaiReranker()
|
||||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||||
|
|
||||||
@@ -470,10 +459,9 @@ def test_openai_reranker(tmp_path, use_tantivy):
|
|||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
os.environ.get("JINA_API_KEY") is None, reason="JINA_API_KEY not set"
|
os.environ.get("JINA_API_KEY") is None, reason="JINA_API_KEY not set"
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_jina_reranker(tmp_path):
|
||||||
def test_jina_reranker(tmp_path, use_tantivy):
|
|
||||||
pytest.importorskip("jina")
|
pytest.importorskip("jina")
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path)
|
||||||
reranker = JinaReranker()
|
reranker = JinaReranker()
|
||||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||||
|
|
||||||
@@ -481,11 +469,10 @@ def test_jina_reranker(tmp_path, use_tantivy):
|
|||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_voyageai_reranker(tmp_path):
|
||||||
def test_voyageai_reranker(tmp_path, use_tantivy):
|
|
||||||
pytest.importorskip("voyageai")
|
pytest.importorskip("voyageai")
|
||||||
reranker = VoyageAIReranker(model_name="rerank-2.5")
|
reranker = VoyageAIReranker(model_name="rerank-2.5")
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path)
|
||||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||||
|
|
||||||
|
|
||||||
@@ -504,7 +491,7 @@ def test_empty_result_reranker():
|
|||||||
|
|
||||||
# Create empty table with schema
|
# Create empty table with schema
|
||||||
empty_table = db.create_table("empty_table", schema=schema, mode="overwrite")
|
empty_table = db.create_table("empty_table", schema=schema, mode="overwrite")
|
||||||
empty_table.create_fts_index("text", use_tantivy=False, replace=True)
|
empty_table.create_fts_index("text", replace=True)
|
||||||
for reranker in [
|
for reranker in [
|
||||||
CrossEncoderReranker(),
|
CrossEncoderReranker(),
|
||||||
# ColbertReranker(),
|
# ColbertReranker(),
|
||||||
@@ -603,11 +590,10 @@ def test_empty_hybrid_result_reranker():
|
|||||||
assert "_rowid" in result.column_names
|
assert "_rowid" in result.column_names
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
def test_cross_encoder_reranker_return_all(tmp_path):
|
||||||
def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy):
|
|
||||||
pytest.importorskip("sentence_transformers")
|
pytest.importorskip("sentence_transformers")
|
||||||
reranker = CrossEncoderReranker(return_score="all")
|
reranker = CrossEncoderReranker(return_score="all")
|
||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path)
|
||||||
query = "single player experience"
|
query = "single player experience"
|
||||||
result = (
|
result = (
|
||||||
table.search(query, query_type="hybrid", vector_column_name="vector")
|
table.search(query, query_type="hybrid", vector_column_name="vector")
|
||||||
|
|||||||
@@ -242,8 +242,8 @@ def test_s3_dynamodb_sync(s3_bucket: str, commit_table: str, monkeypatch):
|
|||||||
|
|
||||||
# FTS indices should error since they are not supported yet.
|
# FTS indices should error since they are not supported yet.
|
||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
NotImplementedError,
|
ValueError,
|
||||||
match="Full-text search is only supported on the local filesystem",
|
match="Tantivy-based FTS has been removed",
|
||||||
):
|
):
|
||||||
table.create_fts_index("x", use_tantivy=True)
|
table.create_fts_index("x", use_tantivy=True)
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
from datetime import date, datetime, timedelta
|
from datetime import date, datetime, timedelta
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import List
|
from typing import List
|
||||||
@@ -527,6 +528,132 @@ async def test_add_async(mem_db_async: AsyncConnection):
|
|||||||
assert await table.count_rows() == 3
|
assert await table.count_rows() == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_overwrite_infers_vector_schema(mem_db: DBConnection):
|
||||||
|
"""Overwrite should infer vector columns the same way create_table does.
|
||||||
|
|
||||||
|
Regression test for https://github.com/lancedb/lancedb/issues/3183
|
||||||
|
"""
|
||||||
|
table = mem_db.create_table(
|
||||||
|
"test_overwrite_vec",
|
||||||
|
data=[
|
||||||
|
{"vector": [1.0, 2.0, 3.0, 4.0], "item": "foo"},
|
||||||
|
{"vector": [5.0, 6.0, 7.0, 8.0], "item": "bar"},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# create_table infers vector as fixed_size_list<float32, 4>
|
||||||
|
original_type = table.schema.field("vector").type
|
||||||
|
assert pa.types.is_fixed_size_list(original_type)
|
||||||
|
|
||||||
|
# overwrite with plain Python lists (PyArrow infers list<double>)
|
||||||
|
table.add(
|
||||||
|
[
|
||||||
|
{"vector": [10.0, 20.0, 30.0, 40.0], "item": "baz"},
|
||||||
|
],
|
||||||
|
mode="overwrite",
|
||||||
|
)
|
||||||
|
# overwrite should infer vector column the same way as create_table
|
||||||
|
new_type = table.schema.field("vector").type
|
||||||
|
assert pa.types.is_fixed_size_list(new_type), (
|
||||||
|
f"Expected fixed_size_list after overwrite, got {new_type}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_progress_callback(mem_db: DBConnection):
|
||||||
|
table = mem_db.create_table(
|
||||||
|
"test",
|
||||||
|
data=[{"id": 1}, {"id": 2}],
|
||||||
|
)
|
||||||
|
|
||||||
|
updates = []
|
||||||
|
table.add([{"id": 3}, {"id": 4}], progress=lambda p: updates.append(dict(p)))
|
||||||
|
|
||||||
|
assert len(table) == 4
|
||||||
|
# The done callback always fires, so we should always get at least one.
|
||||||
|
assert len(updates) >= 1, "expected at least one progress callback"
|
||||||
|
for p in updates:
|
||||||
|
assert "output_rows" in p
|
||||||
|
assert "output_bytes" in p
|
||||||
|
assert "total_rows" in p
|
||||||
|
assert "elapsed_seconds" in p
|
||||||
|
assert "active_tasks" in p
|
||||||
|
assert "total_tasks" in p
|
||||||
|
assert "done" in p
|
||||||
|
# The last callback should have done=True.
|
||||||
|
assert updates[-1]["done"] is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_progress_tqdm_like(mem_db: DBConnection):
|
||||||
|
"""Test that a tqdm-like object gets total set and update() called."""
|
||||||
|
|
||||||
|
class FakeBar:
|
||||||
|
def __init__(self):
|
||||||
|
self.total = None
|
||||||
|
self.n = 0
|
||||||
|
self.postfix = None
|
||||||
|
|
||||||
|
def update(self, n):
|
||||||
|
self.n += n
|
||||||
|
|
||||||
|
def set_postfix_str(self, s):
|
||||||
|
self.postfix = s
|
||||||
|
|
||||||
|
def refresh(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
table = mem_db.create_table(
|
||||||
|
"test",
|
||||||
|
data=[{"id": 1}, {"id": 2}],
|
||||||
|
)
|
||||||
|
|
||||||
|
bar = FakeBar()
|
||||||
|
table.add([{"id": 3}, {"id": 4}], progress=bar)
|
||||||
|
|
||||||
|
assert len(table) == 4
|
||||||
|
# Postfix should contain throughput and worker count
|
||||||
|
if bar.postfix is not None:
|
||||||
|
assert "MB/s" in bar.postfix
|
||||||
|
assert "workers" in bar.postfix
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_progress_bool(mem_db: DBConnection):
|
||||||
|
"""Test that progress=True creates and closes a tqdm bar automatically."""
|
||||||
|
table = mem_db.create_table(
|
||||||
|
"test",
|
||||||
|
data=[{"id": 1}, {"id": 2}],
|
||||||
|
)
|
||||||
|
|
||||||
|
table.add([{"id": 3}, {"id": 4}], progress=True)
|
||||||
|
assert len(table) == 4
|
||||||
|
|
||||||
|
# progress=False should be the same as None
|
||||||
|
table.add([{"id": 5}], progress=False)
|
||||||
|
assert len(table) == 5
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_add_progress_callback_async(mem_db_async: AsyncConnection):
|
||||||
|
"""Progress callbacks work through the async path too."""
|
||||||
|
table = await mem_db_async.create_table("test", data=[{"id": 1}, {"id": 2}])
|
||||||
|
|
||||||
|
updates = []
|
||||||
|
await table.add([{"id": 3}, {"id": 4}], progress=lambda p: updates.append(dict(p)))
|
||||||
|
|
||||||
|
assert await table.count_rows() == 4
|
||||||
|
assert len(updates) >= 1
|
||||||
|
assert updates[-1]["done"] is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_progress_callback_error(mem_db: DBConnection):
|
||||||
|
"""A failing callback must not prevent the write from succeeding."""
|
||||||
|
table = mem_db.create_table("test", data=[{"id": 1}, {"id": 2}])
|
||||||
|
|
||||||
|
def bad_callback(p):
|
||||||
|
raise RuntimeError("boom")
|
||||||
|
|
||||||
|
table.add([{"id": 3}, {"id": 4}], progress=bad_callback)
|
||||||
|
assert len(table) == 4
|
||||||
|
|
||||||
|
|
||||||
def test_polars(mem_db: DBConnection):
|
def test_polars(mem_db: DBConnection):
|
||||||
data = {
|
data = {
|
||||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||||
@@ -923,6 +1050,231 @@ def test_add_with_nans(mem_db: DBConnection):
|
|||||||
assert np.allclose(v, np.array([0.0, 0.0]))
|
assert np.allclose(v, np.array([0.0, 0.0]))
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_with_empty_fixed_size_list_drops_bad_rows(mem_db: DBConnection):
|
||||||
|
class Schema(LanceModel):
|
||||||
|
text: str
|
||||||
|
embedding: Vector(16)
|
||||||
|
|
||||||
|
table = mem_db.create_table("test_empty_embeddings", schema=Schema)
|
||||||
|
table.add(
|
||||||
|
[
|
||||||
|
{"text": "hello", "embedding": []},
|
||||||
|
{"text": "bar", "embedding": [0.1] * 16},
|
||||||
|
],
|
||||||
|
on_bad_vectors="drop",
|
||||||
|
)
|
||||||
|
|
||||||
|
data = table.to_arrow()
|
||||||
|
assert data["text"].to_pylist() == ["bar"]
|
||||||
|
assert np.allclose(data["embedding"].to_pylist()[0], np.array([0.1] * 16))
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_with_integer_embeddings_preserves_casting(mem_db: DBConnection):
|
||||||
|
class Schema(LanceModel):
|
||||||
|
text: str
|
||||||
|
embedding: Vector(4)
|
||||||
|
|
||||||
|
table = mem_db.create_table("test_integer_embeddings", schema=Schema)
|
||||||
|
table.add(
|
||||||
|
[{"text": "foo", "embedding": [1, 2, 3, 4]}],
|
||||||
|
on_bad_vectors="drop",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert table.to_arrow()["embedding"].to_pylist() == [[1.0, 2.0, 3.0, 4.0]]
|
||||||
|
|
||||||
|
|
||||||
|
def test_on_bad_vectors_does_not_handle_non_vector_fixed_size_lists(
|
||||||
|
mem_db: DBConnection,
|
||||||
|
):
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 4)),
|
||||||
|
pa.field("bbox", pa.list_(pa.float32(), 4)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
table = mem_db.create_table("test_bbox_schema", schema=schema)
|
||||||
|
|
||||||
|
with pytest.raises(RuntimeError, match="FixedSizeListType"):
|
||||||
|
table.add(
|
||||||
|
[{"vector": [1.0, 2.0, 3.0, 4.0], "bbox": [0.0, 1.0]}],
|
||||||
|
on_bad_vectors="drop",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_on_bad_vectors_does_not_handle_custom_named_fixed_size_lists(
|
||||||
|
mem_db: DBConnection,
|
||||||
|
):
|
||||||
|
schema = pa.schema([pa.field("features", pa.list_(pa.float32(), 16))])
|
||||||
|
table = mem_db.create_table("test_custom_named_fixed_size_vector", schema=schema)
|
||||||
|
|
||||||
|
with pytest.raises(RuntimeError, match="FixedSizeListType"):
|
||||||
|
table.add(
|
||||||
|
[
|
||||||
|
{"features": []},
|
||||||
|
{"features": [0.1] * 16},
|
||||||
|
],
|
||||||
|
on_bad_vectors="drop",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_on_bad_vectors_with_schema_list_vector_still_sanitizes(mem_db: DBConnection):
|
||||||
|
schema = pa.schema([pa.field("vector", pa.list_(pa.float32()))])
|
||||||
|
table = mem_db.create_table("test_schema_list_vector", schema=schema)
|
||||||
|
table.add(
|
||||||
|
[
|
||||||
|
{"vector": [1.0, 2.0]},
|
||||||
|
{"vector": [3.0]},
|
||||||
|
{"vector": [4.0, 5.0]},
|
||||||
|
],
|
||||||
|
on_bad_vectors="drop",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert table.to_arrow()["vector"].to_pylist() == [[1.0, 2.0], [4.0, 5.0]]
|
||||||
|
|
||||||
|
|
||||||
|
def test_on_bad_vectors_handles_typed_custom_fixed_vectors_for_list_schema(
|
||||||
|
mem_db: DBConnection,
|
||||||
|
):
|
||||||
|
schema = pa.schema([pa.field("vec", pa.list_(pa.float32()))])
|
||||||
|
table = mem_db.create_table("test_typed_custom_fixed_vector", schema=schema)
|
||||||
|
data = pa.table(
|
||||||
|
{
|
||||||
|
"vec": pa.array(
|
||||||
|
[[float("nan")] * 16, [1.0] * 16],
|
||||||
|
type=pa.list_(pa.float32(), 16),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
table.add(data, on_bad_vectors="drop")
|
||||||
|
|
||||||
|
assert table.to_arrow()["vec"].to_pylist() == [[1.0] * 16]
|
||||||
|
|
||||||
|
|
||||||
|
def test_on_bad_vectors_fill_preserves_arrow_nested_vector_type(mem_db: DBConnection):
|
||||||
|
schema = pa.schema([pa.field("vector", pa.list_(pa.float32()))])
|
||||||
|
table = mem_db.create_table("test_fill_arrow_nested_type", schema=schema)
|
||||||
|
data = pa.table(
|
||||||
|
{
|
||||||
|
"vector": pa.array(
|
||||||
|
[[1.0, 2.0], [float("nan"), 3.0]],
|
||||||
|
type=pa.list_(pa.float32(), 2),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
table.add(
|
||||||
|
data,
|
||||||
|
on_bad_vectors="fill",
|
||||||
|
fill_value=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert table.to_arrow()["vector"].to_pylist() == [[1.0, 2.0], [0.0, 0.0]]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("table_name", "batch1", "expected"),
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"test_schema_list_vector_empty_prefix",
|
||||||
|
pa.record_batch({"vector": [[], []]}),
|
||||||
|
[[], [], [1.0, 2.0], [3.0, 4.0]],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"test_schema_list_vector_all_bad_prefix",
|
||||||
|
pa.record_batch({"vector": [[float("nan")] * 3, [float("nan")] * 3]}),
|
||||||
|
[[1.0, 2.0], [3.0, 4.0]],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_on_bad_vectors_with_schema_list_vector_ignores_invalid_prefix_batches(
|
||||||
|
mem_db: DBConnection,
|
||||||
|
table_name: str,
|
||||||
|
batch1: pa.RecordBatch,
|
||||||
|
expected: list,
|
||||||
|
):
|
||||||
|
schema = pa.schema([pa.field("vector", pa.list_(pa.float32()))])
|
||||||
|
table = mem_db.create_table(table_name, schema=schema)
|
||||||
|
batch2 = pa.record_batch({"vector": [[1.0, 2.0], [3.0, 4.0]]})
|
||||||
|
reader = pa.RecordBatchReader.from_batches(batch1.schema, [batch1, batch2])
|
||||||
|
|
||||||
|
table.add(reader, on_bad_vectors="drop")
|
||||||
|
|
||||||
|
assert table.to_arrow()["vector"].to_pylist() == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_on_bad_vectors_with_multiple_vectors_locks_dim_after_final_drop(
|
||||||
|
mem_db: DBConnection,
|
||||||
|
):
|
||||||
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
|
func = MockTextEmbeddingFunction.create()
|
||||||
|
metadata = registry.get_table_metadata(
|
||||||
|
[
|
||||||
|
EmbeddingFunctionConfig(
|
||||||
|
source_column="text1", vector_column="vec1", function=func
|
||||||
|
),
|
||||||
|
EmbeddingFunctionConfig(
|
||||||
|
source_column="text2", vector_column="vec2", function=func
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vec1", pa.list_(pa.float32())),
|
||||||
|
pa.field("vec2", pa.list_(pa.float32())),
|
||||||
|
],
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
table = mem_db.create_table("test_multi_vector_dim_lock", schema=schema)
|
||||||
|
batch1 = pa.record_batch(
|
||||||
|
{
|
||||||
|
"vec1": [[1.0, 2.0, 3.0], [10.0, 11.0]],
|
||||||
|
"vec2": [[float("nan"), 0.0], [5.0, 6.0]],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
batch2 = pa.record_batch(
|
||||||
|
{
|
||||||
|
"vec1": [[20.0, 21.0], [30.0, 31.0]],
|
||||||
|
"vec2": [[7.0, 8.0], [9.0, 10.0]],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
reader = pa.RecordBatchReader.from_batches(batch1.schema, [batch1, batch2])
|
||||||
|
|
||||||
|
table.add(reader, on_bad_vectors="drop")
|
||||||
|
|
||||||
|
data = table.to_arrow()
|
||||||
|
assert data["vec1"].to_pylist() == [[10.0, 11.0], [20.0, 21.0], [30.0, 31.0]]
|
||||||
|
assert data["vec2"].to_pylist() == [[5.0, 6.0], [7.0, 8.0], [9.0, 10.0]]
|
||||||
|
|
||||||
|
|
||||||
|
def test_on_bad_vectors_does_not_handle_non_vector_list_columns(mem_db: DBConnection):
|
||||||
|
schema = pa.schema([pa.field("embedding_history", pa.list_(pa.float32()))])
|
||||||
|
table = mem_db.create_table("test_non_vector_list_schema", schema=schema)
|
||||||
|
table.add(
|
||||||
|
[
|
||||||
|
{"embedding_history": [1.0, 2.0]},
|
||||||
|
{"embedding_history": [3.0]},
|
||||||
|
],
|
||||||
|
on_bad_vectors="drop",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert table.to_arrow()["embedding_history"].to_pylist() == [
|
||||||
|
[1.0, 2.0],
|
||||||
|
[3.0],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_on_bad_vectors_all_null_schema_vector_batches_do_not_crash(
|
||||||
|
mem_db: DBConnection,
|
||||||
|
):
|
||||||
|
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2), nullable=True)])
|
||||||
|
table = mem_db.create_table("test_all_null_vector_batch", schema=schema)
|
||||||
|
|
||||||
|
table.add([{"vector": None}], on_bad_vectors="drop")
|
||||||
|
|
||||||
|
assert table.to_arrow()["vector"].to_pylist() == [None]
|
||||||
|
|
||||||
|
|
||||||
def test_restore(mem_db: DBConnection):
|
def test_restore(mem_db: DBConnection):
|
||||||
table = mem_db.create_table(
|
table = mem_db.create_table(
|
||||||
"my_table",
|
"my_table",
|
||||||
@@ -1596,7 +1948,6 @@ def setup_hybrid_search_table(db: DBConnection, embedding_func):
|
|||||||
|
|
||||||
def test_hybrid_search(tmp_db: DBConnection):
|
def test_hybrid_search(tmp_db: DBConnection):
|
||||||
# This test uses an FTS index
|
# This test uses an FTS index
|
||||||
pytest.importorskip("lancedb.fts")
|
|
||||||
pytest.importorskip("lance")
|
pytest.importorskip("lance")
|
||||||
|
|
||||||
table, MyTable, emb = setup_hybrid_search_table(tmp_db, "test")
|
table, MyTable, emb = setup_hybrid_search_table(tmp_db, "test")
|
||||||
@@ -1667,7 +2018,6 @@ def test_hybrid_search(tmp_db: DBConnection):
|
|||||||
|
|
||||||
def test_hybrid_search_metric_type(tmp_db: DBConnection):
|
def test_hybrid_search_metric_type(tmp_db: DBConnection):
|
||||||
# This test uses an FTS index
|
# This test uses an FTS index
|
||||||
pytest.importorskip("lancedb.fts")
|
|
||||||
pytest.importorskip("lance")
|
pytest.importorskip("lance")
|
||||||
|
|
||||||
# Need to use nonnorm as the embedding function so l2 and dot results
|
# Need to use nonnorm as the embedding function so l2 and dot results
|
||||||
@@ -1689,6 +2039,13 @@ def test_hybrid_search_metric_type(tmp_db: DBConnection):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"consistency_interval", [None, timedelta(seconds=0), timedelta(seconds=0.1)]
|
"consistency_interval", [None, timedelta(seconds=0), timedelta(seconds=0.1)]
|
||||||
)
|
)
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
sys.platform == "win32",
|
||||||
|
reason=(
|
||||||
|
"TODO: directory namespace is not supported on Windows yet; "
|
||||||
|
"re-enable after that is fixed."
|
||||||
|
),
|
||||||
|
)
|
||||||
def test_consistency(tmp_path, consistency_interval):
|
def test_consistency(tmp_path, consistency_interval):
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
table = db.create_table("my_table", data=[{"id": 0}])
|
table = db.create_table("my_table", data=[{"id": 0}])
|
||||||
@@ -1709,7 +2066,6 @@ def test_consistency(tmp_path, consistency_interval):
|
|||||||
elif consistency_interval == timedelta(seconds=0):
|
elif consistency_interval == timedelta(seconds=0):
|
||||||
assert table2.version == table.version
|
assert table2.version == table.version
|
||||||
else:
|
else:
|
||||||
# (consistency_interval == timedelta(seconds=0.1)
|
|
||||||
assert table2.version == table.version - 1
|
assert table2.version == table.version - 1
|
||||||
sleep(0.1)
|
sleep(0.1)
|
||||||
assert table2.version == table.version
|
assert table2.version == table.version
|
||||||
@@ -1982,7 +2338,7 @@ def test_stats(mem_db: DBConnection):
|
|||||||
stats = table.stats()
|
stats = table.stats()
|
||||||
print(f"{stats=}")
|
print(f"{stats=}")
|
||||||
assert stats == {
|
assert stats == {
|
||||||
"total_bytes": 38,
|
"total_bytes": 60,
|
||||||
"num_rows": 2,
|
"num_rows": 2,
|
||||||
"num_indices": 0,
|
"num_indices": 0,
|
||||||
"fragment_stats": {
|
"fragment_stats": {
|
||||||
@@ -2047,3 +2403,33 @@ def test_table_uri(tmp_path):
|
|||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
table = db.create_table("my_table", data=[{"x": 0}])
|
table = db.create_table("my_table", data=[{"x": 0}])
|
||||||
assert table.uri == str(tmp_path / "my_table.lance")
|
assert table.uri == str(tmp_path / "my_table.lance")
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_data_metadata_not_stripped():
|
||||||
|
"""Regression test: dict.update() returns None, so assigning its result
|
||||||
|
would silently replace metadata with None, causing with_metadata(None)
|
||||||
|
to strip all schema metadata from the target schema."""
|
||||||
|
from lancedb.table import _sanitize_data
|
||||||
|
|
||||||
|
schema = pa.schema(
|
||||||
|
[pa.field("x", pa.int64())],
|
||||||
|
metadata={b"existing_key": b"existing_value"},
|
||||||
|
)
|
||||||
|
batch = pa.record_batch([pa.array([1, 2, 3])], schema=schema)
|
||||||
|
|
||||||
|
# Use a different field type so the reader and target schemas differ,
|
||||||
|
# forcing _cast_to_target_schema to rebuild the schema with the
|
||||||
|
# target's metadata (instead of taking the fast-path).
|
||||||
|
target_schema = pa.schema(
|
||||||
|
[pa.field("x", pa.int32())],
|
||||||
|
metadata={b"existing_key": b"existing_value"},
|
||||||
|
)
|
||||||
|
|
||||||
|
reader = pa.RecordBatchReader.from_batches(schema, [batch])
|
||||||
|
metadata = {b"new_key": b"new_value"}
|
||||||
|
result = _sanitize_data(reader, target_schema=target_schema, metadata=metadata)
|
||||||
|
|
||||||
|
result_schema = result.schema
|
||||||
|
assert result_schema.metadata is not None
|
||||||
|
assert result_schema.metadata[b"existing_key"] == b"existing_value"
|
||||||
|
assert result_schema.metadata[b"new_key"] == b"new_value"
|
||||||
|
|||||||
@@ -15,8 +15,10 @@ from lancedb.table import (
|
|||||||
_cast_to_target_schema,
|
_cast_to_target_schema,
|
||||||
_handle_bad_vectors,
|
_handle_bad_vectors,
|
||||||
_into_pyarrow_reader,
|
_into_pyarrow_reader,
|
||||||
_sanitize_data,
|
|
||||||
_infer_target_schema,
|
_infer_target_schema,
|
||||||
|
_merge_metadata,
|
||||||
|
_sanitize_data,
|
||||||
|
sanitize_create_table,
|
||||||
)
|
)
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -304,6 +306,117 @@ def test_handle_bad_vectors_noop():
|
|||||||
assert output["vector"] == vector
|
assert output["vector"] == vector
|
||||||
|
|
||||||
|
|
||||||
|
def test_handle_bad_vectors_updates_reader_schema_for_target_schema():
|
||||||
|
data = pa.table({"vector": [[1, 2, 3, 4]]})
|
||||||
|
target_schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 4))])
|
||||||
|
|
||||||
|
output = _handle_bad_vectors(
|
||||||
|
data.to_reader(),
|
||||||
|
on_bad_vectors="drop",
|
||||||
|
target_schema=target_schema,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert output.schema == pa.schema([pa.field("vector", pa.list_(pa.float32()))])
|
||||||
|
assert output.read_all()["vector"].to_pylist() == [[1.0, 2.0, 3.0, 4.0]]
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_data_keeps_target_field_metadata():
|
||||||
|
source_field = pa.field(
|
||||||
|
"vector",
|
||||||
|
pa.list_(pa.float32(), 2),
|
||||||
|
metadata={b"source": b"drop-me"},
|
||||||
|
)
|
||||||
|
target_field = pa.field(
|
||||||
|
"vector",
|
||||||
|
pa.list_(pa.float32(), 2),
|
||||||
|
metadata={b"target": b"keep-me"},
|
||||||
|
)
|
||||||
|
data = pa.table(
|
||||||
|
{"vector": pa.array([[1.0, 2.0]], type=pa.list_(pa.float32(), 2))},
|
||||||
|
schema=pa.schema([source_field]),
|
||||||
|
)
|
||||||
|
|
||||||
|
output = _sanitize_data(
|
||||||
|
data,
|
||||||
|
target_schema=pa.schema([target_field]),
|
||||||
|
on_bad_vectors="drop",
|
||||||
|
).read_all()
|
||||||
|
|
||||||
|
assert output.schema.field("vector").metadata == {b"target": b"keep-me"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_data_uses_separate_embedding_metadata_for_bad_vectors():
|
||||||
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
|
conf = EmbeddingFunctionConfig(
|
||||||
|
source_column="text",
|
||||||
|
vector_column="custom_vector",
|
||||||
|
function=MockTextEmbeddingFunction.create(),
|
||||||
|
)
|
||||||
|
metadata = registry.get_table_metadata([conf])
|
||||||
|
schema = pa.schema(
|
||||||
|
{
|
||||||
|
"text": pa.string(),
|
||||||
|
"custom_vector": pa.list_(pa.float32(), 10),
|
||||||
|
},
|
||||||
|
metadata={b"note": b"keep-me"},
|
||||||
|
)
|
||||||
|
data = pa.table(
|
||||||
|
{
|
||||||
|
"text": ["bad", "good"],
|
||||||
|
"custom_vector": [[1.0] * 9, [2.0] * 10],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
output = _sanitize_data(
|
||||||
|
data,
|
||||||
|
target_schema=schema,
|
||||||
|
metadata=metadata,
|
||||||
|
on_bad_vectors="drop",
|
||||||
|
).read_all()
|
||||||
|
|
||||||
|
assert output["text"].to_pylist() == ["good"]
|
||||||
|
assert output.schema.metadata[b"note"] == b"keep-me"
|
||||||
|
assert b"embedding_functions" in output.schema.metadata
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_create_table_merges_and_overrides_embedding_metadata():
|
||||||
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
|
old_conf = EmbeddingFunctionConfig(
|
||||||
|
source_column="text",
|
||||||
|
vector_column="old_vector",
|
||||||
|
function=MockTextEmbeddingFunction.create(),
|
||||||
|
)
|
||||||
|
new_conf = EmbeddingFunctionConfig(
|
||||||
|
source_column="text",
|
||||||
|
vector_column="custom_vector",
|
||||||
|
function=MockTextEmbeddingFunction.create(),
|
||||||
|
)
|
||||||
|
metadata = registry.get_table_metadata([new_conf])
|
||||||
|
schema = pa.schema(
|
||||||
|
{
|
||||||
|
"text": pa.string(),
|
||||||
|
"custom_vector": pa.list_(pa.float32(), 10),
|
||||||
|
},
|
||||||
|
metadata=_merge_metadata(
|
||||||
|
{b"note": b"keep-me"},
|
||||||
|
registry.get_table_metadata([old_conf]),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
data, schema = sanitize_create_table(
|
||||||
|
pa.table({"text": ["good"]}),
|
||||||
|
schema,
|
||||||
|
metadata=metadata,
|
||||||
|
on_bad_vectors="drop",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert schema.metadata[b"note"] == b"keep-me"
|
||||||
|
assert b"embedding_functions" in schema.metadata
|
||||||
|
assert data.schema.metadata[b"note"] == b"keep-me"
|
||||||
|
funcs = EmbeddingFunctionRegistry.get_instance().parse_functions(schema.metadata)
|
||||||
|
assert set(funcs.keys()) == {"custom_vector"}
|
||||||
|
|
||||||
|
|
||||||
class TestModel(lancedb.pydantic.LanceModel):
|
class TestModel(lancedb.pydantic.LanceModel):
|
||||||
a: Optional[int]
|
a: Optional[int]
|
||||||
b: Optional[int]
|
b: Optional[int]
|
||||||
|
|||||||
@@ -1,11 +1,17 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
use std::{
|
||||||
|
collections::{HashMap, HashSet},
|
||||||
|
sync::Arc,
|
||||||
|
time::Duration,
|
||||||
|
};
|
||||||
|
|
||||||
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
|
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
|
||||||
use lancedb::{
|
use lancedb::{
|
||||||
connection::Connection as LanceConnection,
|
connection::Connection as LanceConnection,
|
||||||
|
connection::NamespaceClientPushdownOperation,
|
||||||
|
database::namespace::LanceNamespaceDatabase,
|
||||||
database::{CreateTableMode, Database, ReadConsistency},
|
database::{CreateTableMode, Database, ReadConsistency},
|
||||||
};
|
};
|
||||||
use pyo3::{
|
use pyo3::{
|
||||||
@@ -17,8 +23,9 @@ use pyo3::{
|
|||||||
use pyo3_async_runtimes::tokio::future_into_py;
|
use pyo3_async_runtimes::tokio::future_into_py;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
error::PythonErrorExt, namespace::extract_namespace_arc,
|
error::PythonErrorExt,
|
||||||
storage_options::py_object_to_storage_options_provider, table::Table,
|
namespace::{create_namespace_storage_options_provider, extract_namespace_arc},
|
||||||
|
table::Table,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
@@ -38,6 +45,29 @@ impl Connection {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_namespace_client_pushdown_operations(
|
||||||
|
operations: Option<Vec<String>>,
|
||||||
|
) -> PyResult<HashSet<NamespaceClientPushdownOperation>> {
|
||||||
|
let mut parsed = HashSet::new();
|
||||||
|
for operation in operations.unwrap_or_default() {
|
||||||
|
match operation.as_str() {
|
||||||
|
"QueryTable" => {
|
||||||
|
parsed.insert(NamespaceClientPushdownOperation::QueryTable);
|
||||||
|
}
|
||||||
|
"CreateTable" => {
|
||||||
|
parsed.insert(NamespaceClientPushdownOperation::CreateTable);
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
return Err(PyValueError::new_err(format!(
|
||||||
|
"Invalid pushdown operation: {}",
|
||||||
|
operation
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(parsed)
|
||||||
|
}
|
||||||
|
|
||||||
impl Connection {
|
impl Connection {
|
||||||
fn parse_create_mode_str(mode: &str) -> PyResult<CreateTableMode> {
|
fn parse_create_mode_str(mode: &str) -> PyResult<CreateTableMode> {
|
||||||
match mode {
|
match mode {
|
||||||
@@ -87,16 +117,16 @@ impl Connection {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (namespace=vec![], start_after=None, limit=None))]
|
#[pyo3(signature = (namespace_path=None, start_after=None, limit=None))]
|
||||||
pub fn table_names(
|
pub fn table_names(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
start_after: Option<String>,
|
start_after: Option<String>,
|
||||||
limit: Option<u32>,
|
limit: Option<u32>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
let mut op = inner.table_names();
|
let mut op = inner.table_names();
|
||||||
op = op.namespace(namespace);
|
op = op.namespace(namespace_path.unwrap_or_default());
|
||||||
if let Some(start_after) = start_after {
|
if let Some(start_after) = start_after {
|
||||||
op = op.start_after(start_after);
|
op = op.start_after(start_after);
|
||||||
}
|
}
|
||||||
@@ -107,34 +137,43 @@ impl Connection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
#[pyo3(signature = (name, mode, data, namespace=vec![], storage_options=None, storage_options_provider=None, location=None))]
|
#[pyo3(signature = (name, mode, data, namespace_path=None, storage_options=None, location=None, namespace_client=None))]
|
||||||
pub fn create_table<'a>(
|
pub fn create_table<'a>(
|
||||||
self_: PyRef<'a, Self>,
|
self_: PyRef<'a, Self>,
|
||||||
name: String,
|
name: String,
|
||||||
mode: &str,
|
mode: &str,
|
||||||
data: Bound<'_, PyAny>,
|
data: Bound<'_, PyAny>,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
storage_options: Option<HashMap<String, String>>,
|
||||||
storage_options_provider: Option<Py<PyAny>>,
|
|
||||||
location: Option<String>,
|
location: Option<String>,
|
||||||
|
namespace_client: Option<Py<PyAny>>,
|
||||||
) -> PyResult<Bound<'a, PyAny>> {
|
) -> PyResult<Bound<'a, PyAny>> {
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
|
let py = self_.py();
|
||||||
|
|
||||||
let mode = Self::parse_create_mode_str(mode)?;
|
let mode = Self::parse_create_mode_str(mode)?;
|
||||||
|
|
||||||
let batches: Box<dyn arrow::array::RecordBatchReader + Send> =
|
let batches: Box<dyn arrow::array::RecordBatchReader + Send> =
|
||||||
Box::new(ArrowArrayStreamReader::from_pyarrow_bound(&data)?);
|
Box::new(ArrowArrayStreamReader::from_pyarrow_bound(&data)?);
|
||||||
|
|
||||||
let mut builder = inner.create_table(name, batches).mode(mode);
|
let ns_path = namespace_path.clone().unwrap_or_default();
|
||||||
|
let mut builder = inner.create_table(name.clone(), batches).mode(mode);
|
||||||
|
|
||||||
builder = builder.namespace(namespace);
|
builder = builder.namespace(ns_path.clone());
|
||||||
if let Some(storage_options) = storage_options {
|
if let Some(storage_options) = storage_options {
|
||||||
builder = builder.storage_options(storage_options);
|
builder = builder.storage_options(storage_options);
|
||||||
}
|
}
|
||||||
if let Some(provider_obj) = storage_options_provider {
|
|
||||||
let provider = py_object_to_storage_options_provider(provider_obj)?;
|
// Auto-create storage options provider from namespace_client
|
||||||
|
if let Some(ns_obj) = namespace_client {
|
||||||
|
let ns_client = extract_namespace_arc(py, ns_obj)?;
|
||||||
|
// Create table_id by combining namespace_path with table name
|
||||||
|
let mut table_id = ns_path;
|
||||||
|
table_id.push(name);
|
||||||
|
let provider = create_namespace_storage_options_provider(ns_client, table_id);
|
||||||
builder = builder.storage_options_provider(provider);
|
builder = builder.storage_options_provider(provider);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(location) = location {
|
if let Some(location) = location {
|
||||||
builder = builder.location(location);
|
builder = builder.location(location);
|
||||||
}
|
}
|
||||||
@@ -146,33 +185,44 @@ impl Connection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
#[pyo3(signature = (name, mode, schema, namespace=vec![], storage_options=None, storage_options_provider=None, location=None))]
|
#[pyo3(signature = (name, mode, schema, namespace_path=None, storage_options=None, location=None, namespace_client=None))]
|
||||||
pub fn create_empty_table<'a>(
|
pub fn create_empty_table<'a>(
|
||||||
self_: PyRef<'a, Self>,
|
self_: PyRef<'a, Self>,
|
||||||
name: String,
|
name: String,
|
||||||
mode: &str,
|
mode: &str,
|
||||||
schema: Bound<'_, PyAny>,
|
schema: Bound<'_, PyAny>,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
storage_options: Option<HashMap<String, String>>,
|
||||||
storage_options_provider: Option<Py<PyAny>>,
|
|
||||||
location: Option<String>,
|
location: Option<String>,
|
||||||
|
namespace_client: Option<Py<PyAny>>,
|
||||||
) -> PyResult<Bound<'a, PyAny>> {
|
) -> PyResult<Bound<'a, PyAny>> {
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
|
let py = self_.py();
|
||||||
|
|
||||||
let mode = Self::parse_create_mode_str(mode)?;
|
let mode = Self::parse_create_mode_str(mode)?;
|
||||||
|
|
||||||
let schema = Schema::from_pyarrow_bound(&schema)?;
|
let schema = Schema::from_pyarrow_bound(&schema)?;
|
||||||
|
|
||||||
let mut builder = inner.create_empty_table(name, Arc::new(schema)).mode(mode);
|
let ns_path = namespace_path.clone().unwrap_or_default();
|
||||||
|
let mut builder = inner
|
||||||
|
.create_empty_table(name.clone(), Arc::new(schema))
|
||||||
|
.mode(mode);
|
||||||
|
|
||||||
builder = builder.namespace(namespace);
|
builder = builder.namespace(ns_path.clone());
|
||||||
if let Some(storage_options) = storage_options {
|
if let Some(storage_options) = storage_options {
|
||||||
builder = builder.storage_options(storage_options);
|
builder = builder.storage_options(storage_options);
|
||||||
}
|
}
|
||||||
if let Some(provider_obj) = storage_options_provider {
|
|
||||||
let provider = py_object_to_storage_options_provider(provider_obj)?;
|
// Auto-create storage options provider from namespace_client
|
||||||
|
if let Some(ns_obj) = namespace_client {
|
||||||
|
let ns_client = extract_namespace_arc(py, ns_obj)?;
|
||||||
|
// Create table_id by combining namespace_path with table name
|
||||||
|
let mut table_id = ns_path;
|
||||||
|
table_id.push(name);
|
||||||
|
let provider = create_namespace_storage_options_provider(ns_client, table_id);
|
||||||
builder = builder.storage_options_provider(provider);
|
builder = builder.storage_options_provider(provider);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(location) = location {
|
if let Some(location) = location {
|
||||||
builder = builder.location(location);
|
builder = builder.location(location);
|
||||||
}
|
}
|
||||||
@@ -184,45 +234,44 @@ impl Connection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
#[pyo3(signature = (name, namespace=vec![], storage_options = None, storage_options_provider=None, index_cache_size = None, location=None, namespace_client=None, managed_versioning=None))]
|
#[pyo3(signature = (name, namespace_path=None, storage_options=None, index_cache_size=None, location=None, namespace_client=None, managed_versioning=None))]
|
||||||
pub fn open_table(
|
pub fn open_table(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
name: String,
|
name: String,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
storage_options: Option<HashMap<String, String>>,
|
||||||
storage_options_provider: Option<Py<PyAny>>,
|
|
||||||
index_cache_size: Option<u32>,
|
index_cache_size: Option<u32>,
|
||||||
location: Option<String>,
|
location: Option<String>,
|
||||||
namespace_client: Option<Py<PyAny>>,
|
namespace_client: Option<Py<PyAny>>,
|
||||||
managed_versioning: Option<bool>,
|
managed_versioning: Option<bool>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
|
let py = self_.py();
|
||||||
|
|
||||||
let mut builder = inner.open_table(name);
|
let ns_path = namespace_path.clone().unwrap_or_default();
|
||||||
builder = builder.namespace(namespace.clone());
|
let mut builder = inner.open_table(name.clone());
|
||||||
|
builder = builder.namespace(ns_path.clone());
|
||||||
if let Some(storage_options) = storage_options {
|
if let Some(storage_options) = storage_options {
|
||||||
builder = builder.storage_options(storage_options);
|
builder = builder.storage_options(storage_options);
|
||||||
}
|
}
|
||||||
if let Some(provider_obj) = storage_options_provider {
|
|
||||||
let provider = py_object_to_storage_options_provider(provider_obj)?;
|
// Auto-create storage options provider from namespace_client
|
||||||
|
if let Some(ns_obj) = namespace_client {
|
||||||
|
let ns_client = extract_namespace_arc(py, ns_obj)?;
|
||||||
|
// Create table_id by combining namespace_path with table name
|
||||||
|
let mut table_id = ns_path;
|
||||||
|
table_id.push(name);
|
||||||
|
let provider = create_namespace_storage_options_provider(ns_client.clone(), table_id);
|
||||||
builder = builder.storage_options_provider(provider);
|
builder = builder.storage_options_provider(provider);
|
||||||
|
builder = builder.namespace_client(ns_client);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(index_cache_size) = index_cache_size {
|
if let Some(index_cache_size) = index_cache_size {
|
||||||
builder = builder.index_cache_size(index_cache_size);
|
builder = builder.index_cache_size(index_cache_size);
|
||||||
}
|
}
|
||||||
if let Some(location) = location {
|
if let Some(location) = location {
|
||||||
builder = builder.location(location);
|
builder = builder.location(location);
|
||||||
}
|
}
|
||||||
// Extract namespace client from Python object if provided
|
|
||||||
let ns_client = if let Some(ns_obj) = namespace_client {
|
|
||||||
let py = self_.py();
|
|
||||||
Some(extract_namespace_arc(py, ns_obj)?)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
if let Some(ns_client) = ns_client {
|
|
||||||
builder = builder.namespace_client(ns_client);
|
|
||||||
}
|
|
||||||
// Pass managed_versioning if provided to avoid redundant describe_table call
|
// Pass managed_versioning if provided to avoid redundant describe_table call
|
||||||
if let Some(enabled) = managed_versioning {
|
if let Some(enabled) = managed_versioning {
|
||||||
builder = builder.managed_versioning(enabled);
|
builder = builder.managed_versioning(enabled);
|
||||||
@@ -234,12 +283,12 @@ impl Connection {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (target_table_name, source_uri, target_namespace=vec![], source_version=None, source_tag=None, is_shallow=true))]
|
#[pyo3(signature = (target_table_name, source_uri, target_namespace_path=None, source_version=None, source_tag=None, is_shallow=true))]
|
||||||
pub fn clone_table(
|
pub fn clone_table(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
target_table_name: String,
|
target_table_name: String,
|
||||||
source_uri: String,
|
source_uri: String,
|
||||||
target_namespace: Vec<String>,
|
target_namespace_path: Option<Vec<String>>,
|
||||||
source_version: Option<u64>,
|
source_version: Option<u64>,
|
||||||
source_tag: Option<String>,
|
source_tag: Option<String>,
|
||||||
is_shallow: bool,
|
is_shallow: bool,
|
||||||
@@ -247,7 +296,7 @@ impl Connection {
|
|||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
|
|
||||||
let mut builder = inner.clone_table(target_table_name, source_uri);
|
let mut builder = inner.clone_table(target_table_name, source_uri);
|
||||||
builder = builder.target_namespace(target_namespace);
|
builder = builder.target_namespace(target_namespace_path.unwrap_or_default());
|
||||||
if let Some(version) = source_version {
|
if let Some(version) = source_version {
|
||||||
builder = builder.source_version(version);
|
builder = builder.source_version(version);
|
||||||
}
|
}
|
||||||
@@ -262,52 +311,56 @@ impl Connection {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (cur_name, new_name, cur_namespace=vec![], new_namespace=vec![]))]
|
#[pyo3(signature = (cur_name, new_name, cur_namespace_path=None, new_namespace_path=None))]
|
||||||
pub fn rename_table(
|
pub fn rename_table(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
cur_name: String,
|
cur_name: String,
|
||||||
new_name: String,
|
new_name: String,
|
||||||
cur_namespace: Vec<String>,
|
cur_namespace_path: Option<Vec<String>>,
|
||||||
new_namespace: Vec<String>,
|
new_namespace_path: Option<Vec<String>>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
|
let cur_ns_path = cur_namespace_path.unwrap_or_default();
|
||||||
|
let new_ns_path = new_namespace_path.unwrap_or_default();
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
inner
|
inner
|
||||||
.rename_table(cur_name, new_name, &cur_namespace, &new_namespace)
|
.rename_table(cur_name, new_name, &cur_ns_path, &new_ns_path)
|
||||||
.await
|
.await
|
||||||
.infer_error()
|
.infer_error()
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (name, namespace=vec![]))]
|
#[pyo3(signature = (name, namespace_path=None))]
|
||||||
pub fn drop_table(
|
pub fn drop_table(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
name: String,
|
name: String,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
|
let ns_path = namespace_path.unwrap_or_default();
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
inner.drop_table(name, &namespace).await.infer_error()
|
inner.drop_table(name, &ns_path).await.infer_error()
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (namespace=vec![],))]
|
#[pyo3(signature = (namespace_path=None,))]
|
||||||
pub fn drop_all_tables(
|
pub fn drop_all_tables(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
|
let ns_path = namespace_path.unwrap_or_default();
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
inner.drop_all_tables(&namespace).await.infer_error()
|
inner.drop_all_tables(&ns_path).await.infer_error()
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// Namespace management methods
|
// Namespace management methods
|
||||||
|
|
||||||
#[pyo3(signature = (namespace=vec![], page_token=None, limit=None))]
|
#[pyo3(signature = (namespace_path=None, page_token=None, limit=None))]
|
||||||
pub fn list_namespaces(
|
pub fn list_namespaces(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
page_token: Option<String>,
|
page_token: Option<String>,
|
||||||
limit: Option<u32>,
|
limit: Option<u32>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
@@ -316,11 +369,7 @@ impl Connection {
|
|||||||
future_into_py(py, async move {
|
future_into_py(py, async move {
|
||||||
use lance_namespace::models::ListNamespacesRequest;
|
use lance_namespace::models::ListNamespacesRequest;
|
||||||
let request = ListNamespacesRequest {
|
let request = ListNamespacesRequest {
|
||||||
id: if namespace.is_empty() {
|
id: namespace_path,
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(namespace)
|
|
||||||
},
|
|
||||||
page_token,
|
page_token,
|
||||||
limit: limit.map(|l| l as i32),
|
limit: limit.map(|l| l as i32),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
@@ -335,10 +384,10 @@ impl Connection {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (namespace, mode=None, properties=None))]
|
#[pyo3(signature = (namespace_path, mode=None, properties=None))]
|
||||||
pub fn create_namespace(
|
pub fn create_namespace(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
namespace: Vec<String>,
|
namespace_path: Vec<String>,
|
||||||
mode: Option<String>,
|
mode: Option<String>,
|
||||||
properties: Option<std::collections::HashMap<String, String>>,
|
properties: Option<std::collections::HashMap<String, String>>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
@@ -354,11 +403,7 @@ impl Connection {
|
|||||||
_ => None,
|
_ => None,
|
||||||
});
|
});
|
||||||
let request = CreateNamespaceRequest {
|
let request = CreateNamespaceRequest {
|
||||||
id: if namespace.is_empty() {
|
id: Some(namespace_path),
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(namespace)
|
|
||||||
},
|
|
||||||
mode: mode_str,
|
mode: mode_str,
|
||||||
properties,
|
properties,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
@@ -372,10 +417,10 @@ impl Connection {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (namespace, mode=None, behavior=None))]
|
#[pyo3(signature = (namespace_path, mode=None, behavior=None))]
|
||||||
pub fn drop_namespace(
|
pub fn drop_namespace(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
namespace: Vec<String>,
|
namespace_path: Vec<String>,
|
||||||
mode: Option<String>,
|
mode: Option<String>,
|
||||||
behavior: Option<String>,
|
behavior: Option<String>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
@@ -395,11 +440,7 @@ impl Connection {
|
|||||||
_ => None,
|
_ => None,
|
||||||
});
|
});
|
||||||
let request = DropNamespaceRequest {
|
let request = DropNamespaceRequest {
|
||||||
id: if namespace.is_empty() {
|
id: Some(namespace_path),
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(namespace)
|
|
||||||
},
|
|
||||||
mode: mode_str,
|
mode: mode_str,
|
||||||
behavior: behavior_str,
|
behavior: behavior_str,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
@@ -414,21 +455,17 @@ impl Connection {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (namespace,))]
|
#[pyo3(signature = (namespace_path,))]
|
||||||
pub fn describe_namespace(
|
pub fn describe_namespace(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
namespace: Vec<String>,
|
namespace_path: Vec<String>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.get_inner()?.clone();
|
let inner = self_.get_inner()?.clone();
|
||||||
let py = self_.py();
|
let py = self_.py();
|
||||||
future_into_py(py, async move {
|
future_into_py(py, async move {
|
||||||
use lance_namespace::models::DescribeNamespaceRequest;
|
use lance_namespace::models::DescribeNamespaceRequest;
|
||||||
let request = DescribeNamespaceRequest {
|
let request = DescribeNamespaceRequest {
|
||||||
id: if namespace.is_empty() {
|
id: Some(namespace_path),
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(namespace)
|
|
||||||
},
|
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let response = inner.describe_namespace(request).await.infer_error()?;
|
let response = inner.describe_namespace(request).await.infer_error()?;
|
||||||
@@ -440,10 +477,10 @@ impl Connection {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (namespace=vec![], page_token=None, limit=None))]
|
#[pyo3(signature = (namespace_path=None, page_token=None, limit=None))]
|
||||||
pub fn list_tables(
|
pub fn list_tables(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
namespace: Vec<String>,
|
namespace_path: Option<Vec<String>>,
|
||||||
page_token: Option<String>,
|
page_token: Option<String>,
|
||||||
limit: Option<u32>,
|
limit: Option<u32>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
@@ -452,11 +489,7 @@ impl Connection {
|
|||||||
future_into_py(py, async move {
|
future_into_py(py, async move {
|
||||||
use lance_namespace::models::ListTablesRequest;
|
use lance_namespace::models::ListTablesRequest;
|
||||||
let request = ListTablesRequest {
|
let request = ListTablesRequest {
|
||||||
id: if namespace.is_empty() {
|
id: namespace_path,
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(namespace)
|
|
||||||
},
|
|
||||||
page_token,
|
page_token,
|
||||||
limit: limit.map(|l| l as i32),
|
limit: limit.map(|l| l as i32),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
@@ -470,10 +503,29 @@ impl Connection {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the configuration for constructing an equivalent namespace client.
|
||||||
|
/// Returns a dict with:
|
||||||
|
/// - "impl": "dir" for DirectoryNamespace, "rest" for RestNamespace
|
||||||
|
/// - "properties": configuration properties for the namespace
|
||||||
|
#[pyo3(signature = ())]
|
||||||
|
pub fn namespace_client_config(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||||
|
let inner = self_.get_inner()?.clone();
|
||||||
|
let py = self_.py();
|
||||||
|
future_into_py(py, async move {
|
||||||
|
let (impl_type, properties) = inner.namespace_client_config().await.infer_error()?;
|
||||||
|
Python::attach(|py| -> PyResult<Py<PyDict>> {
|
||||||
|
let dict = PyDict::new(py);
|
||||||
|
dict.set_item("impl", impl_type)?;
|
||||||
|
dict.set_item("properties", properties)?;
|
||||||
|
Ok(dict.unbind())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyfunction]
|
#[pyfunction]
|
||||||
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None))]
|
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None, manifest_enabled=false, namespace_client_properties=None))]
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn connect(
|
pub fn connect(
|
||||||
py: Python<'_>,
|
py: Python<'_>,
|
||||||
@@ -485,6 +537,8 @@ pub fn connect(
|
|||||||
client_config: Option<PyClientConfig>,
|
client_config: Option<PyClientConfig>,
|
||||||
storage_options: Option<HashMap<String, String>>,
|
storage_options: Option<HashMap<String, String>>,
|
||||||
session: Option<crate::session::Session>,
|
session: Option<crate::session::Session>,
|
||||||
|
manifest_enabled: bool,
|
||||||
|
namespace_client_properties: Option<HashMap<String, String>>,
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
future_into_py(py, async move {
|
future_into_py(py, async move {
|
||||||
let mut builder = lancedb::connect(&uri);
|
let mut builder = lancedb::connect(&uri);
|
||||||
@@ -504,6 +558,12 @@ pub fn connect(
|
|||||||
if let Some(storage_options) = storage_options {
|
if let Some(storage_options) = storage_options {
|
||||||
builder = builder.storage_options(storage_options);
|
builder = builder.storage_options(storage_options);
|
||||||
}
|
}
|
||||||
|
if manifest_enabled {
|
||||||
|
builder = builder.manifest_enabled(true);
|
||||||
|
}
|
||||||
|
if let Some(namespace_client_properties) = namespace_client_properties {
|
||||||
|
builder = builder.namespace_client_properties(namespace_client_properties);
|
||||||
|
}
|
||||||
#[cfg(feature = "remote")]
|
#[cfg(feature = "remote")]
|
||||||
if let Some(client_config) = client_config {
|
if let Some(client_config) = client_config {
|
||||||
builder = builder.client_config(client_config.into());
|
builder = builder.client_config(client_config.into());
|
||||||
@@ -515,6 +575,52 @@ pub fn connect(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyfunction]
|
||||||
|
#[pyo3(signature = (
|
||||||
|
namespace_client,
|
||||||
|
read_consistency_interval=None,
|
||||||
|
storage_options=None,
|
||||||
|
session=None,
|
||||||
|
namespace_client_pushdown_operations=None,
|
||||||
|
namespace_client_impl=None,
|
||||||
|
namespace_client_properties=None,
|
||||||
|
))]
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub fn connect_namespace_client(
|
||||||
|
py: Python<'_>,
|
||||||
|
namespace_client: Py<PyAny>,
|
||||||
|
read_consistency_interval: Option<f64>,
|
||||||
|
storage_options: Option<HashMap<String, String>>,
|
||||||
|
session: Option<crate::session::Session>,
|
||||||
|
namespace_client_pushdown_operations: Option<Vec<String>>,
|
||||||
|
namespace_client_impl: Option<String>,
|
||||||
|
namespace_client_properties: Option<HashMap<String, String>>,
|
||||||
|
) -> PyResult<Connection> {
|
||||||
|
let namespace_client = extract_namespace_arc(py, namespace_client)?;
|
||||||
|
let read_consistency_interval = read_consistency_interval.map(Duration::from_secs_f64);
|
||||||
|
let namespace_client_pushdown_operations =
|
||||||
|
parse_namespace_client_pushdown_operations(namespace_client_pushdown_operations)?;
|
||||||
|
let ns_impl = namespace_client_impl.unwrap_or_else(|| "python".to_string());
|
||||||
|
let ns_properties = namespace_client_properties.unwrap_or_default();
|
||||||
|
let storage_options = storage_options.unwrap_or_default();
|
||||||
|
let session = session.map(|s| s.inner.clone());
|
||||||
|
|
||||||
|
let database = LanceNamespaceDatabase::from_namespace_client(
|
||||||
|
namespace_client,
|
||||||
|
ns_impl,
|
||||||
|
ns_properties,
|
||||||
|
storage_options,
|
||||||
|
read_consistency_interval,
|
||||||
|
session,
|
||||||
|
namespace_client_pushdown_operations,
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(Connection::new(LanceConnection::new(
|
||||||
|
Arc::new(database),
|
||||||
|
Arc::new(lancedb::embeddings::MemoryRegistry::new()),
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(FromPyObject)]
|
#[derive(FromPyObject)]
|
||||||
pub struct PyClientConfig {
|
pub struct PyClientConfig {
|
||||||
user_agent: String,
|
user_agent: String,
|
||||||
@@ -524,6 +630,7 @@ pub struct PyClientConfig {
|
|||||||
id_delimiter: Option<String>,
|
id_delimiter: Option<String>,
|
||||||
tls_config: Option<PyClientTlsConfig>,
|
tls_config: Option<PyClientTlsConfig>,
|
||||||
header_provider: Option<Py<PyAny>>,
|
header_provider: Option<Py<PyAny>>,
|
||||||
|
user_id: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(FromPyObject)]
|
#[derive(FromPyObject)]
|
||||||
@@ -608,6 +715,7 @@ impl From<PyClientConfig> for lancedb::remote::ClientConfig {
|
|||||||
id_delimiter: value.id_delimiter,
|
id_delimiter: value.id_delimiter,
|
||||||
tls_config: value.tls_config.map(Into::into),
|
tls_config: value.tls_config.map(Into::into),
|
||||||
header_provider,
|
header_provider,
|
||||||
|
user_id: value.user_id,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
175
python/src/expr.rs
Normal file
175
python/src/expr.rs
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
//! PyO3 bindings for the LanceDB expression builder API.
|
||||||
|
//!
|
||||||
|
//! This module exposes [`PyExpr`] and helper free functions so Python can
|
||||||
|
//! build type-safe filter / projection expressions that map directly to
|
||||||
|
//! DataFusion [`Expr`] nodes, bypassing SQL string parsing.
|
||||||
|
|
||||||
|
use arrow::{datatypes::DataType, pyarrow::PyArrowType};
|
||||||
|
use lancedb::expr::{DfExpr, col as ldb_col, contains, expr_cast, lit as df_lit, lower, upper};
|
||||||
|
use pyo3::{Bound, PyAny, PyResult, exceptions::PyValueError, prelude::*, pyfunction};
|
||||||
|
|
||||||
|
/// A type-safe DataFusion expression.
|
||||||
|
///
|
||||||
|
/// Instances are constructed via the free functions [`expr_col`] and
|
||||||
|
/// [`expr_lit`] and combined with the methods on this struct. On the Python
|
||||||
|
/// side a thin wrapper class (`lancedb.expr.Expr`) delegates to these methods
|
||||||
|
/// and adds Python operator overloads.
|
||||||
|
#[pyclass(name = "PyExpr", from_py_object)]
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct PyExpr(pub DfExpr);
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl PyExpr {
|
||||||
|
// ── comparisons ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn eq(&self, other: &Self) -> Self {
|
||||||
|
Self(self.0.clone().eq(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn ne(&self, other: &Self) -> Self {
|
||||||
|
Self(self.0.clone().not_eq(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lt(&self, other: &Self) -> Self {
|
||||||
|
Self(self.0.clone().lt(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lte(&self, other: &Self) -> Self {
|
||||||
|
Self(self.0.clone().lt_eq(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn gt(&self, other: &Self) -> Self {
|
||||||
|
Self(self.0.clone().gt(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn gte(&self, other: &Self) -> Self {
|
||||||
|
Self(self.0.clone().gt_eq(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── logical ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn and_(&self, other: &Self) -> Self {
|
||||||
|
Self(self.0.clone().and(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn or_(&self, other: &Self) -> Self {
|
||||||
|
Self(self.0.clone().or(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn not_(&self) -> Self {
|
||||||
|
use std::ops::Not;
|
||||||
|
Self(self.0.clone().not())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── arithmetic ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn add(&self, other: &Self) -> Self {
|
||||||
|
use std::ops::Add;
|
||||||
|
Self(self.0.clone().add(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sub(&self, other: &Self) -> Self {
|
||||||
|
use std::ops::Sub;
|
||||||
|
Self(self.0.clone().sub(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn mul(&self, other: &Self) -> Self {
|
||||||
|
use std::ops::Mul;
|
||||||
|
Self(self.0.clone().mul(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn div(&self, other: &Self) -> Self {
|
||||||
|
use std::ops::Div;
|
||||||
|
Self(self.0.clone().div(other.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── string functions ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Convert string column to lowercase.
|
||||||
|
fn lower(&self) -> Self {
|
||||||
|
Self(lower(self.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert string column to uppercase.
|
||||||
|
fn upper(&self) -> Self {
|
||||||
|
Self(upper(self.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test whether the string contains `substr`.
|
||||||
|
fn contains(&self, substr: &Self) -> Self {
|
||||||
|
Self(contains(self.0.clone(), substr.0.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── type cast ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Cast the expression to `data_type`.
|
||||||
|
///
|
||||||
|
/// `data_type` must be a PyArrow `DataType` (e.g. `pa.int32()`).
|
||||||
|
/// On the Python side, `lancedb.expr.Expr.cast` also accepts type name
|
||||||
|
/// strings via `pa.lib.ensure_type` before forwarding here.
|
||||||
|
fn cast(&self, data_type: PyArrowType<DataType>) -> Self {
|
||||||
|
Self(expr_cast(self.0.clone(), data_type.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── utilities ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Render the expression as a SQL string (useful for debugging).
|
||||||
|
fn to_sql(&self) -> PyResult<String> {
|
||||||
|
lancedb::expr::expr_to_sql_string(&self.0).map_err(|e| PyValueError::new_err(e.to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn __repr__(&self) -> PyResult<String> {
|
||||||
|
let sql =
|
||||||
|
lancedb::expr::expr_to_sql_string(&self.0).unwrap_or_else(|_| "<expr>".to_string());
|
||||||
|
Ok(format!("PyExpr({})", sql))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── free functions ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Create a column reference expression.
|
||||||
|
///
|
||||||
|
/// The column name is preserved exactly as given (case-sensitive), so
|
||||||
|
/// `col("firstName")` correctly references a field named `firstName`.
|
||||||
|
#[pyfunction]
|
||||||
|
pub fn expr_col(name: &str) -> PyExpr {
|
||||||
|
PyExpr(ldb_col(name))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a literal value expression.
|
||||||
|
///
|
||||||
|
/// Supported Python types: `bool`, `int`, `float`, `str`.
|
||||||
|
#[pyfunction]
|
||||||
|
pub fn expr_lit(value: Bound<'_, PyAny>) -> PyResult<PyExpr> {
|
||||||
|
// bool must be checked before int because bool is a subclass of int in Python
|
||||||
|
if let Ok(b) = value.extract::<bool>() {
|
||||||
|
return Ok(PyExpr(df_lit(b)));
|
||||||
|
}
|
||||||
|
if let Ok(i) = value.extract::<i64>() {
|
||||||
|
return Ok(PyExpr(df_lit(i)));
|
||||||
|
}
|
||||||
|
if let Ok(f) = value.extract::<f64>() {
|
||||||
|
return Ok(PyExpr(df_lit(f)));
|
||||||
|
}
|
||||||
|
if let Ok(s) = value.extract::<String>() {
|
||||||
|
return Ok(PyExpr(df_lit(s)));
|
||||||
|
}
|
||||||
|
Err(PyValueError::new_err(format!(
|
||||||
|
"unsupported literal type: {}. Supported: bool, int, float, str",
|
||||||
|
value.get_type().name()?
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Call an arbitrary registered SQL function by name.
|
||||||
|
///
|
||||||
|
/// See `lancedb::expr::func` for the list of supported function names.
|
||||||
|
#[pyfunction]
|
||||||
|
pub fn expr_func(name: &str, args: Vec<PyExpr>) -> PyResult<PyExpr> {
|
||||||
|
let df_args: Vec<DfExpr> = args.into_iter().map(|e| e.0).collect();
|
||||||
|
lancedb::expr::func(name, df_args)
|
||||||
|
.map(PyExpr)
|
||||||
|
.map_err(|e| PyValueError::new_err(e.to_string()))
|
||||||
|
}
|
||||||
@@ -33,7 +33,7 @@ impl PyHeaderProvider {
|
|||||||
Ok(headers_py) => {
|
Ok(headers_py) => {
|
||||||
// Convert Python dict to Rust HashMap
|
// Convert Python dict to Rust HashMap
|
||||||
let bound_headers = headers_py.bind(py);
|
let bound_headers = headers_py.bind(py);
|
||||||
let dict: &Bound<PyDict> = bound_headers.downcast().map_err(|e| {
|
let dict: &Bound<PyDict> = bound_headers.cast().map_err(|e| {
|
||||||
format!("HeaderProvider.get_headers must return a dict: {}", e)
|
format!("HeaderProvider.get_headers must return a dict: {}", e)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ use pyo3::{
|
|||||||
Bound, FromPyObject, PyAny, PyResult, Python,
|
Bound, FromPyObject, PyAny, PyResult, Python,
|
||||||
exceptions::{PyKeyError, PyValueError},
|
exceptions::{PyKeyError, PyValueError},
|
||||||
intern, pyclass, pymethods,
|
intern, pyclass, pymethods,
|
||||||
types::PyAnyMethods,
|
types::{PyAnyMethods, PyString},
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::util::parse_distance_type;
|
use crate::util::parse_distance_type;
|
||||||
@@ -22,7 +22,7 @@ pub fn class_name(ob: &'_ Bound<'_, PyAny>) -> PyResult<String> {
|
|||||||
let full_name = ob
|
let full_name = ob
|
||||||
.getattr(intern!(ob.py(), "__class__"))?
|
.getattr(intern!(ob.py(), "__class__"))?
|
||||||
.getattr(intern!(ob.py(), "__name__"))?;
|
.getattr(intern!(ob.py(), "__name__"))?;
|
||||||
let full_name = full_name.downcast()?.to_string_lossy();
|
let full_name = full_name.cast::<PyString>()?.to_string_lossy();
|
||||||
|
|
||||||
match full_name.rsplit_once('.') {
|
match full_name.rsplit_once('.') {
|
||||||
Some((_, name)) => Ok(name.to_string()),
|
Some((_, name)) => Ok(name.to_string()),
|
||||||
|
|||||||
@@ -2,8 +2,9 @@
|
|||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
use arrow::RecordBatchStream;
|
use arrow::RecordBatchStream;
|
||||||
use connection::{Connection, connect};
|
use connection::{Connection, connect, connect_namespace_client};
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
|
use expr::{PyExpr, expr_col, expr_func, expr_lit};
|
||||||
use index::IndexConfig;
|
use index::IndexConfig;
|
||||||
use permutation::{PyAsyncPermutationBuilder, PyPermutationReader};
|
use permutation::{PyAsyncPermutationBuilder, PyPermutationReader};
|
||||||
use pyo3::{
|
use pyo3::{
|
||||||
@@ -21,13 +22,13 @@ use table::{
|
|||||||
pub mod arrow;
|
pub mod arrow;
|
||||||
pub mod connection;
|
pub mod connection;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
|
pub mod expr;
|
||||||
pub mod header;
|
pub mod header;
|
||||||
pub mod index;
|
pub mod index;
|
||||||
pub mod namespace;
|
pub mod namespace;
|
||||||
pub mod permutation;
|
pub mod permutation;
|
||||||
pub mod query;
|
pub mod query;
|
||||||
pub mod session;
|
pub mod session;
|
||||||
pub mod storage_options;
|
|
||||||
pub mod table;
|
pub mod table;
|
||||||
pub mod util;
|
pub mod util;
|
||||||
|
|
||||||
@@ -55,10 +56,15 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|||||||
m.add_class::<UpdateResult>()?;
|
m.add_class::<UpdateResult>()?;
|
||||||
m.add_class::<PyAsyncPermutationBuilder>()?;
|
m.add_class::<PyAsyncPermutationBuilder>()?;
|
||||||
m.add_class::<PyPermutationReader>()?;
|
m.add_class::<PyPermutationReader>()?;
|
||||||
|
m.add_class::<PyExpr>()?;
|
||||||
m.add_function(wrap_pyfunction!(connect, m)?)?;
|
m.add_function(wrap_pyfunction!(connect, m)?)?;
|
||||||
|
m.add_function(wrap_pyfunction!(connect_namespace_client, m)?)?;
|
||||||
m.add_function(wrap_pyfunction!(permutation::async_permutation_builder, m)?)?;
|
m.add_function(wrap_pyfunction!(permutation::async_permutation_builder, m)?)?;
|
||||||
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
|
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
|
||||||
m.add_function(wrap_pyfunction!(query::fts_query_to_json, m)?)?;
|
m.add_function(wrap_pyfunction!(query::fts_query_to_json, m)?)?;
|
||||||
|
m.add_function(wrap_pyfunction!(expr_col, m)?)?;
|
||||||
|
m.add_function(wrap_pyfunction!(expr_lit, m)?)?;
|
||||||
|
m.add_function(wrap_pyfunction!(expr_func, m)?)?;
|
||||||
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
|
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user