mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-13 01:50:42 +00:00
Compare commits
26 Commits
v0.28.0-be
...
jack/idp-o
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9665f698a3 | ||
|
|
650f173236 | ||
|
|
9b21c136c6 | ||
|
|
694aa48e19 | ||
|
|
455ba5abbf | ||
|
|
5338aeb006 | ||
|
|
47a34f5cca | ||
|
|
a17c241e86 | ||
|
|
1fc23e5473 | ||
|
|
87b831bcae | ||
|
|
59db036118 | ||
|
|
c091243d5b | ||
|
|
a2aea7b4e5 | ||
|
|
4a5341edb1 | ||
|
|
25dfe2cfd4 | ||
|
|
4dcd7f4314 | ||
|
|
2e36cd9dad | ||
|
|
f31e27768a | ||
|
|
b84150a53e | ||
|
|
d135c18db6 | ||
|
|
ef399de092 | ||
|
|
0d767abd0e | ||
|
|
a92ae0ded5 | ||
|
|
c54888a83a | ||
|
|
ba6c44abc9 | ||
|
|
75b0a8e0a3 |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.28.0-beta.9"
|
||||
current_version = "0.28.0-beta.11"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
18
.github/dependabot.yml
vendored
Normal file
18
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
version: 2
|
||||
|
||||
# Scope: the root Cargo workspace, which produces the Rust binaries we
|
||||
# ship to users (the Node.js and Python native extensions). The
|
||||
# `rust/lancedb` library crate shares the same lockfile; its consumers
|
||||
# pick their own dependency versions, but bumping transitive deps here
|
||||
# keeps the binaries we ship current.
|
||||
updates:
|
||||
- package-ecosystem: cargo
|
||||
directory: /
|
||||
schedule:
|
||||
interval: weekly
|
||||
open-pull-requests-limit: 10
|
||||
groups:
|
||||
rust-minor-patch:
|
||||
update-types:
|
||||
- minor
|
||||
- patch
|
||||
3
.github/workflows/dev.yml
vendored
3
.github/workflows/dev.yml
vendored
@@ -8,6 +8,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
labeler:
|
||||
permissions:
|
||||
|
||||
8
.github/workflows/java-publish.yml
vendored
8
.github/workflows/java-publish.yml
vendored
@@ -19,6 +19,9 @@ on:
|
||||
paths:
|
||||
- .github/workflows/java-publish.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
name: Build and Publish
|
||||
@@ -40,7 +43,7 @@ jobs:
|
||||
server-username: SONATYPE_USER
|
||||
server-password: SONATYPE_TOKEN
|
||||
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
|
||||
gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
|
||||
gpg-passphrase: MAVEN_GPG_PASSPHRASE
|
||||
- name: Set git config
|
||||
run: |
|
||||
git config --global user.email "dev+gha@lancedb.com"
|
||||
@@ -55,10 +58,11 @@ jobs:
|
||||
echo "use-agent" >> ~/.gnupg/gpg.conf
|
||||
echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf
|
||||
export GPG_TTY=$(tty)
|
||||
./mvnw --batch-mode -DskipTests -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -pl lancedb-core -am -P deploy-to-ossrh
|
||||
./mvnw --batch-mode -DskipTests -DpushChanges=false deploy -pl lancedb-core -am -P deploy-to-ossrh
|
||||
env:
|
||||
SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
|
||||
SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
|
||||
MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
|
||||
|
||||
report-failure:
|
||||
name: Report Workflow Failure
|
||||
|
||||
3
.github/workflows/java.yml
vendored
3
.github/workflows/java.yml
vendored
@@ -24,6 +24,9 @@ on:
|
||||
- java/**
|
||||
- .github/workflows/java.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build-java:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
4
.github/workflows/license-header-check.yml
vendored
4
.github/workflows/license-header-check.yml
vendored
@@ -10,6 +10,10 @@ on:
|
||||
- nodejs/**
|
||||
- java/**
|
||||
- .github/workflows/license-header-check.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
check-licenses:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
3
.github/workflows/nodejs.yml
vendored
3
.github/workflows/nodejs.yml
vendored
@@ -15,6 +15,9 @@ on:
|
||||
- .github/workflows/nodejs.yml
|
||||
- docker-compose.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
15
.github/workflows/pypi-publish.yml
vendored
15
.github/workflows/pypi-publish.yml
vendored
@@ -14,10 +14,16 @@ on:
|
||||
env:
|
||||
PIP_EXTRA_INDEX_URL: "https://pypi.fury.io/lance-format/ https://pypi.fury.io/lancedb/"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
name: Python ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
|
||||
timeout-minutes: 60
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
@@ -57,10 +63,12 @@ jobs:
|
||||
- uses: ./.github/workflows/upload_wheel
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
with:
|
||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||
mac:
|
||||
timeout-minutes: 90
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -85,10 +93,12 @@ jobs:
|
||||
- uses: ./.github/workflows/upload_wheel
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
with:
|
||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||
windows:
|
||||
timeout-minutes: 60
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -107,7 +117,6 @@ jobs:
|
||||
- uses: ./.github/workflows/upload_wheel
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
with:
|
||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||
gh-release:
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
|
||||
5
.github/workflows/python.yml
vendored
5
.github/workflows/python.yml
vendored
@@ -17,6 +17,9 @@ on:
|
||||
- .github/workflows/build_windows_wheel/**
|
||||
- .github/workflows/run_tests/**
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
@@ -108,7 +111,6 @@ jobs:
|
||||
- name: Install
|
||||
run: |
|
||||
pip install --extra-index-url https://pypi.fury.io/lance-format/ --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests,dev,embeddings]
|
||||
pip install tantivy
|
||||
pip install mlx
|
||||
- name: Doctest
|
||||
run: pytest --doctest-modules python/lancedb
|
||||
@@ -227,6 +229,5 @@ jobs:
|
||||
pip install "pydantic<2"
|
||||
pip install pyarrow==16
|
||||
pip install --extra-index-url https://pypi.fury.io/lance-format/ --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests]
|
||||
pip install tantivy
|
||||
- name: Run tests
|
||||
run: pytest -m "not slow and not s3_test" -x -v --durations=30 python/tests
|
||||
|
||||
17
.github/workflows/rust.yml
vendored
17
.github/workflows/rust.yml
vendored
@@ -9,9 +9,15 @@ on:
|
||||
- Cargo.toml
|
||||
- Cargo.lock
|
||||
- rust-toolchain.toml
|
||||
- deny.toml
|
||||
- rust/**
|
||||
- nodejs/Cargo.toml
|
||||
- python/Cargo.toml
|
||||
- .github/workflows/rust.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
@@ -53,6 +59,17 @@ jobs:
|
||||
- name: Run clippy (without remote feature)
|
||||
run: cargo clippy --profile ci --workspace --tests -- -D warnings
|
||||
|
||||
deny:
|
||||
# Supply-chain checks: advisories, licenses, banned crates, and source
|
||||
# restrictions. Configuration lives in `deny.toml` at the workspace root.
|
||||
timeout-minutes: 10
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: EmbarkStudios/cargo-deny-action@v2
|
||||
with:
|
||||
command: check advisories bans licenses sources
|
||||
|
||||
build-no-lock:
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 30
|
||||
|
||||
@@ -3,6 +3,9 @@ name: Update package-lock.json
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -3,6 +3,9 @@ name: Update NodeJs package-lock.json
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
31
.github/workflows/upload_wheel/action.yml
vendored
31
.github/workflows/upload_wheel/action.yml
vendored
@@ -2,9 +2,6 @@ name: upload-wheel
|
||||
|
||||
description: "Upload wheels to Pypi"
|
||||
inputs:
|
||||
pypi_token:
|
||||
required: true
|
||||
description: "release token for the repo"
|
||||
fury_token:
|
||||
required: true
|
||||
description: "release token for the fury repo"
|
||||
@@ -12,12 +9,6 @@ inputs:
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install twine
|
||||
python3 -m pip install --upgrade pkginfo
|
||||
- name: Choose repo
|
||||
shell: bash
|
||||
id: choose_repo
|
||||
@@ -27,19 +18,17 @@ runs:
|
||||
else
|
||||
echo "repo=pypi" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
- name: Publish to PyPI
|
||||
- name: Publish to Fury
|
||||
if: steps.choose_repo.outputs.repo == 'fury'
|
||||
shell: bash
|
||||
env:
|
||||
FURY_TOKEN: ${{ inputs.fury_token }}
|
||||
PYPI_TOKEN: ${{ inputs.pypi_token }}
|
||||
run: |
|
||||
if [[ ${{ steps.choose_repo.outputs.repo }} == fury ]]; then
|
||||
WHEEL=$(ls target/wheels/lancedb-*.whl 2> /dev/null | head -n 1)
|
||||
echo "Uploading $WHEEL to Fury"
|
||||
curl -f -F package=@$WHEEL https://$FURY_TOKEN@push.fury.io/lancedb/
|
||||
else
|
||||
twine upload --repository ${{ steps.choose_repo.outputs.repo }} \
|
||||
--username __token__ \
|
||||
--password $PYPI_TOKEN \
|
||||
target/wheels/lancedb-*.whl
|
||||
fi
|
||||
WHEEL=$(ls target/wheels/lancedb-*.whl 2> /dev/null | head -n 1)
|
||||
echo "Uploading $WHEEL to Fury"
|
||||
curl -f -F package=@$WHEEL https://$FURY_TOKEN@push.fury.io/lancedb/
|
||||
- name: Publish to PyPI
|
||||
if: steps.choose_repo.outputs.repo == 'pypi'
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
packages-dir: target/wheels/
|
||||
|
||||
3081
Cargo.lock
generated
3081
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
66
Cargo.toml
66
Cargo.toml
@@ -1,7 +1,5 @@
|
||||
[workspace]
|
||||
members = ["rust/lancedb", "nodejs", "python"]
|
||||
# Python package needs to be built by maturin.
|
||||
exclude = ["python"]
|
||||
resolver = "2"
|
||||
|
||||
[workspace.package]
|
||||
@@ -15,40 +13,40 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.91.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=6.0.0-beta.1", default-features = false, "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-core = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datagen = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-file = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-io = { "version" = "=6.0.0-beta.1", default-features = false, "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-index = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-linalg = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=6.0.0-beta.1", default-features = false, "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-table = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-testing = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datafusion = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-encoding = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-arrow = { "version" = "=6.0.0-beta.1", "tag" = "v6.0.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance = { "version" = "=7.0.0-beta.7", default-features = false, "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-core = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datagen = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-file = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-io = { "version" = "=7.0.0-beta.7", default-features = false, "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-index = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-linalg = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=7.0.0-beta.7", default-features = false, "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-table = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-testing = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datafusion = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-encoding = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-arrow = { "version" = "=7.0.0-beta.7", "tag" = "v7.0.0-beta.7", "git" = "https://github.com/lance-format/lance.git" }
|
||||
ahash = "0.8"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "57.2", optional = false }
|
||||
arrow-array = "57.2"
|
||||
arrow-data = "57.2"
|
||||
arrow-ipc = "57.2"
|
||||
arrow-ord = "57.2"
|
||||
arrow-schema = "57.2"
|
||||
arrow-select = "57.2"
|
||||
arrow-cast = "57.2"
|
||||
arrow = { version = "58.0.0", optional = false }
|
||||
arrow-array = "58.0.0"
|
||||
arrow-data = "58.0.0"
|
||||
arrow-ipc = "58.0.0"
|
||||
arrow-ord = "58.0.0"
|
||||
arrow-schema = "58.0.0"
|
||||
arrow-select = "58.0.0"
|
||||
arrow-cast = "58.0.0"
|
||||
async-trait = "0"
|
||||
datafusion = { version = "52.1", default-features = false }
|
||||
datafusion-catalog = "52.1"
|
||||
datafusion-common = { version = "52.1", default-features = false }
|
||||
datafusion-execution = "52.1"
|
||||
datafusion-expr = "52.1"
|
||||
datafusion-functions = "52.1"
|
||||
datafusion-physical-plan = "52.1"
|
||||
datafusion-physical-expr = "52.1"
|
||||
datafusion-sql = "52.1"
|
||||
datafusion = { version = "53.0.0", default-features = false }
|
||||
datafusion-catalog = "53.0.0"
|
||||
datafusion-common = { version = "53.0.0", default-features = false }
|
||||
datafusion-execution = "53.0.0"
|
||||
datafusion-expr = "53.0.0"
|
||||
datafusion-functions = "53.0.0"
|
||||
datafusion-physical-plan = "53.0.0"
|
||||
datafusion-physical-expr = "53.0.0"
|
||||
datafusion-sql = "53.0.0"
|
||||
env_logger = "0.11"
|
||||
half = { "version" = "2.7.1", default-features = false, features = [
|
||||
"num-traits",
|
||||
@@ -56,7 +54,7 @@ half = { "version" = "2.7.1", default-features = false, features = [
|
||||
futures = "0"
|
||||
log = "0.4"
|
||||
moka = { version = "0.12", features = ["future"] }
|
||||
object_store = "0.12.0"
|
||||
object_store = "0.13.2"
|
||||
pin-project = "1.0.7"
|
||||
rand = "0.9"
|
||||
snafu = "0.8"
|
||||
|
||||
196
deny.toml
Normal file
196
deny.toml
Normal file
@@ -0,0 +1,196 @@
|
||||
# cargo-deny configuration for LanceDB.
|
||||
#
|
||||
# Run locally with `cargo deny check`. See
|
||||
# https://embarkstudios.github.io/cargo-deny/ for the full reference.
|
||||
|
||||
# The set of target triples we care about. cargo-deny will only consider
|
||||
# dependencies that are used on at least one of these targets. Keeping this
|
||||
# explicit avoids noise from platform-specific crates (e.g. wasm, android,
|
||||
# ios) that we never actually ship.
|
||||
[graph]
|
||||
targets = [
|
||||
"x86_64-unknown-linux-gnu",
|
||||
"aarch64-unknown-linux-gnu",
|
||||
"x86_64-apple-darwin",
|
||||
"aarch64-apple-darwin",
|
||||
"x86_64-pc-windows-msvc",
|
||||
"aarch64-pc-windows-msvc",
|
||||
]
|
||||
all-features = true
|
||||
|
||||
[output]
|
||||
feature-depth = 1
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Advisories: security vulnerabilities and yanked crates.
|
||||
# ---------------------------------------------------------------------------
|
||||
[advisories]
|
||||
version = 2
|
||||
# Fail the check if any crate in the lockfile has been yanked from crates.io.
|
||||
# Yanked crates are a signal the author retracted the release (often due to
|
||||
# bugs or security issues) and should not be depended on.
|
||||
yanked = "deny"
|
||||
# Advisory IDs we have explicitly reviewed and chosen to accept. Every
|
||||
# entry must include a rationale and, where possible, an upstream issue
|
||||
# pointing to a fix. Revisit this list whenever dependencies are updated.
|
||||
ignore = [
|
||||
# rsa: Marvin Attack timing side-channel in PKCS#1 v1.5 decryption.
|
||||
# Reached only through opendal → reqsign → rsa. We do not use RSA
|
||||
# decryption in LanceDB ourselves; this is dormant in the signing path.
|
||||
# No fixed release exists upstream as of this writing.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2023-0071
|
||||
{ id = "RUSTSEC-2023-0071", reason = "rsa crate via opendal/reqsign; no fixed upstream release" },
|
||||
|
||||
# instant: unmaintained. Pulled in via backoff → instant. Upstream
|
||||
# recommends switching to `web-time`; fix has to come from backoff.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2024-0384
|
||||
{ id = "RUSTSEC-2024-0384", reason = "transitive via backoff; waiting on backoff replacement" },
|
||||
|
||||
# paste: unmaintained (author archived the repo). Used transitively by
|
||||
# datafusion and the arrow ecosystem; widespread, no drop-in replacement.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2024-0436
|
||||
{ id = "RUSTSEC-2024-0436", reason = "transitive via datafusion; awaiting ecosystem migration" },
|
||||
|
||||
# encoding: unmaintained. Reached through lindera-dictionary, which is
|
||||
# required by the native Lindera tokenizer path. Lindera has not migrated
|
||||
# off this crate yet.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2021-0153
|
||||
{ id = "RUSTSEC-2021-0153", reason = "transitive via lindera-dictionary for native Lindera tokenizer" },
|
||||
|
||||
# fast-float: unsound and unmaintained. Reached only through polars-arrow
|
||||
# from the optional Polars integration; replacement requires a Polars
|
||||
# dependency upgrade.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2024-0379
|
||||
{ id = "RUSTSEC-2024-0379", reason = "transitive via polars-arrow; waiting on Polars migration" },
|
||||
|
||||
# tantivy: segfault on malformed input due to missing bounds check.
|
||||
# Pulled in via lance for full-text search. We only feed tantivy
|
||||
# documents we construct ourselves, not attacker-controlled bytes.
|
||||
# Tracked for a lance dependency bump.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0003
|
||||
{ id = "RUSTSEC-2025-0003", reason = "tantivy via lance; inputs are internally produced, not user-supplied bytes" },
|
||||
|
||||
# backoff: unmaintained. Reached only via async-openai. Replacement
|
||||
# requires async-openai to migrate (or us to drop async-openai).
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0012
|
||||
{ id = "RUSTSEC-2025-0012", reason = "transitive via async-openai; waiting on upstream migration" },
|
||||
|
||||
# number_prefix: unmaintained. Transitive via indicatif → hf-hub.
|
||||
# No security impact, just maintenance status.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0119
|
||||
{ id = "RUSTSEC-2025-0119", reason = "transitive via hf-hub/indicatif; cosmetic formatting crate" },
|
||||
|
||||
# bincode: unmaintained. Reached through lindera and lindera-dictionary,
|
||||
# which are required by the native Lindera tokenizer path. Lindera has not
|
||||
# migrated to another serialization format yet.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2025-0141
|
||||
{ id = "RUSTSEC-2025-0141", reason = "transitive via lindera/lindera-dictionary for native Lindera tokenizer" },
|
||||
|
||||
# lru: soundness issue in IterMut. Reached only through aws-sdk-s3 in
|
||||
# LanceDB's dev-dependency graph; LanceDB does not use that iterator
|
||||
# directly. Clearing this requires the AWS SDK chain to update lru.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0002
|
||||
{ id = "RUSTSEC-2026-0002", reason = "transitive via aws-sdk-s3 dev-dependency; waiting on AWS SDK lru upgrade" },
|
||||
|
||||
# rustls-webpki 0.101.7 (old major line): name-constraint checks for
|
||||
# URI / wildcard names. Pulled in only via the legacy rustls 0.21 chain
|
||||
# from aws-smithy-http-client. The 0.103 line we actively use is patched.
|
||||
# Clearing the 0.101 copy requires the aws-sdk chain to migrate off
|
||||
# rustls 0.21.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0098
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0099
|
||||
{ id = "RUSTSEC-2026-0098", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
|
||||
{ id = "RUSTSEC-2026-0099", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
|
||||
|
||||
# rustls-webpki 0.101.7: reachable panic in CRL parsing. Same legacy
|
||||
# rustls 0.21 chain from aws-smithy-http-client as above. The 0.103 line
|
||||
# we actively use is upgraded to 0.103.13 which contains the fix.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0104
|
||||
{ id = "RUSTSEC-2026-0104", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" },
|
||||
|
||||
# rand 0.8.5: soundness issue only when ThreadRng reseeds inside a custom
|
||||
# logger. Reached through several transitive chains. LanceDB does not use
|
||||
# rand from a custom logger; upgrade once all pinned chains accept 0.8.6+.
|
||||
# https://rustsec.org/advisories/RUSTSEC-2026-0097
|
||||
{ id = "RUSTSEC-2026-0097", reason = "transitive rand 0.8.5; LanceDB does not call ThreadRng from custom logging" },
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Licenses: only allow licenses we've reviewed as compatible with Apache-2.0.
|
||||
# ---------------------------------------------------------------------------
|
||||
[licenses]
|
||||
version = 2
|
||||
# SPDX identifiers for licenses that are compatible with our Apache-2.0
|
||||
# distribution. Additions require legal review.
|
||||
allow = [
|
||||
"Apache-2.0",
|
||||
"Apache-2.0 WITH LLVM-exception",
|
||||
"MIT",
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
"ISC",
|
||||
"Unicode-3.0",
|
||||
"Unicode-DFS-2016",
|
||||
"Zlib",
|
||||
"CC0-1.0",
|
||||
"MPL-2.0",
|
||||
"BSL-1.0",
|
||||
"OpenSSL",
|
||||
# 0BSD ("BSD Zero Clause") is effectively public domain — no attribution
|
||||
# required. Pulled in by `mock_instant`.
|
||||
"0BSD",
|
||||
# bzip2-1.0.6 is the permissive upstream bzip2 license (BSD-like). Pulled
|
||||
# in by `libbz2-rs-sys`, the pure-Rust bzip2 implementation.
|
||||
"bzip2-1.0.6",
|
||||
# CDLA-Permissive-2.0 is a permissive data license used by `webpki-roots`
|
||||
# for the Mozilla CA root bundle. Data-only, distribution-compatible.
|
||||
"CDLA-Permissive-2.0",
|
||||
]
|
||||
confidence-threshold = 0.8
|
||||
# Crates whose license cannot be determined from Cargo metadata but whose
|
||||
# license we've manually confirmed from upstream. Keep this list minimal.
|
||||
[[licenses.clarify]]
|
||||
# polars-arrow-format omits the `license` field in its Cargo.toml, but the
|
||||
# upstream repo (pola-rs/polars-arrow-format) is dual-licensed Apache-2.0 OR
|
||||
# MIT. See https://github.com/pola-rs/polars-arrow-format/blob/main/LICENSE
|
||||
crate = "polars-arrow-format"
|
||||
expression = "Apache-2.0 OR MIT"
|
||||
license-files = []
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bans: disallow specific crates and flag dependency hygiene issues.
|
||||
# ---------------------------------------------------------------------------
|
||||
[bans]
|
||||
# Warn (not deny) on duplicate versions of the same crate. In a large
|
||||
# workspace like this one, duplicates are common and often unavoidable
|
||||
# transitively. We surface them to discourage growth, but don't fail CI.
|
||||
multiple-versions = "warn"
|
||||
# Wildcard version requirements (`foo = "*"`) are a footgun — they let any
|
||||
# future release in without review. Ban them outright.
|
||||
wildcards = "deny"
|
||||
# Internal workspace crates reference each other via `path = "..."`, which
|
||||
# cargo-deny sees as a wildcard version. That's fine for private workspace
|
||||
# members (not published to crates.io), so allow it specifically for paths.
|
||||
allow-wildcard-paths = true
|
||||
# Features that, if enabled, should cause the check to fail.
|
||||
deny = []
|
||||
# Crates to skip when checking for duplicate versions.
|
||||
skip = []
|
||||
# Similar to `skip`, but also skips the entire transitive subtree.
|
||||
skip-tree = []
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Sources: restrict where crates can come from.
|
||||
# ---------------------------------------------------------------------------
|
||||
[sources]
|
||||
# Deny any registry other than the ones explicitly listed below.
|
||||
unknown-registry = "deny"
|
||||
# Deny any git dependency whose host isn't in the allow-list below. This
|
||||
# prevents accidental pulls from arbitrary forks.
|
||||
unknown-git = "deny"
|
||||
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
|
||||
# Lance is developed in a sibling repo and pulled as a git dependency until
|
||||
# releases are cut to crates.io. Allow that specific host.
|
||||
allow-git = [
|
||||
"https://github.com/lance-format/lance",
|
||||
]
|
||||
@@ -24,4 +24,4 @@ RUN python --version && \
|
||||
rustc --version && \
|
||||
protoc --version
|
||||
|
||||
RUN pip install --no-cache-dir tantivy lancedb
|
||||
RUN pip install --no-cache-dir lancedb
|
||||
|
||||
@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
|
||||
<dependency>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-core</artifactId>
|
||||
<version>0.28.0-beta.9</version>
|
||||
<version>0.28.0-beta.11</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
||||
@@ -25,8 +25,7 @@ new BooleanQuery(queries): BooleanQuery
|
||||
Creates an instance of BooleanQuery.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **queries**: [[`Occur`](../enumerations/Occur.md), [`FullTextQuery`](../interfaces/FullTextQuery.md)][]
|
||||
* **queries**: [[`Occur`](../enumerations/Occur.md), [`FullTextQuery`](../interfaces/FullTextQuery.md)][]
|
||||
An array of (Occur, FullTextQuery objects) to combine.
|
||||
Occur specifies whether the query must match, or should match.
|
||||
|
||||
|
||||
@@ -31,18 +31,14 @@ but penalizes those that match the negative query.
|
||||
the penalty is controlled by the `negativeBoost` parameter.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **positive**: [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
* **positive**: [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
The positive query that boosts the relevance score.
|
||||
|
||||
* **negative**: [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
* **negative**: [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
The negative query that reduces the relevance score.
|
||||
|
||||
* **options?**
|
||||
* **options?**
|
||||
Optional parameters for the boost query.
|
||||
- `negativeBoost`: The boost factor for the negative query (default is 0.0).
|
||||
|
||||
* **options.negativeBoost?**: `number`
|
||||
* **options.negativeBoost?**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -42,26 +42,19 @@ both the source and cloned tables to evolve independently while initially
|
||||
sharing the same data, deletion, and index files.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **targetTableName**: `string`
|
||||
* **targetTableName**: `string`
|
||||
The name of the target table to create.
|
||||
|
||||
* **sourceUri**: `string`
|
||||
* **sourceUri**: `string`
|
||||
The URI of the source table to clone from.
|
||||
|
||||
* **options?**
|
||||
* **options?**
|
||||
Clone options.
|
||||
|
||||
* **options.isShallow?**: `boolean`
|
||||
* **options.isShallow?**: `boolean`
|
||||
Whether to perform a shallow clone (defaults to true).
|
||||
|
||||
* **options.sourceTag?**: `string`
|
||||
* **options.sourceTag?**: `string`
|
||||
The tag of the source table to clone.
|
||||
|
||||
* **options.sourceVersion?**: `number`
|
||||
* **options.sourceVersion?**: `number`
|
||||
The version of the source table to clone.
|
||||
|
||||
* **options.targetNamespacePath?**: `string`[]
|
||||
* **options.targetNamespacePath?**: `string`[]
|
||||
The namespace path for the target table (defaults to root namespace).
|
||||
|
||||
#### Returns
|
||||
@@ -102,14 +95,11 @@ abstract createEmptyTable(
|
||||
Creates a new empty Table
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
The name of the table.
|
||||
|
||||
* **schema**: [`SchemaLike`](../type-aliases/SchemaLike.md)
|
||||
* **schema**: [`SchemaLike`](../type-aliases/SchemaLike.md)
|
||||
The schema of the table
|
||||
|
||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
Additional options (backwards compatibility)
|
||||
|
||||
##### Returns
|
||||
@@ -129,17 +119,13 @@ abstract createEmptyTable(
|
||||
Creates a new empty Table
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
The name of the table.
|
||||
|
||||
* **schema**: [`SchemaLike`](../type-aliases/SchemaLike.md)
|
||||
* **schema**: [`SchemaLike`](../type-aliases/SchemaLike.md)
|
||||
The schema of the table
|
||||
|
||||
* **namespacePath?**: `string`[]
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path to create the table in (defaults to root namespace)
|
||||
|
||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
Additional options
|
||||
|
||||
##### Returns
|
||||
@@ -159,11 +145,9 @@ abstract createTable(options, namespacePath?): Promise<Table>
|
||||
Creates a new Table and initialize it with new data.
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **options**: `object` & `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
* **options**: `object` & `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
The options object.
|
||||
|
||||
* **namespacePath?**: `string`[]
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path to create the table in (defaults to root namespace)
|
||||
|
||||
##### Returns
|
||||
@@ -182,15 +166,12 @@ abstract createTable(
|
||||
Creates a new Table and initialize it with new data.
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
The name of the table.
|
||||
|
||||
* **data**: [`TableLike`](../type-aliases/TableLike.md) \| `Record`<`string`, `unknown`>[]
|
||||
* **data**: [`TableLike`](../type-aliases/TableLike.md) \| `Record`<`string`, `unknown`>[]
|
||||
Non-empty Array of Records
|
||||
to be inserted into the table
|
||||
|
||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
Additional options (backwards compatibility)
|
||||
|
||||
##### Returns
|
||||
@@ -210,18 +191,14 @@ abstract createTable(
|
||||
Creates a new Table and initialize it with new data.
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
The name of the table.
|
||||
|
||||
* **data**: [`TableLike`](../type-aliases/TableLike.md) \| `Record`<`string`, `unknown`>[]
|
||||
* **data**: [`TableLike`](../type-aliases/TableLike.md) \| `Record`<`string`, `unknown`>[]
|
||||
Non-empty Array of Records
|
||||
to be inserted into the table
|
||||
|
||||
* **namespacePath?**: `string`[]
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path to create the table in (defaults to root namespace)
|
||||
|
||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
* **options?**: `Partial`<[`CreateTableOptions`](../interfaces/CreateTableOptions.md)>
|
||||
Additional options
|
||||
|
||||
##### Returns
|
||||
@@ -253,8 +230,7 @@ abstract dropAllTables(namespacePath?): Promise<void>
|
||||
Drop all tables in the database.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **namespacePath?**: `string`[]
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path to drop tables from (defaults to root namespace).
|
||||
|
||||
#### Returns
|
||||
@@ -272,11 +248,9 @@ abstract dropTable(name, namespacePath?): Promise<void>
|
||||
Drop an existing table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
The name of the table to drop.
|
||||
|
||||
* **namespacePath?**: `string`[]
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path of the table (defaults to root namespace).
|
||||
|
||||
#### Returns
|
||||
@@ -311,14 +285,11 @@ abstract openTable(
|
||||
Open a table in the database.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
The name of the table
|
||||
|
||||
* **namespacePath?**: `string`[]
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path of the table (defaults to root namespace)
|
||||
|
||||
* **options?**: `Partial`<[`OpenTableOptions`](../interfaces/OpenTableOptions.md)>
|
||||
* **options?**: `Partial`<[`OpenTableOptions`](../interfaces/OpenTableOptions.md)>
|
||||
Additional options
|
||||
|
||||
#### Returns
|
||||
@@ -340,8 +311,7 @@ List all the table names in this database.
|
||||
Tables will be returned in lexicographical order.
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`TableNamesOptions`](../interfaces/TableNamesOptions.md)>
|
||||
* **options?**: `Partial`<[`TableNamesOptions`](../interfaces/TableNamesOptions.md)>
|
||||
options to control the
|
||||
paging / start point (backwards compatibility)
|
||||
|
||||
@@ -360,11 +330,9 @@ List all the table names in this database.
|
||||
Tables will be returned in lexicographical order.
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **namespacePath?**: `string`[]
|
||||
* **namespacePath?**: `string`[]
|
||||
The namespace path to list tables from (defaults to root namespace)
|
||||
|
||||
* **options?**: `Partial`<[`TableNamesOptions`](../interfaces/TableNamesOptions.md)>
|
||||
* **options?**: `Partial`<[`TableNamesOptions`](../interfaces/TableNamesOptions.md)>
|
||||
options to control the
|
||||
paging / start point
|
||||
|
||||
|
||||
@@ -73,8 +73,7 @@ The results of a full text search are ordered by relevance measured by BM25.
|
||||
You can combine filters with full text search.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`FtsOptions`](../interfaces/FtsOptions.md)>
|
||||
* **options?**: `Partial`<[`FtsOptions`](../interfaces/FtsOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -95,8 +94,7 @@ It is a variant of the HNSW algorithm that uses product quantization to compress
|
||||
the vectors.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`HnswPqOptions`](../interfaces/HnswPqOptions.md)>
|
||||
* **options?**: `Partial`<[`HnswPqOptions`](../interfaces/HnswPqOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -117,8 +115,7 @@ It is a variant of the HNSW algorithm that uses scalar quantization to compress
|
||||
the vectors.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`HnswSqOptions`](../interfaces/HnswSqOptions.md)>
|
||||
* **options?**: `Partial`<[`HnswSqOptions`](../interfaces/HnswSqOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -148,8 +145,7 @@ Note that training an IVF FLAT index on a large dataset is a slow operation and
|
||||
currently is also a memory intensive operation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`IvfFlatOptions`](../interfaces/IvfFlatOptions.md)>
|
||||
* **options?**: `Partial`<[`IvfFlatOptions`](../interfaces/IvfFlatOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -185,8 +181,7 @@ Note that training an IVF PQ index on a large dataset is a slow operation and
|
||||
currently is also a memory intensive operation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`IvfPqOptions`](../interfaces/IvfPqOptions.md)>
|
||||
* **options?**: `Partial`<[`IvfPqOptions`](../interfaces/IvfPqOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -216,8 +211,7 @@ Note that training an IVF RQ index on a large dataset is a slow operation and
|
||||
currently is also a memory intensive operation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`IvfRqOptions`](../interfaces/IvfRqOptions.md)>
|
||||
* **options?**: `Partial`<[`IvfRqOptions`](../interfaces/IvfRqOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -17,8 +17,7 @@ new MakeArrowTableOptions(values?): MakeArrowTableOptions
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **values?**: `Partial`<[`MakeArrowTableOptions`](MakeArrowTableOptions.md)>
|
||||
* **values?**: `Partial`<[`MakeArrowTableOptions`](MakeArrowTableOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -28,30 +28,22 @@ new MatchQuery(
|
||||
Creates an instance of MatchQuery.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
* **query**: `string`
|
||||
The text query to search for.
|
||||
|
||||
* **column**: `string`
|
||||
* **column**: `string`
|
||||
The name of the column to search within.
|
||||
|
||||
* **options?**
|
||||
* **options?**
|
||||
Optional parameters for the match query.
|
||||
- `boost`: The boost factor for the query (default is 1.0).
|
||||
- `fuzziness`: The fuzziness level for the query (default is 0).
|
||||
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||
- `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
|
||||
|
||||
* **options.boost?**: `number`
|
||||
|
||||
* **options.fuzziness?**: `number`
|
||||
|
||||
* **options.maxExpansions?**: `number`
|
||||
|
||||
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||
|
||||
* **options.prefixLength?**: `number`
|
||||
* **options.boost?**: `number`
|
||||
* **options.fuzziness?**: `number`
|
||||
* **options.maxExpansions?**: `number`
|
||||
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||
* **options.prefixLength?**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -19,10 +19,8 @@ new MergeInsertBuilder(native, schema): MergeInsertBuilder
|
||||
Construct a MergeInsertBuilder. __Internal use only.__
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **native**: `NativeMergeInsertBuilder`
|
||||
|
||||
* **schema**: `Schema`<`any`> \| `Promise`<`Schema`<`any`>>
|
||||
* **native**: `NativeMergeInsertBuilder`
|
||||
* **schema**: `Schema`<`any`> \| `Promise`<`Schema`<`any`>>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -39,10 +37,8 @@ execute(data, execOptions?): Promise<MergeResult>
|
||||
Executes the merge insert operation
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **data**: [`Data`](../type-aliases/Data.md)
|
||||
|
||||
* **execOptions?**: `Partial`<[`WriteExecutionOptions`](../interfaces/WriteExecutionOptions.md)>
|
||||
* **data**: [`Data`](../type-aliases/Data.md)
|
||||
* **execOptions?**: `Partial`<[`WriteExecutionOptions`](../interfaces/WriteExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -66,8 +62,7 @@ table scan even if an index exists. This can be useful for benchmarking or when
|
||||
the query optimizer chooses a suboptimal path.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **useIndex**: `boolean`
|
||||
* **useIndex**: `boolean`
|
||||
Whether to use indices for the merge operation. Defaults to `true`.
|
||||
|
||||
#### Returns
|
||||
@@ -104,10 +99,8 @@ table (new data).
|
||||
For example, "target.last_update < source.last_update"
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**
|
||||
|
||||
* **options.where?**: `string`
|
||||
* **options?**
|
||||
* **options.where?**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -126,10 +119,8 @@ deleted. An optional condition can be provided to limit what
|
||||
data is deleted.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**
|
||||
|
||||
* **options.where?**: `string`
|
||||
* **options?**
|
||||
* **options.where?**: `string`
|
||||
An optional condition to limit what data is deleted
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -28,21 +28,16 @@ new MultiMatchQuery(
|
||||
Creates an instance of MultiMatchQuery.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
* **query**: `string`
|
||||
The text query to search for across multiple columns.
|
||||
|
||||
* **columns**: `string`[]
|
||||
* **columns**: `string`[]
|
||||
An array of column names to search within.
|
||||
|
||||
* **options?**
|
||||
* **options?**
|
||||
Optional parameters for the multi-match query.
|
||||
- `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
||||
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||
|
||||
* **options.boosts?**: `number`[]
|
||||
|
||||
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||
* **options.boosts?**: `number`[]
|
||||
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -21,8 +21,7 @@ new NativeJsHeaderProvider(getHeadersCallback): NativeJsHeaderProvider
|
||||
Create a new JsHeaderProvider from a JavaScript callback
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **getHeadersCallback**
|
||||
* **getHeadersCallback**
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -51,11 +51,9 @@ new OAuthHeaderProvider(tokenFetcher, refreshBufferSeconds): OAuthHeaderProvider
|
||||
Initialize the OAuth provider.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **tokenFetcher**
|
||||
* **tokenFetcher**
|
||||
Function to fetch new tokens. Should return object with 'accessToken' and optionally 'expiresIn'.
|
||||
|
||||
* **refreshBufferSeconds**: `number` = `300`
|
||||
* **refreshBufferSeconds**: `number` = `300`
|
||||
Seconds before expiry to refresh token. Default 300 (5 minutes).
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -46,8 +46,7 @@ filter(filter): PermutationBuilder
|
||||
Configure filtering for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **filter**: `string`
|
||||
* **filter**: `string`
|
||||
SQL filter expression
|
||||
|
||||
#### Returns
|
||||
@@ -73,11 +72,9 @@ persist(connection, tableName): PermutationBuilder
|
||||
Configure the permutation to be persisted.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **connection**: [`Connection`](Connection.md)
|
||||
* **connection**: [`Connection`](Connection.md)
|
||||
The connection to persist the permutation to
|
||||
|
||||
* **tableName**: `string`
|
||||
* **tableName**: `string`
|
||||
The name of the table to create
|
||||
|
||||
#### Returns
|
||||
@@ -103,8 +100,7 @@ shuffle(options): PermutationBuilder
|
||||
Configure shuffling for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options**: [`ShuffleOptions`](../interfaces/ShuffleOptions.md)
|
||||
* **options**: [`ShuffleOptions`](../interfaces/ShuffleOptions.md)
|
||||
Configuration for shuffling
|
||||
|
||||
#### Returns
|
||||
@@ -134,8 +130,7 @@ splitCalculated(options): PermutationBuilder
|
||||
Configure calculated splits for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options**: [`SplitCalculatedOptions`](../interfaces/SplitCalculatedOptions.md)
|
||||
* **options**: [`SplitCalculatedOptions`](../interfaces/SplitCalculatedOptions.md)
|
||||
Configuration for calculated splitting
|
||||
|
||||
#### Returns
|
||||
@@ -161,8 +156,7 @@ splitHash(options): PermutationBuilder
|
||||
Configure hash-based splits for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options**: [`SplitHashOptions`](../interfaces/SplitHashOptions.md)
|
||||
* **options**: [`SplitHashOptions`](../interfaces/SplitHashOptions.md)
|
||||
Configuration for hash-based splitting
|
||||
|
||||
#### Returns
|
||||
@@ -192,8 +186,7 @@ splitRandom(options): PermutationBuilder
|
||||
Configure random splits for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options**: [`SplitRandomOptions`](../interfaces/SplitRandomOptions.md)
|
||||
* **options**: [`SplitRandomOptions`](../interfaces/SplitRandomOptions.md)
|
||||
Configuration for random splitting
|
||||
|
||||
#### Returns
|
||||
@@ -226,8 +219,7 @@ splitSequential(options): PermutationBuilder
|
||||
Configure sequential splits for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options**: [`SplitSequentialOptions`](../interfaces/SplitSequentialOptions.md)
|
||||
* **options**: [`SplitSequentialOptions`](../interfaces/SplitSequentialOptions.md)
|
||||
Configuration for sequential splitting
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -28,18 +28,14 @@ new PhraseQuery(
|
||||
Creates an instance of `PhraseQuery`.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
* **query**: `string`
|
||||
The phrase to search for in the specified column.
|
||||
|
||||
* **column**: `string`
|
||||
* **column**: `string`
|
||||
The name of the column to search within.
|
||||
|
||||
* **options?**
|
||||
* **options?**
|
||||
Optional parameters for the phrase query.
|
||||
- `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
|
||||
|
||||
* **options.slop?**: `number`
|
||||
* **options.slop?**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -86,8 +86,7 @@ protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||
Execute the query and return the results as an
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -120,8 +119,7 @@ explainPlan(verbose): Promise<string>
|
||||
Generates an explanation of the query execution plan.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **verbose**: `boolean` = `false`
|
||||
* **verbose**: `boolean` = `false`
|
||||
If true, provides a more detailed explanation. Defaults to false.
|
||||
|
||||
#### Returns
|
||||
@@ -177,8 +175,7 @@ filter(predicate): this
|
||||
A filter statement to be applied to this query.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **predicate**: `string`
|
||||
* **predicate**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -205,10 +202,8 @@ fullTextSearch(query, options?): this
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
* **options?**: `Partial`<[`FullTextSearchOptions`](../interfaces/FullTextSearchOptions.md)>
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
* **options?**: `Partial`<[`FullTextSearchOptions`](../interfaces/FullTextSearchOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -232,8 +227,7 @@ By default, a plain search has no limit. If this method is not
|
||||
called then every valid row from the table will be returned.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **limit**: `number`
|
||||
* **limit**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -268,8 +262,7 @@ fixed size list of floats) then the column does not need to be specified.
|
||||
If there is more than one vector column you must use
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
|
||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -308,10 +301,8 @@ nearestToText(query, columns?): Query
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
* **columns?**: `string`[]
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
* **columns?**: `string`[]
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -330,8 +321,7 @@ Set the number of rows to skip before returning results.
|
||||
This is useful for pagination.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **offset**: `number`
|
||||
* **offset**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -393,8 +383,7 @@ For example, an SQL query might state `SELECT a + b AS combined, c`. The equiva
|
||||
input to this method would be:
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **columns**: `string` \| `string`[] \| `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
* **columns**: `string` \| `string`[] \| `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -428,8 +417,7 @@ toArray(options?): Promise<any[]>
|
||||
Collect the results as an array of objects.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -450,8 +438,7 @@ toArrow(options?): Promise<Table<any>>
|
||||
Collect the results as an Arrow
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -478,8 +465,7 @@ A filter statement to be applied to this query.
|
||||
The filter should be supplied as an SQL query string. For example:
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **predicate**: `string`
|
||||
* **predicate**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -87,8 +87,7 @@ protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||
Execute the query and return the results as an
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -117,8 +116,7 @@ explainPlan(verbose): Promise<string>
|
||||
Generates an explanation of the query execution plan.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **verbose**: `boolean` = `false`
|
||||
* **verbose**: `boolean` = `false`
|
||||
If true, provides a more detailed explanation. Defaults to false.
|
||||
|
||||
#### Returns
|
||||
@@ -186,8 +184,7 @@ For example, an SQL query might state `SELECT a + b AS combined, c`. The equiva
|
||||
input to this method would be:
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **columns**: `string` \| `string`[] \| `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
* **columns**: `string` \| `string`[] \| `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -217,8 +214,7 @@ toArray(options?): Promise<any[]>
|
||||
Collect the results as an array of objects.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -235,8 +231,7 @@ toArrow(options?): Promise<Table<any>>
|
||||
Collect the results as an Arrow
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -33,10 +33,8 @@ Create a new session with custom cache sizes.
|
||||
Defaults to 1GB if not specified.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **indexCacheSizeBytes?**: `null` \| `bigint`
|
||||
|
||||
* **metadataCacheSizeBytes?**: `null` \| `bigint`
|
||||
* **indexCacheSizeBytes?**: `null` \| `bigint`
|
||||
* **metadataCacheSizeBytes?**: `null` \| `bigint`
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -37,8 +37,7 @@ new StaticHeaderProvider(headers): StaticHeaderProvider
|
||||
Initialize with static headers.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **headers**: `Record`<`string`, `string`>
|
||||
* **headers**: `Record`<`string`, `string`>
|
||||
Headers to return for every request.
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -46,11 +46,9 @@ abstract add(data, options?): Promise<AddResult>
|
||||
Insert records into this Table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **data**: [`Data`](../type-aliases/Data.md)
|
||||
* **data**: [`Data`](../type-aliases/Data.md)
|
||||
Records to be inserted into the Table
|
||||
|
||||
* **options?**: `Partial`<[`AddDataOptions`](../interfaces/AddDataOptions.md)>
|
||||
* **options?**: `Partial`<[`AddDataOptions`](../interfaces/AddDataOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -70,8 +68,7 @@ abstract addColumns(newColumnTransforms): Promise<AddColumnsResult>
|
||||
Add new columns with defined values.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **newColumnTransforms**: `Field`<`any`> \| `Field`<`any`>[] \| `Schema`<`any`> \| [`AddColumnsSql`](../interfaces/AddColumnsSql.md)[]
|
||||
* **newColumnTransforms**: `Field`<`any`> \| `Field`<`any`>[] \| `Schema`<`any`> \| [`AddColumnsSql`](../interfaces/AddColumnsSql.md)[]
|
||||
Either:
|
||||
- An array of objects with column names and SQL expressions to calculate values
|
||||
- A single Arrow Field defining one column with its data type (column will be initialized with null values)
|
||||
@@ -96,8 +93,7 @@ abstract alterColumns(columnAlterations): Promise<AlterColumnsResult>
|
||||
Alter the name or nullability of columns.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **columnAlterations**: [`ColumnAlteration`](../interfaces/ColumnAlteration.md)[]
|
||||
* **columnAlterations**: [`ColumnAlteration`](../interfaces/ColumnAlteration.md)[]
|
||||
One or more alterations to
|
||||
apply to columns.
|
||||
|
||||
@@ -126,8 +122,7 @@ Calling this method will set the table into time-travel mode. If you
|
||||
wish to return to standard mode, call `checkoutLatest`.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **version**: `string` \| `number`
|
||||
* **version**: `string` \| `number`
|
||||
The version to checkout, could be version number or tag
|
||||
|
||||
#### Returns
|
||||
@@ -196,8 +191,7 @@ abstract countRows(filter?): Promise<number>
|
||||
Count the total number of rows in the dataset.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **filter?**: `string`
|
||||
* **filter?**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -222,10 +216,8 @@ We currently don't support custom named indexes.
|
||||
The index name will always be `${column}_idx`.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **column**: `string`
|
||||
|
||||
* **options?**: `Partial`<[`IndexOptions`](../interfaces/IndexOptions.md)>
|
||||
* **column**: `string`
|
||||
* **options?**: `Partial`<[`IndexOptions`](../interfaces/IndexOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -268,8 +260,7 @@ abstract delete(predicate): Promise<DeleteResult>
|
||||
Delete the rows that satisfy the predicate.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **predicate**: `string`
|
||||
* **predicate**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -308,8 +299,7 @@ call ``compact_files`` to rewrite the data without the removed columns and
|
||||
then call ``cleanup_files`` to remove the old files.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **columnNames**: `string`[]
|
||||
* **columnNames**: `string`[]
|
||||
The names of the columns to drop. These can
|
||||
be nested column references (e.g. "a.b.c") or top-level column names
|
||||
(e.g. "a").
|
||||
@@ -332,8 +322,7 @@ abstract dropIndex(name): Promise<void>
|
||||
Drop an index from the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
The name of the index.
|
||||
This does not delete the index from disk, it just removes it from the table.
|
||||
To delete the index, run [Table#optimize](Table.md#optimize) after dropping the index.
|
||||
@@ -354,8 +343,7 @@ abstract indexStats(name): Promise<undefined | IndexStatistics>
|
||||
List all the stats of a specified index
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
The name of the index.
|
||||
|
||||
#### Returns
|
||||
@@ -460,8 +448,7 @@ abstract mergeInsert(on): MergeInsertBuilder
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **on**: `string` \| `string`[]
|
||||
* **on**: `string` \| `string`[]
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -492,8 +479,7 @@ Modeled after ``VACUUM`` in PostgreSQL.
|
||||
modification operations.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`OptimizeOptions`](../interfaces/OptimizeOptions.md)>
|
||||
* **options?**: `Partial`<[`OptimizeOptions`](../interfaces/OptimizeOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -510,8 +496,7 @@ abstract prewarmIndex(name): Promise<void>
|
||||
Prewarm an index in the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
The name of the index.
|
||||
This will load the index into memory. This may reduce the cold-start time for
|
||||
future queries. If the index does not fit in the cache then this call may be
|
||||
@@ -643,14 +628,11 @@ Create a search query to find the nearest neighbors
|
||||
of the given query
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
the query, a vector or string
|
||||
|
||||
* **queryType?**: `string`
|
||||
* **queryType?**: `string`
|
||||
the type of the query, "vector", "fts", or "auto"
|
||||
|
||||
* **ftsColumns?**: `string` \| `string`[]
|
||||
* **ftsColumns?**: `string` \| `string`[]
|
||||
the columns to search in for full text search
|
||||
for now, only one column can be searched at a time.
|
||||
when "auto" is used, if the query is a string and an embedding function is defined, it will be treated as a vector query
|
||||
@@ -715,8 +697,7 @@ abstract takeOffsets(offsets): TakeQuery
|
||||
Create a query that returns a subset of the rows in the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **offsets**: `number`[]
|
||||
* **offsets**: `number`[]
|
||||
The offsets of the rows to return.
|
||||
|
||||
#### Returns
|
||||
@@ -736,8 +717,7 @@ abstract takeRowIds(rowIds): TakeQuery
|
||||
Create a query that returns a subset of the rows in the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **rowIds**: readonly (`number` \| `bigint`)[]
|
||||
* **rowIds**: readonly (`number` \| `bigint`)[]
|
||||
The row ids of the rows to return.
|
||||
Row ids returned by `withRowId()` are `bigint`, so `bigint[]` is supported.
|
||||
For convenience / backwards compatibility, `number[]` is also accepted (for
|
||||
@@ -776,8 +756,7 @@ abstract update(opts): Promise<UpdateResult>
|
||||
Update existing records in the Table
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **opts**: `object` & `Partial`<[`UpdateOptions`](../interfaces/UpdateOptions.md)>
|
||||
* **opts**: `object` & `Partial`<[`UpdateOptions`](../interfaces/UpdateOptions.md)>
|
||||
|
||||
##### Returns
|
||||
|
||||
@@ -801,8 +780,7 @@ abstract update(opts): Promise<UpdateResult>
|
||||
Update existing records in the Table
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **opts**: `object` & `Partial`<[`UpdateOptions`](../interfaces/UpdateOptions.md)>
|
||||
* **opts**: `object` & `Partial`<[`UpdateOptions`](../interfaces/UpdateOptions.md)>
|
||||
|
||||
##### Returns
|
||||
|
||||
@@ -839,12 +817,10 @@ better performance with a single [`merge_insert`] call instead of
|
||||
repeatedly calilng this method.
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **updates**: `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
* **updates**: `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
the
|
||||
columns to update
|
||||
|
||||
* **options?**: `Partial`<[`UpdateOptions`](../interfaces/UpdateOptions.md)>
|
||||
* **options?**: `Partial`<[`UpdateOptions`](../interfaces/UpdateOptions.md)>
|
||||
additional options to control
|
||||
the update behavior
|
||||
|
||||
@@ -875,8 +851,7 @@ is the same thing as calling `nearestTo` on the builder returned
|
||||
by `query`.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)
|
||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -911,11 +886,9 @@ abstract waitForIndex(indexNames, timeoutSeconds): Promise<void>
|
||||
Waits for asynchronous indexing to complete on the table.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **indexNames**: `string`[]
|
||||
* **indexNames**: `string`[]
|
||||
The name of the indices to wait for
|
||||
|
||||
* **timeoutSeconds**: `number`
|
||||
* **timeoutSeconds**: `number`
|
||||
The number of seconds to wait before timing out
|
||||
This will raise an error if the indices are not created and fully indexed within the timeout.
|
||||
|
||||
|
||||
@@ -27,10 +27,8 @@ create(tag, version): Promise<void>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **tag**: `string`
|
||||
|
||||
* **version**: `number`
|
||||
* **tag**: `string`
|
||||
* **version**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -45,8 +43,7 @@ delete(tag): Promise<void>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **tag**: `string`
|
||||
* **tag**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -61,8 +58,7 @@ getVersion(tag): Promise<number>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **tag**: `string`
|
||||
* **tag**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -89,10 +85,8 @@ update(tag, version): Promise<void>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **tag**: `string`
|
||||
|
||||
* **version**: `number`
|
||||
* **tag**: `string`
|
||||
* **version**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -82,8 +82,7 @@ protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||
Execute the query and return the results as an
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -116,8 +115,7 @@ explainPlan(verbose): Promise<string>
|
||||
Generates an explanation of the query execution plan.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **verbose**: `boolean` = `false`
|
||||
* **verbose**: `boolean` = `false`
|
||||
If true, provides a more detailed explanation. Defaults to false.
|
||||
|
||||
#### Returns
|
||||
@@ -193,8 +191,7 @@ For example, an SQL query might state `SELECT a + b AS combined, c`. The equiva
|
||||
input to this method would be:
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **columns**: `string` \| `string`[] \| `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
* **columns**: `string` \| `string`[] \| `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -228,8 +225,7 @@ toArray(options?): Promise<any[]>
|
||||
Collect the results as an array of objects.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -250,8 +246,7 @@ toArrow(options?): Promise<Table<any>>
|
||||
Collect the results as an Arrow
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -15,8 +15,7 @@ new VectorColumnOptions(values?): VectorColumnOptions
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **values?**: `Partial`<[`VectorColumnOptions`](VectorColumnOptions.md)>
|
||||
* **values?**: `Partial`<[`VectorColumnOptions`](VectorColumnOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -39,8 +39,7 @@ addQueryVector(vector): VectorQuery
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
|
||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -127,8 +126,7 @@ This controls which column is compared to the query vector supplied in
|
||||
the call to
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **column**: `string`
|
||||
* **column**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -150,10 +148,8 @@ distanceRange(lowerBound?, upperBound?): VectorQuery
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **lowerBound?**: `number`
|
||||
|
||||
* **upperBound?**: `number`
|
||||
* **lowerBound?**: `number`
|
||||
* **upperBound?**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -174,8 +170,7 @@ to some kind of distance metric. This parameter controls which distance metric
|
||||
use. See
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **distanceType**: `"l2"` \| `"cosine"` \| `"dot"`
|
||||
* **distanceType**: `"l2"` \| `"cosine"` \| `"dot"`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -209,8 +204,7 @@ Increasing this value will increase the recall of your query but will
|
||||
also increase the latency of your query. The default value is 1.5*limit.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **ef**: `number`
|
||||
* **ef**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -227,8 +221,7 @@ protected execute(options?): AsyncGenerator<RecordBatch<any>, void, unknown>
|
||||
Execute the query and return the results as an
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -261,8 +254,7 @@ explainPlan(verbose): Promise<string>
|
||||
Generates an explanation of the query execution plan.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **verbose**: `boolean` = `false`
|
||||
* **verbose**: `boolean` = `false`
|
||||
If true, provides a more detailed explanation. Defaults to false.
|
||||
|
||||
#### Returns
|
||||
@@ -318,8 +310,7 @@ filter(predicate): this
|
||||
A filter statement to be applied to this query.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **predicate**: `string`
|
||||
* **predicate**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -346,10 +337,8 @@ fullTextSearch(query, options?): this
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
|
||||
* **options?**: `Partial`<[`FullTextSearchOptions`](../interfaces/FullTextSearchOptions.md)>
|
||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
* **options?**: `Partial`<[`FullTextSearchOptions`](../interfaces/FullTextSearchOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -373,8 +362,7 @@ By default, a plain search has no limit. If this method is not
|
||||
called then every valid row from the table will be returned.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **limit**: `number`
|
||||
* **limit**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -401,8 +389,7 @@ a narrow filter to allow these queries to spend more time searching and avoid
|
||||
potential false negatives.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **maximumNprobes**: `number`
|
||||
* **maximumNprobes**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -424,8 +411,7 @@ filter. See `nprobes` for more details. Higher values will increase recall
|
||||
but will also increase latency.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **minimumNprobes**: `number`
|
||||
* **minimumNprobes**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -465,8 +451,7 @@ you can use `minimumNprobes` and `maximumNprobes`. This method sets both
|
||||
the minimum and maximum to the same value.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **nprobes**: `number`
|
||||
* **nprobes**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -485,8 +470,7 @@ Set the number of rows to skip before returning results.
|
||||
This is useful for pagination.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **offset**: `number`
|
||||
* **offset**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -590,8 +574,7 @@ and the quantized result vectors. This can be considerably different than the t
|
||||
distance between the query vector and the actual uncompressed vector.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **refineFactor**: `number`
|
||||
* **refineFactor**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -606,8 +589,7 @@ rerank(reranker): VectorQuery
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **reranker**: [`Reranker`](../namespaces/rerankers/interfaces/Reranker.md)
|
||||
* **reranker**: [`Reranker`](../namespaces/rerankers/interfaces/Reranker.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -642,8 +624,7 @@ For example, an SQL query might state `SELECT a + b AS combined, c`. The equiva
|
||||
input to this method would be:
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **columns**: `string` \| `string`[] \| `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
* **columns**: `string` \| `string`[] \| `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -677,8 +658,7 @@ toArray(options?): Promise<any[]>
|
||||
Collect the results as an array of objects.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -699,8 +679,7 @@ toArrow(options?): Promise<Table<any>>
|
||||
Collect the results as an Arrow
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -727,8 +706,7 @@ A filter statement to be applied to this query.
|
||||
The filter should be supplied as an SQL query string. For example:
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **predicate**: `string`
|
||||
* **predicate**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
59
docs/src/js/enumerations/OAuthFlowType.md
Normal file
59
docs/src/js/enumerations/OAuthFlowType.md
Normal file
@@ -0,0 +1,59 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / OAuthFlowType
|
||||
|
||||
# Enumeration: OAuthFlowType
|
||||
|
||||
OAuth authentication flow types.
|
||||
|
||||
## Enumeration Members
|
||||
|
||||
### AuthorizationCodePKCE
|
||||
|
||||
```ts
|
||||
AuthorizationCodePKCE: "authorization_code_pkce";
|
||||
```
|
||||
|
||||
Authorization Code with PKCE (interactive browser-based auth).
|
||||
|
||||
***
|
||||
|
||||
### AzureManagedIdentity
|
||||
|
||||
```ts
|
||||
AzureManagedIdentity: "azure_managed_identity";
|
||||
```
|
||||
|
||||
Azure Managed Identity via IMDS.
|
||||
|
||||
***
|
||||
|
||||
### ClientCredentials
|
||||
|
||||
```ts
|
||||
ClientCredentials: "client_credentials";
|
||||
```
|
||||
|
||||
Client Credentials grant (service-to-service / M2M).
|
||||
|
||||
***
|
||||
|
||||
### DeviceCode
|
||||
|
||||
```ts
|
||||
DeviceCode: "device_code";
|
||||
```
|
||||
|
||||
Device Code grant (CLI / headless environments).
|
||||
|
||||
***
|
||||
|
||||
### WorkloadIdentity
|
||||
|
||||
```ts
|
||||
WorkloadIdentity: "workload_identity";
|
||||
```
|
||||
|
||||
Workload Identity Federation (K8s, GitHub Actions).
|
||||
@@ -11,8 +11,7 @@ function RecordBatchIterator(promisedInner): AsyncGenerator<RecordBatch<any>, vo
|
||||
```
|
||||
|
||||
## Parameters
|
||||
|
||||
* **promisedInner**: `Promise`<`RecordBatchIterator`>
|
||||
* **promisedInner**: `Promise`<`RecordBatchIterator`>
|
||||
|
||||
## Returns
|
||||
|
||||
|
||||
@@ -25,17 +25,13 @@ Accepted formats:
|
||||
- `db://host:port` - remote database (LanceDB cloud)
|
||||
|
||||
### Parameters
|
||||
|
||||
* **uri**: `string`
|
||||
* **uri**: `string`
|
||||
The uri of the database. If the database uri starts
|
||||
with `db://` then it connects to a remote database.
|
||||
|
||||
* **options?**: `Partial`<[`ConnectionOptions`](../interfaces/ConnectionOptions.md)>
|
||||
* **options?**: `Partial`<[`ConnectionOptions`](../interfaces/ConnectionOptions.md)>
|
||||
The options to use when connecting to the database
|
||||
|
||||
* **session?**: [`Session`](../classes/Session.md)
|
||||
|
||||
* **headerProvider?**: [`HeaderProvider`](../classes/HeaderProvider.md) \| () => `Record`<`string`, `string`> \| () => `Promise`<`Record`<`string`, `string`>>
|
||||
* **session?**: [`Session`](../classes/Session.md)
|
||||
* **headerProvider?**: [`HeaderProvider`](../classes/HeaderProvider.md) \| () => `Record`<`string`, `string`> \| () => `Promise`<`Record`<`string`, `string`>>
|
||||
|
||||
### Returns
|
||||
|
||||
@@ -85,8 +81,7 @@ Accepted formats:
|
||||
- `db://host:port` - remote database (LanceDB cloud)
|
||||
|
||||
### Parameters
|
||||
|
||||
* **options**: `Partial`<[`ConnectionOptions`](../interfaces/ConnectionOptions.md)> & `object`
|
||||
* **options**: `Partial`<[`ConnectionOptions`](../interfaces/ConnectionOptions.md)> & `object`
|
||||
The options to use when connecting to the database
|
||||
|
||||
### Returns
|
||||
|
||||
@@ -46,12 +46,9 @@ rules are as follows:
|
||||
- Array<any> => List
|
||||
|
||||
## Parameters
|
||||
|
||||
* **data**: `Record`<`string`, `unknown`>[]
|
||||
|
||||
* **options?**: `Partial`<[`MakeArrowTableOptions`](../classes/MakeArrowTableOptions.md)>
|
||||
|
||||
* **metadata?**: `Map`<`string`, `string`>
|
||||
* **data**: `Record`<`string`, `unknown`>[]
|
||||
* **options?**: `Partial`<[`MakeArrowTableOptions`](../classes/MakeArrowTableOptions.md)>
|
||||
* **metadata?**: `Map`<`string`, `string`>
|
||||
|
||||
## Returns
|
||||
|
||||
|
||||
@@ -11,8 +11,7 @@ function packBits(data): number[]
|
||||
```
|
||||
|
||||
## Parameters
|
||||
|
||||
* **data**: `number`[]
|
||||
* **data**: `number`[]
|
||||
|
||||
## Returns
|
||||
|
||||
|
||||
@@ -13,8 +13,7 @@ function permutationBuilder(table): PermutationBuilder
|
||||
Create a permutation builder for the given table.
|
||||
|
||||
## Parameters
|
||||
|
||||
* **table**: [`Table`](../classes/Table.md)
|
||||
* **table**: [`Table`](../classes/Table.md)
|
||||
The source table to create a permutation from
|
||||
|
||||
## Returns
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
## Enumerations
|
||||
|
||||
- [FullTextQueryType](enumerations/FullTextQueryType.md)
|
||||
- [OAuthFlowType](enumerations/OAuthFlowType.md)
|
||||
- [Occur](enumerations/Occur.md)
|
||||
- [Operator](enumerations/Operator.md)
|
||||
|
||||
@@ -70,6 +71,8 @@
|
||||
- [IvfPqOptions](interfaces/IvfPqOptions.md)
|
||||
- [IvfRqOptions](interfaces/IvfRqOptions.md)
|
||||
- [MergeResult](interfaces/MergeResult.md)
|
||||
- [NativeOAuthConfig](interfaces/NativeOAuthConfig.md)
|
||||
- [OAuthConfig](interfaces/OAuthConfig.md)
|
||||
- [OpenTableOptions](interfaces/OpenTableOptions.md)
|
||||
- [OptimizeOptions](interfaces/OptimizeOptions.md)
|
||||
- [OptimizeStats](interfaces/OptimizeStats.md)
|
||||
|
||||
@@ -41,6 +41,41 @@ for testing purposes.
|
||||
|
||||
***
|
||||
|
||||
### manifestEnabled?
|
||||
|
||||
```ts
|
||||
optional manifestEnabled: boolean;
|
||||
```
|
||||
|
||||
(For LanceDB OSS only): use directory namespace manifests as the source
|
||||
of truth for table metadata. Existing directory-listed root tables are
|
||||
migrated into the manifest on access.
|
||||
|
||||
***
|
||||
|
||||
### namespaceClientProperties?
|
||||
|
||||
```ts
|
||||
optional namespaceClientProperties: Record<string, string>;
|
||||
```
|
||||
|
||||
(For LanceDB OSS only): extra properties for the backing namespace
|
||||
client used by manifest-enabled native connections.
|
||||
|
||||
***
|
||||
|
||||
### oauthConfig?
|
||||
|
||||
```ts
|
||||
optional oauthConfig: NativeOAuthConfig;
|
||||
```
|
||||
|
||||
(For LanceDB cloud only): OAuth configuration for IdP-based
|
||||
authentication (e.g., Azure Entra ID). When set, token acquisition
|
||||
and refresh are handled entirely in Rust.
|
||||
|
||||
***
|
||||
|
||||
### readConsistencyInterval?
|
||||
|
||||
```ts
|
||||
|
||||
112
docs/src/js/interfaces/NativeOAuthConfig.md
Normal file
112
docs/src/js/interfaces/NativeOAuthConfig.md
Normal file
@@ -0,0 +1,112 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / NativeOAuthConfig
|
||||
|
||||
# Interface: NativeOAuthConfig
|
||||
|
||||
OAuth configuration for LanceDB authentication.
|
||||
All token acquisition and refresh is handled in the Rust layer.
|
||||
|
||||
## Properties
|
||||
|
||||
### callbackPort?
|
||||
|
||||
```ts
|
||||
optional callbackPort: number;
|
||||
```
|
||||
|
||||
Port for local callback server (authorization_code_pkce, default: 8400).
|
||||
|
||||
***
|
||||
|
||||
### clientId
|
||||
|
||||
```ts
|
||||
clientId: string;
|
||||
```
|
||||
|
||||
Application / Client ID.
|
||||
|
||||
***
|
||||
|
||||
### clientSecret?
|
||||
|
||||
```ts
|
||||
optional clientSecret: string;
|
||||
```
|
||||
|
||||
Client secret (required for client_credentials).
|
||||
|
||||
***
|
||||
|
||||
### flow?
|
||||
|
||||
```ts
|
||||
optional flow: string;
|
||||
```
|
||||
|
||||
Authentication flow: "client_credentials", "authorization_code_pkce",
|
||||
"device_code", "azure_managed_identity", "workload_identity"
|
||||
|
||||
***
|
||||
|
||||
### issuerUrl
|
||||
|
||||
```ts
|
||||
issuerUrl: string;
|
||||
```
|
||||
|
||||
OIDC issuer URL or OAuth authority URL.
|
||||
For Azure: `https://login.microsoftonline.com/{tenant_id}/v2.0`
|
||||
|
||||
***
|
||||
|
||||
### managedIdentityClientId?
|
||||
|
||||
```ts
|
||||
optional managedIdentityClientId: string;
|
||||
```
|
||||
|
||||
Client ID for user-assigned managed identity (azure_managed_identity).
|
||||
|
||||
***
|
||||
|
||||
### redirectUri?
|
||||
|
||||
```ts
|
||||
optional redirectUri: string;
|
||||
```
|
||||
|
||||
Redirect URI (authorization_code_pkce flow).
|
||||
|
||||
***
|
||||
|
||||
### refreshBufferSecs?
|
||||
|
||||
```ts
|
||||
optional refreshBufferSecs: number;
|
||||
```
|
||||
|
||||
Seconds before expiry to trigger proactive refresh (default: 300).
|
||||
|
||||
***
|
||||
|
||||
### scopes
|
||||
|
||||
```ts
|
||||
scopes: string[];
|
||||
```
|
||||
|
||||
OAuth scopes to request. For Azure: `["api://{app_id}/.default"]`
|
||||
|
||||
***
|
||||
|
||||
### tokenFile?
|
||||
|
||||
```ts
|
||||
optional tokenFile: string;
|
||||
```
|
||||
|
||||
Path to federated token file (workload_identity).
|
||||
134
docs/src/js/interfaces/OAuthConfig.md
Normal file
134
docs/src/js/interfaces/OAuthConfig.md
Normal file
@@ -0,0 +1,134 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / OAuthConfig
|
||||
|
||||
# Interface: OAuthConfig
|
||||
|
||||
OAuth configuration for LanceDB authentication.
|
||||
|
||||
All token acquisition and refresh is handled in the Rust layer.
|
||||
This config is passed through to Rust via napi-rs.
|
||||
|
||||
## Examples
|
||||
|
||||
```typescript
|
||||
const config: OAuthConfig = {
|
||||
issuerUrl: "https://login.microsoftonline.com/{tenant}/v2.0",
|
||||
clientId: "app-id",
|
||||
clientSecret: "secret",
|
||||
scopes: ["api://lancedb-api/.default"],
|
||||
};
|
||||
```
|
||||
|
||||
```typescript
|
||||
const config: OAuthConfig = {
|
||||
issuerUrl: "https://login.microsoftonline.com/{tenant}/v2.0",
|
||||
clientId: "app-id",
|
||||
scopes: ["api://lancedb-api/.default"],
|
||||
flow: OAuthFlowType.AzureManagedIdentity,
|
||||
};
|
||||
```
|
||||
|
||||
## Properties
|
||||
|
||||
### callbackPort?
|
||||
|
||||
```ts
|
||||
optional callbackPort: number;
|
||||
```
|
||||
|
||||
Port for local callback server (AuthorizationCodePKCE, default: 8400).
|
||||
|
||||
***
|
||||
|
||||
### clientId
|
||||
|
||||
```ts
|
||||
clientId: string;
|
||||
```
|
||||
|
||||
Application / Client ID.
|
||||
|
||||
***
|
||||
|
||||
### clientSecret?
|
||||
|
||||
```ts
|
||||
optional clientSecret: string;
|
||||
```
|
||||
|
||||
Client secret (required for ClientCredentials).
|
||||
|
||||
***
|
||||
|
||||
### flow?
|
||||
|
||||
```ts
|
||||
optional flow: OAuthFlowType;
|
||||
```
|
||||
|
||||
Authentication flow (default: ClientCredentials).
|
||||
|
||||
***
|
||||
|
||||
### issuerUrl
|
||||
|
||||
```ts
|
||||
issuerUrl: string;
|
||||
```
|
||||
|
||||
OIDC issuer URL or OAuth authority URL.
|
||||
For Azure: `https://login.microsoftonline.com/{tenant_id}/v2.0`
|
||||
|
||||
***
|
||||
|
||||
### managedIdentityClientId?
|
||||
|
||||
```ts
|
||||
optional managedIdentityClientId: string;
|
||||
```
|
||||
|
||||
Client ID for user-assigned managed identity (AzureManagedIdentity).
|
||||
|
||||
***
|
||||
|
||||
### redirectUri?
|
||||
|
||||
```ts
|
||||
optional redirectUri: string;
|
||||
```
|
||||
|
||||
Redirect URI (AuthorizationCodePKCE flow).
|
||||
|
||||
***
|
||||
|
||||
### refreshBufferSecs?
|
||||
|
||||
```ts
|
||||
optional refreshBufferSecs: number;
|
||||
```
|
||||
|
||||
Seconds before expiry to trigger proactive refresh (default: 300).
|
||||
|
||||
***
|
||||
|
||||
### scopes
|
||||
|
||||
```ts
|
||||
scopes: string[];
|
||||
```
|
||||
|
||||
OAuth scopes to request.
|
||||
For Azure: `["api://{app_id}/.default"]`
|
||||
|
||||
***
|
||||
|
||||
### tokenFile?
|
||||
|
||||
```ts
|
||||
optional tokenFile: string;
|
||||
```
|
||||
|
||||
Path to federated token file (WorkloadIdentity).
|
||||
@@ -58,8 +58,7 @@ computeQueryEmbeddings(data): Promise<number[] | Uint8Array | Float32Array | Flo
|
||||
Compute the embeddings for a single query
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **data**: `T`
|
||||
* **data**: `T`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -76,8 +75,7 @@ abstract computeSourceEmbeddings(data): Promise<number[][] | Float32Array[] | Fl
|
||||
Creates a vector representation for the given values.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **data**: `T`[]
|
||||
* **data**: `T`[]
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -155,8 +153,7 @@ protected resolveVariables(config): Partial<M>
|
||||
Apply variables to the config.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **config**: `Partial`<`M`>
|
||||
* **config**: `Partial`<`M`>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -173,8 +170,7 @@ sourceField(optionsOrDatatype): [DataType<Type, any>, Map<string, EmbeddingFunct
|
||||
sourceField is used in combination with `LanceSchema` to provide a declarative data model
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **optionsOrDatatype**: `DataType`<`Type`, `any`> \| `Partial`<[`FieldOptions`](../interfaces/FieldOptions.md)<`DataType`<`Type`, `any`>>>
|
||||
* **optionsOrDatatype**: `DataType`<`Type`, `any`> \| `Partial`<[`FieldOptions`](../interfaces/FieldOptions.md)<`DataType`<`Type`, `any`>>>
|
||||
The options for the field or the datatype
|
||||
|
||||
#### Returns
|
||||
@@ -211,8 +207,7 @@ vectorField(optionsOrDatatype?): [DataType<Type, any>, Map<string, EmbeddingFunc
|
||||
vectorField is used in combination with `LanceSchema` to provide a declarative data model
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **optionsOrDatatype?**: `DataType`<`Type`, `any`> \| `Partial`<[`FieldOptions`](../interfaces/FieldOptions.md)<`DataType`<`Type`, `any`>>>
|
||||
* **optionsOrDatatype?**: `DataType`<`Type`, `any`> \| `Partial`<[`FieldOptions`](../interfaces/FieldOptions.md)<`DataType`<`Type`, `any`>>>
|
||||
The options for the field
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -32,8 +32,7 @@ functionToMetadata(conf): Record<string, any>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **conf**: [`EmbeddingFunctionConfig`](../interfaces/EmbeddingFunctionConfig.md)
|
||||
* **conf**: [`EmbeddingFunctionConfig`](../interfaces/EmbeddingFunctionConfig.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -54,8 +53,7 @@ Fetch an embedding function by name
|
||||
• **T** *extends* [`EmbeddingFunction`](EmbeddingFunction.md)<`unknown`, [`FunctionOptions`](../interfaces/FunctionOptions.md)>
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
The name of the function
|
||||
|
||||
#### Returns
|
||||
@@ -71,8 +69,7 @@ getTableMetadata(functions): Map<string, string>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **functions**: [`EmbeddingFunctionConfig`](../interfaces/EmbeddingFunctionConfig.md)[]
|
||||
* **functions**: [`EmbeddingFunctionConfig`](../interfaces/EmbeddingFunctionConfig.md)[]
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -89,8 +86,7 @@ getVar(name): undefined | string
|
||||
Get a variable.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
* **name**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -129,18 +125,15 @@ Register an embedding function
|
||||
• **T** *extends* [`EmbeddingFunctionConstructor`](../interfaces/EmbeddingFunctionConstructor.md)<[`EmbeddingFunction`](EmbeddingFunction.md)<`any`, [`FunctionOptions`](../interfaces/FunctionOptions.md)>> = [`EmbeddingFunctionConstructor`](../interfaces/EmbeddingFunctionConstructor.md)<[`EmbeddingFunction`](EmbeddingFunction.md)<`any`, [`FunctionOptions`](../interfaces/FunctionOptions.md)>>
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **this**: [`EmbeddingFunctionRegistry`](EmbeddingFunctionRegistry.md)
|
||||
|
||||
* **alias?**: `string`
|
||||
* **this**: [`EmbeddingFunctionRegistry`](EmbeddingFunctionRegistry.md)
|
||||
* **alias?**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
`Function`
|
||||
|
||||
##### Parameters
|
||||
|
||||
* **ctor**: `T`
|
||||
* **ctor**: `T`
|
||||
|
||||
##### Returns
|
||||
|
||||
@@ -161,8 +154,7 @@ reset(this): void
|
||||
reset the registry to the initial state
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **this**: [`EmbeddingFunctionRegistry`](EmbeddingFunctionRegistry.md)
|
||||
* **this**: [`EmbeddingFunctionRegistry`](EmbeddingFunctionRegistry.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -187,10 +179,8 @@ whether to use a GPU for inference.
|
||||
The name must not contain colons. The default value can contain colons.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **name**: `string`
|
||||
|
||||
* **value**: `string`
|
||||
* **name**: `string`
|
||||
* **value**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -43,8 +43,7 @@ computeQueryEmbeddings(data): Promise<number[] | Uint8Array | Float32Array | Flo
|
||||
Compute the embeddings for a single query
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **data**: `string`
|
||||
* **data**: `string`
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -65,8 +64,7 @@ computeSourceEmbeddings(data): Promise<number[][] | Float32Array[] | Float64Arra
|
||||
Creates a vector representation for the given values.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **data**: `string`[]
|
||||
* **data**: `string`[]
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -103,10 +101,8 @@ abstract generateEmbeddings(texts, ...args): Promise<number[][] | Float32Array[]
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **texts**: `string`[]
|
||||
|
||||
* ...**args**: `any`[]
|
||||
* **texts**: `string`[]
|
||||
* ...**args**: `any`[]
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -182,8 +178,7 @@ protected resolveVariables(config): Partial<M>
|
||||
Apply variables to the config.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **config**: `Partial`<`M`>
|
||||
* **config**: `Partial`<`M`>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -245,8 +240,7 @@ vectorField(optionsOrDatatype?): [DataType<Type, any>, Map<string, EmbeddingFunc
|
||||
vectorField is used in combination with `LanceSchema` to provide a declarative data model
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **optionsOrDatatype?**: `DataType`<`Type`, `any`> \| `Partial`<[`FieldOptions`](../interfaces/FieldOptions.md)<`DataType`<`Type`, `any`>>>
|
||||
* **optionsOrDatatype?**: `DataType`<`Type`, `any`> \| `Partial`<[`FieldOptions`](../interfaces/FieldOptions.md)<`DataType`<`Type`, `any`>>>
|
||||
The options for the field
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -13,8 +13,7 @@ function LanceSchema(fields): Schema
|
||||
Create a schema with embedding functions.
|
||||
|
||||
## Parameters
|
||||
|
||||
* **fields**: `Record`<`string`, `object` \| [`object`, `Map`<`string`, [`EmbeddingFunction`](../classes/EmbeddingFunction.md)<`any`, [`FunctionOptions`](../interfaces/FunctionOptions.md)>>]>
|
||||
* **fields**: `Record`<`string`, `object` \| [`object`, `Map`<`string`, [`EmbeddingFunction`](../classes/EmbeddingFunction.md)<`any`, [`FunctionOptions`](../interfaces/FunctionOptions.md)>>]>
|
||||
|
||||
## Returns
|
||||
|
||||
|
||||
@@ -11,16 +11,14 @@ function register(name?): (ctor) => any
|
||||
```
|
||||
|
||||
## Parameters
|
||||
|
||||
* **name?**: `string`
|
||||
* **name?**: `string`
|
||||
|
||||
## Returns
|
||||
|
||||
`Function`
|
||||
|
||||
### Parameters
|
||||
|
||||
* **ctor**: [`EmbeddingFunctionConstructor`](../interfaces/EmbeddingFunctionConstructor.md)<[`EmbeddingFunction`](../classes/EmbeddingFunction.md)<`any`, [`FunctionOptions`](../interfaces/FunctionOptions.md)>>
|
||||
* **ctor**: [`EmbeddingFunctionConstructor`](../interfaces/EmbeddingFunctionConstructor.md)<[`EmbeddingFunction`](../classes/EmbeddingFunction.md)<`any`, [`FunctionOptions`](../interfaces/FunctionOptions.md)>>
|
||||
|
||||
### Returns
|
||||
|
||||
|
||||
@@ -19,8 +19,7 @@ new EmbeddingFunctionConstructor(modelOptions?): T
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **modelOptions?**: `T`\[`"TOptions"`\]
|
||||
* **modelOptions?**: `T`\[`"TOptions"`\]
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -19,8 +19,7 @@ create(options?): CreateReturnType<T>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `T`\[`"TOptions"`\]
|
||||
* **options?**: `T`\[`"TOptions"`\]
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -20,12 +20,9 @@ rerankHybrid(
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
|
||||
* **vecResults**: `RecordBatch`<`any`>
|
||||
|
||||
* **ftsResults**: `RecordBatch`<`any`>
|
||||
* **query**: `string`
|
||||
* **vecResults**: `RecordBatch`<`any`>
|
||||
* **ftsResults**: `RecordBatch`<`any`>
|
||||
|
||||
#### Returns
|
||||
|
||||
@@ -40,8 +37,7 @@ static create(k): Promise<RRFReranker>
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **k**: `number` = `60`
|
||||
* **k**: `number` = `60`
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -18,12 +18,9 @@ rerankHybrid(
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string`
|
||||
|
||||
* **vecResults**: `RecordBatch`<`any`>
|
||||
|
||||
* **ftsResults**: `RecordBatch`<`any`>
|
||||
* **query**: `string`
|
||||
* **vecResults**: `RecordBatch`<`any`>
|
||||
* **ftsResults**: `RecordBatch`<`any`>
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -94,11 +94,11 @@ of raw SQL strings with [where][lancedb.query.LanceQueryBuilder.where] and
|
||||
|
||||
## Full text search
|
||||
|
||||
::: lancedb.fts.create_index
|
||||
Use [lancedb.table.Table.create_fts_index][] for the synchronous API or
|
||||
[lancedb.table.AsyncTable.create_index][] with [lancedb.index.FTS][] for the
|
||||
asynchronous API.
|
||||
|
||||
::: lancedb.fts.populate_index
|
||||
|
||||
::: lancedb.fts.search_index
|
||||
::: lancedb.index.FTS
|
||||
|
||||
## Utilities
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.28.0-beta.9</version>
|
||||
<version>0.28.0-beta.11</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.28.0-beta.9</version>
|
||||
<version>0.28.0-beta.11</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Java SDK Parent POM</description>
|
||||
@@ -28,7 +28,7 @@
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<arrow.version>15.0.0</arrow.version>
|
||||
<lance-core.version>6.0.0-beta.1</lance-core.version>
|
||||
<lance-core.version>7.0.0-beta.7</lance-core.version>
|
||||
<spotless.skip>false</spotless.skip>
|
||||
<spotless.version>2.30.0</spotless.version>
|
||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.28.0-beta.9"
|
||||
version = "0.28.0-beta.11"
|
||||
publish = false
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
@@ -15,7 +16,7 @@ crate-type = ["cdylib"]
|
||||
async-trait.workspace = true
|
||||
arrow-ipc.workspace = true
|
||||
arrow-array.workspace = true
|
||||
arrow-buffer = "57.2"
|
||||
arrow-buffer = "58.0.0"
|
||||
half.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
env_logger.workspace = true
|
||||
@@ -31,8 +32,8 @@ lzma-sys = { version = "0.1", features = ["static"] }
|
||||
log.workspace = true
|
||||
|
||||
# Pin to resolve build failures; update periodically for security patches.
|
||||
aws-lc-sys = "=0.38.0"
|
||||
aws-lc-rs = "=1.16.1"
|
||||
aws-lc-sys = "=0.40.0"
|
||||
aws-lc-rs = "=1.16.3"
|
||||
|
||||
[build-dependencies]
|
||||
napi-build = "2.3.1"
|
||||
|
||||
@@ -48,6 +48,7 @@ export {
|
||||
SplitHashOptions,
|
||||
SplitSequentialOptions,
|
||||
ShuffleOptions,
|
||||
OAuthConfig as NativeOAuthConfig,
|
||||
} from "./native.js";
|
||||
|
||||
export {
|
||||
@@ -113,6 +114,8 @@ export {
|
||||
TokenResponse,
|
||||
} from "./header";
|
||||
|
||||
export { OAuthConfig, OAuthFlowType } from "./oauth";
|
||||
|
||||
export { MergeInsertBuilder, WriteExecutionOptions } from "./merge";
|
||||
|
||||
export * as embedding from "./embedding";
|
||||
|
||||
82
nodejs/lancedb/oauth.ts
Normal file
82
nodejs/lancedb/oauth.ts
Normal file
@@ -0,0 +1,82 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
/**
|
||||
* OAuth authentication flow types.
|
||||
*/
|
||||
export enum OAuthFlowType {
|
||||
/** Client Credentials grant (service-to-service / M2M). */
|
||||
ClientCredentials = "client_credentials",
|
||||
/** Authorization Code with PKCE (interactive browser-based auth). */
|
||||
AuthorizationCodePKCE = "authorization_code_pkce",
|
||||
/** Device Code grant (CLI / headless environments). */
|
||||
DeviceCode = "device_code",
|
||||
/** Azure Managed Identity via IMDS. */
|
||||
AzureManagedIdentity = "azure_managed_identity",
|
||||
/** Workload Identity Federation (K8s, GitHub Actions). */
|
||||
WorkloadIdentity = "workload_identity",
|
||||
}
|
||||
|
||||
/**
|
||||
* OAuth configuration for LanceDB authentication.
|
||||
*
|
||||
* All token acquisition and refresh is handled in the Rust layer.
|
||||
* This config is passed through to Rust via napi-rs.
|
||||
*
|
||||
* @example Client Credentials (service-to-service):
|
||||
* ```typescript
|
||||
* const config: OAuthConfig = {
|
||||
* issuerUrl: "https://login.microsoftonline.com/{tenant}/v2.0",
|
||||
* clientId: "app-id",
|
||||
* clientSecret: "secret",
|
||||
* scopes: ["api://lancedb-api/.default"],
|
||||
* };
|
||||
* ```
|
||||
*
|
||||
* @example Azure Managed Identity:
|
||||
* ```typescript
|
||||
* const config: OAuthConfig = {
|
||||
* issuerUrl: "https://login.microsoftonline.com/{tenant}/v2.0",
|
||||
* clientId: "app-id",
|
||||
* scopes: ["api://lancedb-api/.default"],
|
||||
* flow: OAuthFlowType.AzureManagedIdentity,
|
||||
* };
|
||||
* ```
|
||||
*/
|
||||
export interface OAuthConfig {
|
||||
/**
|
||||
* OIDC issuer URL or OAuth authority URL.
|
||||
* For Azure: `https://login.microsoftonline.com/{tenant_id}/v2.0`
|
||||
*/
|
||||
issuerUrl: string;
|
||||
|
||||
/** Application / Client ID. */
|
||||
clientId: string;
|
||||
|
||||
/**
|
||||
* OAuth scopes to request.
|
||||
* For Azure: `["api://{app_id}/.default"]`
|
||||
*/
|
||||
scopes: string[];
|
||||
|
||||
/** Authentication flow (default: ClientCredentials). */
|
||||
flow?: OAuthFlowType;
|
||||
|
||||
/** Client secret (required for ClientCredentials). */
|
||||
clientSecret?: string;
|
||||
|
||||
/** Redirect URI (AuthorizationCodePKCE flow). */
|
||||
redirectUri?: string;
|
||||
|
||||
/** Port for local callback server (AuthorizationCodePKCE, default: 8400). */
|
||||
callbackPort?: number;
|
||||
|
||||
/** Client ID for user-assigned managed identity (AzureManagedIdentity). */
|
||||
managedIdentityClientId?: string;
|
||||
|
||||
/** Path to federated token file (WorkloadIdentity). */
|
||||
tokenFile?: string;
|
||||
|
||||
/** Seconds before expiry to trigger proactive refresh (default: 300). */
|
||||
refreshBufferSecs?: number;
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.28.0-beta.9",
|
||||
"version": "0.28.0-beta.11",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.28.0-beta.9",
|
||||
"version": "0.28.0-beta.11",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.28.0-beta.9",
|
||||
"version": "0.28.0-beta.11",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.28.0-beta.9",
|
||||
"version": "0.28.0-beta.11",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.28.0-beta.9",
|
||||
"version": "0.28.0-beta.11",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.28.0-beta.9",
|
||||
"version": "0.28.0-beta.11",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.28.0-beta.9",
|
||||
"version": "0.28.0-beta.11",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.28.0-beta.8",
|
||||
"version": "0.28.0-beta.11",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.28.0-beta.8",
|
||||
"version": "0.28.0-beta.11",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.28.0-beta.9",
|
||||
"version": "0.28.0-beta.11",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
@@ -75,7 +75,6 @@
|
||||
"build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir lancedb",
|
||||
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
|
||||
"build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir dist",
|
||||
"postbuild:release": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
|
||||
"build": "npm run build:debug && npm run tsc",
|
||||
"build-release": "npm run build:release && npm run tsc",
|
||||
"tsc": "tsc -b",
|
||||
|
||||
@@ -67,6 +67,12 @@ impl Connection {
|
||||
builder = builder.storage_option(key, value);
|
||||
}
|
||||
}
|
||||
if let Some(manifest_enabled) = options.manifest_enabled {
|
||||
builder = builder.manifest_enabled(manifest_enabled);
|
||||
}
|
||||
if let Some(namespace_client_properties) = options.namespace_client_properties {
|
||||
builder = builder.namespace_client_properties(namespace_client_properties);
|
||||
}
|
||||
|
||||
// Create client config, optionally with header provider
|
||||
let client_config = options.client_config.unwrap_or_default();
|
||||
@@ -79,6 +85,11 @@ impl Connection {
|
||||
|
||||
builder = builder.client_config(rust_config);
|
||||
|
||||
if let Some(oauth_config) = options.oauth_config {
|
||||
let config: lancedb::remote::oauth::OAuthConfig = oauth_config.into();
|
||||
builder = builder.oauth_config(config);
|
||||
}
|
||||
|
||||
if let Some(api_key) = options.api_key {
|
||||
builder = builder.api_key(&api_key);
|
||||
}
|
||||
|
||||
@@ -37,6 +37,13 @@ pub struct ConnectionOptions {
|
||||
///
|
||||
/// The available options are described at https://docs.lancedb.com/storage/
|
||||
pub storage_options: Option<HashMap<String, String>>,
|
||||
/// (For LanceDB OSS only): use directory namespace manifests as the source
|
||||
/// of truth for table metadata. Existing directory-listed root tables are
|
||||
/// migrated into the manifest on access.
|
||||
pub manifest_enabled: Option<bool>,
|
||||
/// (For LanceDB OSS only): extra properties for the backing namespace
|
||||
/// client used by manifest-enabled native connections.
|
||||
pub namespace_client_properties: Option<HashMap<String, String>>,
|
||||
/// (For LanceDB OSS only): the session to use for this connection. Holds
|
||||
/// shared caches and other session-specific state.
|
||||
pub session: Option<session::Session>,
|
||||
@@ -53,6 +60,10 @@ pub struct ConnectionOptions {
|
||||
/// (For LanceDB cloud only): the host to use for LanceDB cloud. Used
|
||||
/// for testing purposes.
|
||||
pub host_override: Option<String>,
|
||||
/// (For LanceDB cloud only): OAuth configuration for IdP-based
|
||||
/// authentication (e.g., Azure Entra ID). When set, token acquisition
|
||||
/// and refresh are handled entirely in Rust.
|
||||
pub oauth_config: Option<remote::OAuthConfig>,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
|
||||
@@ -140,6 +140,67 @@ impl From<TlsConfig> for lancedb::remote::TlsConfig {
|
||||
}
|
||||
}
|
||||
|
||||
/// OAuth configuration for LanceDB authentication.
|
||||
/// All token acquisition and refresh is handled in the Rust layer.
|
||||
#[napi(object)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OAuthConfig {
|
||||
/// OIDC issuer URL or OAuth authority URL.
|
||||
/// For Azure: `https://login.microsoftonline.com/{tenant_id}/v2.0`
|
||||
pub issuer_url: String,
|
||||
/// Application / Client ID.
|
||||
pub client_id: String,
|
||||
/// OAuth scopes to request. For Azure: `["api://{app_id}/.default"]`
|
||||
pub scopes: Vec<String>,
|
||||
/// Authentication flow: "client_credentials", "authorization_code_pkce",
|
||||
/// "device_code", "azure_managed_identity", "workload_identity"
|
||||
pub flow: Option<String>,
|
||||
/// Client secret (required for client_credentials).
|
||||
pub client_secret: Option<String>,
|
||||
/// Redirect URI (authorization_code_pkce flow).
|
||||
pub redirect_uri: Option<String>,
|
||||
/// Port for local callback server (authorization_code_pkce, default: 8400).
|
||||
pub callback_port: Option<u16>,
|
||||
/// Client ID for user-assigned managed identity (azure_managed_identity).
|
||||
pub managed_identity_client_id: Option<String>,
|
||||
/// Path to federated token file (workload_identity).
|
||||
pub token_file: Option<String>,
|
||||
/// Seconds before expiry to trigger proactive refresh (default: 300).
|
||||
pub refresh_buffer_secs: Option<u32>,
|
||||
}
|
||||
|
||||
impl From<OAuthConfig> for lancedb::remote::oauth::OAuthConfig {
|
||||
fn from(config: OAuthConfig) -> Self {
|
||||
use lancedb::remote::oauth::OAuthFlow;
|
||||
|
||||
let flow = match config.flow.as_deref().unwrap_or("client_credentials") {
|
||||
"authorization_code_pkce" => OAuthFlow::AuthorizationCodePKCE {
|
||||
redirect_uri: config.redirect_uri,
|
||||
callback_port: config.callback_port,
|
||||
},
|
||||
"device_code" => OAuthFlow::DeviceCode,
|
||||
"azure_managed_identity" => OAuthFlow::AzureManagedIdentity {
|
||||
client_id: config.managed_identity_client_id,
|
||||
},
|
||||
"workload_identity" => OAuthFlow::WorkloadIdentity {
|
||||
token_file: config
|
||||
.token_file
|
||||
.expect("tokenFile is required for workload_identity flow"),
|
||||
},
|
||||
other => panic!("Unknown OAuth flow type: {other}"),
|
||||
};
|
||||
|
||||
Self {
|
||||
issuer_url: config.issuer_url,
|
||||
client_id: config.client_id,
|
||||
client_secret: config.client_secret,
|
||||
scopes: config.scopes,
|
||||
flow,
|
||||
refresh_buffer_secs: config.refresh_buffer_secs.map(|v| v as u64),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ClientConfig> for lancedb::remote::ClientConfig {
|
||||
fn from(config: ClientConfig) -> Self {
|
||||
Self {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.31.0-beta.9"
|
||||
current_version = "0.31.0-beta.11"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.31.0-beta.9"
|
||||
version = "0.31.0-beta.11"
|
||||
publish = false
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
@@ -14,7 +15,7 @@ name = "_lancedb"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
arrow = { version = "57.2", features = ["pyarrow"] }
|
||||
arrow = { version = "58.0.0", features = ["pyarrow"] }
|
||||
async-trait = "0.1"
|
||||
bytes = "1"
|
||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||
@@ -24,8 +25,8 @@ lance-namespace-impls.workspace = true
|
||||
lance-io.workspace = true
|
||||
env_logger.workspace = true
|
||||
log.workspace = true
|
||||
pyo3 = { version = "0.26", features = ["extension-module", "abi3-py39"] }
|
||||
pyo3-async-runtimes = { version = "0.26", features = [
|
||||
pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39"] }
|
||||
pyo3-async-runtimes = { version = "0.28", features = [
|
||||
"attributes",
|
||||
"tokio-runtime",
|
||||
] }
|
||||
@@ -34,10 +35,11 @@ futures.workspace = true
|
||||
serde = "1"
|
||||
serde_json = "1"
|
||||
snafu.workspace = true
|
||||
tokio = { version = "1.40", features = ["sync"] }
|
||||
tokio = { version = "1.40", features = ["sync", "rt-multi-thread"] }
|
||||
libc = "0.2"
|
||||
|
||||
[build-dependencies]
|
||||
pyo3-build-config = { version = "0.26", features = [
|
||||
pyo3-build-config = { version = "0.28", features = [
|
||||
"extension-module",
|
||||
"abi3-py39",
|
||||
] }
|
||||
|
||||
@@ -183,7 +183,6 @@
|
||||
| stack-data | 0.6.3 | MIT License | http://github.com/alexmojaki/stack_data |
|
||||
| sympy | 1.14.0 | BSD License | https://sympy.org |
|
||||
| tabulate | 0.9.0 | MIT License | https://github.com/astanin/python-tabulate |
|
||||
| tantivy | 0.25.1 | UNKNOWN | UNKNOWN |
|
||||
| threadpoolctl | 3.6.0 | BSD License | https://github.com/joblib/threadpoolctl |
|
||||
| timm | 1.0.24 | Apache Software License | https://github.com/huggingface/pytorch-image-models |
|
||||
| tinycss2 | 1.4.0 | BSD License | https://www.courtbouillon.org/tinycss2 |
|
||||
|
||||
@@ -57,7 +57,6 @@ tests = [
|
||||
"duckdb>=0.9.0",
|
||||
"pytz>=2023.3",
|
||||
"polars>=0.19, <=1.3.0",
|
||||
"tantivy>=0.20.0",
|
||||
"pyarrow-stubs>=16.0",
|
||||
"pylance>=5.0.0b5",
|
||||
"requests>=2.31.0",
|
||||
|
||||
@@ -7,7 +7,6 @@ import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import timedelta
|
||||
from typing import Dict, Optional, Union, Any, List
|
||||
import warnings
|
||||
|
||||
__version__ = importlib.metadata.version("lancedb")
|
||||
|
||||
@@ -73,6 +72,7 @@ def connect(
|
||||
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
session: Optional[Session] = None,
|
||||
manifest_enabled: bool = False,
|
||||
namespace_client_impl: Optional[str] = None,
|
||||
namespace_client_properties: Optional[Dict[str, str]] = None,
|
||||
namespace_client_pushdown_operations: Optional[List[str]] = None,
|
||||
@@ -111,6 +111,10 @@ def connect(
|
||||
storage_options: dict, optional
|
||||
Additional options for the storage backend. See available options at
|
||||
<https://docs.lancedb.com/storage/>
|
||||
manifest_enabled : bool, default False
|
||||
When true for local/native connections, use directory namespace
|
||||
manifests as the source of truth for table metadata. Existing
|
||||
directory-listed root tables are migrated into the manifest on access.
|
||||
session: Session, optional
|
||||
(For LanceDB OSS only)
|
||||
A session to use for this connection. Sessions allow you to configure
|
||||
@@ -158,11 +162,11 @@ def connect(
|
||||
conn : DBConnection
|
||||
A connection to a LanceDB database.
|
||||
"""
|
||||
if namespace_client_impl is not None or namespace_client_properties is not None:
|
||||
if namespace_client_impl is None or namespace_client_properties is None:
|
||||
if namespace_client_impl is not None:
|
||||
if namespace_client_properties is None:
|
||||
raise ValueError(
|
||||
"Both namespace_client_impl and "
|
||||
"namespace_client_properties must be provided"
|
||||
"namespace_client_properties must be provided when "
|
||||
"namespace_client_impl is set"
|
||||
)
|
||||
if kwargs:
|
||||
raise ValueError(f"Unknown keyword arguments: {kwargs}")
|
||||
@@ -175,6 +179,12 @@ def connect(
|
||||
namespace_client_pushdown_operations=namespace_client_pushdown_operations,
|
||||
)
|
||||
|
||||
if namespace_client_properties is not None and not manifest_enabled:
|
||||
raise ValueError(
|
||||
"namespace_client_impl must be provided when using "
|
||||
"namespace_client_properties unless manifest_enabled=True"
|
||||
)
|
||||
|
||||
if namespace_client_pushdown_operations is not None:
|
||||
raise ValueError(
|
||||
"namespace_client_pushdown_operations is only valid when "
|
||||
@@ -212,6 +222,8 @@ def connect(
|
||||
read_consistency_interval=read_consistency_interval,
|
||||
storage_options=storage_options,
|
||||
session=session,
|
||||
manifest_enabled=manifest_enabled,
|
||||
namespace_client_properties=namespace_client_properties,
|
||||
)
|
||||
|
||||
|
||||
@@ -289,6 +301,8 @@ def deserialize_conn(
|
||||
parsed["uri"],
|
||||
read_consistency_interval=rci,
|
||||
storage_options=storage_options,
|
||||
manifest_enabled=parsed.get("manifest_enabled", False),
|
||||
namespace_client_properties=parsed.get("namespace_client_properties"),
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown connection_type: {connection_type}")
|
||||
@@ -304,6 +318,9 @@ async def connect_async(
|
||||
client_config: Optional[Union[ClientConfig, Dict[str, Any]]] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
session: Optional[Session] = None,
|
||||
manifest_enabled: bool = False,
|
||||
namespace_client_properties: Optional[Dict[str, str]] = None,
|
||||
oauth_config=None,
|
||||
) -> AsyncConnection:
|
||||
"""Connect to a LanceDB database.
|
||||
|
||||
@@ -343,6 +360,13 @@ async def connect_async(
|
||||
cache sizes for index and metadata caches, which can significantly
|
||||
impact memory use and performance. They can also be re-used across
|
||||
multiple connections to share the same cache state.
|
||||
manifest_enabled : bool, default False
|
||||
When true for local/native connections, use directory namespace
|
||||
manifests as the source of truth for table metadata. Existing
|
||||
directory-listed root tables are migrated into the manifest on access.
|
||||
namespace_client_properties : dict, optional
|
||||
Additional directory namespace client properties to use with
|
||||
``manifest_enabled=True``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -385,6 +409,9 @@ async def connect_async(
|
||||
client_config,
|
||||
storage_options,
|
||||
session,
|
||||
manifest_enabled,
|
||||
namespace_client_properties,
|
||||
oauth_config,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -412,13 +439,3 @@ __all__ = [
|
||||
"Table",
|
||||
"__version__",
|
||||
]
|
||||
|
||||
|
||||
def __warn_on_fork():
|
||||
warnings.warn(
|
||||
"lance is not fork-safe. If you are using multiprocessing, use spawn instead.",
|
||||
)
|
||||
|
||||
|
||||
if hasattr(os, "register_at_fork"):
|
||||
os.register_at_fork(before=__warn_on_fork) # type: ignore[attr-defined]
|
||||
|
||||
@@ -12,6 +12,7 @@ from .index import (
|
||||
LabelList,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
FTS,
|
||||
)
|
||||
from lance_namespace import (
|
||||
@@ -25,6 +26,7 @@ from .remote import ClientConfig
|
||||
|
||||
IvfHnswPq: type[HnswPq] = HnswPq
|
||||
IvfHnswSq: type[HnswSq] = HnswSq
|
||||
IvfHnswFlat: type[HnswFlat] = HnswFlat
|
||||
|
||||
class PyExpr:
|
||||
"""A type-safe DataFusion expression node (Rust-side handle)."""
|
||||
@@ -180,6 +182,7 @@ class Table:
|
||||
IvfPq,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
@@ -242,6 +245,9 @@ async def connect(
|
||||
client_config: Optional[Union[ClientConfig, Dict[str, Any]]],
|
||||
storage_options: Optional[Dict[str, str]],
|
||||
session: Optional[Session],
|
||||
manifest_enabled: bool = False,
|
||||
namespace_client_properties: Optional[Dict[str, str]] = None,
|
||||
oauth_config: Optional[Any] = None,
|
||||
) -> Connection: ...
|
||||
|
||||
class RecordBatchStream:
|
||||
@@ -440,7 +446,7 @@ class AsyncPermutationBuilder:
|
||||
async def execute(self) -> Table: ...
|
||||
|
||||
def async_permutation_builder(
|
||||
table: Table, dest_table_name: str
|
||||
table: Table,
|
||||
) -> AsyncPermutationBuilder: ...
|
||||
def fts_query_to_json(query: Any) -> str: ...
|
||||
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import threading
|
||||
import warnings
|
||||
|
||||
|
||||
class BackgroundEventLoop:
|
||||
@@ -13,6 +15,9 @@ class BackgroundEventLoop:
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._start()
|
||||
|
||||
def _start(self):
|
||||
self.loop = asyncio.new_event_loop()
|
||||
self.thread = threading.Thread(
|
||||
target=self.loop.run_forever,
|
||||
@@ -31,3 +36,30 @@ class BackgroundEventLoop:
|
||||
|
||||
|
||||
LOOP = BackgroundEventLoop()
|
||||
|
||||
_FORK_WARNED = False
|
||||
|
||||
|
||||
def _reset_after_fork():
|
||||
# Threads do not survive fork(), so the asyncio loop in LOOP.thread is
|
||||
# dead in the child. Re-initialize the singleton in place so existing
|
||||
# `from .background_loop import LOOP` references in other modules see
|
||||
# the new state. The Rust-side tokio runtime is reset analogously by a
|
||||
# pthread_atfork hook installed in the _lancedb extension.
|
||||
LOOP._start()
|
||||
global _FORK_WARNED
|
||||
if not _FORK_WARNED:
|
||||
_FORK_WARNED = True
|
||||
warnings.warn(
|
||||
"lancedb fork support is experimental: the internal async "
|
||||
"runtime has been reset in the forked child, but a small chance "
|
||||
"of deadlock remains if other state was mid-operation at fork "
|
||||
"time. The 'forkserver' or 'spawn' multiprocessing start method "
|
||||
"is likely a safer alternative.",
|
||||
RuntimeWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
if hasattr(os, "register_at_fork"):
|
||||
os.register_at_fork(after_in_child=_reset_after_fork)
|
||||
|
||||
@@ -590,8 +590,13 @@ class LanceDBConnection(DBConnection):
|
||||
read_consistency_interval: Optional[timedelta] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
session: Optional[Session] = None,
|
||||
manifest_enabled: bool = False,
|
||||
namespace_client_properties: Optional[Dict[str, str]] = None,
|
||||
_inner: Optional[LanceDbConnection] = None,
|
||||
):
|
||||
self.storage_options = storage_options
|
||||
self._manifest_enabled = manifest_enabled
|
||||
self._namespace_client_properties = namespace_client_properties
|
||||
if _inner is not None:
|
||||
self._conn = _inner
|
||||
self._cached_namespace_client = None
|
||||
@@ -633,6 +638,8 @@ class LanceDBConnection(DBConnection):
|
||||
None,
|
||||
storage_options,
|
||||
session,
|
||||
manifest_enabled,
|
||||
namespace_client_properties,
|
||||
)
|
||||
|
||||
# TODO: It would be nice if we didn't store self.storage_options but it is
|
||||
@@ -640,7 +647,6 @@ class LanceDBConnection(DBConnection):
|
||||
# work because some paths like LanceDBConnection.from_inner will lose the
|
||||
# storage_options. Also, this class really shouldn't be holding any state
|
||||
# beyond _conn.
|
||||
self.storage_options = storage_options
|
||||
self._conn = AsyncConnection(LOOP.run(do_connect()))
|
||||
self._cached_namespace_client: Optional[LanceNamespace] = None
|
||||
|
||||
@@ -677,6 +683,8 @@ class LanceDBConnection(DBConnection):
|
||||
"connection_type": "local",
|
||||
"uri": self.uri,
|
||||
"storage_options": self.storage_options,
|
||||
"manifest_enabled": self._manifest_enabled,
|
||||
"namespace_client_properties": self._namespace_client_properties,
|
||||
"read_consistency_interval_seconds": (
|
||||
rci.total_seconds() if rci else None
|
||||
),
|
||||
|
||||
@@ -1,201 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
"""Full text search index using tantivy-py"""
|
||||
|
||||
import os
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
try:
|
||||
import tantivy
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
|
||||
)
|
||||
|
||||
from .table import LanceTable
|
||||
|
||||
|
||||
def create_index(
|
||||
index_path: str,
|
||||
text_fields: List[str],
|
||||
ordering_fields: Optional[List[str]] = None,
|
||||
tokenizer_name: str = "default",
|
||||
) -> tantivy.Index:
|
||||
"""
|
||||
Create a new Index (not populated)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index_path : str
|
||||
Path to the index directory
|
||||
text_fields : List[str]
|
||||
List of text fields to index
|
||||
ordering_fields: List[str]
|
||||
List of unsigned type fields to order by at search time
|
||||
tokenizer_name : str, default "default"
|
||||
The tokenizer to use
|
||||
|
||||
Returns
|
||||
-------
|
||||
index : tantivy.Index
|
||||
The index object (not yet populated)
|
||||
"""
|
||||
if ordering_fields is None:
|
||||
ordering_fields = []
|
||||
# Declaring our schema.
|
||||
schema_builder = tantivy.SchemaBuilder()
|
||||
# special field that we'll populate with row_id
|
||||
schema_builder.add_integer_field("doc_id", stored=True)
|
||||
# data fields
|
||||
for name in text_fields:
|
||||
schema_builder.add_text_field(name, stored=True, tokenizer_name=tokenizer_name)
|
||||
if ordering_fields:
|
||||
for name in ordering_fields:
|
||||
schema_builder.add_unsigned_field(name, fast=True)
|
||||
schema = schema_builder.build()
|
||||
os.makedirs(index_path, exist_ok=True)
|
||||
index = tantivy.Index(schema, path=index_path)
|
||||
return index
|
||||
|
||||
|
||||
def populate_index(
|
||||
index: tantivy.Index,
|
||||
table: LanceTable,
|
||||
fields: List[str],
|
||||
writer_heap_size: Optional[int] = None,
|
||||
ordering_fields: Optional[List[str]] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Populate an index with data from a LanceTable
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : tantivy.Index
|
||||
The index object
|
||||
table : LanceTable
|
||||
The table to index
|
||||
fields : List[str]
|
||||
List of fields to index
|
||||
writer_heap_size : int
|
||||
The writer heap size in bytes, defaults to 1GB
|
||||
|
||||
Returns
|
||||
-------
|
||||
int
|
||||
The number of rows indexed
|
||||
"""
|
||||
if ordering_fields is None:
|
||||
ordering_fields = []
|
||||
writer_heap_size = writer_heap_size or 1024 * 1024 * 1024
|
||||
# first check the fields exist and are string or large string type
|
||||
nested = []
|
||||
|
||||
for name in fields:
|
||||
try:
|
||||
f = table.schema.field(name) # raises KeyError if not found
|
||||
except KeyError:
|
||||
f = resolve_path(table.schema, name)
|
||||
nested.append(name)
|
||||
|
||||
if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
|
||||
raise TypeError(f"Field {name} is not a string type")
|
||||
|
||||
# create a tantivy writer
|
||||
writer = index.writer(heap_size=writer_heap_size)
|
||||
# write data into index
|
||||
dataset = table.to_lance()
|
||||
row_id = 0
|
||||
|
||||
max_nested_level = 0
|
||||
if len(nested) > 0:
|
||||
max_nested_level = max([len(name.split(".")) for name in nested])
|
||||
|
||||
for b in dataset.to_batches(columns=fields + ordering_fields):
|
||||
if max_nested_level > 0:
|
||||
b = pa.Table.from_batches([b])
|
||||
for _ in range(max_nested_level - 1):
|
||||
b = b.flatten()
|
||||
for i in range(b.num_rows):
|
||||
doc = tantivy.Document()
|
||||
for name in fields:
|
||||
value = b[name][i].as_py()
|
||||
if value is not None:
|
||||
doc.add_text(name, value)
|
||||
for name in ordering_fields:
|
||||
value = b[name][i].as_py()
|
||||
if value is not None:
|
||||
doc.add_unsigned(name, value)
|
||||
if not doc.is_empty:
|
||||
doc.add_integer("doc_id", row_id)
|
||||
writer.add_document(doc)
|
||||
row_id += 1
|
||||
# commit changes
|
||||
writer.commit()
|
||||
return row_id
|
||||
|
||||
|
||||
def resolve_path(schema, field_name: str) -> pa.Field:
|
||||
"""
|
||||
Resolve a nested field path to a list of field names
|
||||
|
||||
Parameters
|
||||
----------
|
||||
field_name : str
|
||||
The field name to resolve
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[str]
|
||||
The resolved path
|
||||
"""
|
||||
path = field_name.split(".")
|
||||
field = schema.field(path.pop(0))
|
||||
for segment in path:
|
||||
if pa.types.is_struct(field.type):
|
||||
field = field.type.field(segment)
|
||||
else:
|
||||
raise KeyError(f"field {field_name} not found in schema {schema}")
|
||||
return field
|
||||
|
||||
|
||||
def search_index(
|
||||
index: tantivy.Index, query: str, limit: int = 10, ordering_field=None
|
||||
) -> Tuple[Tuple[int], Tuple[float]]:
|
||||
"""
|
||||
Search an index for a query
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : tantivy.Index
|
||||
The index object
|
||||
query : str
|
||||
The query string
|
||||
limit : int
|
||||
The maximum number of results to return
|
||||
|
||||
Returns
|
||||
-------
|
||||
ids_and_score: list[tuple[int], tuple[float]]
|
||||
A tuple of two tuples, the first containing the document ids
|
||||
and the second containing the scores
|
||||
"""
|
||||
searcher = index.searcher()
|
||||
query = index.parse_query(query)
|
||||
# get top results
|
||||
if ordering_field:
|
||||
results = searcher.search(query, limit, order_by_field=ordering_field)
|
||||
else:
|
||||
results = searcher.search(query, limit)
|
||||
if results.count == 0:
|
||||
return tuple(), tuple()
|
||||
return tuple(
|
||||
zip(
|
||||
*[
|
||||
(searcher.doc(doc_address)["doc_id"][0], score)
|
||||
for score, doc_address in results.hits
|
||||
]
|
||||
)
|
||||
)
|
||||
@@ -7,6 +7,7 @@ from typing import Literal, Optional
|
||||
from ._lancedb import (
|
||||
IndexConfig,
|
||||
)
|
||||
from .types import BaseTokenizerType
|
||||
|
||||
lang_mapping = {
|
||||
"ar": "Arabic",
|
||||
@@ -111,8 +112,12 @@ class FTS:
|
||||
- "simple": Splits text by whitespace and punctuation.
|
||||
- "whitespace": Split text by whitespace, but not punctuation.
|
||||
- "raw": No tokenization. The entire text is treated as a single token.
|
||||
- "ngram": N-gram tokenizer for substring-style matching.
|
||||
- "jieba/*": Jieba tokenizer loaded from Lance's language model home.
|
||||
- "lindera/*": Lindera tokenizer loaded from Lance's language model home.
|
||||
language : str, default "English"
|
||||
The language to use for tokenization.
|
||||
The language to use for stemming and stop-word removal. This is not the
|
||||
primary way to enable CJK tokenization.
|
||||
max_token_length : int, default 40
|
||||
The maximum token length to index. Tokens longer than this length will be
|
||||
ignored.
|
||||
@@ -127,10 +132,17 @@ class FTS:
|
||||
ascii_folding : bool, default True
|
||||
Whether to fold ASCII characters. This converts accented characters to
|
||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||
|
||||
Notes
|
||||
-----
|
||||
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
|
||||
require tokenizer models in Lance's language model home. Set
|
||||
``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
|
||||
directory under ``lance/language_models``.
|
||||
"""
|
||||
|
||||
with_position: bool = False
|
||||
base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple"
|
||||
base_tokenizer: BaseTokenizerType = "simple"
|
||||
language: str = "English"
|
||||
max_token_length: Optional[int] = 40
|
||||
lower_case: bool = True
|
||||
@@ -376,9 +388,98 @@ class HnswSq:
|
||||
target_partition_size: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class HnswFlat:
|
||||
"""Describe a HNSW-FLAT index configuration.
|
||||
|
||||
HNSW-FLAT stands for Hierarchical Navigable Small World without quantization.
|
||||
It stores raw vectors in the HNSW graph, providing the highest recall among
|
||||
the IVF_HNSW family at the cost of more memory and disk space compared to
|
||||
:class:`HnswSq` or :class:`HnswPq`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
distance_type: str, default "l2"
|
||||
|
||||
The distance metric used to train the index.
|
||||
|
||||
The following distance types are available:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
num_partitions, default sqrt(num_rows)
|
||||
|
||||
The number of IVF partitions to create.
|
||||
|
||||
For HNSW, we recommend a small number of partitions. Setting this to 1
|
||||
works well for most tables. For very large tables, training just one HNSW
|
||||
graph will require too much memory. Each partition becomes its own HNSW
|
||||
graph, so setting this value higher reduces the peak memory use of
|
||||
training.
|
||||
|
||||
max_iterations, default 50
|
||||
|
||||
Max iterations to train kmeans.
|
||||
|
||||
When training an IVF index we use kmeans to calculate the partitions.
|
||||
This parameter controls how many iterations of kmeans to run.
|
||||
|
||||
sample_rate, default 256
|
||||
|
||||
The rate used to calculate the number of training vectors for kmeans.
|
||||
|
||||
m, default 20
|
||||
|
||||
The number of neighbors to select for each vector in the HNSW graph.
|
||||
|
||||
This value controls the tradeoff between search speed and accuracy.
|
||||
The higher the value the more accurate the search but the slower it
|
||||
will be.
|
||||
|
||||
ef_construction, default 300
|
||||
|
||||
The number of candidates to evaluate during the construction of the HNSW
|
||||
graph.
|
||||
|
||||
This value controls the tradeoff between build speed and accuracy.
|
||||
The higher the value the more accurate the build but the slower it will
|
||||
be. 150 to 300 is the typical range. 100 is a minimum for good quality
|
||||
search results. In most cases, there is no benefit to setting this higher
|
||||
than 500. This value should be set to a value that is not less than `ef`
|
||||
in the search phase.
|
||||
|
||||
target_partition_size, default is 1,048,576
|
||||
|
||||
The target size of each partition.
|
||||
"""
|
||||
|
||||
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
||||
num_partitions: Optional[int] = None
|
||||
max_iterations: int = 50
|
||||
sample_rate: int = 256
|
||||
m: int = 20
|
||||
ef_construction: int = 300
|
||||
target_partition_size: Optional[int] = None
|
||||
|
||||
|
||||
# Backwards-compatible aliases
|
||||
IvfHnswPq = HnswPq
|
||||
IvfHnswSq = HnswSq
|
||||
IvfHnswFlat = HnswFlat
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -698,11 +799,13 @@ __all__ = [
|
||||
"IvfPq",
|
||||
"IvfHnswPq",
|
||||
"IvfHnswSq",
|
||||
"IvfHnswFlat",
|
||||
"IvfSq",
|
||||
"IvfRq",
|
||||
"IvfFlat",
|
||||
"HnswPq",
|
||||
"HnswSq",
|
||||
"HnswFlat",
|
||||
"IndexConfig",
|
||||
"FTS",
|
||||
"Bitmap",
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
from deprecation import deprecated
|
||||
from lancedb import AsyncConnection, DBConnection
|
||||
import pyarrow as pa
|
||||
import copy
|
||||
import json
|
||||
|
||||
from deprecation import deprecated
|
||||
import pyarrow as pa
|
||||
|
||||
from ._lancedb import async_permutation_builder, PermutationReader
|
||||
from .table import LanceTable
|
||||
from .background_loop import LOOP
|
||||
@@ -36,10 +37,7 @@ class PermutationBuilder:
|
||||
be referenced by name in the future. If names are not provided then they can only
|
||||
be referenced by their ordinal index. There is no requirement to name every split.
|
||||
|
||||
By default, the permutation will be stored in memory and will be lost when the
|
||||
program exits. To persist the permutation (for very large datasets or to share
|
||||
the permutation across multiple workers) use the [persist](#persist) method to
|
||||
create a permanent table.
|
||||
The permutation is stored in memory and will be lost when the program exits.
|
||||
"""
|
||||
|
||||
def __init__(self, table: LanceTable):
|
||||
@@ -51,15 +49,6 @@ class PermutationBuilder:
|
||||
"""
|
||||
self._async = async_permutation_builder(table)
|
||||
|
||||
def persist(
|
||||
self, database: Union[DBConnection, AsyncConnection], table_name: str
|
||||
) -> "PermutationBuilder":
|
||||
"""
|
||||
Persist the permutation to the given database.
|
||||
"""
|
||||
self._async.persist(database, table_name)
|
||||
return self
|
||||
|
||||
def split_random(
|
||||
self,
|
||||
*,
|
||||
@@ -380,20 +369,44 @@ class Permutation:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
reader: PermutationReader,
|
||||
base_table: LanceTable,
|
||||
permutation_table: Optional[LanceTable],
|
||||
split: int,
|
||||
selection: dict[str, str],
|
||||
batch_size: int,
|
||||
transform_fn: Callable[pa.RecordBatch, Any],
|
||||
offset: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
connection_factory: Optional[Callable[[str], LanceTable]] = None,
|
||||
_reader: Optional[PermutationReader] = None,
|
||||
):
|
||||
"""
|
||||
Internal constructor. Use [from_tables](#from_tables) instead.
|
||||
"""
|
||||
assert reader is not None, "reader is required"
|
||||
assert base_table is not None, "base_table is required"
|
||||
assert selection is not None, "selection is required"
|
||||
self.reader = reader
|
||||
self.base_table = base_table
|
||||
self.permutation_table = permutation_table
|
||||
self.split = split
|
||||
self.selection = selection
|
||||
self.transform_fn = transform_fn
|
||||
self.batch_size = batch_size
|
||||
self.offset = offset
|
||||
self.limit = limit
|
||||
self.connection_factory = connection_factory
|
||||
if _reader is None:
|
||||
_reader = LOOP.run(self._build_reader())
|
||||
self.reader: PermutationReader = _reader
|
||||
|
||||
async def _build_reader(self) -> PermutationReader:
|
||||
reader = await PermutationReader.from_tables(
|
||||
self.base_table, self.permutation_table, self.split
|
||||
)
|
||||
if self.offset is not None:
|
||||
reader = await reader.with_offset(self.offset)
|
||||
if self.limit is not None:
|
||||
reader = await reader.with_limit(self.limit)
|
||||
return reader
|
||||
|
||||
def _with_selection(self, selection: dict[str, str]) -> "Permutation":
|
||||
"""
|
||||
@@ -402,21 +415,97 @@ class Permutation:
|
||||
Does not validation of the selection and it replaces it entirely. This is not
|
||||
intended for public use.
|
||||
"""
|
||||
return Permutation(self.reader, selection, self.batch_size, self.transform_fn)
|
||||
|
||||
def _with_reader(self, reader: PermutationReader) -> "Permutation":
|
||||
"""
|
||||
Creates a new permutation with the given reader
|
||||
|
||||
This is an internal method and should not be used directly.
|
||||
"""
|
||||
return Permutation(reader, self.selection, self.batch_size, self.transform_fn)
|
||||
new = copy.copy(self)
|
||||
new.selection = selection
|
||||
return new
|
||||
|
||||
def with_batch_size(self, batch_size: int) -> "Permutation":
|
||||
"""
|
||||
Creates a new permutation with the given batch size
|
||||
"""
|
||||
return Permutation(self.reader, self.selection, batch_size, self.transform_fn)
|
||||
new = copy.copy(self)
|
||||
new.batch_size = batch_size
|
||||
return new
|
||||
|
||||
def with_connection_factory(
|
||||
self, connection_factory: Callable[[str], LanceTable]
|
||||
) -> "Permutation":
|
||||
"""
|
||||
Creates a new permutation that will use ``connection_factory`` to reopen
|
||||
the base table when this permutation is unpickled in a worker process.
|
||||
|
||||
The factory is a callable that takes a single argument — the base table
|
||||
name — and returns a [LanceTable]. It must be picklable; the worker
|
||||
will pickle it via standard ``pickle`` and call it to recover the base
|
||||
table. Picklable callables in practice means top-level (module-level)
|
||||
functions, ``functools.partial`` of such functions, or instances of
|
||||
picklable classes implementing ``__call__``. Lambdas and closures over
|
||||
local variables don't pickle with the default protocol.
|
||||
|
||||
Setting a factory is necessary when the URI alone is not enough to
|
||||
re-open the connection — most importantly for LanceDB Cloud (``db://``)
|
||||
connections, where ``api_key`` and ``region`` aren't recoverable from
|
||||
the connection object after construction.
|
||||
|
||||
For local file or cloud-storage paths the factory is optional: if not
|
||||
set, ``__getstate__`` falls back to capturing
|
||||
``(uri, storage_options, namespace_path)`` and re-opening via
|
||||
``lancedb.connect(uri, storage_options=...)``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Basic native (file-system path), parameterized via ``functools.partial``::
|
||||
|
||||
import functools, lancedb
|
||||
from lancedb.permutation import Permutation
|
||||
|
||||
def open_native_table(uri: str, table_name: str):
|
||||
return lancedb.connect(uri).open_table(table_name)
|
||||
|
||||
factory = functools.partial(open_native_table, "/data/lance_db")
|
||||
permutation = Permutation.identity(
|
||||
factory("training")
|
||||
).with_connection_factory(factory)
|
||||
|
||||
Native via :func:`lancedb.connect_namespace` (e.g. a directory- or
|
||||
REST-backed namespace client). The factory takes the
|
||||
implementation name and properties dict as partial-bound args so
|
||||
the worker can rebuild the same namespace connection::
|
||||
|
||||
def open_via_namespace(
|
||||
impl: str, properties: dict[str, str], table_name: str,
|
||||
):
|
||||
return lancedb.connect_namespace(impl, properties).open_table(
|
||||
table_name,
|
||||
)
|
||||
|
||||
factory = functools.partial(
|
||||
open_via_namespace,
|
||||
"dir",
|
||||
{"root": "/data/lance_db"},
|
||||
)
|
||||
|
||||
LanceDB Cloud, reading credentials from env vars at worker startup
|
||||
so secrets aren't pickled into the dataset::
|
||||
|
||||
import os, lancedb
|
||||
|
||||
def open_remote_table(table_name: str):
|
||||
db = lancedb.connect(
|
||||
"db://my-database",
|
||||
api_key=os.environ["LANCEDB_API_KEY"],
|
||||
region=os.environ.get("LANCEDB_REGION", "us-east-1"),
|
||||
)
|
||||
return db.open_table(table_name)
|
||||
|
||||
permutation = Permutation.identity(
|
||||
open_remote_table("training")
|
||||
).with_connection_factory(open_remote_table)
|
||||
"""
|
||||
assert connection_factory is not None, "connection_factory is required"
|
||||
new = copy.copy(self)
|
||||
new.connection_factory = connection_factory
|
||||
return new
|
||||
|
||||
@classmethod
|
||||
def identity(cls, table: LanceTable) -> "Permutation":
|
||||
@@ -489,11 +578,126 @@ class Permutation:
|
||||
schema = await reader.output_schema(None)
|
||||
initial_selection = {name: name for name in schema.names}
|
||||
return cls(
|
||||
reader, initial_selection, DEFAULT_BATCH_SIZE, Transforms.arrow2python
|
||||
base_table,
|
||||
permutation_table,
|
||||
split,
|
||||
initial_selection,
|
||||
DEFAULT_BATCH_SIZE,
|
||||
Transforms.arrow2python,
|
||||
_reader=reader,
|
||||
)
|
||||
|
||||
return LOOP.run(do_from_tables())
|
||||
|
||||
def __getstate__(self) -> dict[str, Any]:
|
||||
"""Build a picklable state dict for this permutation.
|
||||
|
||||
The base table is captured either via a user-supplied
|
||||
``connection_factory`` (see [with_connection_factory]) or, as a
|
||||
fallback, by introspecting ``(uri, storage_options, namespace_path)``
|
||||
on the connection. The permutation table — always an in-memory
|
||||
LanceDB table — is captured as a pyarrow Table (which pickles via
|
||||
Arrow IPC natively). The reader is dropped from the wire format;
|
||||
``__setstate__`` rebuilds it from the restored tables.
|
||||
"""
|
||||
permutation_data: Optional[pa.Table] = None
|
||||
if self.permutation_table is not None:
|
||||
permutation_data = self.permutation_table.to_arrow()
|
||||
|
||||
common = {
|
||||
"base_table_name": self.base_table.name,
|
||||
"permutation_data": permutation_data,
|
||||
"split": self.split,
|
||||
"selection": self.selection,
|
||||
"batch_size": self.batch_size,
|
||||
"transform_fn": self.transform_fn,
|
||||
"offset": self.offset,
|
||||
"limit": self.limit,
|
||||
"connection_factory": self.connection_factory,
|
||||
}
|
||||
|
||||
if self.connection_factory is not None:
|
||||
# The factory carries enough state to recover the base table on
|
||||
# its own; we don't need to capture the URI / storage options /
|
||||
# namespace from the existing connection.
|
||||
return common
|
||||
|
||||
# URI-introspection fallback: only viable for native (OSS) connections
|
||||
# where (uri, storage_options) is enough to reopen. Remote / cloud
|
||||
# connections don't expose recoverable api_key / region — those users
|
||||
# must call with_connection_factory().
|
||||
try:
|
||||
base_uri = self.base_table._conn.uri
|
||||
storage_options = self.base_table._conn.storage_options
|
||||
except AttributeError as e:
|
||||
raise ValueError(
|
||||
"Cannot pickle this Permutation: the base table's connection "
|
||||
"does not expose a uri/storage_options, which usually means it "
|
||||
"is a remote (LanceDB Cloud) connection. Call "
|
||||
"Permutation.with_connection_factory(...) first to provide a "
|
||||
"picklable callable that re-opens the base table from a worker "
|
||||
"process."
|
||||
) from e
|
||||
|
||||
if base_uri.startswith("memory://"):
|
||||
# In-memory base tables don't exist in any worker process by
|
||||
# default, so dump the entire base table into the pickle. This
|
||||
# can be expensive for large datasets — users with large
|
||||
# in-memory base tables should either persist them or set a
|
||||
# connection_factory.
|
||||
return {
|
||||
**common,
|
||||
"base_table_data": self.base_table.to_arrow(),
|
||||
}
|
||||
|
||||
return {
|
||||
**common,
|
||||
"base_table_uri": base_uri,
|
||||
"base_table_namespace": self.base_table._namespace_path,
|
||||
"base_table_storage_options": storage_options,
|
||||
}
|
||||
|
||||
def __setstate__(self, state: dict[str, Any]) -> None:
|
||||
from . import connect
|
||||
|
||||
connection_factory = state["connection_factory"]
|
||||
if connection_factory is not None:
|
||||
base_table = connection_factory(state["base_table_name"])
|
||||
elif "base_table_data" in state:
|
||||
# In-memory base table inlined into the pickle; rebuild the same
|
||||
# way we rebuild the in-memory permutation table.
|
||||
mem_db = connect("memory://")
|
||||
base_table = mem_db.create_table(
|
||||
state["base_table_name"], state["base_table_data"]
|
||||
)
|
||||
else:
|
||||
base_db = connect(
|
||||
state["base_table_uri"],
|
||||
storage_options=state["base_table_storage_options"],
|
||||
)
|
||||
base_table = base_db.open_table(
|
||||
state["base_table_name"],
|
||||
namespace_path=state["base_table_namespace"] or None,
|
||||
)
|
||||
|
||||
permutation_table: Optional[LanceTable] = None
|
||||
if state["permutation_data"] is not None:
|
||||
mem_db = connect("memory://")
|
||||
permutation_table = mem_db.create_table(
|
||||
"permutation", state["permutation_data"]
|
||||
)
|
||||
|
||||
self.base_table = base_table
|
||||
self.permutation_table = permutation_table
|
||||
self.split = state["split"]
|
||||
self.selection = state["selection"]
|
||||
self.batch_size = state["batch_size"]
|
||||
self.transform_fn = state["transform_fn"]
|
||||
self.offset = state["offset"]
|
||||
self.limit = state["limit"]
|
||||
self.connection_factory = connection_factory
|
||||
self.reader = LOOP.run(self._build_reader())
|
||||
|
||||
@property
|
||||
def schema(self) -> pa.Schema:
|
||||
async def do_output_schema():
|
||||
@@ -760,7 +964,9 @@ class Permutation:
|
||||
for expensive operations such as image decoding.
|
||||
"""
|
||||
assert transform is not None, "transform is required"
|
||||
return Permutation(self.reader, self.selection, self.batch_size, transform)
|
||||
new = copy.copy(self)
|
||||
new.transform_fn = transform
|
||||
return new
|
||||
|
||||
def __getitem__(self, index: int) -> Any:
|
||||
"""
|
||||
@@ -795,12 +1001,10 @@ class Permutation:
|
||||
"""
|
||||
Skip the first `skip` rows of the permutation
|
||||
"""
|
||||
|
||||
async def do_with_skip():
|
||||
reader = await self.reader.with_offset(skip)
|
||||
return self._with_reader(reader)
|
||||
|
||||
return LOOP.run(do_with_skip())
|
||||
new = copy.copy(self)
|
||||
new.offset = skip
|
||||
new.reader = LOOP.run(new._build_reader())
|
||||
return new
|
||||
|
||||
@deprecated(details="Use with_take instead")
|
||||
def take(self, limit: int) -> "Permutation":
|
||||
@@ -818,12 +1022,10 @@ class Permutation:
|
||||
"""
|
||||
Limit the permutation to `limit` rows (following any `skip`)
|
||||
"""
|
||||
|
||||
async def do_with_take():
|
||||
reader = await self.reader.with_limit(limit)
|
||||
return self._with_reader(reader)
|
||||
|
||||
return LOOP.run(do_with_take())
|
||||
new = copy.copy(self)
|
||||
new.limit = limit
|
||||
new.reader = LOOP.run(new._build_reader())
|
||||
return new
|
||||
|
||||
@deprecated(details="Use with_repeat instead")
|
||||
def repeat(self, times: int) -> "Permutation":
|
||||
|
||||
@@ -25,7 +25,6 @@ import deprecation
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
import pyarrow.fs as pa_fs
|
||||
import pydantic
|
||||
|
||||
from lancedb.pydantic import PYDANTIC_VERSION
|
||||
@@ -1526,9 +1525,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
return self._table._output_schema(self.to_query_object())
|
||||
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
path, fs, exist = self._table._get_fts_index_path()
|
||||
if exist:
|
||||
return self.tantivy_to_arrow()
|
||||
self._table._ensure_no_legacy_fts_index()
|
||||
|
||||
query = self._query
|
||||
if self._phrase_query:
|
||||
@@ -1552,90 +1549,6 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
):
|
||||
raise NotImplementedError("to_batches on an FTS query")
|
||||
|
||||
def tantivy_to_arrow(self) -> pa.Table:
|
||||
try:
|
||||
import tantivy
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
|
||||
)
|
||||
|
||||
from .fts import search_index
|
||||
|
||||
# get the index path
|
||||
path, fs, exist = self._table._get_fts_index_path()
|
||||
|
||||
# check if the index exist
|
||||
if not exist:
|
||||
raise FileNotFoundError(
|
||||
"Fts index does not exist. "
|
||||
"Please first call table.create_fts_index(['<field_names>']) to "
|
||||
"create the fts index."
|
||||
)
|
||||
|
||||
# Check that we are on local filesystem
|
||||
if not isinstance(fs, pa_fs.LocalFileSystem):
|
||||
raise NotImplementedError(
|
||||
"Tantivy-based full text search "
|
||||
"is only supported on the local filesystem"
|
||||
)
|
||||
# open the index
|
||||
index = tantivy.Index.open(path)
|
||||
# get the scores and doc ids
|
||||
query = self._query
|
||||
if self._phrase_query:
|
||||
query = query.replace('"', "'")
|
||||
query = f'"{query}"'
|
||||
limit = self._limit if self._limit is not None else 10
|
||||
row_ids, scores = search_index(
|
||||
index, query, limit, ordering_field=self.ordering_field_name
|
||||
)
|
||||
if len(row_ids) == 0:
|
||||
empty_schema = pa.schema([pa.field("_score", pa.float32())])
|
||||
return pa.Table.from_batches([], schema=empty_schema)
|
||||
scores = pa.array(scores)
|
||||
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
|
||||
output_tbl = output_tbl.append_column("_score", scores)
|
||||
# this needs to match vector search results which are uint64
|
||||
row_ids = pa.array(row_ids, type=pa.uint64())
|
||||
|
||||
if self._where is not None:
|
||||
tmp_name = "__lancedb__duckdb__indexer__"
|
||||
output_tbl = output_tbl.append_column(
|
||||
tmp_name, pa.array(range(len(output_tbl)))
|
||||
)
|
||||
try:
|
||||
# TODO would be great to have Substrait generate pyarrow compute
|
||||
# expressions or conversely have pyarrow support SQL expressions
|
||||
# using Substrait
|
||||
import duckdb
|
||||
|
||||
indexer = duckdb.sql(
|
||||
f"SELECT {tmp_name} FROM output_tbl WHERE {self._where}"
|
||||
).to_arrow_table()[tmp_name]
|
||||
output_tbl = output_tbl.take(indexer).drop([tmp_name])
|
||||
row_ids = row_ids.take(indexer)
|
||||
|
||||
except ImportError:
|
||||
import tempfile
|
||||
|
||||
import lance
|
||||
|
||||
# TODO Use "memory://" instead once that's supported
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
ds = lance.write_dataset(output_tbl, tmp)
|
||||
output_tbl = ds.to_table(filter=self._where)
|
||||
indexer = output_tbl[tmp_name]
|
||||
row_ids = row_ids.take(indexer)
|
||||
output_tbl = output_tbl.drop([tmp_name])
|
||||
|
||||
if self._with_row_id:
|
||||
output_tbl = output_tbl.append_column("_rowid", row_ids)
|
||||
|
||||
if self._reranker is not None:
|
||||
output_tbl = self._reranker.rerank_fts(self._query, output_tbl)
|
||||
return output_tbl
|
||||
|
||||
def rerank(self, reranker: Reranker) -> LanceFtsQueryBuilder:
|
||||
"""Rerank the results using the specified reranker.
|
||||
|
||||
@@ -1730,7 +1643,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
def _validate_query(self, query, vector=None, text=None):
|
||||
if query is not None and (vector is not None or text is not None):
|
||||
raise ValueError(
|
||||
"You can either provide a string query in search() method"
|
||||
"You can either provide a string query in search() method "
|
||||
"or set `vector()` and `text()` explicitly for hybrid search."
|
||||
"But not both."
|
||||
)
|
||||
|
||||
@@ -9,6 +9,7 @@ from typing import List, Optional
|
||||
from lancedb import __version__
|
||||
|
||||
from .header import HeaderProvider
|
||||
from .oauth import OAuthConfig, OAuthFlowType
|
||||
|
||||
__all__ = [
|
||||
"TimeoutConfig",
|
||||
@@ -16,6 +17,8 @@ __all__ = [
|
||||
"TlsConfig",
|
||||
"ClientConfig",
|
||||
"HeaderProvider",
|
||||
"OAuthConfig",
|
||||
"OAuthFlowType",
|
||||
]
|
||||
|
||||
|
||||
|
||||
90
python/python/lancedb/remote/oauth.py
Normal file
90
python/python/lancedb/remote/oauth.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class OAuthFlowType(str, Enum):
|
||||
"""OAuth authentication flow types."""
|
||||
|
||||
CLIENT_CREDENTIALS = "client_credentials"
|
||||
"""Client Credentials grant (service-to-service / M2M)."""
|
||||
|
||||
AUTHORIZATION_CODE_PKCE = "authorization_code_pkce"
|
||||
"""Authorization Code with PKCE (interactive browser-based auth)."""
|
||||
|
||||
DEVICE_CODE = "device_code"
|
||||
"""Device Code grant (CLI / headless environments)."""
|
||||
|
||||
AZURE_MANAGED_IDENTITY = "azure_managed_identity"
|
||||
"""Azure Managed Identity via IMDS."""
|
||||
|
||||
WORKLOAD_IDENTITY = "workload_identity"
|
||||
"""Workload Identity Federation (K8s, GitHub Actions)."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class OAuthConfig:
|
||||
"""OAuth configuration for LanceDB authentication.
|
||||
|
||||
All token acquisition and refresh is handled in the Rust layer.
|
||||
This config is passed through to Rust via PyO3.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
issuer_url : str
|
||||
OIDC issuer URL or OAuth authority URL.
|
||||
For Azure: ``https://login.microsoftonline.com/{tenant_id}/v2.0``
|
||||
client_id : str
|
||||
Application / Client ID.
|
||||
scopes : List[str]
|
||||
OAuth scopes to request.
|
||||
For Azure: ``["api://{app_id}/.default"]``
|
||||
flow : OAuthFlowType
|
||||
Authentication flow to use. Default: CLIENT_CREDENTIALS.
|
||||
client_secret : Optional[str]
|
||||
Client secret (required for CLIENT_CREDENTIALS).
|
||||
redirect_uri : Optional[str]
|
||||
Redirect URI for AUTHORIZATION_CODE_PKCE flow.
|
||||
callback_port : Optional[int]
|
||||
Port for local HTTP callback server (AUTHORIZATION_CODE_PKCE, default: 8400).
|
||||
managed_identity_client_id : Optional[str]
|
||||
Client ID for user-assigned managed identity (AZURE_MANAGED_IDENTITY).
|
||||
token_file : Optional[str]
|
||||
Path to federated token file (WORKLOAD_IDENTITY).
|
||||
refresh_buffer_secs : Optional[int]
|
||||
Seconds before expiry to trigger proactive refresh (default: 300).
|
||||
|
||||
Examples
|
||||
--------
|
||||
Client Credentials (service-to-service):
|
||||
|
||||
>>> config = OAuthConfig(
|
||||
... issuer_url="https://login.microsoftonline.com/{tenant}/v2.0",
|
||||
... client_id="app-id",
|
||||
... client_secret="secret",
|
||||
... scopes=["api://lancedb-api/.default"],
|
||||
... )
|
||||
|
||||
Azure Managed Identity:
|
||||
|
||||
>>> config = OAuthConfig(
|
||||
... issuer_url="https://login.microsoftonline.com/{tenant}/v2.0",
|
||||
... client_id="app-id",
|
||||
... scopes=["api://lancedb-api/.default"],
|
||||
... flow=OAuthFlowType.AZURE_MANAGED_IDENTITY,
|
||||
... )
|
||||
"""
|
||||
|
||||
issuer_url: str
|
||||
client_id: str
|
||||
scopes: List[str]
|
||||
flow: OAuthFlowType = OAuthFlowType.CLIENT_CREDENTIALS
|
||||
client_secret: Optional[str] = None
|
||||
redirect_uri: Optional[str] = None
|
||||
callback_port: Optional[int] = None
|
||||
managed_identity_client_id: Optional[str] = None
|
||||
token_file: Optional[str] = None
|
||||
refresh_buffer_secs: Optional[int] = None
|
||||
@@ -22,6 +22,7 @@ from lancedb.index import (
|
||||
FTS,
|
||||
BTree,
|
||||
Bitmap,
|
||||
HnswFlat,
|
||||
HnswSq,
|
||||
IvfFlat,
|
||||
IvfPq,
|
||||
@@ -39,6 +40,7 @@ from lancedb.table import _normalize_progress
|
||||
|
||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
|
||||
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
||||
from ..types import BaseTokenizerType
|
||||
|
||||
|
||||
class RemoteTable(Table):
|
||||
@@ -167,7 +169,7 @@ class RemoteTable(Table):
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
with_position: bool = False,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: str = "simple",
|
||||
base_tokenizer: BaseTokenizerType = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
@@ -284,13 +286,15 @@ class RemoteTable(Table):
|
||||
)
|
||||
elif index_type == "IVF_HNSW_SQ":
|
||||
config = HnswSq(distance_type=metric, num_partitions=num_partitions)
|
||||
elif index_type == "IVF_HNSW_FLAT":
|
||||
config = HnswFlat(distance_type=metric, num_partitions=num_partitions)
|
||||
elif index_type == "IVF_FLAT":
|
||||
config = IvfFlat(distance_type=metric, num_partitions=num_partitions)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown vector index type: {index_type}. Valid options are"
|
||||
" 'IVF_FLAT', 'IVF_PQ', 'IVF_RQ', 'IVF_SQ',"
|
||||
" 'IVF_HNSW_PQ', 'IVF_HNSW_SQ'"
|
||||
" 'IVF_HNSW_PQ', 'IVF_HNSW_SQ', 'IVF_HNSW_FLAT'"
|
||||
)
|
||||
|
||||
LOOP.run(
|
||||
|
||||
@@ -57,6 +57,7 @@ from .index import (
|
||||
LabelList,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
FTS,
|
||||
)
|
||||
from .merge import LanceMergeInsertBuilder
|
||||
@@ -86,6 +87,59 @@ from .util import (
|
||||
)
|
||||
from .index import lang_mapping
|
||||
|
||||
_MODEL_BACKED_TOKENIZER_PREFIXES = ("jieba", "lindera")
|
||||
_MODEL_BACKED_TOKENIZER_ERRORS = (
|
||||
"unknown base tokenizer",
|
||||
"Invalid directory path:",
|
||||
"Failed to load Jieba",
|
||||
"Failed to load tokenizer config",
|
||||
"Failed to initialize default tokenizer",
|
||||
)
|
||||
|
||||
|
||||
def _add_unique_note(exception: BaseException, note: str) -> None:
|
||||
existing_notes = getattr(exception, "__notes__", ()) or ()
|
||||
message = (
|
||||
exception.args[0]
|
||||
if exception.args and isinstance(exception.args[0], str)
|
||||
else ""
|
||||
)
|
||||
if note not in existing_notes and note not in message:
|
||||
add_note(exception, note)
|
||||
|
||||
|
||||
def _is_model_backed_tokenizer(base_tokenizer: str) -> bool:
|
||||
return any(
|
||||
base_tokenizer == prefix or base_tokenizer.startswith(f"{prefix}/")
|
||||
for prefix in _MODEL_BACKED_TOKENIZER_PREFIXES
|
||||
)
|
||||
|
||||
|
||||
def _maybe_add_fts_error_note(
|
||||
exception: BaseException, *, base_tokenizer: str, language: Optional[str] = None
|
||||
) -> None:
|
||||
message = str(exception)
|
||||
if language is not None and "not support the requested language" in message:
|
||||
supported_langs = ", ".join(lang_mapping.values())
|
||||
_add_unique_note(exception, f"Supported languages: {supported_langs}")
|
||||
return
|
||||
|
||||
if not _is_model_backed_tokenizer(base_tokenizer):
|
||||
return
|
||||
|
||||
if not any(marker in message for marker in _MODEL_BACKED_TOKENIZER_ERRORS):
|
||||
return
|
||||
|
||||
_add_unique_note(
|
||||
exception,
|
||||
"Model-backed tokenizers such as 'jieba/default' and 'lindera/ipadic' "
|
||||
"require tokenizer models in Lance's language model home. Set "
|
||||
"LANCE_LANGUAGE_MODEL_HOME to override the default platform data "
|
||||
"directory under 'lance/language_models'. Expected layouts include "
|
||||
"'<model-home>/jieba/default/...' and "
|
||||
"'<model-home>/lindera/ipadic/...'.",
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .db import LanceDBConnection
|
||||
@@ -943,29 +997,29 @@ class Table(ABC):
|
||||
Parameters
|
||||
----------
|
||||
field_names: str or list of str
|
||||
The name(s) of the field to index.
|
||||
If ``use_tantivy`` is False (default), only a single field name
|
||||
(str) is supported. To index multiple fields, create a separate
|
||||
FTS index for each field.
|
||||
The name of the field to index. Native FTS indexes can only be
|
||||
created on a single field at a time. To search over multiple text
|
||||
fields, create a separate FTS index for each field.
|
||||
replace: bool, default False
|
||||
If True, replace the existing index if it exists. Note that this is
|
||||
not yet an atomic operation; the index will be temporarily
|
||||
unavailable while the new index is being created.
|
||||
writer_heap_size: int, default 1GB
|
||||
Only available with use_tantivy=True
|
||||
Deprecated legacy Tantivy parameter. Any value other than the
|
||||
default raises an error.
|
||||
ordering_field_names:
|
||||
A list of unsigned type fields to index to optionally order
|
||||
results on at search time.
|
||||
only available with use_tantivy=True
|
||||
Deprecated legacy Tantivy parameter. Setting this raises an error.
|
||||
tokenizer_name: str, default "default"
|
||||
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
||||
language code followed by "_stem". So for english it would be "en_stem".
|
||||
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
||||
A compatibility alias for native tokenizer configs. Can be "raw",
|
||||
"default" or the 2 letter language code followed by "_stem". So
|
||||
for english it would be "en_stem". For new native FTS indexes, use
|
||||
``base_tokenizer`` directly; ``tokenizer_name`` is a legacy
|
||||
compatibility alias and does not expose model-backed tokenizer names
|
||||
such as ``jieba/default`` or ``lindera/ipadic``.
|
||||
use_tantivy: bool, default False
|
||||
If True, use the legacy full-text search implementation based on tantivy.
|
||||
If False, use the new full-text search implementation based on lance-index.
|
||||
Deprecated legacy Tantivy parameter. Setting this to True raises an
|
||||
error.
|
||||
with_position: bool, default False
|
||||
Only available with use_tantivy=False
|
||||
If False, do not store the positions of the terms in the text.
|
||||
This can reduce the size of the index and improve indexing speed.
|
||||
But it will raise an exception for phrase queries.
|
||||
@@ -975,8 +1029,11 @@ class Table(ABC):
|
||||
- "whitespace": Split text by whitespace, but not punctuation.
|
||||
- "raw": No tokenization. The entire text is treated as a single token.
|
||||
- "ngram": N-Gram tokenizer.
|
||||
- "jieba/*": Jieba tokenizer loaded from Lance's language model home.
|
||||
- "lindera/*": Lindera tokenizer loaded from Lance's language model home.
|
||||
language : str, default "English"
|
||||
The language to use for tokenization.
|
||||
The language to use for stemming and stop-word removal. This is not
|
||||
the primary way to enable CJK tokenization.
|
||||
max_token_length : int, default 40
|
||||
The maximum token length to index. Tokens longer than this length will be
|
||||
ignored.
|
||||
@@ -1002,6 +1059,13 @@ class Table(ABC):
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
The name of the index. If not provided, a default name will be generated.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
|
||||
require tokenizer models in Lance's language model home. Set
|
||||
``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
|
||||
directory under ``lance/language_models``.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -1746,6 +1810,16 @@ class Table(ABC):
|
||||
index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound
|
||||
return (path, fs, index_exists)
|
||||
|
||||
def _ensure_no_legacy_fts_index(self):
|
||||
path, _, exists = self._get_fts_index_path()
|
||||
if exists:
|
||||
raise ValueError(
|
||||
"Legacy Tantivy FTS index detected at "
|
||||
f"{path}. Tantivy-based FTS has been removed. "
|
||||
"Delete the legacy index and recreate it with "
|
||||
"table.create_fts_index(...)."
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def uses_v2_manifest_paths(self) -> bool:
|
||||
"""
|
||||
@@ -2163,7 +2237,13 @@ class LanceTable(Table):
|
||||
index_cache_size: Optional[int] = None,
|
||||
num_bits: int = 8,
|
||||
index_type: Literal[
|
||||
"IVF_FLAT", "IVF_SQ", "IVF_PQ", "IVF_RQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
|
||||
"IVF_FLAT",
|
||||
"IVF_SQ",
|
||||
"IVF_PQ",
|
||||
"IVF_RQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
] = "IVF_PQ",
|
||||
max_iterations: int = 50,
|
||||
sample_rate: int = 256,
|
||||
@@ -2250,6 +2330,16 @@ class LanceTable(Table):
|
||||
ef_construction=ef_construction,
|
||||
target_partition_size=target_partition_size,
|
||||
)
|
||||
elif index_type == "IVF_HNSW_FLAT":
|
||||
config = HnswFlat(
|
||||
distance_type=metric,
|
||||
num_partitions=num_partitions,
|
||||
max_iterations=max_iterations,
|
||||
sample_rate=sample_rate,
|
||||
m=m,
|
||||
ef_construction=ef_construction,
|
||||
target_partition_size=target_partition_size,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown index type {index_type}")
|
||||
|
||||
@@ -2405,41 +2495,57 @@ class LanceTable(Table):
|
||||
prefix_only: bool = False,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
if not use_tantivy:
|
||||
if not isinstance(field_names, str):
|
||||
raise ValueError(
|
||||
"Native FTS indexes can only be created on a single field "
|
||||
"at a time. To search over multiple text fields, create a "
|
||||
"separate FTS index for each field."
|
||||
)
|
||||
self._ensure_no_legacy_fts_index()
|
||||
|
||||
if tokenizer_name is None:
|
||||
tokenizer_configs = {
|
||||
"base_tokenizer": base_tokenizer,
|
||||
"language": language,
|
||||
"with_position": with_position,
|
||||
"max_token_length": max_token_length,
|
||||
"lower_case": lower_case,
|
||||
"stem": stem,
|
||||
"remove_stop_words": remove_stop_words,
|
||||
"ascii_folding": ascii_folding,
|
||||
"ngram_min_length": ngram_min_length,
|
||||
"ngram_max_length": ngram_max_length,
|
||||
"prefix_only": prefix_only,
|
||||
}
|
||||
else:
|
||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||
|
||||
config = FTS(
|
||||
**tokenizer_configs,
|
||||
if use_tantivy:
|
||||
raise ValueError(
|
||||
"Tantivy-based FTS has been removed. "
|
||||
"Remove use_tantivy and recreate the index with native FTS."
|
||||
)
|
||||
if ordering_field_names is not None:
|
||||
raise ValueError(
|
||||
"ordering_field_names was only supported by the removed "
|
||||
"Tantivy-based FTS implementation."
|
||||
)
|
||||
if writer_heap_size != 1024 * 1024 * 1024:
|
||||
raise ValueError(
|
||||
"writer_heap_size was only supported by the removed "
|
||||
"Tantivy-based FTS implementation."
|
||||
)
|
||||
if not isinstance(field_names, str):
|
||||
raise ValueError(
|
||||
"Native FTS indexes can only be created on a single field "
|
||||
"at a time. To search over multiple text fields, create a "
|
||||
"separate FTS index for each field."
|
||||
)
|
||||
if "." in field_names:
|
||||
raise ValueError(
|
||||
"Native FTS indexes can only be created on top-level fields. "
|
||||
f"Received nested field path: {field_names!r}."
|
||||
)
|
||||
|
||||
# delete the existing legacy index if it exists
|
||||
if replace:
|
||||
path, fs, exist = self._get_fts_index_path()
|
||||
if exist:
|
||||
fs.delete_dir(path)
|
||||
if tokenizer_name is None:
|
||||
tokenizer_configs = {
|
||||
"base_tokenizer": base_tokenizer,
|
||||
"language": language,
|
||||
"with_position": with_position,
|
||||
"max_token_length": max_token_length,
|
||||
"lower_case": lower_case,
|
||||
"stem": stem,
|
||||
"remove_stop_words": remove_stop_words,
|
||||
"ascii_folding": ascii_folding,
|
||||
"ngram_min_length": ngram_min_length,
|
||||
"ngram_max_length": ngram_max_length,
|
||||
"prefix_only": prefix_only,
|
||||
}
|
||||
else:
|
||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||
|
||||
config = FTS(
|
||||
**tokenizer_configs,
|
||||
)
|
||||
|
||||
try:
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
field_names,
|
||||
@@ -2448,42 +2554,13 @@ class LanceTable(Table):
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
from .fts import create_index, populate_index
|
||||
|
||||
if isinstance(field_names, str):
|
||||
field_names = [field_names]
|
||||
|
||||
if isinstance(ordering_field_names, str):
|
||||
ordering_field_names = [ordering_field_names]
|
||||
|
||||
path, fs, exist = self._get_fts_index_path()
|
||||
if exist:
|
||||
if not replace:
|
||||
raise ValueError("Index already exists. Use replace=True to overwrite.")
|
||||
fs.delete_dir(path)
|
||||
|
||||
if not isinstance(fs, pa_fs.LocalFileSystem):
|
||||
raise NotImplementedError(
|
||||
"Full-text search is only supported on the local filesystem"
|
||||
except (ValueError, RuntimeError) as e:
|
||||
_maybe_add_fts_error_note(
|
||||
e,
|
||||
base_tokenizer=config.base_tokenizer,
|
||||
language=config.language,
|
||||
)
|
||||
|
||||
if tokenizer_name is None:
|
||||
tokenizer_name = "default"
|
||||
index = create_index(
|
||||
path,
|
||||
field_names,
|
||||
ordering_fields=ordering_field_names,
|
||||
tokenizer_name=tokenizer_name,
|
||||
)
|
||||
populate_index(
|
||||
index,
|
||||
self,
|
||||
field_names,
|
||||
ordering_fields=ordering_field_names,
|
||||
writer_heap_size=writer_heap_size,
|
||||
)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
|
||||
@@ -3813,7 +3890,18 @@ class AsyncTable:
|
||||
*,
|
||||
replace: Optional[bool] = None,
|
||||
config: Optional[
|
||||
Union[IvfFlat, IvfPq, IvfRq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
||||
Union[
|
||||
IvfFlat,
|
||||
IvfPq,
|
||||
IvfRq,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
FTS,
|
||||
]
|
||||
] = None,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
name: Optional[str] = None,
|
||||
@@ -3860,6 +3948,7 @@ class AsyncTable:
|
||||
IvfRq,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
HnswFlat,
|
||||
BTree,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
@@ -3879,11 +3968,13 @@ class AsyncTable:
|
||||
name=name,
|
||||
train=train,
|
||||
)
|
||||
except ValueError as e:
|
||||
if "not support the requested language" in str(e):
|
||||
supported_langs = ", ".join(lang_mapping.values())
|
||||
help_msg = f"Supported languages: {supported_langs}"
|
||||
add_note(e, help_msg)
|
||||
except (ValueError, RuntimeError) as e:
|
||||
if isinstance(config, FTS):
|
||||
_maybe_add_fts_error_note(
|
||||
e,
|
||||
base_tokenizer=config.base_tokenizer,
|
||||
language=config.language,
|
||||
)
|
||||
raise e
|
||||
|
||||
async def drop_index(self, name: str) -> None:
|
||||
@@ -5028,6 +5119,7 @@ class IndexStatistics:
|
||||
"IVF_RQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"FTS",
|
||||
"BTREE",
|
||||
"BITMAP",
|
||||
|
||||
@@ -24,6 +24,7 @@ VectorIndexType = Literal[
|
||||
"IVF_PQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"IVF_RQ",
|
||||
]
|
||||
ScalarIndexType = Literal["BTREE", "BITMAP", "LABEL_LIST"]
|
||||
@@ -31,6 +32,7 @@ IndexType = Literal[
|
||||
"IVF_PQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"IVF_HNSW_FLAT",
|
||||
"IVF_SQ",
|
||||
"FTS",
|
||||
"BTREE",
|
||||
@@ -40,4 +42,5 @@ IndexType = Literal[
|
||||
]
|
||||
|
||||
# Tokenizer literals
|
||||
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||
BuiltinTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||
BaseTokenizerType = BuiltinTokenizerType | str
|
||||
|
||||
@@ -180,7 +180,7 @@ def test_fts_fuzzy_query():
|
||||
),
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
table.create_fts_index("text", replace=True)
|
||||
|
||||
results = table.search(MatchQuery("foo", "text", fuzziness=1)).to_pandas()
|
||||
assert len(results) == 4
|
||||
@@ -230,7 +230,7 @@ def test_fts_boost_query():
|
||||
),
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("desc", use_tantivy=False, replace=True)
|
||||
table.create_fts_index("desc", replace=True)
|
||||
|
||||
results = table.search(
|
||||
BoostQuery(
|
||||
@@ -265,7 +265,7 @@ def test_fts_boolean_query(tmp_path):
|
||||
],
|
||||
mode="overwrite",
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
table.create_fts_index("text", replace=True)
|
||||
|
||||
# SHOULD
|
||||
results = table.search(
|
||||
@@ -319,9 +319,7 @@ def test_fts_native():
|
||||
],
|
||||
)
|
||||
|
||||
# passing `use_tantivy=False` to use lance FTS index
|
||||
# `use_tantivy=True` by default
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
table.search("puppy").limit(10).select(["text"]).to_list()
|
||||
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
|
||||
# ...
|
||||
@@ -332,7 +330,6 @@ def test_fts_native():
|
||||
# --8<-- [start:fts_config_folding]
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
language="French",
|
||||
stem=True,
|
||||
ascii_folding=True,
|
||||
@@ -346,7 +343,7 @@ def test_fts_native():
|
||||
table.search("puppy").limit(10).where("text='foo'", prefilter=False).to_list()
|
||||
# --8<-- [end:fts_postfiltering]
|
||||
# --8<-- [start:fts_with_position]
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
|
||||
table.create_fts_index("text", with_position=True, replace=True)
|
||||
# --8<-- [end:fts_with_position]
|
||||
# --8<-- [start:fts_incremental_index]
|
||||
table.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
|
||||
|
||||
8
python/python/tests/models/jieba/default/dict.txt
Normal file
8
python/python/tests/models/jieba/default/dict.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
我们 98740 r
|
||||
都 202780 d
|
||||
有 423765 v
|
||||
光明 1219 n
|
||||
的 318825 uj
|
||||
前途 1263 n
|
||||
前 62779 f
|
||||
途 857 n
|
||||
4
python/python/tests/models/lindera/ipadic/config.yml
Normal file
4
python/python/tests/models/lindera/ipadic/config.yml
Normal file
@@ -0,0 +1,4 @@
|
||||
segmenter:
|
||||
mode: "normal"
|
||||
dictionary:
|
||||
path: "./python/tests/models/lindera/ipadic/main"
|
||||
BIN
python/python/tests/models/lindera/ipadic/main.zip
Normal file
BIN
python/python/tests/models/lindera/ipadic/main.zip
Normal file
Binary file not shown.
@@ -15,8 +15,7 @@ import pytest
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_basic(tmp_path, use_tantivy):
|
||||
def test_basic(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
assert db.uri == str(tmp_path)
|
||||
@@ -49,7 +48,7 @@ def test_basic(tmp_path, use_tantivy):
|
||||
assert len(rs) == 1
|
||||
assert rs["item"].iloc[0] == "foo"
|
||||
|
||||
table.create_fts_index("item", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("item")
|
||||
rs = table.search("bar", query_type="fts").to_pandas()
|
||||
assert len(rs) == 1
|
||||
assert rs["item"].iloc[0] == "bar"
|
||||
|
||||
@@ -15,7 +15,10 @@
|
||||
# limitations under the License.
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from pathlib import Path
|
||||
import zipfile
|
||||
|
||||
import lancedb as ldb
|
||||
from lancedb.db import DBConnection
|
||||
@@ -36,8 +39,7 @@ import pytest
|
||||
import pytest_asyncio
|
||||
from utils import exception_output
|
||||
|
||||
pytest.importorskip("lancedb.fts")
|
||||
tantivy = pytest.importorskip("tantivy")
|
||||
TEST_LANGUAGE_MODEL_HOME = Path(__file__).parent / "models"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -92,6 +94,40 @@ def table(tmp_path) -> ldb.table.LanceTable:
|
||||
return table
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def language_model_home(monkeypatch, tmp_path):
|
||||
model_home = tmp_path / "language-models"
|
||||
shutil.copytree(TEST_LANGUAGE_MODEL_HOME, model_home)
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(model_home))
|
||||
return model_home
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lindera_ipadic(language_model_home):
|
||||
model_path = language_model_home / "lindera" / "ipadic"
|
||||
extracted_model = model_path / "main"
|
||||
config_path = model_path / "config.yml"
|
||||
|
||||
if extracted_model.exists():
|
||||
shutil.rmtree(extracted_model)
|
||||
|
||||
with zipfile.ZipFile(model_path / "main.zip", "r") as zip_ref:
|
||||
zip_ref.extractall(model_path)
|
||||
config_path.write_text(
|
||||
"segmenter:\n"
|
||||
' mode: "normal"\n'
|
||||
" dictionary:\n"
|
||||
f' path: "{extracted_model.resolve().as_posix()}"\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if extracted_model.exists():
|
||||
shutil.rmtree(extracted_model)
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
||||
# Use local random state to avoid affecting other tests
|
||||
@@ -144,58 +180,53 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
||||
return table
|
||||
|
||||
|
||||
def test_create_index(tmp_path):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
assert isinstance(index, tantivy.Index)
|
||||
assert os.path.exists(str(tmp_path / "index"))
|
||||
@pytest.mark.parametrize(
|
||||
("kwargs", "match"),
|
||||
[
|
||||
(
|
||||
{"use_tantivy": True},
|
||||
"Tantivy-based FTS has been removed",
|
||||
),
|
||||
(
|
||||
{"ordering_field_names": ["count"]},
|
||||
"ordering_field_names was only supported",
|
||||
),
|
||||
(
|
||||
{"writer_heap_size": 128},
|
||||
"writer_heap_size was only supported",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_reject_removed_tantivy_parameters(table, kwargs, match):
|
||||
with pytest.raises(ValueError, match=match):
|
||||
table.create_fts_index("text", **kwargs)
|
||||
|
||||
|
||||
def test_create_index_with_stemming(tmp_path, table):
|
||||
index = ldb.fts.create_index(
|
||||
str(tmp_path / "index"), ["text"], tokenizer_name="en_stem"
|
||||
)
|
||||
assert isinstance(index, tantivy.Index)
|
||||
assert os.path.exists(str(tmp_path / "index"))
|
||||
def test_reject_legacy_tantivy_index(table):
|
||||
path, _, _ = table._get_fts_index_path()
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
# Check stemming by running tokenizer on non empty table
|
||||
table.create_fts_index("text", tokenizer_name="en_stem", use_tantivy=True)
|
||||
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
|
||||
table.search("puppy").limit(5).to_list()
|
||||
|
||||
with pytest.raises(ValueError, match="Legacy Tantivy FTS index detected"):
|
||||
table.create_fts_index("text")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
@pytest.mark.parametrize("with_position", [True, False])
|
||||
def test_create_inverted_index(table, use_tantivy, with_position):
|
||||
if use_tantivy and not with_position:
|
||||
pytest.skip("we don't support building a tantivy index without position")
|
||||
def test_create_inverted_index(table, with_position):
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=use_tantivy,
|
||||
with_position=with_position,
|
||||
name="custom_fts_index",
|
||||
)
|
||||
if not use_tantivy:
|
||||
indices = table.list_indices()
|
||||
fts_indices = [i for i in indices if i.index_type == "FTS"]
|
||||
assert any(i.name == "custom_fts_index" for i in fts_indices)
|
||||
indices = table.list_indices()
|
||||
fts_indices = [i for i in indices if i.index_type == "FTS"]
|
||||
assert any(i.name == "custom_fts_index" for i in fts_indices)
|
||||
|
||||
|
||||
def test_populate_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
||||
|
||||
|
||||
def test_search_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||
ldb.fts.populate_index(index, table, ["text"])
|
||||
index.reload()
|
||||
results = ldb.fts.search_index(index, query="puppy", limit=5)
|
||||
assert len(results) == 2
|
||||
assert len(results[0]) == 5 # row_ids
|
||||
assert len(results[1]) == 5 # _score
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_search_fts(table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
def test_search_fts(table):
|
||||
table.create_fts_index("text")
|
||||
results = table.search("puppy").select(["id", "text"]).limit(5).to_list()
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
@@ -204,53 +235,52 @@ def test_search_fts(table, use_tantivy):
|
||||
results = table.search("puppy").select(["id", "text"]).to_list()
|
||||
assert len(results) == 10
|
||||
|
||||
if not use_tantivy:
|
||||
# Test with a query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
# Test with a query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
|
||||
# Test boost query
|
||||
results = (
|
||||
table.search(
|
||||
BoostQuery(
|
||||
MatchQuery("puppy", "text"),
|
||||
MatchQuery("runs", "text"),
|
||||
)
|
||||
# Test boost query
|
||||
results = (
|
||||
table.search(
|
||||
BoostQuery(
|
||||
MatchQuery("puppy", "text"),
|
||||
MatchQuery("runs", "text"),
|
||||
)
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
|
||||
# Test multi match query
|
||||
table.create_fts_index("text2", use_tantivy=use_tantivy)
|
||||
results = (
|
||||
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
# Test multi match query
|
||||
table.create_fts_index("text2")
|
||||
results = (
|
||||
table.search(MultiMatchQuery("puppy", ["text", "text2"]))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
# Test boolean query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
for r in results:
|
||||
assert "puppy" in r["text"]
|
||||
assert "runs" in r["text"]
|
||||
# Test boolean query
|
||||
results = (
|
||||
table.search(MatchQuery("puppy", "text") & MatchQuery("runs", "text"))
|
||||
.select(["id", "text"])
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
for r in results:
|
||||
assert "puppy" in r["text"]
|
||||
assert "runs" in r["text"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -318,13 +348,13 @@ async def test_fts_select_async(async_table):
|
||||
|
||||
|
||||
def test_search_fts_phrase_query(table):
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=False)
|
||||
table.create_fts_index("text", with_position=False)
|
||||
try:
|
||||
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||
assert False
|
||||
except Exception:
|
||||
pass
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
|
||||
table.create_fts_index("text", with_position=True, replace=True)
|
||||
results = table.search("puppy").limit(100).to_list()
|
||||
|
||||
# Test with quotation marks
|
||||
@@ -375,8 +405,8 @@ async def test_search_fts_phrase_query_async(async_table):
|
||||
|
||||
|
||||
def test_search_fts_specify_column(table):
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text2", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
table.create_fts_index("text2")
|
||||
|
||||
results = table.search("puppy", fts_columns="text").limit(5).to_list()
|
||||
assert len(results) == 5
|
||||
@@ -470,42 +500,8 @@ async def test_search_fts_specify_column_async(async_table):
|
||||
pass
|
||||
|
||||
|
||||
def test_search_ordering_field_index_table(tmp_path, table):
|
||||
table.create_fts_index("text", ordering_field_names=["count"], use_tantivy=True)
|
||||
rows = (
|
||||
table.search("puppy", ordering_field_name="count")
|
||||
.limit(20)
|
||||
.select(["text", "count"])
|
||||
.to_list()
|
||||
)
|
||||
for r in rows:
|
||||
assert "puppy" in r["text"]
|
||||
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
||||
|
||||
|
||||
def test_search_ordering_field_index(tmp_path, table):
|
||||
index = ldb.fts.create_index(
|
||||
str(tmp_path / "index"), ["text"], ordering_fields=["count"]
|
||||
)
|
||||
|
||||
ldb.fts.populate_index(index, table, ["text"], ordering_fields=["count"])
|
||||
index.reload()
|
||||
results = ldb.fts.search_index(
|
||||
index, query="puppy", limit=5, ordering_field="count"
|
||||
)
|
||||
assert len(results) == 2
|
||||
assert len(results[0]) == 5 # row_ids
|
||||
assert len(results[1]) == 5 # _distance
|
||||
rows = table.to_lance().take(results[0]).to_pylist()
|
||||
|
||||
for r in rows:
|
||||
assert "puppy" in r["text"]
|
||||
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_create_index_from_table(tmp_path, table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
def test_create_index_from_table(tmp_path, table):
|
||||
table.create_fts_index("text")
|
||||
df = table.search("puppy").limit(5).select(["text"]).to_pandas()
|
||||
assert len(df) <= 5
|
||||
assert "text" in df.columns
|
||||
@@ -525,36 +521,24 @@ def test_create_index_from_table(tmp_path, table, use_tantivy):
|
||||
)
|
||||
|
||||
with pytest.raises(Exception, match="already exists"):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text")
|
||||
|
||||
table.create_fts_index("text", replace=True, use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text", replace=True)
|
||||
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
||||
|
||||
|
||||
def test_create_index_multiple_columns(tmp_path, table):
|
||||
table.create_fts_index(["text", "text2"], use_tantivy=True)
|
||||
df = table.search("puppy").limit(5).to_pandas()
|
||||
assert len(df) == 5
|
||||
assert "text" in df.columns
|
||||
assert "text2" in df.columns
|
||||
|
||||
|
||||
def test_empty_rs(tmp_path, table, mocker):
|
||||
table.create_fts_index(["text", "text2"], use_tantivy=True)
|
||||
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
||||
df = table.search("puppy").limit(5).to_pandas()
|
||||
assert len(df) == 0
|
||||
with pytest.raises(ValueError, match="Native FTS indexes can only be created"):
|
||||
table.create_fts_index(["text", "text2"])
|
||||
|
||||
|
||||
def test_nested_schema(tmp_path, table):
|
||||
table.create_fts_index("nested.text", use_tantivy=True)
|
||||
rs = table.search("puppy").limit(5).to_list()
|
||||
assert len(rs) == 5
|
||||
with pytest.raises(ValueError, match="top-level fields"):
|
||||
table.create_fts_index("nested.text")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_search_index_with_filter(table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
def test_search_index_with_filter(table):
|
||||
table.create_fts_index("text")
|
||||
orig_import = __import__
|
||||
|
||||
def import_mock(name, *args):
|
||||
@@ -584,8 +568,7 @@ def test_search_index_with_filter(table, use_tantivy):
|
||||
assert r["_rowid"] is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_null_input(table, use_tantivy):
|
||||
def test_null_input(table):
|
||||
table.add(
|
||||
[
|
||||
{
|
||||
@@ -598,14 +581,13 @@ def test_null_input(table, use_tantivy):
|
||||
}
|
||||
]
|
||||
)
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text")
|
||||
|
||||
|
||||
def test_syntax(table):
|
||||
# https://github.com/lancedb/lancedb/issues/769
|
||||
table.create_fts_index("text", use_tantivy=True)
|
||||
with pytest.raises(ValueError, match="Syntax Error"):
|
||||
table.search("they could have been dogs OR").limit(10).to_list()
|
||||
table.create_fts_index("text")
|
||||
table.search("they could have been dogs OR").limit(10).to_list()
|
||||
|
||||
# these should work
|
||||
|
||||
@@ -616,6 +598,7 @@ def test_syntax(table):
|
||||
).to_list()
|
||||
|
||||
# phrase queries
|
||||
table.create_fts_index("text", with_position=True, replace=True)
|
||||
table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list()
|
||||
table.search('"they could have been dogs OR cats"').limit(10).to_list()
|
||||
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
|
||||
@@ -639,7 +622,7 @@ def test_language(mem_db: DBConnection):
|
||||
table = mem_db.create_table("test", data=data)
|
||||
|
||||
with pytest.raises(ValueError) as e:
|
||||
table.create_fts_index("text", use_tantivy=False, language="klingon")
|
||||
table.create_fts_index("text", language="klingon")
|
||||
|
||||
assert exception_output(e) == (
|
||||
"ValueError: LanceDB does not support the requested language: 'klingon'\n"
|
||||
@@ -650,7 +633,6 @@ def test_language(mem_db: DBConnection):
|
||||
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
language="French",
|
||||
stem=True,
|
||||
ascii_folding=True,
|
||||
@@ -690,7 +672,7 @@ def test_fts_on_list(mem_db: DBConnection):
|
||||
}
|
||||
)
|
||||
table = mem_db.create_table("test", data=data)
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True)
|
||||
table.create_fts_index("text", with_position=True)
|
||||
|
||||
res = table.search("lance").limit(5).to_list()
|
||||
assert len(res) == 3
|
||||
@@ -702,7 +684,7 @@ def test_fts_on_list(mem_db: DBConnection):
|
||||
def test_fts_ngram(mem_db: DBConnection):
|
||||
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
|
||||
table = mem_db.create_table("test", data=data)
|
||||
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
|
||||
table.create_fts_index("text", base_tokenizer="ngram")
|
||||
|
||||
results = table.search("lan", query_type="fts").limit(10).to_list()
|
||||
assert len(results) == 2
|
||||
@@ -721,7 +703,6 @@ def test_fts_ngram(mem_db: DBConnection):
|
||||
# test setting min_ngram_length and prefix_only
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
base_tokenizer="ngram",
|
||||
replace=True,
|
||||
ngram_min_length=2,
|
||||
@@ -742,6 +723,90 @@ def test_fts_ngram(mem_db: DBConnection):
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
|
||||
def test_fts_jieba_tokenizer(mem_db: DBConnection, language_model_home):
|
||||
data = pa.table({"text": ["我们都有光明的前途", "光明的前途"]})
|
||||
table = mem_db.create_table("test_jieba", data=data)
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
results = table.search("我们", query_type="fts").limit(10).to_list()
|
||||
assert [row["text"] for row in results] == ["我们都有光明的前途"]
|
||||
|
||||
|
||||
def test_fts_jieba_missing_language_model_note(
|
||||
mem_db: DBConnection, monkeypatch, tmp_path
|
||||
):
|
||||
missing_root = tmp_path / "missing-language-models"
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
|
||||
table = mem_db.create_table(
|
||||
"test_missing_jieba_model",
|
||||
data=pa.table({"text": ["我们都有光明的前途"]}),
|
||||
)
|
||||
|
||||
with pytest.raises((ValueError, RuntimeError)) as e:
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
output = exception_output(e)
|
||||
assert "Invalid directory path:" in output
|
||||
assert "LANCE_LANGUAGE_MODEL_HOME" in output
|
||||
assert "jieba/default" in output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fts_jieba_missing_language_model_note_async(monkeypatch, tmp_path):
|
||||
missing_root = tmp_path / "missing-language-models"
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
|
||||
db = await ldb.connect_async(tmp_path / "async-db")
|
||||
table = await db.create_table(
|
||||
"test_missing_jieba_model_async",
|
||||
data=pa.table({"text": ["我们都有光明的前途"]}),
|
||||
)
|
||||
|
||||
with pytest.raises((ValueError, RuntimeError)) as e:
|
||||
await table.create_index(
|
||||
"text",
|
||||
config=FTS(
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
),
|
||||
)
|
||||
|
||||
output = exception_output(e)
|
||||
assert "Invalid directory path:" in output
|
||||
assert "LANCE_LANGUAGE_MODEL_HOME" in output
|
||||
assert "jieba/default" in output
|
||||
|
||||
|
||||
def test_fts_lindera_tokenizer(
|
||||
mem_db: DBConnection, language_model_home, lindera_ipadic
|
||||
):
|
||||
data = pa.table({"text": ["成田国際空港", "東京国際空港", "羽田空港"]})
|
||||
table = mem_db.create_table("test_lindera", data=data)
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="lindera/ipadic",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
results = table.search("成田", query_type="fts").limit(10).to_list()
|
||||
assert [row["text"] for row in results] == ["成田国際空港"]
|
||||
|
||||
|
||||
def test_fts_query_to_json():
|
||||
"""Test that FTS query to_json() produces valid JSON strings with exact format."""
|
||||
|
||||
@@ -886,7 +951,7 @@ def test_fts_query_to_json():
|
||||
|
||||
|
||||
def test_fts_fast_search(table):
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text")
|
||||
|
||||
# Insert some unindexed data
|
||||
table.add(
|
||||
|
||||
@@ -28,7 +28,7 @@ def sync_table(tmpdir_factory) -> Table:
|
||||
}
|
||||
)
|
||||
table = db.create_table("test", data)
|
||||
table.create_fts_index("text", with_position=False, use_tantivy=False)
|
||||
table.create_fts_index("text", with_position=False)
|
||||
return table
|
||||
|
||||
|
||||
@@ -192,7 +192,7 @@ def table_with_id(tmpdir_factory) -> Table:
|
||||
}
|
||||
)
|
||||
table = db.create_table("test_with_id", data)
|
||||
table.create_fts_index("text", with_position=False, use_tantivy=False)
|
||||
table.create_fts_index("text", with_position=False)
|
||||
return table
|
||||
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user