mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
63 Commits
v0.22.1
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b2242886e0 | ||
|
|
199904ab35 | ||
|
|
1fa888615f | ||
|
|
40967f3baa | ||
|
|
0bfc7de32c | ||
|
|
d43880a585 | ||
|
|
59a886958b | ||
|
|
c36f6746d1 | ||
|
|
25ce6d311f | ||
|
|
92a4e46f9f | ||
|
|
845641c480 | ||
|
|
d96404c635 | ||
|
|
02d31ee412 | ||
|
|
308623577d | ||
|
|
8ee3ae378f | ||
|
|
3372a2aae0 | ||
|
|
4cfcd95320 | ||
|
|
a70ff04bc9 | ||
|
|
a9daa18be9 | ||
|
|
3f2e3986e9 | ||
|
|
bf55feb9b6 | ||
|
|
8f8e06a2da | ||
|
|
03eab0f091 | ||
|
|
143184c0ae | ||
|
|
dadb042978 | ||
|
|
5a19cf15a6 | ||
|
|
3dcec724b7 | ||
|
|
86a6bb9fcb | ||
|
|
b59d1007d3 | ||
|
|
56a16b1728 | ||
|
|
b7afed9beb | ||
|
|
5cbbaa2e4a | ||
|
|
1b6bd2498e | ||
|
|
285da9db1d | ||
|
|
ad8306c96b | ||
|
|
3594538509 | ||
|
|
917aabd077 | ||
|
|
5ec12c9971 | ||
|
|
d0ce489b21 | ||
|
|
d7e02c8181 | ||
|
|
70958f6366 | ||
|
|
1ac745eb18 | ||
|
|
1357fe8aa1 | ||
|
|
0d78929893 | ||
|
|
9e2a68541e | ||
|
|
1aa0fd16e7 | ||
|
|
fec2a05629 | ||
|
|
79a1cd60ee | ||
|
|
88807a59a4 | ||
|
|
e0e7e01ea8 | ||
|
|
a416ebc11d | ||
|
|
f941054baf | ||
|
|
1a81c46505 | ||
|
|
82b25a71e9 | ||
|
|
13c613d45f | ||
|
|
e07389a36c | ||
|
|
e7e9e80b1d | ||
|
|
247fb58400 | ||
|
|
504bdc471c | ||
|
|
d617cdef4a | ||
|
|
356d7046fd | ||
|
|
48e5caabda | ||
|
|
d6cc68f671 |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.22.1-beta.4"
|
||||
current_version = "0.22.3-beta.2"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
45
.github/actions/create-failure-issue/action.yml
vendored
Normal file
45
.github/actions/create-failure-issue/action.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: Create Failure Issue
|
||||
description: Creates a GitHub issue if any jobs in the workflow failed
|
||||
|
||||
inputs:
|
||||
job-results:
|
||||
description: 'JSON string of job results from needs context'
|
||||
required: true
|
||||
workflow-name:
|
||||
description: 'Name of the workflow'
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Check for failures and create issue
|
||||
shell: bash
|
||||
env:
|
||||
JOB_RESULTS: ${{ inputs.job-results }}
|
||||
WORKFLOW_NAME: ${{ inputs.workflow-name }}
|
||||
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
run: |
|
||||
# Check if any job failed
|
||||
if echo "$JOB_RESULTS" | jq -e 'to_entries | any(.value.result == "failure")' > /dev/null; then
|
||||
echo "Detected job failures, creating issue..."
|
||||
|
||||
# Extract failed job names
|
||||
FAILED_JOBS=$(echo "$JOB_RESULTS" | jq -r 'to_entries | map(select(.value.result == "failure")) | map(.key) | join(", ")')
|
||||
|
||||
# Create issue with workflow name, failed jobs, and run URL
|
||||
gh issue create \
|
||||
--title "$WORKFLOW_NAME Failed ($FAILED_JOBS)" \
|
||||
--body "The workflow **$WORKFLOW_NAME** failed during execution.
|
||||
|
||||
**Failed jobs:** $FAILED_JOBS
|
||||
|
||||
**Run URL:** $RUN_URL
|
||||
|
||||
Please investigate the failed jobs and address any issues." \
|
||||
--label "ci"
|
||||
|
||||
echo "Issue created successfully"
|
||||
else
|
||||
echo "No job failures detected, skipping issue creation"
|
||||
fi
|
||||
14
.github/workflows/cargo-publish.yml
vendored
14
.github/workflows/cargo-publish.yml
vendored
@@ -38,3 +38,17 @@ jobs:
|
||||
- name: Publish the package
|
||||
run: |
|
||||
cargo publish -p lancedb --all-features --token ${{ steps.auth.outputs.token }}
|
||||
report-failure:
|
||||
name: Report Workflow Failure
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build]
|
||||
if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch')
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: ./.github/actions/create-failure-issue
|
||||
with:
|
||||
job-results: ${{ toJSON(needs) }}
|
||||
workflow-name: ${{ github.workflow }}
|
||||
|
||||
107
.github/workflows/codex-update-lance-dependency.yml
vendored
Normal file
107
.github/workflows/codex-update-lance-dependency.yml
vendored
Normal file
@@ -0,0 +1,107 @@
|
||||
name: Codex Update Lance Dependency
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
tag:
|
||||
description: "Tag name from Lance"
|
||||
required: true
|
||||
type: string
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "Tag name from Lance"
|
||||
required: true
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
actions: read
|
||||
|
||||
jobs:
|
||||
update:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Show inputs
|
||||
run: |
|
||||
echo "tag = ${{ inputs.tag }}"
|
||||
|
||||
- name: Checkout Repo LanceDB
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: true
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
|
||||
- name: Install Codex CLI
|
||||
run: npm install -g @openai/codex
|
||||
|
||||
- name: Install Rust toolchain
|
||||
uses: dtolnay/rust-toolchain@stable
|
||||
with:
|
||||
toolchain: stable
|
||||
components: clippy, rustfmt
|
||||
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y protobuf-compiler libssl-dev
|
||||
|
||||
- name: Install cargo-info
|
||||
run: cargo install cargo-info
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: python3 -m pip install --upgrade pip packaging
|
||||
|
||||
- name: Configure git user
|
||||
run: |
|
||||
git config user.name "lancedb automation"
|
||||
git config user.email "robot@lancedb.com"
|
||||
|
||||
- name: Configure Codex authentication
|
||||
env:
|
||||
CODEX_TOKEN_B64: ${{ secrets.CODEX_TOKEN }}
|
||||
run: |
|
||||
if [ -z "${CODEX_TOKEN_B64}" ]; then
|
||||
echo "Repository secret CODEX_TOKEN is not defined; skipping Codex execution."
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p ~/.codex
|
||||
echo "${CODEX_TOKEN_B64}" | base64 --decode > ~/.codex/auth.json
|
||||
|
||||
- name: Run Codex to update Lance dependency
|
||||
env:
|
||||
TAG: ${{ inputs.tag }}
|
||||
GITHUB_TOKEN: ${{ secrets.ROBOT_TOKEN }}
|
||||
GH_TOKEN: ${{ secrets.ROBOT_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
VERSION="${TAG#refs/tags/}"
|
||||
VERSION="${VERSION#v}"
|
||||
BRANCH_NAME="codex/update-lance-${VERSION//[^a-zA-Z0-9]/-}"
|
||||
cat <<EOF >/tmp/codex-prompt.txt
|
||||
You are running inside the lancedb repository on a GitHub Actions runner. Update the Lance dependency to version ${VERSION} and prepare a pull request for maintainers to review.
|
||||
|
||||
Follow these steps exactly:
|
||||
1. Use script "ci/set_lance_version.py" to update Lance dependencies. The script already refreshes Cargo metadata, so allow it to finish even if it takes time.
|
||||
2. Run "cargo clippy --workspace --tests --all-features -- -D warnings". If diagnostics appear, fix them yourself and rerun clippy until it exits cleanly. Do not skip any warnings.
|
||||
3. After clippy succeeds, run "cargo fmt --all" to format the workspace.
|
||||
4. Ensure the repository is clean except for intentional changes. Inspect "git status --short" and "git diff" to confirm the dependency update and any required fixes.
|
||||
5. Create and switch to a new branch named "${BRANCH_NAME}" (replace any duplicated hyphens if necessary).
|
||||
6. Stage all relevant files with "git add -A". Commit using the message "chore: update lance dependency to v${VERSION}".
|
||||
7. Push the branch to origin. If the branch already exists, force-push your changes.
|
||||
8. env "GH_TOKEN" is available, use "gh" tools for github related operations like creating pull request.
|
||||
9. Create a pull request targeting "main" with title "chore: update lance dependency to v${VERSION}". In the body, summarize the dependency bump, clippy/fmt verification, and link the triggering tag (${TAG}).
|
||||
10. After creating the PR, display the PR URL, "git status --short", and a concise summary of the commands run and their results.
|
||||
|
||||
Constraints:
|
||||
- Use bash commands; avoid modifying GitHub workflow files other than through the scripted task above.
|
||||
- Do not merge the PR.
|
||||
- If any command fails, diagnose and fix the issue instead of aborting.
|
||||
EOF
|
||||
codex --config shell_environment_policy.ignore_default_excludes=true exec --dangerously-bypass-approvals-and-sandbox "$(cat /tmp/codex-prompt.txt)"
|
||||
3
.github/workflows/docs.yml
vendored
3
.github/workflows/docs.yml
vendored
@@ -56,8 +56,9 @@ jobs:
|
||||
with:
|
||||
node-version: 20
|
||||
cache: 'npm'
|
||||
cache-dependency-path: docs/package-lock.json
|
||||
- name: Install node dependencies
|
||||
working-directory: node
|
||||
working-directory: nodejs
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y protobuf-compiler libssl-dev
|
||||
|
||||
15
.github/workflows/java-publish.yml
vendored
15
.github/workflows/java-publish.yml
vendored
@@ -43,7 +43,6 @@ jobs:
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
with:
|
||||
toolchain: "1.81.0"
|
||||
cache-workspaces: "./java/core/lancedb-jni"
|
||||
# Disable full debug symbol generation to speed up CI build and keep memory down
|
||||
# "1" means line tables only, which is useful for panic tracebacks.
|
||||
@@ -112,3 +111,17 @@ jobs:
|
||||
env:
|
||||
SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
|
||||
SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
|
||||
report-failure:
|
||||
name: Report Workflow Failure
|
||||
runs-on: ubuntu-latest
|
||||
needs: [linux-arm64, linux-x86, macos-arm64]
|
||||
if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch')
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: ./.github/actions/create-failure-issue
|
||||
with:
|
||||
job-results: ${{ toJSON(needs) }}
|
||||
workflow-name: ${{ github.workflow }}
|
||||
|
||||
3
.github/workflows/nodejs.yml
vendored
3
.github/workflows/nodejs.yml
vendored
@@ -6,6 +6,7 @@ on:
|
||||
- main
|
||||
pull_request:
|
||||
paths:
|
||||
- Cargo.toml
|
||||
- nodejs/**
|
||||
- .github/workflows/nodejs.yml
|
||||
- docker-compose.yml
|
||||
@@ -116,7 +117,7 @@ jobs:
|
||||
set -e
|
||||
npm ci
|
||||
npm run docs
|
||||
if ! git diff --exit-code -- . ':(exclude)Cargo.lock'; then
|
||||
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
|
||||
echo "Docs need to be updated"
|
||||
echo "Run 'npm run docs', fix any warnings, and commit the changes."
|
||||
exit 1
|
||||
|
||||
14
.github/workflows/npm-publish.yml
vendored
14
.github/workflows/npm-publish.yml
vendored
@@ -365,3 +365,17 @@ jobs:
|
||||
ARGS="$ARGS --tag preview"
|
||||
fi
|
||||
npm publish $ARGS
|
||||
report-failure:
|
||||
name: Report Workflow Failure
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build-lancedb, test-lancedb, publish]
|
||||
if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch')
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: ./.github/actions/create-failure-issue
|
||||
with:
|
||||
job-results: ${{ toJSON(needs) }}
|
||||
workflow-name: ${{ github.workflow }}
|
||||
|
||||
14
.github/workflows/pypi-publish.yml
vendored
14
.github/workflows/pypi-publish.yml
vendored
@@ -173,3 +173,17 @@ jobs:
|
||||
generate_release_notes: false
|
||||
name: Python LanceDB v${{ steps.extract_version.outputs.version }}
|
||||
body: ${{ steps.python_release_notes.outputs.changelog }}
|
||||
report-failure:
|
||||
name: Report Workflow Failure
|
||||
runs-on: ubuntu-latest
|
||||
needs: [linux, mac, windows]
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write
|
||||
if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch')
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: ./.github/actions/create-failure-issue
|
||||
with:
|
||||
job-results: ${{ toJSON(needs) }}
|
||||
workflow-name: ${{ github.workflow }}
|
||||
|
||||
1
.github/workflows/python.yml
vendored
1
.github/workflows/python.yml
vendored
@@ -6,6 +6,7 @@ on:
|
||||
- main
|
||||
pull_request:
|
||||
paths:
|
||||
- Cargo.toml
|
||||
- python/**
|
||||
- .github/workflows/python.yml
|
||||
|
||||
|
||||
13
.github/workflows/rust.yml
vendored
13
.github/workflows/rust.yml
vendored
@@ -96,6 +96,7 @@ jobs:
|
||||
# Need up-to-date compilers for kernels
|
||||
CC: clang-18
|
||||
CXX: clang++-18
|
||||
GH_TOKEN: ${{ secrets.SOPHON_READ_TOKEN }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -117,15 +118,17 @@ jobs:
|
||||
sudo chmod 600 /swapfile
|
||||
sudo mkswap /swapfile
|
||||
sudo swapon /swapfile
|
||||
- name: Start S3 integration test environment
|
||||
working-directory: .
|
||||
run: docker compose up --detach --wait
|
||||
- name: Build
|
||||
run: cargo build --all-features --tests --locked --examples
|
||||
- name: Run tests
|
||||
run: cargo test --all-features --locked
|
||||
- name: Run feature tests
|
||||
run: make -C ./lancedb feature-tests
|
||||
- name: Run examples
|
||||
run: cargo run --example simple --locked
|
||||
- name: Run remote tests
|
||||
# Running this requires access to secrets, so skip if this is
|
||||
# a PR from a fork.
|
||||
if: github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork
|
||||
run: make -C ./lancedb remote-tests
|
||||
|
||||
macos:
|
||||
timeout-minutes: 30
|
||||
|
||||
26
.github/workflows/trigger-vectordb-recipes.yml
vendored
26
.github/workflows/trigger-vectordb-recipes.yml
vendored
@@ -1,26 +0,0 @@
|
||||
name: Trigger vectordb-recipers workflow
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/trigger-vectordb-recipes.yml
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Trigger vectordb-recipes workflow
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
github-token: ${{ secrets.VECTORDB_RECIPES_ACTION_TOKEN }}
|
||||
script: |
|
||||
const result = await github.rest.actions.createWorkflowDispatch({
|
||||
owner: 'lancedb',
|
||||
repo: 'vectordb-recipes',
|
||||
workflow_id: 'examples-test.yml',
|
||||
ref: 'main'
|
||||
});
|
||||
console.log(result);
|
||||
1338
Cargo.lock
generated
1338
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
59
Cargo.toml
59
Cargo.toml
@@ -15,30 +15,37 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.37.0", default-features = false, "features" = ["dynamodb"] }
|
||||
lance-io = { "version" = "=0.37.0", default-features = false }
|
||||
lance-index = "=0.37.0"
|
||||
lance-linalg = "=0.37.0"
|
||||
lance-table = "=0.37.0"
|
||||
lance-testing = "=0.37.0"
|
||||
lance-datafusion = "=0.37.0"
|
||||
lance-encoding = "=0.37.0"
|
||||
lance = { "version" = "=0.38.3-beta.11", default-features = false, "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-core = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datagen = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-file = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { "version" = "=0.38.3-beta.11", default-features = false, "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-namespace = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=0.38.3-beta.11", "features" = ["dir-aws", "dir-gcp", "dir-azure", "dir-oss", "rest"], "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-arrow = { "version" = "=0.38.3-beta.11", "tag" = "v0.38.3-beta.11", "git" = "https://github.com/lancedb/lance.git" }
|
||||
ahash = "0.8"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "55.1", optional = false }
|
||||
arrow-array = "55.1"
|
||||
arrow-data = "55.1"
|
||||
arrow-ipc = "55.1"
|
||||
arrow-ord = "55.1"
|
||||
arrow-schema = "55.1"
|
||||
arrow-arith = "55.1"
|
||||
arrow-cast = "55.1"
|
||||
arrow = { version = "56.2", optional = false }
|
||||
arrow-array = "56.2"
|
||||
arrow-data = "56.2"
|
||||
arrow-ipc = "56.2"
|
||||
arrow-ord = "56.2"
|
||||
arrow-schema = "56.2"
|
||||
arrow-select = "56.2"
|
||||
arrow-cast = "56.2"
|
||||
async-trait = "0"
|
||||
datafusion = { version = "49.0", default-features = false }
|
||||
datafusion-catalog = "49.0"
|
||||
datafusion-common = { version = "49.0", default-features = false }
|
||||
datafusion-execution = "49.0"
|
||||
datafusion-expr = "49.0"
|
||||
datafusion-physical-plan = "49.0"
|
||||
datafusion = { version = "50.1", default-features = false }
|
||||
datafusion-catalog = "50.1"
|
||||
datafusion-common = { version = "50.1", default-features = false }
|
||||
datafusion-execution = "50.1"
|
||||
datafusion-expr = "50.1"
|
||||
datafusion-physical-plan = "50.1"
|
||||
env_logger = "0.11"
|
||||
half = { "version" = "2.6.0", default-features = false, features = [
|
||||
"num-traits",
|
||||
@@ -48,18 +55,14 @@ log = "0.4"
|
||||
moka = { version = "0.12", features = ["future"] }
|
||||
object_store = "0.12.0"
|
||||
pin-project = "1.0.7"
|
||||
rand = "0.9"
|
||||
snafu = "0.8"
|
||||
url = "2"
|
||||
num-traits = "0.2"
|
||||
rand = "0.9"
|
||||
regex = "1.10"
|
||||
lazy_static = "1"
|
||||
semver = "1.0.25"
|
||||
crunchy = "0.2.4"
|
||||
# Temporary pins to work around downstream issues
|
||||
# https://github.com/apache/arrow-rs/commit/2fddf85afcd20110ce783ed5b4cdeb82293da30b
|
||||
chrono = "=0.4.41"
|
||||
# https://github.com/RustCrypto/formats/issues/1684
|
||||
base64ct = "=1.6.0"
|
||||
chrono = "0.4"
|
||||
# Workaround for: https://github.com/Lokathor/bytemuck/issues/306
|
||||
bytemuck_derive = ">=1.8.1, <1.9.0"
|
||||
|
||||
4
ci/create_lancedb_test_connection.sh
Executable file
4
ci/create_lancedb_test_connection.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
export RUST_LOG=info
|
||||
exec ./lancedb server --port 0 --sql-port 0 --data-dir "${1}"
|
||||
18
ci/run_with_docker_compose.sh
Executable file
18
ci/run_with_docker_compose.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
#
|
||||
# A script for running the given command together with a docker compose environment.
|
||||
#
|
||||
|
||||
# Bring down the docker setup once the command is done running.
|
||||
tear_down() {
|
||||
docker compose -p fixture down
|
||||
}
|
||||
trap tear_down EXIT
|
||||
|
||||
set +xe
|
||||
|
||||
# Clean up any existing docker setup and bring up a new one.
|
||||
docker compose -p fixture up --detach --wait || exit 1
|
||||
|
||||
"${@}"
|
||||
68
ci/run_with_test_connection.sh
Executable file
68
ci/run_with_test_connection.sh
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
#
|
||||
# A script for running the given command together with the lancedb cli.
|
||||
#
|
||||
|
||||
die() {
|
||||
echo $?
|
||||
exit 1
|
||||
}
|
||||
|
||||
check_command_exists() {
|
||||
command="${1}"
|
||||
which ${command} &> /dev/null || \
|
||||
die "Unable to locate command: ${command}. Did you install it?"
|
||||
}
|
||||
|
||||
if [[ ! -e ./lancedb ]]; then
|
||||
if [[ -v SOPHON_READ_TOKEN ]]; then
|
||||
INPUT="lancedb-linux-x64"
|
||||
gh release \
|
||||
--repo lancedb/lancedb \
|
||||
download ci-support-binaries \
|
||||
--pattern "${INPUT}" \
|
||||
|| die "failed to fetch cli."
|
||||
check_command_exists openssl
|
||||
openssl enc -aes-256-cbc \
|
||||
-d -pbkdf2 \
|
||||
-pass "env:SOPHON_READ_TOKEN" \
|
||||
-in "${INPUT}" \
|
||||
-out ./lancedb-linux-x64.tar.gz \
|
||||
|| die "openssl failed"
|
||||
TARGET="${INPUT}.tar.gz"
|
||||
else
|
||||
ARCH="x64"
|
||||
if [[ $OSTYPE == 'darwin'* ]]; then
|
||||
UNAME=$(uname -m)
|
||||
if [[ $UNAME == 'arm64' ]]; then
|
||||
ARCH='arm64'
|
||||
fi
|
||||
OSTYPE="macos"
|
||||
elif [[ $OSTYPE == 'linux'* ]]; then
|
||||
if [[ $UNAME == 'aarch64' ]]; then
|
||||
ARCH='arm64'
|
||||
fi
|
||||
OSTYPE="linux"
|
||||
else
|
||||
die "unknown OSTYPE: $OSTYPE"
|
||||
fi
|
||||
|
||||
check_command_exists gh
|
||||
TARGET="lancedb-${OSTYPE}-${ARCH}.tar.gz"
|
||||
gh release \
|
||||
--repo lancedb/sophon \
|
||||
download lancedb-cli-v0.0.3 \
|
||||
--pattern "${TARGET}" \
|
||||
|| die "failed to fetch cli."
|
||||
fi
|
||||
|
||||
check_command_exists tar
|
||||
tar xvf "${TARGET}" || die "tar failed."
|
||||
[[ -e ./lancedb ]] || die "failed to extract lancedb."
|
||||
fi
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
|
||||
export CREATE_LANCEDB_TEST_CONNECTION_SCRIPT="${SCRIPT_DIR}/create_lancedb_test_connection.sh"
|
||||
|
||||
"${@}"
|
||||
@@ -183,10 +183,8 @@ def set_preview_version(version: str):
|
||||
|
||||
def line_updater(line: str) -> str:
|
||||
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||
base_version = version.split("-")[0] # Get the base version without beta suffix
|
||||
|
||||
# Build config in desired order: version, default-features, features, tag, git
|
||||
config = {"version": f"={base_version}"}
|
||||
config = {"version": f"={version}"}
|
||||
|
||||
if extract_default_features(line):
|
||||
config["default-features"] = False
|
||||
|
||||
@@ -70,6 +70,23 @@ plugins:
|
||||
- mkdocs-jupyter
|
||||
- render_swagger:
|
||||
allow_arbitrary_locations: true
|
||||
- redirects:
|
||||
redirect_maps:
|
||||
# Redirect the home page and other top-level markdown files. This enables maximum SEO benefit
|
||||
# other sub-pages are handled by the ingected js in overrides/partials/header.html
|
||||
'index.md': 'https://lancedb.com/docs/'
|
||||
'guides/tables.md': 'https://lancedb.com/docs/tables/'
|
||||
'ann_indexes.md': 'https://lancedb.com/docs/indexing/'
|
||||
'basic.md': 'https://lancedb.com/docs/quickstart/'
|
||||
'faq.md': 'https://lancedb.com/docs/faq/'
|
||||
'embeddings/understanding_embeddings.md': 'https://lancedb.com/docs/embedding/'
|
||||
'integrations.md': 'https://lancedb.com/docs/integrations/'
|
||||
'examples.md': 'https://lancedb.com/docs/tutorials/'
|
||||
'concepts/vector_search.md': 'https://lancedb.com/docs/search/vector-search/'
|
||||
'troubleshooting.md': 'https://lancedb.com/docs/troubleshooting/'
|
||||
'guides/storage.md': 'https://lancedb.com/docs/storage/integrations'
|
||||
|
||||
|
||||
|
||||
markdown_extensions:
|
||||
- admonition
|
||||
|
||||
@@ -19,7 +19,13 @@
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
IN THE SOFTWARE.
|
||||
-->
|
||||
|
||||
<div id="deprecation-banner" style="background-color: #f8d7da; color: #721c24; padding: 1em; text-align: center;">
|
||||
<p style="margin: 0; font-size: 1.1em;">
|
||||
<strong>This documentation site is deprecated.</strong>
|
||||
Please visit our new documentation site at <a href="https://lancedb.com/docs" style="color: #721c24; text-decoration: underline;">
|
||||
lancedb.com/docs</a> for the latest information.
|
||||
</p>
|
||||
</div>
|
||||
{% set class = "md-header" %}
|
||||
{% if "navigation.tabs.sticky" in features %}
|
||||
{% set class = class ~ " md-header--shadow md-header--lifted" %}
|
||||
@@ -150,9 +156,9 @@
|
||||
|
||||
<div style="margin-left: 10px; margin-right: 5px;">
|
||||
<a href="https://discord.com/invite/zMM32dvNtd" target="_blank" rel="noopener noreferrer">
|
||||
<svg fill="#FFFFFF" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 50 50" width="25px" height="25px"><path d="M 41.625 10.769531 C 37.644531 7.566406 31.347656 7.023438 31.078125 7.003906 C 30.660156 6.96875 30.261719 7.203125 30.089844 7.589844 C 30.074219 7.613281 29.9375 7.929688 29.785156 8.421875 C 32.417969 8.867188 35.652344 9.761719 38.578125 11.578125 C 39.046875 11.867188 39.191406 12.484375 38.902344 12.953125 C 38.710938 13.261719 38.386719 13.429688 38.050781 13.429688 C 37.871094 13.429688 37.6875 13.378906 37.523438 13.277344 C 32.492188 10.15625 26.210938 10 25 10 C 23.789063 10 17.503906 10.15625 12.476563 13.277344 C 12.007813 13.570313 11.390625 13.425781 11.101563 12.957031 C 10.808594 12.484375 10.953125 11.871094 11.421875 11.578125 C 14.347656 9.765625 17.582031 8.867188 20.214844 8.425781 C 20.0625 7.929688 19.925781 7.617188 19.914063 7.589844 C 19.738281 7.203125 19.34375 6.960938 18.921875 7.003906 C 18.652344 7.023438 12.355469 7.566406 8.320313 10.8125 C 6.214844 12.761719 2 24.152344 2 34 C 2 34.175781 2.046875 34.34375 2.132813 34.496094 C 5.039063 39.605469 12.972656 40.941406 14.78125 41 C 14.789063 41 14.800781 41 14.8125 41 C 15.132813 41 15.433594 40.847656 15.621094 40.589844 L 17.449219 38.074219 C 12.515625 36.800781 9.996094 34.636719 9.851563 34.507813 C 9.4375 34.144531 9.398438 33.511719 9.765625 33.097656 C 10.128906 32.683594 10.761719 32.644531 11.175781 33.007813 C 11.234375 33.0625 15.875 37 25 37 C 34.140625 37 38.78125 33.046875 38.828125 33.007813 C 39.242188 32.648438 39.871094 32.683594 40.238281 33.101563 C 40.601563 33.515625 40.5625 34.144531 40.148438 34.507813 C 40.003906 34.636719 37.484375 36.800781 32.550781 38.074219 L 34.378906 40.589844 C 34.566406 40.847656 34.867188 41 35.1875 41 C 35.199219 41 35.210938 41 35.21875 41 C 37.027344 40.941406 44.960938 39.605469 47.867188 34.496094 C 47.953125 34.34375 48 34.175781 48 34 C 48 24.152344 43.785156 12.761719 41.625 10.769531 Z M 18.5 30 C 16.566406 30 15 28.210938 15 26 C 15 23.789063 16.566406 22 18.5 22 C 20.433594 22 22 23.789063 22 26 C 22 28.210938 20.433594 30 18.5 30 Z M 31.5 30 C 29.566406 30 28 28.210938 28 26 C 28 23.789063 29.566406 22 31.5 22 C 33.433594 22 35 23.789063 35 26 C 35 28.210938 33.433594 30 31.5 30 Z"/></svg>
|
||||
</a>
|
||||
</div>
|
||||
<svg fill="#FFFFFF" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 50 50" width="25px" height="25px"><path d="M 41.625 10.769531 C 37.644531 7.566406 31.347656 7.023438 31.078125 7.003906 C 30.660156 6.96875 30.261719 7.203125 30.089844 7.589844 C 30.074219 7.613281 29.9375 7.929688 29.785156 8.421875 C 32.417969 8.867188 35.652344 9.761719 38.578125 11.578125 C 39.046875 11.867188 39.191406 12.484375 38.902344 12.953125 C 38.710938 13.261719 38.386719 13.429688 38.050781 13.429688 C 37.871094 13.429688 37.6875 13.378906 37.523438 13.277344 C 32.492188 10.15625 26.210938 10 25 10 C 23.789063 10 17.503906 10.15625 12.476563 13.277344 C 12.007813 13.570313 11.390625 13.425781 11.101563 12.957031 C 10.808594 12.484375 10.953125 11.871094 11.421875 11.578125 C 14.347656 9.765625 17.582031 8.867188 20.214844 8.425781 C 20.0625 7.929688 19.925781 7.617188 19.914063 7.589844 C 19.738281 7.203125 19.34375 6.960938 18.921875 7.003906 C 18.652344 7.023438 12.355469 7.566406 8.320313 10.8125 C 6.214844 12.761719 2 24.152344 2 34 C 2 34.175781 2.046875 34.34375 2.132813 34.496094 C 5.039063 39.605469 12.972656 40.941406 14.78125 41 C 14.789063 41 14.800781 41 14.8125 41 C 15.132813 41 15.433594 40.847656 15.621094 40.589844 L 17.449219 38.074219 C 12.515625 36.800781 9.996094 34.636719 9.851563 34.507813 C 9.4375 34.144531 9.398438 33.511719 9.765625 33.097656 C 10.128906 32.683594 10.761719 32.644531 11.175781 33.007813 C 11.234375 33.0625 15.875 37 25 37 C 34.140625 37 38.78125 33.046875 38.828125 33.007813 C 39.242188 32.648438 39.871094 32.683594 40.238281 33.101563 C 40.601563 33.515625 40.5625 34.144531 40.148438 34.507813 C 40.003906 34.636719 37.484375 36.800781 32.550781 38.074219 L 34.378906 40.589844 C 34.566406 40.847656 34.867188 41 35.1875 41 C 35.199219 41 35.210938 41 35.21875 41 C 37.027344 40.941406 44.960938 39.605469 47.867188 34.496094 C 47.953125 34.34375 48 34.175781 48 34 C 48 24.152344 43.785156 12.761719 41.625 10.769531 Z M 18.5 30 C 16.566406 30 15 28.210938 15 26 C 15 23.789063 16.566406 22 18.5 22 C 20.433594 22 22 23.789063 22 26 C 22 28.210938 20.433594 30 18.5 30 Z M 31.5 30 C 29.566406 30 28 28.210938 28 26 C 28 23.789063 29.566406 22 31.5 22 C 33.433594 22 35 23.789063 35 26 C 35 28.210938 33.433594 30 31.5 30 Z"/></svg>
|
||||
</a>
|
||||
</div>
|
||||
<div style="margin-left: 5px; margin-right: 5px;">
|
||||
<a href="https://twitter.com/lancedb" target="_blank" rel="noopener noreferrer">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0,0,256,256" width="25px" height="25px" fill-rule="nonzero"><g fill-opacity="0" fill="#ffffff" fill-rule="nonzero" stroke="none" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="10" stroke-dasharray="" stroke-dashoffset="0" font-family="none" font-weight="none" font-size="none" text-anchor="none" style="mix-blend-mode: normal"><path d="M0,256v-256h256v256z" id="bgRectangle"></path></g><g fill="#ffffff" fill-rule="nonzero" stroke="none" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="10" stroke-dasharray="" stroke-dashoffset="0" font-family="none" font-weight="none" font-size="none" text-anchor="none" style="mix-blend-mode: normal"><g transform="scale(4,4)"><path d="M57,17.114c-1.32,1.973 -2.991,3.707 -4.916,5.097c0.018,0.423 0.028,0.847 0.028,1.274c0,13.013 -9.902,28.018 -28.016,28.018c-5.562,0 -12.81,-1.948 -15.095,-4.423c0.772,0.092 1.556,0.138 2.35,0.138c4.615,0 8.861,-1.575 12.23,-4.216c-4.309,-0.079 -7.946,-2.928 -9.199,-6.84c1.96,0.308 4.447,-0.17 4.447,-0.17c0,0 -7.7,-1.322 -7.899,-9.779c2.226,1.291 4.46,1.231 4.46,1.231c0,0 -4.441,-2.734 -4.379,-8.195c0.037,-3.221 1.331,-4.953 1.331,-4.953c8.414,10.361 20.298,10.29 20.298,10.29c0,0 -0.255,-1.471 -0.255,-2.243c0,-5.437 4.408,-9.847 9.847,-9.847c2.832,0 5.391,1.196 7.187,3.111c2.245,-0.443 4.353,-1.263 6.255,-2.391c-0.859,3.44 -4.329,5.448 -4.329,5.448c0,0 2.969,-0.329 5.655,-1.55z"></path></g></g></svg>
|
||||
@@ -173,4 +179,77 @@
|
||||
{% include "partials/tabs.html" %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</header>
|
||||
</header>
|
||||
|
||||
<script>
|
||||
(function() {
|
||||
function checkPathAndRedirect() {
|
||||
var banner = document.getElementById('deprecation-banner');
|
||||
|
||||
if (document.querySelector('meta[http-equiv="refresh"]')) {
|
||||
return; // The redirects plugin is already handling this page.
|
||||
}
|
||||
|
||||
var currentPath = window.location.pathname;
|
||||
|
||||
var cleanPath = currentPath.endsWith('/') && currentPath.length > 1
|
||||
? currentPath.slice(0, -1)
|
||||
: currentPath;
|
||||
|
||||
// These are the ONLY paths that should remain on the old site
|
||||
var apiPaths = [
|
||||
'/lancedb/python',
|
||||
'/lancedb/javascript',
|
||||
'/lancedb/js',
|
||||
'/lancedb/api_reference'
|
||||
];
|
||||
|
||||
var isApiPage = apiPaths.some(function(apiPath) {
|
||||
return cleanPath.startsWith(apiPath);
|
||||
});
|
||||
|
||||
if (isApiPage) {
|
||||
if (banner) {
|
||||
banner.style.display = 'none';
|
||||
}
|
||||
} else {
|
||||
if (banner) {
|
||||
banner.style.display = 'block';
|
||||
}
|
||||
|
||||
// Add noindex meta tag to prevent indexing of old docs for seo
|
||||
var noindexMeta = document.createElement('meta');
|
||||
noindexMeta.setAttribute('name', 'robots');
|
||||
noindexMeta.setAttribute('content', 'noindex, follow');
|
||||
document.head.appendChild(noindexMeta);
|
||||
|
||||
// Add canonical link to point to the new docs to reward new site for seo
|
||||
var canonicalLink = document.createElement('link');
|
||||
canonicalLink.setAttribute('rel', 'canonical');
|
||||
canonicalLink.setAttribute('href', 'https://lancedb.com/docs');
|
||||
document.head.appendChild(canonicalLink);
|
||||
|
||||
window.location.replace('https://lancedb.com/docs');
|
||||
}
|
||||
}
|
||||
|
||||
// Run the check only if doc is ready. This makes sure we catch the initial load
|
||||
// and redirect.
|
||||
if (document.readyState === 'loading') {
|
||||
document.addEventListener('DOMContentLoaded', checkPathAndRedirect);
|
||||
} else {
|
||||
checkPathAndRedirect();
|
||||
}
|
||||
|
||||
// Use an interval to handle subsequent navigation clicks.
|
||||
var lastPath = window.location.pathname;
|
||||
setInterval(function() {
|
||||
if (window.location.pathname !== lastPath) {
|
||||
lastPath = window.location.pathname;
|
||||
checkPathAndRedirect();
|
||||
}
|
||||
}, 2000); // keeping it 2 second to make it easy for user to understand
|
||||
// what's happening
|
||||
|
||||
})();
|
||||
</script>
|
||||
@@ -5,3 +5,4 @@ mkdocstrings[python]==0.25.2
|
||||
griffe
|
||||
mkdocs-render-swagger-plugin
|
||||
pydantic
|
||||
mkdocs-redirects
|
||||
|
||||
@@ -25,6 +25,51 @@ the underlying connection has been closed.
|
||||
|
||||
## Methods
|
||||
|
||||
### cloneTable()
|
||||
|
||||
```ts
|
||||
abstract cloneTable(
|
||||
targetTableName,
|
||||
sourceUri,
|
||||
options?): Promise<Table>
|
||||
```
|
||||
|
||||
Clone a table from a source table.
|
||||
|
||||
A shallow clone creates a new table that shares the underlying data files
|
||||
with the source table but has its own independent manifest. This allows
|
||||
both the source and cloned tables to evolve independently while initially
|
||||
sharing the same data, deletion, and index files.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **targetTableName**: `string`
|
||||
The name of the target table to create.
|
||||
|
||||
* **sourceUri**: `string`
|
||||
The URI of the source table to clone from.
|
||||
|
||||
* **options?**
|
||||
Clone options.
|
||||
|
||||
* **options.isShallow?**: `boolean`
|
||||
Whether to perform a shallow clone (defaults to true).
|
||||
|
||||
* **options.sourceTag?**: `string`
|
||||
The tag of the source table to clone.
|
||||
|
||||
* **options.sourceVersion?**: `number`
|
||||
The version of the source table to clone.
|
||||
|
||||
* **options.targetNamespace?**: `string`[]
|
||||
The namespace for the target table (defaults to root namespace).
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Table`](Table.md)>
|
||||
|
||||
***
|
||||
|
||||
### close()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -194,6 +194,37 @@ currently is also a memory intensive operation.
|
||||
|
||||
***
|
||||
|
||||
### ivfRq()
|
||||
|
||||
```ts
|
||||
static ivfRq(options?): Index
|
||||
```
|
||||
|
||||
Create an IvfRq index
|
||||
|
||||
IVF-RQ (RabitQ Quantization) compresses vectors using RabitQ quantization
|
||||
and organizes them into IVF partitions.
|
||||
|
||||
The compression scheme is called RabitQ quantization. Each dimension is quantized into a small number of bits.
|
||||
The parameters `num_bits` and `num_partitions` control this process, providing a tradeoff
|
||||
between index size (and thus search speed) and index accuracy.
|
||||
|
||||
The partitioning process is called IVF and the `num_partitions` parameter controls how
|
||||
many groups to create.
|
||||
|
||||
Note that training an IVF RQ index on a large dataset is a slow operation and
|
||||
currently is also a memory intensive operation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`IvfRqOptions`](../interfaces/IvfRqOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
[`Index`](Index.md)
|
||||
|
||||
***
|
||||
|
||||
### labelList()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -52,6 +52,30 @@ the merge result
|
||||
|
||||
***
|
||||
|
||||
### useIndex()
|
||||
|
||||
```ts
|
||||
useIndex(useIndex): MergeInsertBuilder
|
||||
```
|
||||
|
||||
Controls whether to use indexes for the merge operation.
|
||||
|
||||
When set to `true` (the default), the operation will use an index if available
|
||||
on the join key for improved performance. When set to `false`, it forces a full
|
||||
table scan even if an index exists. This can be useful for benchmarking or when
|
||||
the query optimizer chooses a suboptimal path.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **useIndex**: `boolean`
|
||||
Whether to use indices for the merge operation. Defaults to `true`.
|
||||
|
||||
#### Returns
|
||||
|
||||
[`MergeInsertBuilder`](MergeInsertBuilder.md)
|
||||
|
||||
***
|
||||
|
||||
### whenMatchedUpdateAll()
|
||||
|
||||
```ts
|
||||
|
||||
220
docs/src/js/classes/PermutationBuilder.md
Normal file
220
docs/src/js/classes/PermutationBuilder.md
Normal file
@@ -0,0 +1,220 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / PermutationBuilder
|
||||
|
||||
# Class: PermutationBuilder
|
||||
|
||||
A PermutationBuilder for creating data permutations with splits, shuffling, and filtering.
|
||||
|
||||
This class provides a TypeScript wrapper around the native Rust PermutationBuilder,
|
||||
offering methods to configure data splits, shuffling, and filtering before executing
|
||||
the permutation to create a new table.
|
||||
|
||||
## Methods
|
||||
|
||||
### execute()
|
||||
|
||||
```ts
|
||||
execute(): Promise<Table>
|
||||
```
|
||||
|
||||
Execute the permutation and create the destination table.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Table`](Table.md)>
|
||||
|
||||
A Promise that resolves to the new Table instance
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
const permutationTable = await builder.execute();
|
||||
console.log(`Created table: ${permutationTable.name}`);
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### filter()
|
||||
|
||||
```ts
|
||||
filter(filter): PermutationBuilder
|
||||
```
|
||||
|
||||
Configure filtering for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **filter**: `string`
|
||||
SQL filter expression
|
||||
|
||||
#### Returns
|
||||
|
||||
[`PermutationBuilder`](PermutationBuilder.md)
|
||||
|
||||
A new PermutationBuilder instance
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
builder.filter("age > 18 AND status = 'active'");
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### shuffle()
|
||||
|
||||
```ts
|
||||
shuffle(options): PermutationBuilder
|
||||
```
|
||||
|
||||
Configure shuffling for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options**: [`ShuffleOptions`](../interfaces/ShuffleOptions.md)
|
||||
Configuration for shuffling
|
||||
|
||||
#### Returns
|
||||
|
||||
[`PermutationBuilder`](PermutationBuilder.md)
|
||||
|
||||
A new PermutationBuilder instance
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
// Basic shuffle
|
||||
builder.shuffle({ seed: 42 });
|
||||
|
||||
// Shuffle with clump size
|
||||
builder.shuffle({ seed: 42, clumpSize: 10 });
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### splitCalculated()
|
||||
|
||||
```ts
|
||||
splitCalculated(calculation): PermutationBuilder
|
||||
```
|
||||
|
||||
Configure calculated splits for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **calculation**: `string`
|
||||
SQL expression for calculating splits
|
||||
|
||||
#### Returns
|
||||
|
||||
[`PermutationBuilder`](PermutationBuilder.md)
|
||||
|
||||
A new PermutationBuilder instance
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
builder.splitCalculated("user_id % 3");
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### splitHash()
|
||||
|
||||
```ts
|
||||
splitHash(options): PermutationBuilder
|
||||
```
|
||||
|
||||
Configure hash-based splits for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options**: [`SplitHashOptions`](../interfaces/SplitHashOptions.md)
|
||||
Configuration for hash-based splitting
|
||||
|
||||
#### Returns
|
||||
|
||||
[`PermutationBuilder`](PermutationBuilder.md)
|
||||
|
||||
A new PermutationBuilder instance
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
builder.splitHash({
|
||||
columns: ["user_id"],
|
||||
splitWeights: [70, 30],
|
||||
discardWeight: 0
|
||||
});
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### splitRandom()
|
||||
|
||||
```ts
|
||||
splitRandom(options): PermutationBuilder
|
||||
```
|
||||
|
||||
Configure random splits for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options**: [`SplitRandomOptions`](../interfaces/SplitRandomOptions.md)
|
||||
Configuration for random splitting
|
||||
|
||||
#### Returns
|
||||
|
||||
[`PermutationBuilder`](PermutationBuilder.md)
|
||||
|
||||
A new PermutationBuilder instance
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
// Split by ratios
|
||||
builder.splitRandom({ ratios: [0.7, 0.3], seed: 42 });
|
||||
|
||||
// Split by counts
|
||||
builder.splitRandom({ counts: [1000, 500], seed: 42 });
|
||||
|
||||
// Split with fixed size
|
||||
builder.splitRandom({ fixed: 100, seed: 42 });
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### splitSequential()
|
||||
|
||||
```ts
|
||||
splitSequential(options): PermutationBuilder
|
||||
```
|
||||
|
||||
Configure sequential splits for the permutation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options**: [`SplitSequentialOptions`](../interfaces/SplitSequentialOptions.md)
|
||||
Configuration for sequential splitting
|
||||
|
||||
#### Returns
|
||||
|
||||
[`PermutationBuilder`](PermutationBuilder.md)
|
||||
|
||||
A new PermutationBuilder instance
|
||||
|
||||
#### Example
|
||||
|
||||
```ts
|
||||
// Split by ratios
|
||||
builder.splitSequential({ ratios: [0.8, 0.2] });
|
||||
|
||||
// Split by counts
|
||||
builder.splitSequential({ counts: [800, 200] });
|
||||
|
||||
// Split with fixed size
|
||||
builder.splitSequential({ fixed: 1000 });
|
||||
```
|
||||
@@ -343,6 +343,29 @@ This is useful for pagination.
|
||||
|
||||
***
|
||||
|
||||
### outputSchema()
|
||||
|
||||
```ts
|
||||
outputSchema(): Promise<Schema<any>>
|
||||
```
|
||||
|
||||
Returns the schema of the output that will be returned by this query.
|
||||
|
||||
This can be used to inspect the types and names of the columns that will be
|
||||
returned by the query before executing it.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`Schema`<`any`>>
|
||||
|
||||
An Arrow Schema describing the output columns.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
`StandardQueryBase.outputSchema`
|
||||
|
||||
***
|
||||
|
||||
### select()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -140,6 +140,25 @@ const plan = await table.query().nearestTo([0.5, 0.2]).explainPlan();
|
||||
|
||||
***
|
||||
|
||||
### outputSchema()
|
||||
|
||||
```ts
|
||||
outputSchema(): Promise<Schema<any>>
|
||||
```
|
||||
|
||||
Returns the schema of the output that will be returned by this query.
|
||||
|
||||
This can be used to inspect the types and names of the columns that will be
|
||||
returned by the query before executing it.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`Schema`<`any`>>
|
||||
|
||||
An Arrow Schema describing the output columns.
|
||||
|
||||
***
|
||||
|
||||
### select()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -143,6 +143,29 @@ const plan = await table.query().nearestTo([0.5, 0.2]).explainPlan();
|
||||
|
||||
***
|
||||
|
||||
### outputSchema()
|
||||
|
||||
```ts
|
||||
outputSchema(): Promise<Schema<any>>
|
||||
```
|
||||
|
||||
Returns the schema of the output that will be returned by this query.
|
||||
|
||||
This can be used to inspect the types and names of the columns that will be
|
||||
returned by the query before executing it.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`Schema`<`any`>>
|
||||
|
||||
An Arrow Schema describing the output columns.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
[`QueryBase`](QueryBase.md).[`outputSchema`](QueryBase.md#outputschema)
|
||||
|
||||
***
|
||||
|
||||
### select()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -498,6 +498,29 @@ This is useful for pagination.
|
||||
|
||||
***
|
||||
|
||||
### outputSchema()
|
||||
|
||||
```ts
|
||||
outputSchema(): Promise<Schema<any>>
|
||||
```
|
||||
|
||||
Returns the schema of the output that will be returned by this query.
|
||||
|
||||
This can be used to inspect the types and names of the columns that will be
|
||||
returned by the query before executing it.
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<`Schema`<`any`>>
|
||||
|
||||
An Arrow Schema describing the output columns.
|
||||
|
||||
#### Inherited from
|
||||
|
||||
`StandardQueryBase.outputSchema`
|
||||
|
||||
***
|
||||
|
||||
### postfilter()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -13,7 +13,7 @@ function makeArrowTable(
|
||||
metadata?): ArrowTable
|
||||
```
|
||||
|
||||
An enhanced version of the makeTable function from Apache Arrow
|
||||
An enhanced version of the apache-arrow makeTable function from Apache Arrow
|
||||
that supports nested fields and embeddings columns.
|
||||
|
||||
(typically you do not need to call this function. It will be called automatically
|
||||
|
||||
34
docs/src/js/functions/permutationBuilder.md
Normal file
34
docs/src/js/functions/permutationBuilder.md
Normal file
@@ -0,0 +1,34 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / permutationBuilder
|
||||
|
||||
# Function: permutationBuilder()
|
||||
|
||||
```ts
|
||||
function permutationBuilder(table): PermutationBuilder
|
||||
```
|
||||
|
||||
Create a permutation builder for the given table.
|
||||
|
||||
## Parameters
|
||||
|
||||
* **table**: [`Table`](../classes/Table.md)
|
||||
The source table to create a permutation from
|
||||
|
||||
## Returns
|
||||
|
||||
[`PermutationBuilder`](../classes/PermutationBuilder.md)
|
||||
|
||||
A PermutationBuilder instance
|
||||
|
||||
## Example
|
||||
|
||||
```ts
|
||||
const builder = permutationBuilder(sourceTable, "training_data")
|
||||
.splitRandom({ ratios: [0.8, 0.2], seed: 42 })
|
||||
.shuffle({ seed: 123 });
|
||||
|
||||
const trainingTable = await builder.execute();
|
||||
```
|
||||
@@ -28,6 +28,7 @@
|
||||
- [MultiMatchQuery](classes/MultiMatchQuery.md)
|
||||
- [NativeJsHeaderProvider](classes/NativeJsHeaderProvider.md)
|
||||
- [OAuthHeaderProvider](classes/OAuthHeaderProvider.md)
|
||||
- [PermutationBuilder](classes/PermutationBuilder.md)
|
||||
- [PhraseQuery](classes/PhraseQuery.md)
|
||||
- [Query](classes/Query.md)
|
||||
- [QueryBase](classes/QueryBase.md)
|
||||
@@ -68,6 +69,7 @@
|
||||
- [IndexStatistics](interfaces/IndexStatistics.md)
|
||||
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
|
||||
- [IvfPqOptions](interfaces/IvfPqOptions.md)
|
||||
- [IvfRqOptions](interfaces/IvfRqOptions.md)
|
||||
- [MergeResult](interfaces/MergeResult.md)
|
||||
- [OpenTableOptions](interfaces/OpenTableOptions.md)
|
||||
- [OptimizeOptions](interfaces/OptimizeOptions.md)
|
||||
@@ -75,9 +77,14 @@
|
||||
- [QueryExecutionOptions](interfaces/QueryExecutionOptions.md)
|
||||
- [RemovalStats](interfaces/RemovalStats.md)
|
||||
- [RetryConfig](interfaces/RetryConfig.md)
|
||||
- [ShuffleOptions](interfaces/ShuffleOptions.md)
|
||||
- [SplitHashOptions](interfaces/SplitHashOptions.md)
|
||||
- [SplitRandomOptions](interfaces/SplitRandomOptions.md)
|
||||
- [SplitSequentialOptions](interfaces/SplitSequentialOptions.md)
|
||||
- [TableNamesOptions](interfaces/TableNamesOptions.md)
|
||||
- [TableStatistics](interfaces/TableStatistics.md)
|
||||
- [TimeoutConfig](interfaces/TimeoutConfig.md)
|
||||
- [TlsConfig](interfaces/TlsConfig.md)
|
||||
- [TokenResponse](interfaces/TokenResponse.md)
|
||||
- [UpdateOptions](interfaces/UpdateOptions.md)
|
||||
- [UpdateResult](interfaces/UpdateResult.md)
|
||||
@@ -101,3 +108,4 @@
|
||||
- [connect](functions/connect.md)
|
||||
- [makeArrowTable](functions/makeArrowTable.md)
|
||||
- [packBits](functions/packBits.md)
|
||||
- [permutationBuilder](functions/permutationBuilder.md)
|
||||
|
||||
@@ -40,6 +40,14 @@ optional timeoutConfig: TimeoutConfig;
|
||||
|
||||
***
|
||||
|
||||
### tlsConfig?
|
||||
|
||||
```ts
|
||||
optional tlsConfig: TlsConfig;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### userAgent?
|
||||
|
||||
```ts
|
||||
|
||||
101
docs/src/js/interfaces/IvfRqOptions.md
Normal file
101
docs/src/js/interfaces/IvfRqOptions.md
Normal file
@@ -0,0 +1,101 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / IvfRqOptions
|
||||
|
||||
# Interface: IvfRqOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### distanceType?
|
||||
|
||||
```ts
|
||||
optional distanceType: "l2" | "cosine" | "dot";
|
||||
```
|
||||
|
||||
Distance type to use to build the index.
|
||||
|
||||
Default value is "l2".
|
||||
|
||||
This is used when training the index to calculate the IVF partitions
|
||||
(vectors are grouped in partitions with similar vectors according to this
|
||||
distance type) and during quantization.
|
||||
|
||||
The distance type used to train an index MUST match the distance type used
|
||||
to search the index. Failure to do so will yield inaccurate results.
|
||||
|
||||
The following distance types are available:
|
||||
|
||||
"l2" - Euclidean distance.
|
||||
"cosine" - Cosine distance.
|
||||
"dot" - Dot product.
|
||||
|
||||
***
|
||||
|
||||
### maxIterations?
|
||||
|
||||
```ts
|
||||
optional maxIterations: number;
|
||||
```
|
||||
|
||||
Max iterations to train IVF kmeans.
|
||||
|
||||
When training an IVF index we use kmeans to calculate the partitions. This parameter
|
||||
controls how many iterations of kmeans to run.
|
||||
|
||||
The default value is 50.
|
||||
|
||||
***
|
||||
|
||||
### numBits?
|
||||
|
||||
```ts
|
||||
optional numBits: number;
|
||||
```
|
||||
|
||||
Number of bits per dimension for residual quantization.
|
||||
|
||||
This value controls how much each residual component is compressed. The more
|
||||
bits, the more accurate the index will be but the slower search. Typical values
|
||||
are small integers; the default is 1 bit per dimension.
|
||||
|
||||
***
|
||||
|
||||
### numPartitions?
|
||||
|
||||
```ts
|
||||
optional numPartitions: number;
|
||||
```
|
||||
|
||||
The number of IVF partitions to create.
|
||||
|
||||
This value should generally scale with the number of rows in the dataset.
|
||||
By default the number of partitions is the square root of the number of
|
||||
rows.
|
||||
|
||||
If this value is too large then the first part of the search (picking the
|
||||
right partition) will be slow. If this value is too small then the second
|
||||
part of the search (searching within a partition) will be slow.
|
||||
|
||||
***
|
||||
|
||||
### sampleRate?
|
||||
|
||||
```ts
|
||||
optional sampleRate: number;
|
||||
```
|
||||
|
||||
The number of vectors, per partition, to sample when training IVF kmeans.
|
||||
|
||||
When an IVF index is trained, we need to calculate partitions. These are groups
|
||||
of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
||||
|
||||
Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
||||
random sample of the data. This parameter controls the size of the sample. The total
|
||||
number of vectors used to train the index is `sample_rate * num_partitions`.
|
||||
|
||||
Increasing this value might improve the quality of the index but in most cases the
|
||||
default should be sufficient.
|
||||
|
||||
The default value is 256.
|
||||
23
docs/src/js/interfaces/ShuffleOptions.md
Normal file
23
docs/src/js/interfaces/ShuffleOptions.md
Normal file
@@ -0,0 +1,23 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / ShuffleOptions
|
||||
|
||||
# Interface: ShuffleOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### clumpSize?
|
||||
|
||||
```ts
|
||||
optional clumpSize: number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### seed?
|
||||
|
||||
```ts
|
||||
optional seed: number;
|
||||
```
|
||||
31
docs/src/js/interfaces/SplitHashOptions.md
Normal file
31
docs/src/js/interfaces/SplitHashOptions.md
Normal file
@@ -0,0 +1,31 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / SplitHashOptions
|
||||
|
||||
# Interface: SplitHashOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### columns
|
||||
|
||||
```ts
|
||||
columns: string[];
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### discardWeight?
|
||||
|
||||
```ts
|
||||
optional discardWeight: number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### splitWeights
|
||||
|
||||
```ts
|
||||
splitWeights: number[];
|
||||
```
|
||||
39
docs/src/js/interfaces/SplitRandomOptions.md
Normal file
39
docs/src/js/interfaces/SplitRandomOptions.md
Normal file
@@ -0,0 +1,39 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / SplitRandomOptions
|
||||
|
||||
# Interface: SplitRandomOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### counts?
|
||||
|
||||
```ts
|
||||
optional counts: number[];
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### fixed?
|
||||
|
||||
```ts
|
||||
optional fixed: number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### ratios?
|
||||
|
||||
```ts
|
||||
optional ratios: number[];
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### seed?
|
||||
|
||||
```ts
|
||||
optional seed: number;
|
||||
```
|
||||
31
docs/src/js/interfaces/SplitSequentialOptions.md
Normal file
31
docs/src/js/interfaces/SplitSequentialOptions.md
Normal file
@@ -0,0 +1,31 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / SplitSequentialOptions
|
||||
|
||||
# Interface: SplitSequentialOptions
|
||||
|
||||
## Properties
|
||||
|
||||
### counts?
|
||||
|
||||
```ts
|
||||
optional counts: number[];
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### fixed?
|
||||
|
||||
```ts
|
||||
optional fixed: number;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### ratios?
|
||||
|
||||
```ts
|
||||
optional ratios: number[];
|
||||
```
|
||||
49
docs/src/js/interfaces/TlsConfig.md
Normal file
49
docs/src/js/interfaces/TlsConfig.md
Normal file
@@ -0,0 +1,49 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / TlsConfig
|
||||
|
||||
# Interface: TlsConfig
|
||||
|
||||
TLS/mTLS configuration for the remote HTTP client.
|
||||
|
||||
## Properties
|
||||
|
||||
### assertHostname?
|
||||
|
||||
```ts
|
||||
optional assertHostname: boolean;
|
||||
```
|
||||
|
||||
Whether to verify the hostname in the server's certificate.
|
||||
|
||||
***
|
||||
|
||||
### certFile?
|
||||
|
||||
```ts
|
||||
optional certFile: string;
|
||||
```
|
||||
|
||||
Path to the client certificate file (PEM format) for mTLS authentication.
|
||||
|
||||
***
|
||||
|
||||
### keyFile?
|
||||
|
||||
```ts
|
||||
optional keyFile: string;
|
||||
```
|
||||
|
||||
Path to the client private key file (PEM format) for mTLS authentication.
|
||||
|
||||
***
|
||||
|
||||
### sslCaCert?
|
||||
|
||||
```ts
|
||||
optional sslCaCert: string;
|
||||
```
|
||||
|
||||
Path to the CA certificate file (PEM format) for server verification.
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.1-beta.4</version>
|
||||
<version>0.22.3-beta.2</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.1-beta.4</version>
|
||||
<version>0.22.3-beta.2</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.1-beta.4</version>
|
||||
<version>0.22.3-beta.2</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Java SDK Parent POM</description>
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.22.1-beta.4"
|
||||
version = "0.22.3-beta.2"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -1,17 +1,5 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import {
|
||||
Bool,
|
||||
Field,
|
||||
Int32,
|
||||
List,
|
||||
Schema,
|
||||
Struct,
|
||||
Uint8,
|
||||
Utf8,
|
||||
} from "apache-arrow";
|
||||
|
||||
import * as arrow15 from "apache-arrow-15";
|
||||
import * as arrow16 from "apache-arrow-16";
|
||||
import * as arrow17 from "apache-arrow-17";
|
||||
@@ -25,11 +13,9 @@ import {
|
||||
fromTableToBuffer,
|
||||
makeArrowTable,
|
||||
makeEmptyTable,
|
||||
tableFromIPC,
|
||||
} from "../lancedb/arrow";
|
||||
import {
|
||||
EmbeddingFunction,
|
||||
FieldOptions,
|
||||
FunctionOptions,
|
||||
} from "../lancedb/embedding/embedding_function";
|
||||
import { EmbeddingFunctionConfig } from "../lancedb/embedding/registry";
|
||||
@@ -1037,35 +1023,35 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(table.getChild("test")!.get(2)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// Test for the undefined values bug fix
|
||||
describe("undefined values handling", () => {
|
||||
it("should handle mixed undefined and actual values", () => {
|
||||
const schema = new Schema([
|
||||
new Field("text", new Utf8(), true), // nullable
|
||||
new Field("number", new Int32(), true), // nullable
|
||||
new Field("bool", new Bool(), true), // nullable
|
||||
]);
|
||||
|
||||
const data = [
|
||||
{ text: undefined, number: 42, bool: true },
|
||||
{ text: "hello", number: undefined, bool: false },
|
||||
{ text: "world", number: 123, bool: undefined },
|
||||
];
|
||||
const table = makeArrowTable(data, { schema });
|
||||
|
||||
const result = table.toArray();
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].text).toBe(null);
|
||||
expect(result[0].number).toBe(42);
|
||||
expect(result[0].bool).toBe(true);
|
||||
expect(result[1].text).toBe("hello");
|
||||
expect(result[1].number).toBe(null);
|
||||
expect(result[1].bool).toBe(false);
|
||||
expect(result[2].text).toBe("world");
|
||||
expect(result[2].number).toBe(123);
|
||||
expect(result[2].bool).toBe(null);
|
||||
});
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
// Test for the undefined values bug fix
|
||||
describe("undefined values handling", () => {
|
||||
it("should handle mixed undefined and actual values", () => {
|
||||
const schema = new Schema([
|
||||
new Field("text", new Utf8(), true), // nullable
|
||||
new Field("number", new Int32(), true), // nullable
|
||||
new Field("bool", new Bool(), true), // nullable
|
||||
]);
|
||||
|
||||
const data = [
|
||||
{ text: undefined, number: 42, bool: true },
|
||||
{ text: "hello", number: undefined, bool: false },
|
||||
{ text: "world", number: 123, bool: undefined },
|
||||
];
|
||||
const table = makeArrowTable(data, { schema });
|
||||
|
||||
const result = table.toArray();
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].text).toBe(null);
|
||||
expect(result[0].number).toBe(42);
|
||||
expect(result[0].bool).toBe(true);
|
||||
expect(result[1].text).toBe("hello");
|
||||
expect(result[1].number).toBe(null);
|
||||
expect(result[1].bool).toBe(false);
|
||||
expect(result[2].text).toBe("world");
|
||||
expect(result[2].number).toBe(123);
|
||||
expect(result[2].bool).toBe(null);
|
||||
});
|
||||
});
|
||||
|
||||
227
nodejs/__test__/permutation.test.ts
Normal file
227
nodejs/__test__/permutation.test.ts
Normal file
@@ -0,0 +1,227 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import * as tmp from "tmp";
|
||||
import { Table, connect, permutationBuilder } from "../lancedb";
|
||||
import { makeArrowTable } from "../lancedb/arrow";
|
||||
|
||||
describe("PermutationBuilder", () => {
|
||||
let tmpDir: tmp.DirResult;
|
||||
let table: Table;
|
||||
|
||||
beforeEach(async () => {
|
||||
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||
const db = await connect(tmpDir.name);
|
||||
|
||||
// Create test data
|
||||
const data = makeArrowTable(
|
||||
[
|
||||
{ id: 1, value: 10 },
|
||||
{ id: 2, value: 20 },
|
||||
{ id: 3, value: 30 },
|
||||
{ id: 4, value: 40 },
|
||||
{ id: 5, value: 50 },
|
||||
{ id: 6, value: 60 },
|
||||
{ id: 7, value: 70 },
|
||||
{ id: 8, value: 80 },
|
||||
{ id: 9, value: 90 },
|
||||
{ id: 10, value: 100 },
|
||||
],
|
||||
{ vectorColumns: {} },
|
||||
);
|
||||
|
||||
table = await db.createTable("test_table", data);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
tmpDir.removeCallback();
|
||||
});
|
||||
|
||||
test("should create permutation builder", () => {
|
||||
const builder = permutationBuilder(table);
|
||||
expect(builder).toBeDefined();
|
||||
});
|
||||
|
||||
test("should execute basic permutation", async () => {
|
||||
const builder = permutationBuilder(table);
|
||||
const permutationTable = await builder.execute();
|
||||
|
||||
expect(permutationTable).toBeDefined();
|
||||
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(10);
|
||||
});
|
||||
|
||||
test("should create permutation with random splits", async () => {
|
||||
const builder = permutationBuilder(table).splitRandom({
|
||||
ratios: [1.0],
|
||||
seed: 42,
|
||||
});
|
||||
|
||||
const permutationTable = await builder.execute();
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(10);
|
||||
});
|
||||
|
||||
test("should create permutation with percentage splits", async () => {
|
||||
const builder = permutationBuilder(table).splitRandom({
|
||||
ratios: [0.3, 0.7],
|
||||
seed: 42,
|
||||
});
|
||||
|
||||
const permutationTable = await builder.execute();
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(10);
|
||||
|
||||
// Check split distribution
|
||||
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||
|
||||
expect(split0Count).toBeGreaterThan(0);
|
||||
expect(split1Count).toBeGreaterThan(0);
|
||||
expect(split0Count + split1Count).toBe(10);
|
||||
});
|
||||
|
||||
test("should create permutation with count splits", async () => {
|
||||
const builder = permutationBuilder(table).splitRandom({
|
||||
counts: [3, 7],
|
||||
seed: 42,
|
||||
});
|
||||
|
||||
const permutationTable = await builder.execute();
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(10);
|
||||
|
||||
// Check split distribution
|
||||
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||
|
||||
expect(split0Count).toBe(3);
|
||||
expect(split1Count).toBe(7);
|
||||
});
|
||||
|
||||
test("should create permutation with hash splits", async () => {
|
||||
const builder = permutationBuilder(table).splitHash({
|
||||
columns: ["id"],
|
||||
splitWeights: [50, 50],
|
||||
discardWeight: 0,
|
||||
});
|
||||
|
||||
const permutationTable = await builder.execute();
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(10);
|
||||
|
||||
// Check that splits exist
|
||||
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||
|
||||
expect(split0Count).toBeGreaterThan(0);
|
||||
expect(split1Count).toBeGreaterThan(0);
|
||||
expect(split0Count + split1Count).toBe(10);
|
||||
});
|
||||
|
||||
test("should create permutation with sequential splits", async () => {
|
||||
const builder = permutationBuilder(table).splitSequential({
|
||||
ratios: [0.5, 0.5],
|
||||
});
|
||||
|
||||
const permutationTable = await builder.execute();
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(10);
|
||||
|
||||
// Check split distribution - sequential should give exactly 5 and 5
|
||||
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||
|
||||
expect(split0Count).toBe(5);
|
||||
expect(split1Count).toBe(5);
|
||||
});
|
||||
|
||||
test("should create permutation with calculated splits", async () => {
|
||||
const builder = permutationBuilder(table).splitCalculated("id % 2");
|
||||
|
||||
const permutationTable = await builder.execute();
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(10);
|
||||
|
||||
// Check split distribution
|
||||
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||
|
||||
expect(split0Count).toBeGreaterThan(0);
|
||||
expect(split1Count).toBeGreaterThan(0);
|
||||
expect(split0Count + split1Count).toBe(10);
|
||||
});
|
||||
|
||||
test("should create permutation with shuffle", async () => {
|
||||
const builder = permutationBuilder(table).shuffle({
|
||||
seed: 42,
|
||||
});
|
||||
|
||||
const permutationTable = await builder.execute();
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(10);
|
||||
});
|
||||
|
||||
test("should create permutation with shuffle and clump size", async () => {
|
||||
const builder = permutationBuilder(table).shuffle({
|
||||
seed: 42,
|
||||
clumpSize: 2,
|
||||
});
|
||||
|
||||
const permutationTable = await builder.execute();
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(10);
|
||||
});
|
||||
|
||||
test("should create permutation with filter", async () => {
|
||||
const builder = permutationBuilder(table).filter("value > 50");
|
||||
|
||||
const permutationTable = await builder.execute();
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(5); // Values 60, 70, 80, 90, 100
|
||||
});
|
||||
|
||||
test("should chain multiple operations", async () => {
|
||||
const builder = permutationBuilder(table)
|
||||
.filter("value <= 80")
|
||||
.splitRandom({ ratios: [0.5, 0.5], seed: 42 })
|
||||
.shuffle({ seed: 123 });
|
||||
|
||||
const permutationTable = await builder.execute();
|
||||
const rowCount = await permutationTable.countRows();
|
||||
expect(rowCount).toBe(8); // Values 10, 20, 30, 40, 50, 60, 70, 80
|
||||
|
||||
// Check split distribution
|
||||
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||
|
||||
expect(split0Count).toBeGreaterThan(0);
|
||||
expect(split1Count).toBeGreaterThan(0);
|
||||
expect(split0Count + split1Count).toBe(8);
|
||||
});
|
||||
|
||||
test("should throw error for invalid split arguments", () => {
|
||||
const builder = permutationBuilder(table);
|
||||
|
||||
// Test no arguments provided
|
||||
expect(() => builder.splitRandom({})).toThrow(
|
||||
"Exactly one of 'ratios', 'counts', or 'fixed' must be provided",
|
||||
);
|
||||
|
||||
// Test multiple arguments provided
|
||||
expect(() =>
|
||||
builder.splitRandom({ ratios: [0.5, 0.5], counts: [3, 7], seed: 42 }),
|
||||
).toThrow("Exactly one of 'ratios', 'counts', or 'fixed' must be provided");
|
||||
});
|
||||
|
||||
test("should throw error when builder is consumed", async () => {
|
||||
const builder = permutationBuilder(table);
|
||||
|
||||
// Execute once
|
||||
await builder.execute();
|
||||
|
||||
// Should throw error on second execution
|
||||
await expect(builder.execute()).rejects.toThrow("Builder already consumed");
|
||||
});
|
||||
});
|
||||
111
nodejs/__test__/query.test.ts
Normal file
111
nodejs/__test__/query.test.ts
Normal file
@@ -0,0 +1,111 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import * as tmp from "tmp";
|
||||
|
||||
import { type Table, connect } from "../lancedb";
|
||||
import {
|
||||
Field,
|
||||
FixedSizeList,
|
||||
Float32,
|
||||
Int64,
|
||||
Schema,
|
||||
Utf8,
|
||||
makeArrowTable,
|
||||
} from "../lancedb/arrow";
|
||||
import { Index } from "../lancedb/indices";
|
||||
|
||||
describe("Query outputSchema", () => {
|
||||
let tmpDir: tmp.DirResult;
|
||||
let table: Table;
|
||||
|
||||
beforeEach(async () => {
|
||||
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||
const db = await connect(tmpDir.name);
|
||||
|
||||
// Create table with explicit schema to ensure proper types
|
||||
const schema = new Schema([
|
||||
new Field("a", new Int64(), true),
|
||||
new Field("text", new Utf8(), true),
|
||||
new Field(
|
||||
"vec",
|
||||
new FixedSizeList(2, new Field("item", new Float32())),
|
||||
true,
|
||||
),
|
||||
]);
|
||||
|
||||
const data = makeArrowTable(
|
||||
[
|
||||
{ a: 1n, text: "foo", vec: [1, 2] },
|
||||
{ a: 2n, text: "bar", vec: [3, 4] },
|
||||
{ a: 3n, text: "baz", vec: [5, 6] },
|
||||
],
|
||||
{ schema },
|
||||
);
|
||||
table = await db.createTable("test", data);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
tmpDir.removeCallback();
|
||||
});
|
||||
|
||||
it("should return schema for plain query", async () => {
|
||||
const schema = await table.query().outputSchema();
|
||||
|
||||
expect(schema.fields.length).toBe(3);
|
||||
expect(schema.fields.map((f) => f.name)).toEqual(["a", "text", "vec"]);
|
||||
expect(schema.fields[0].type.toString()).toBe("Int64");
|
||||
expect(schema.fields[1].type.toString()).toBe("Utf8");
|
||||
});
|
||||
|
||||
it("should return schema with dynamic projection", async () => {
|
||||
const schema = await table.query().select({ bl: "a * 2" }).outputSchema();
|
||||
|
||||
expect(schema.fields.length).toBe(1);
|
||||
expect(schema.fields[0].name).toBe("bl");
|
||||
expect(schema.fields[0].type.toString()).toBe("Int64");
|
||||
});
|
||||
|
||||
it("should return schema for vector search with _distance column", async () => {
|
||||
const schema = await table
|
||||
.vectorSearch([1, 2])
|
||||
.select(["a"])
|
||||
.outputSchema();
|
||||
|
||||
expect(schema.fields.length).toBe(2);
|
||||
expect(schema.fields.map((f) => f.name)).toEqual(["a", "_distance"]);
|
||||
expect(schema.fields[0].type.toString()).toBe("Int64");
|
||||
expect(schema.fields[1].type.toString()).toBe("Float32");
|
||||
});
|
||||
|
||||
it("should return schema for FTS search", async () => {
|
||||
await table.createIndex("text", { config: Index.fts() });
|
||||
|
||||
const schema = await table
|
||||
.search("foo", "fts")
|
||||
.select(["a"])
|
||||
.outputSchema();
|
||||
|
||||
// FTS search includes _score column in addition to selected columns
|
||||
expect(schema.fields.length).toBe(2);
|
||||
expect(schema.fields.map((f) => f.name)).toContain("a");
|
||||
expect(schema.fields.map((f) => f.name)).toContain("_score");
|
||||
const aField = schema.fields.find((f) => f.name === "a");
|
||||
expect(aField?.type.toString()).toBe("Int64");
|
||||
});
|
||||
|
||||
it("should return schema for take query", async () => {
|
||||
const schema = await table.takeOffsets([0]).select(["text"]).outputSchema();
|
||||
|
||||
expect(schema.fields.length).toBe(1);
|
||||
expect(schema.fields[0].name).toBe("text");
|
||||
expect(schema.fields[0].type.toString()).toBe("Utf8");
|
||||
});
|
||||
|
||||
it("should return full schema when no select is specified", async () => {
|
||||
const schema = await table.query().outputSchema();
|
||||
|
||||
// Should return all columns
|
||||
expect(schema.fields.length).toBe(3);
|
||||
});
|
||||
});
|
||||
@@ -7,7 +7,6 @@ import {
|
||||
ClientConfig,
|
||||
Connection,
|
||||
ConnectionOptions,
|
||||
NativeJsHeaderProvider,
|
||||
TlsConfig,
|
||||
connect,
|
||||
} from "../lancedb";
|
||||
|
||||
184
nodejs/__test__/sanitize.test.ts
Normal file
184
nodejs/__test__/sanitize.test.ts
Normal file
@@ -0,0 +1,184 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import * as arrow from "../lancedb/arrow";
|
||||
import { sanitizeField, sanitizeType } from "../lancedb/sanitize";
|
||||
|
||||
describe("sanitize", function () {
|
||||
describe("sanitizeType function", function () {
|
||||
it("should handle type objects", function () {
|
||||
const type = new arrow.Int32();
|
||||
const result = sanitizeType(type);
|
||||
|
||||
expect(result.typeId).toBe(arrow.Type.Int);
|
||||
expect((result as arrow.Int).bitWidth).toBe(32);
|
||||
expect((result as arrow.Int).isSigned).toBe(true);
|
||||
|
||||
const floatType = {
|
||||
typeId: 3, // Type.Float = 3
|
||||
precision: 2,
|
||||
toString: () => "Float",
|
||||
isFloat: true,
|
||||
isFixedWidth: true,
|
||||
};
|
||||
|
||||
const floatResult = sanitizeType(floatType);
|
||||
expect(floatResult).toBeInstanceOf(arrow.DataType);
|
||||
expect(floatResult.typeId).toBe(arrow.Type.Float);
|
||||
|
||||
const floatResult2 = sanitizeType({ ...floatType, typeId: () => 3 });
|
||||
expect(floatResult2).toBeInstanceOf(arrow.DataType);
|
||||
expect(floatResult2.typeId).toBe(arrow.Type.Float);
|
||||
});
|
||||
|
||||
const allTypeNameTestCases = [
|
||||
["null", new arrow.Null()],
|
||||
["binary", new arrow.Binary()],
|
||||
["utf8", new arrow.Utf8()],
|
||||
["bool", new arrow.Bool()],
|
||||
["int8", new arrow.Int8()],
|
||||
["int16", new arrow.Int16()],
|
||||
["int32", new arrow.Int32()],
|
||||
["int64", new arrow.Int64()],
|
||||
["uint8", new arrow.Uint8()],
|
||||
["uint16", new arrow.Uint16()],
|
||||
["uint32", new arrow.Uint32()],
|
||||
["uint64", new arrow.Uint64()],
|
||||
["float16", new arrow.Float16()],
|
||||
["float32", new arrow.Float32()],
|
||||
["float64", new arrow.Float64()],
|
||||
["datemillisecond", new arrow.DateMillisecond()],
|
||||
["dateday", new arrow.DateDay()],
|
||||
["timenanosecond", new arrow.TimeNanosecond()],
|
||||
["timemicrosecond", new arrow.TimeMicrosecond()],
|
||||
["timemillisecond", new arrow.TimeMillisecond()],
|
||||
["timesecond", new arrow.TimeSecond()],
|
||||
["intervaldaytime", new arrow.IntervalDayTime()],
|
||||
["intervalyearmonth", new arrow.IntervalYearMonth()],
|
||||
["durationnanosecond", new arrow.DurationNanosecond()],
|
||||
["durationmicrosecond", new arrow.DurationMicrosecond()],
|
||||
["durationmillisecond", new arrow.DurationMillisecond()],
|
||||
["durationsecond", new arrow.DurationSecond()],
|
||||
] as const;
|
||||
|
||||
it.each(allTypeNameTestCases)(
|
||||
'should map type name "%s" to %s',
|
||||
function (name, expected) {
|
||||
const result = sanitizeType(name);
|
||||
expect(result).toBeInstanceOf(expected.constructor);
|
||||
},
|
||||
);
|
||||
|
||||
const caseVariationTestCases = [
|
||||
["NULL", new arrow.Null()],
|
||||
["Utf8", new arrow.Utf8()],
|
||||
["FLOAT32", new arrow.Float32()],
|
||||
["DaTedAy", new arrow.DateDay()],
|
||||
] as const;
|
||||
|
||||
it.each(caseVariationTestCases)(
|
||||
'should be case insensitive for type name "%s" mapped to %s',
|
||||
function (name, expected) {
|
||||
const result = sanitizeType(name);
|
||||
expect(result).toBeInstanceOf(expected.constructor);
|
||||
},
|
||||
);
|
||||
|
||||
it("should throw error for unrecognized type name", function () {
|
||||
expect(() => sanitizeType("invalid_type")).toThrow(
|
||||
"Unrecognized type name in schema: invalid_type",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("sanitizeField function", function () {
|
||||
it("should handle field with string type name", function () {
|
||||
const field = sanitizeField({
|
||||
name: "string_field",
|
||||
type: "utf8",
|
||||
nullable: true,
|
||||
metadata: new Map([["key", "value"]]),
|
||||
});
|
||||
|
||||
expect(field).toBeInstanceOf(arrow.Field);
|
||||
expect(field.name).toBe("string_field");
|
||||
expect(field.type).toBeInstanceOf(arrow.Utf8);
|
||||
expect(field.nullable).toBe(true);
|
||||
expect(field.metadata?.get("key")).toBe("value");
|
||||
});
|
||||
|
||||
it("should handle field with type object", function () {
|
||||
const floatType = {
|
||||
typeId: 3, // Float
|
||||
precision: 32,
|
||||
};
|
||||
|
||||
const field = sanitizeField({
|
||||
name: "float_field",
|
||||
type: floatType,
|
||||
nullable: false,
|
||||
});
|
||||
|
||||
expect(field).toBeInstanceOf(arrow.Field);
|
||||
expect(field.name).toBe("float_field");
|
||||
expect(field.type).toBeInstanceOf(arrow.DataType);
|
||||
expect(field.type.typeId).toBe(arrow.Type.Float);
|
||||
expect((field.type as arrow.Float64).precision).toBe(32);
|
||||
expect(field.nullable).toBe(false);
|
||||
});
|
||||
|
||||
it("should handle field with direct Type instance", function () {
|
||||
const field = sanitizeField({
|
||||
name: "bool_field",
|
||||
type: new arrow.Bool(),
|
||||
nullable: true,
|
||||
});
|
||||
|
||||
expect(field).toBeInstanceOf(arrow.Field);
|
||||
expect(field.name).toBe("bool_field");
|
||||
expect(field.type).toBeInstanceOf(arrow.Bool);
|
||||
expect(field.nullable).toBe(true);
|
||||
});
|
||||
|
||||
it("should throw error for invalid field object", function () {
|
||||
expect(() =>
|
||||
sanitizeField({
|
||||
type: "int32",
|
||||
nullable: true,
|
||||
}),
|
||||
).toThrow(
|
||||
"The field passed in is missing a `type`/`name`/`nullable` property",
|
||||
);
|
||||
|
||||
// Invalid type
|
||||
expect(() =>
|
||||
sanitizeField({
|
||||
name: "invalid",
|
||||
type: { invalid: true },
|
||||
nullable: true,
|
||||
}),
|
||||
).toThrow("Expected a Type to have a typeId property");
|
||||
|
||||
// Invalid nullable
|
||||
expect(() =>
|
||||
sanitizeField({
|
||||
name: "invalid_nullable",
|
||||
type: "int32",
|
||||
nullable: "not a boolean",
|
||||
}),
|
||||
).toThrow("The field passed in had a non-boolean `nullable` property");
|
||||
});
|
||||
|
||||
it("should report error for invalid type name", function () {
|
||||
expect(() =>
|
||||
sanitizeField({
|
||||
name: "invalid_field",
|
||||
type: "invalid_type",
|
||||
nullable: true,
|
||||
}),
|
||||
).toThrow(
|
||||
"Unable to sanitize type for field: invalid_field due to error: Error: Unrecognized type name in schema: invalid_type",
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -10,7 +10,13 @@ import * as arrow16 from "apache-arrow-16";
|
||||
import * as arrow17 from "apache-arrow-17";
|
||||
import * as arrow18 from "apache-arrow-18";
|
||||
|
||||
import { MatchQuery, PhraseQuery, Table, connect } from "../lancedb";
|
||||
import {
|
||||
Connection,
|
||||
MatchQuery,
|
||||
PhraseQuery,
|
||||
Table,
|
||||
connect,
|
||||
} from "../lancedb";
|
||||
import {
|
||||
Table as ArrowTable,
|
||||
Field,
|
||||
@@ -21,6 +27,8 @@ import {
|
||||
Int64,
|
||||
List,
|
||||
Schema,
|
||||
SchemaLike,
|
||||
Type,
|
||||
Uint8,
|
||||
Utf8,
|
||||
makeArrowTable,
|
||||
@@ -39,7 +47,6 @@ import {
|
||||
Operator,
|
||||
instanceOfFullTextQuery,
|
||||
} from "../lancedb/query";
|
||||
import exp = require("constants");
|
||||
|
||||
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
"Given a table",
|
||||
@@ -212,8 +219,7 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
},
|
||||
);
|
||||
|
||||
// TODO: https://github.com/lancedb/lancedb/issues/1832
|
||||
it.skip("should be able to omit nullable fields", async () => {
|
||||
it("should be able to omit nullable fields", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const schema = new arrow.Schema([
|
||||
new arrow.Field(
|
||||
@@ -237,23 +243,36 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
await table.add([data3]);
|
||||
|
||||
let res = await table.query().limit(10).toArray();
|
||||
const resVector = res.map((r) => r.get("vector").toArray());
|
||||
const resVector = res.map((r) =>
|
||||
r.vector ? Array.from(r.vector) : null,
|
||||
);
|
||||
expect(resVector).toEqual([null, data2.vector, data3.vector]);
|
||||
const resItem = res.map((r) => r.get("item").toArray());
|
||||
const resItem = res.map((r) => r.item);
|
||||
expect(resItem).toEqual(["foo", null, "bar"]);
|
||||
const resPrice = res.map((r) => r.get("price").toArray());
|
||||
const resPrice = res.map((r) => r.price);
|
||||
expect(resPrice).toEqual([10.0, 2.0, 3.0]);
|
||||
|
||||
const data4 = { item: "foo" };
|
||||
// We can't omit a column if it's not nullable
|
||||
await expect(table.add([data4])).rejects.toThrow("Invalid user input");
|
||||
await expect(table.add([data4])).rejects.toThrow(
|
||||
"Append with different schema",
|
||||
);
|
||||
|
||||
// But we can alter columns to make them nullable
|
||||
await table.alterColumns([{ path: "price", nullable: true }]);
|
||||
await table.add([data4]);
|
||||
|
||||
res = (await table.query().limit(10).toArray()).map((r) => r.toJSON());
|
||||
expect(res).toEqual([data1, data2, data3, data4]);
|
||||
res = (await table.query().limit(10).toArray()).map((r) => ({
|
||||
...r.toJSON(),
|
||||
vector: r.vector ? Array.from(r.vector) : null,
|
||||
}));
|
||||
// Rust fills missing nullable fields with null
|
||||
expect(res).toEqual([
|
||||
{ ...data1, vector: null },
|
||||
{ ...data2, item: null },
|
||||
data3,
|
||||
{ ...data4, price: null, vector: null },
|
||||
]);
|
||||
});
|
||||
|
||||
it("should be able to insert nullable data for non-nullable fields", async () => {
|
||||
@@ -331,6 +350,43 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
const table = await db.createTable("my_table", data);
|
||||
expect(await table.countRows()).toEqual(2);
|
||||
});
|
||||
|
||||
it("should allow undefined and omitted nullable vector fields", async () => {
|
||||
// Test for the bug: can't pass undefined or omit vector column
|
||||
const db = await connect("memory://");
|
||||
const schema = new arrow.Schema([
|
||||
new arrow.Field("id", new arrow.Int32(), true),
|
||||
new arrow.Field(
|
||||
"vector",
|
||||
new arrow.FixedSizeList(
|
||||
32,
|
||||
new arrow.Field("item", new arrow.Float32(), true),
|
||||
),
|
||||
true, // nullable = true
|
||||
),
|
||||
]);
|
||||
const table = await db.createEmptyTable("test_table", schema);
|
||||
|
||||
// Should not throw error for undefined value
|
||||
await table.add([{ id: 0, vector: undefined }]);
|
||||
|
||||
// Should not throw error for omitted field
|
||||
await table.add([{ id: 1 }]);
|
||||
|
||||
// Should still work for null
|
||||
await table.add([{ id: 2, vector: null }]);
|
||||
|
||||
// Should still work for actual vector
|
||||
const testVector = new Array(32).fill(0.5);
|
||||
await table.add([{ id: 3, vector: testVector }]);
|
||||
expect(await table.countRows()).toEqual(4);
|
||||
|
||||
const res = await table.query().limit(10).toArray();
|
||||
const resVector = res.map((r) =>
|
||||
r.vector ? Array.from(r.vector) : null,
|
||||
);
|
||||
expect(resVector).toEqual([null, null, null, testVector]);
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
@@ -488,6 +544,32 @@ describe("merge insert", () => {
|
||||
.execute(newData, { timeoutMs: 0 }),
|
||||
).rejects.toThrow("merge insert timed out");
|
||||
});
|
||||
|
||||
test("useIndex", async () => {
|
||||
const newData = [
|
||||
{ a: 2, b: "x" },
|
||||
{ a: 4, b: "z" },
|
||||
];
|
||||
|
||||
// Test with useIndex(true) - should work fine
|
||||
const result1 = await table
|
||||
.mergeInsert("a")
|
||||
.whenNotMatchedInsertAll()
|
||||
.useIndex(true)
|
||||
.execute(newData);
|
||||
|
||||
expect(result1.numInsertedRows).toBe(1); // Only a=4 should be inserted
|
||||
|
||||
// Test with useIndex(false) - should also work fine
|
||||
const newData2 = [{ a: 5, b: "w" }];
|
||||
const result2 = await table
|
||||
.mergeInsert("a")
|
||||
.whenNotMatchedInsertAll()
|
||||
.useIndex(false)
|
||||
.execute(newData2);
|
||||
|
||||
expect(result2.numInsertedRows).toBe(1); // a=5 should be inserted
|
||||
});
|
||||
});
|
||||
|
||||
describe("When creating an index", () => {
|
||||
@@ -779,6 +861,15 @@ describe("When creating an index", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("should be able to create IVF_RQ", async () => {
|
||||
await tbl.createIndex("vec", {
|
||||
config: Index.ivfRq({
|
||||
numPartitions: 10,
|
||||
numBits: 1,
|
||||
}),
|
||||
});
|
||||
});
|
||||
|
||||
it("should allow me to replace (or not) an existing index", async () => {
|
||||
await tbl.createIndex("id");
|
||||
// Default is replace=true
|
||||
@@ -1429,7 +1520,9 @@ describe("when optimizing a dataset", () => {
|
||||
|
||||
it("delete unverified", async () => {
|
||||
const version = await table.version();
|
||||
const versionFile = `${tmpDir.name}/${table.name}.lance/_versions/${version - 1}.manifest`;
|
||||
const versionFile = `${tmpDir.name}/${table.name}.lance/_versions/${
|
||||
version - 1
|
||||
}.manifest`;
|
||||
fs.rmSync(versionFile);
|
||||
|
||||
let stats = await table.optimize({ deleteUnverified: false });
|
||||
@@ -1943,3 +2036,52 @@ describe("column name options", () => {
|
||||
expect(results2.length).toBe(10);
|
||||
});
|
||||
});
|
||||
|
||||
describe("when creating an empty table", () => {
|
||||
let con: Connection;
|
||||
beforeEach(async () => {
|
||||
const tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||
con = await connect(tmpDir.name);
|
||||
});
|
||||
afterEach(() => {
|
||||
con.close();
|
||||
});
|
||||
|
||||
it("can create an empty table from an arrow Schema", async () => {
|
||||
const schema = new Schema([
|
||||
new Field("id", new Int64()),
|
||||
new Field("vector", new Float64()),
|
||||
]);
|
||||
const table = await con.createEmptyTable("test", schema);
|
||||
const actualSchema = await table.schema();
|
||||
expect(actualSchema.fields[0].type.typeId).toBe(Type.Int);
|
||||
expect((actualSchema.fields[0].type as Int64).bitWidth).toBe(64);
|
||||
expect(actualSchema.fields[1].type.typeId).toBe(Type.Float);
|
||||
expect((actualSchema.fields[1].type as Float64).precision).toBe(2);
|
||||
});
|
||||
|
||||
it("can create an empty table from schema that specifies field types by name", async () => {
|
||||
const schemaLike = {
|
||||
fields: [
|
||||
{
|
||||
name: "id",
|
||||
type: "int64",
|
||||
nullable: true,
|
||||
},
|
||||
{
|
||||
name: "vector",
|
||||
type: "float64",
|
||||
nullable: true,
|
||||
},
|
||||
],
|
||||
metadata: new Map(),
|
||||
names: ["id", "vector"],
|
||||
} satisfies SchemaLike;
|
||||
const table = await con.createEmptyTable("test", schemaLike);
|
||||
const actualSchema = await table.schema();
|
||||
expect(actualSchema.fields[0].type.typeId).toBe(Type.Int);
|
||||
expect((actualSchema.fields[0].type as Int64).bitWidth).toBe(64);
|
||||
expect(actualSchema.fields[1].type.typeId).toBe(Type.Float);
|
||||
expect((actualSchema.fields[1].type as Float64).precision).toBe(2);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
"noUnreachableSuper": "error",
|
||||
"noUnsafeFinally": "error",
|
||||
"noUnsafeOptionalChaining": "error",
|
||||
"noUnusedImports": "error",
|
||||
"noUnusedLabels": "error",
|
||||
"noUnusedVariables": "warn",
|
||||
"useIsNan": "error",
|
||||
|
||||
@@ -41,7 +41,6 @@ import {
|
||||
vectorFromArray as badVectorFromArray,
|
||||
makeBuilder,
|
||||
makeData,
|
||||
makeTable,
|
||||
} from "apache-arrow";
|
||||
import { Buffers } from "apache-arrow/data";
|
||||
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
||||
@@ -74,7 +73,7 @@ export type FieldLike =
|
||||
| {
|
||||
type: string;
|
||||
name: string;
|
||||
nullable?: boolean;
|
||||
nullable: boolean;
|
||||
metadata?: Map<string, string>;
|
||||
};
|
||||
|
||||
@@ -279,7 +278,7 @@ export class MakeArrowTableOptions {
|
||||
}
|
||||
|
||||
/**
|
||||
* An enhanced version of the {@link makeTable} function from Apache Arrow
|
||||
* An enhanced version of the apache-arrow makeTable function from Apache Arrow
|
||||
* that supports nested fields and embeddings columns.
|
||||
*
|
||||
* (typically you do not need to call this function. It will be called automatically
|
||||
@@ -1286,19 +1285,36 @@ function validateSchemaEmbeddings(
|
||||
if (isFixedSizeList(field.type)) {
|
||||
field = sanitizeField(field);
|
||||
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
||||
// Check if there's an embedding function registered for this field
|
||||
let hasEmbeddingFunction = false;
|
||||
|
||||
// Check schema metadata for embedding functions
|
||||
if (schema.metadata.has("embedding_functions")) {
|
||||
const embeddings = JSON.parse(
|
||||
schema.metadata.get("embedding_functions")!,
|
||||
);
|
||||
if (
|
||||
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
||||
embeddings.find((f: any) => f["vectorColumn"] === field.name) ===
|
||||
undefined
|
||||
) {
|
||||
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
||||
if (embeddings.find((f: any) => f["vectorColumn"] === field.name)) {
|
||||
hasEmbeddingFunction = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check passed embedding function parameter
|
||||
if (embeddings && embeddings.vectorColumn === field.name) {
|
||||
hasEmbeddingFunction = true;
|
||||
}
|
||||
|
||||
// If the field is nullable AND there's no embedding function, allow undefined/omitted values
|
||||
if (field.nullable && !hasEmbeddingFunction) {
|
||||
fields.push(field);
|
||||
} else {
|
||||
// Either not nullable OR has embedding function - require explicit values
|
||||
if (hasEmbeddingFunction) {
|
||||
// Don't add to missingEmbeddingFields since this is expected to be filled by embedding function
|
||||
fields.push(field);
|
||||
} else {
|
||||
missingEmbeddingFields.push(field);
|
||||
}
|
||||
} else {
|
||||
missingEmbeddingFields.push(field);
|
||||
}
|
||||
} else {
|
||||
fields.push(field);
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
import {
|
||||
Data,
|
||||
Schema,
|
||||
SchemaLike,
|
||||
TableLike,
|
||||
fromTableToStreamBuffer,
|
||||
|
||||
@@ -43,6 +43,10 @@ export {
|
||||
DeleteResult,
|
||||
DropColumnsResult,
|
||||
UpdateResult,
|
||||
SplitRandomOptions,
|
||||
SplitHashOptions,
|
||||
SplitSequentialOptions,
|
||||
ShuffleOptions,
|
||||
} from "./native.js";
|
||||
|
||||
export {
|
||||
@@ -85,6 +89,7 @@ export {
|
||||
Index,
|
||||
IndexOptions,
|
||||
IvfPqOptions,
|
||||
IvfRqOptions,
|
||||
IvfFlatOptions,
|
||||
HnswPqOptions,
|
||||
HnswSqOptions,
|
||||
@@ -110,6 +115,7 @@ export {
|
||||
export { MergeInsertBuilder, WriteExecutionOptions } from "./merge";
|
||||
|
||||
export * as embedding from "./embedding";
|
||||
export { permutationBuilder, PermutationBuilder } from "./permutation";
|
||||
export * as rerankers from "./rerankers";
|
||||
export {
|
||||
SchemaLike,
|
||||
|
||||
@@ -112,6 +112,77 @@ export interface IvfPqOptions {
|
||||
sampleRate?: number;
|
||||
}
|
||||
|
||||
export interface IvfRqOptions {
|
||||
/**
|
||||
* The number of IVF partitions to create.
|
||||
*
|
||||
* This value should generally scale with the number of rows in the dataset.
|
||||
* By default the number of partitions is the square root of the number of
|
||||
* rows.
|
||||
*
|
||||
* If this value is too large then the first part of the search (picking the
|
||||
* right partition) will be slow. If this value is too small then the second
|
||||
* part of the search (searching within a partition) will be slow.
|
||||
*/
|
||||
numPartitions?: number;
|
||||
|
||||
/**
|
||||
* Number of bits per dimension for residual quantization.
|
||||
*
|
||||
* This value controls how much each residual component is compressed. The more
|
||||
* bits, the more accurate the index will be but the slower search. Typical values
|
||||
* are small integers; the default is 1 bit per dimension.
|
||||
*/
|
||||
numBits?: number;
|
||||
|
||||
/**
|
||||
* Distance type to use to build the index.
|
||||
*
|
||||
* Default value is "l2".
|
||||
*
|
||||
* This is used when training the index to calculate the IVF partitions
|
||||
* (vectors are grouped in partitions with similar vectors according to this
|
||||
* distance type) and during quantization.
|
||||
*
|
||||
* The distance type used to train an index MUST match the distance type used
|
||||
* to search the index. Failure to do so will yield inaccurate results.
|
||||
*
|
||||
* The following distance types are available:
|
||||
*
|
||||
* "l2" - Euclidean distance.
|
||||
* "cosine" - Cosine distance.
|
||||
* "dot" - Dot product.
|
||||
*/
|
||||
distanceType?: "l2" | "cosine" | "dot";
|
||||
|
||||
/**
|
||||
* Max iterations to train IVF kmeans.
|
||||
*
|
||||
* When training an IVF index we use kmeans to calculate the partitions. This parameter
|
||||
* controls how many iterations of kmeans to run.
|
||||
*
|
||||
* The default value is 50.
|
||||
*/
|
||||
maxIterations?: number;
|
||||
|
||||
/**
|
||||
* The number of vectors, per partition, to sample when training IVF kmeans.
|
||||
*
|
||||
* When an IVF index is trained, we need to calculate partitions. These are groups
|
||||
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
||||
*
|
||||
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
||||
* random sample of the data. This parameter controls the size of the sample. The total
|
||||
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
||||
*
|
||||
* Increasing this value might improve the quality of the index but in most cases the
|
||||
* default should be sufficient.
|
||||
*
|
||||
* The default value is 256.
|
||||
*/
|
||||
sampleRate?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options to create an `HNSW_PQ` index
|
||||
*/
|
||||
@@ -523,6 +594,35 @@ export class Index {
|
||||
options?.distanceType,
|
||||
options?.numPartitions,
|
||||
options?.numSubVectors,
|
||||
options?.numBits,
|
||||
options?.maxIterations,
|
||||
options?.sampleRate,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an IvfRq index
|
||||
*
|
||||
* IVF-RQ (RabitQ Quantization) compresses vectors using RabitQ quantization
|
||||
* and organizes them into IVF partitions.
|
||||
*
|
||||
* The compression scheme is called RabitQ quantization. Each dimension is quantized into a small number of bits.
|
||||
* The parameters `num_bits` and `num_partitions` control this process, providing a tradeoff
|
||||
* between index size (and thus search speed) and index accuracy.
|
||||
*
|
||||
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
||||
* many groups to create.
|
||||
*
|
||||
* Note that training an IVF RQ index on a large dataset is a slow operation and
|
||||
* currently is also a memory intensive operation.
|
||||
*/
|
||||
static ivfRq(options?: Partial<IvfRqOptions>) {
|
||||
return new Index(
|
||||
LanceDbIndex.ivfRq(
|
||||
options?.distanceType,
|
||||
options?.numPartitions,
|
||||
options?.numBits,
|
||||
options?.maxIterations,
|
||||
options?.sampleRate,
|
||||
),
|
||||
|
||||
@@ -70,6 +70,23 @@ export class MergeInsertBuilder {
|
||||
this.#schema,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Controls whether to use indexes for the merge operation.
|
||||
*
|
||||
* When set to `true` (the default), the operation will use an index if available
|
||||
* on the join key for improved performance. When set to `false`, it forces a full
|
||||
* table scan even if an index exists. This can be useful for benchmarking or when
|
||||
* the query optimizer chooses a suboptimal path.
|
||||
*
|
||||
* @param useIndex - Whether to use indices for the merge operation. Defaults to `true`.
|
||||
*/
|
||||
useIndex(useIndex: boolean): MergeInsertBuilder {
|
||||
return new MergeInsertBuilder(
|
||||
this.#native.useIndex(useIndex),
|
||||
this.#schema,
|
||||
);
|
||||
}
|
||||
/**
|
||||
* Executes the merge insert operation
|
||||
*
|
||||
|
||||
183
nodejs/lancedb/permutation.ts
Normal file
183
nodejs/lancedb/permutation.ts
Normal file
@@ -0,0 +1,183 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import {
|
||||
PermutationBuilder as NativePermutationBuilder,
|
||||
Table as NativeTable,
|
||||
ShuffleOptions,
|
||||
SplitHashOptions,
|
||||
SplitRandomOptions,
|
||||
SplitSequentialOptions,
|
||||
permutationBuilder as nativePermutationBuilder,
|
||||
} from "./native.js";
|
||||
import { LocalTable, Table } from "./table";
|
||||
|
||||
/**
|
||||
* A PermutationBuilder for creating data permutations with splits, shuffling, and filtering.
|
||||
*
|
||||
* This class provides a TypeScript wrapper around the native Rust PermutationBuilder,
|
||||
* offering methods to configure data splits, shuffling, and filtering before executing
|
||||
* the permutation to create a new table.
|
||||
*/
|
||||
export class PermutationBuilder {
|
||||
private inner: NativePermutationBuilder;
|
||||
|
||||
/**
|
||||
* @hidden
|
||||
*/
|
||||
constructor(inner: NativePermutationBuilder) {
|
||||
this.inner = inner;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure random splits for the permutation.
|
||||
*
|
||||
* @param options - Configuration for random splitting
|
||||
* @returns A new PermutationBuilder instance
|
||||
* @example
|
||||
* ```ts
|
||||
* // Split by ratios
|
||||
* builder.splitRandom({ ratios: [0.7, 0.3], seed: 42 });
|
||||
*
|
||||
* // Split by counts
|
||||
* builder.splitRandom({ counts: [1000, 500], seed: 42 });
|
||||
*
|
||||
* // Split with fixed size
|
||||
* builder.splitRandom({ fixed: 100, seed: 42 });
|
||||
* ```
|
||||
*/
|
||||
splitRandom(options: SplitRandomOptions): PermutationBuilder {
|
||||
const newInner = this.inner.splitRandom(options);
|
||||
return new PermutationBuilder(newInner);
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure hash-based splits for the permutation.
|
||||
*
|
||||
* @param options - Configuration for hash-based splitting
|
||||
* @returns A new PermutationBuilder instance
|
||||
* @example
|
||||
* ```ts
|
||||
* builder.splitHash({
|
||||
* columns: ["user_id"],
|
||||
* splitWeights: [70, 30],
|
||||
* discardWeight: 0
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
splitHash(options: SplitHashOptions): PermutationBuilder {
|
||||
const newInner = this.inner.splitHash(options);
|
||||
return new PermutationBuilder(newInner);
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure sequential splits for the permutation.
|
||||
*
|
||||
* @param options - Configuration for sequential splitting
|
||||
* @returns A new PermutationBuilder instance
|
||||
* @example
|
||||
* ```ts
|
||||
* // Split by ratios
|
||||
* builder.splitSequential({ ratios: [0.8, 0.2] });
|
||||
*
|
||||
* // Split by counts
|
||||
* builder.splitSequential({ counts: [800, 200] });
|
||||
*
|
||||
* // Split with fixed size
|
||||
* builder.splitSequential({ fixed: 1000 });
|
||||
* ```
|
||||
*/
|
||||
splitSequential(options: SplitSequentialOptions): PermutationBuilder {
|
||||
const newInner = this.inner.splitSequential(options);
|
||||
return new PermutationBuilder(newInner);
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure calculated splits for the permutation.
|
||||
*
|
||||
* @param calculation - SQL expression for calculating splits
|
||||
* @returns A new PermutationBuilder instance
|
||||
* @example
|
||||
* ```ts
|
||||
* builder.splitCalculated("user_id % 3");
|
||||
* ```
|
||||
*/
|
||||
splitCalculated(calculation: string): PermutationBuilder {
|
||||
const newInner = this.inner.splitCalculated(calculation);
|
||||
return new PermutationBuilder(newInner);
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure shuffling for the permutation.
|
||||
*
|
||||
* @param options - Configuration for shuffling
|
||||
* @returns A new PermutationBuilder instance
|
||||
* @example
|
||||
* ```ts
|
||||
* // Basic shuffle
|
||||
* builder.shuffle({ seed: 42 });
|
||||
*
|
||||
* // Shuffle with clump size
|
||||
* builder.shuffle({ seed: 42, clumpSize: 10 });
|
||||
* ```
|
||||
*/
|
||||
shuffle(options: ShuffleOptions): PermutationBuilder {
|
||||
const newInner = this.inner.shuffle(options);
|
||||
return new PermutationBuilder(newInner);
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure filtering for the permutation.
|
||||
*
|
||||
* @param filter - SQL filter expression
|
||||
* @returns A new PermutationBuilder instance
|
||||
* @example
|
||||
* ```ts
|
||||
* builder.filter("age > 18 AND status = 'active'");
|
||||
* ```
|
||||
*/
|
||||
filter(filter: string): PermutationBuilder {
|
||||
const newInner = this.inner.filter(filter);
|
||||
return new PermutationBuilder(newInner);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the permutation and create the destination table.
|
||||
*
|
||||
* @returns A Promise that resolves to the new Table instance
|
||||
* @example
|
||||
* ```ts
|
||||
* const permutationTable = await builder.execute();
|
||||
* console.log(`Created table: ${permutationTable.name}`);
|
||||
* ```
|
||||
*/
|
||||
async execute(): Promise<Table> {
|
||||
const nativeTable: NativeTable = await this.inner.execute();
|
||||
return new LocalTable(nativeTable);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a permutation builder for the given table.
|
||||
*
|
||||
* @param table - The source table to create a permutation from
|
||||
* @returns A PermutationBuilder instance
|
||||
* @example
|
||||
* ```ts
|
||||
* const builder = permutationBuilder(sourceTable, "training_data")
|
||||
* .splitRandom({ ratios: [0.8, 0.2], seed: 42 })
|
||||
* .shuffle({ seed: 123 });
|
||||
*
|
||||
* const trainingTable = await builder.execute();
|
||||
* ```
|
||||
*/
|
||||
export function permutationBuilder(table: Table): PermutationBuilder {
|
||||
// Extract the inner native table from the TypeScript wrapper
|
||||
const localTable = table as LocalTable;
|
||||
// Access inner through type assertion since it's private
|
||||
const nativeBuilder = nativePermutationBuilder(
|
||||
// biome-ignore lint/suspicious/noExplicitAny: need access to private variable
|
||||
(localTable as any).inner,
|
||||
);
|
||||
return new PermutationBuilder(nativeBuilder);
|
||||
}
|
||||
@@ -326,6 +326,25 @@ export class QueryBase<
|
||||
return this.inner.analyzePlan();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the schema of the output that will be returned by this query.
|
||||
*
|
||||
* This can be used to inspect the types and names of the columns that will be
|
||||
* returned by the query before executing it.
|
||||
*
|
||||
* @returns An Arrow Schema describing the output columns.
|
||||
*/
|
||||
async outputSchema(): Promise<import("./arrow").Schema> {
|
||||
let schemaBuffer: Buffer;
|
||||
if (this.inner instanceof Promise) {
|
||||
schemaBuffer = await this.inner.then((inner) => inner.outputSchema());
|
||||
} else {
|
||||
schemaBuffer = await this.inner.outputSchema();
|
||||
}
|
||||
const schema = tableFromIPC(schemaBuffer).schema;
|
||||
return schema;
|
||||
}
|
||||
}
|
||||
|
||||
export class StandardQueryBase<
|
||||
|
||||
@@ -326,6 +326,9 @@ export function sanitizeDictionary(typeLike: object) {
|
||||
|
||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||
export function sanitizeType(typeLike: unknown): DataType<any> {
|
||||
if (typeof typeLike === "string") {
|
||||
return dataTypeFromName(typeLike);
|
||||
}
|
||||
if (typeof typeLike !== "object" || typeLike === null) {
|
||||
throw Error("Expected a Type but object was null/undefined");
|
||||
}
|
||||
@@ -447,7 +450,7 @@ export function sanitizeType(typeLike: unknown): DataType<any> {
|
||||
case Type.DurationSecond:
|
||||
return new DurationSecond();
|
||||
default:
|
||||
throw new Error("Unrecoginized type id in schema: " + typeId);
|
||||
throw new Error("Unrecognized type id in schema: " + typeId);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -467,7 +470,15 @@ export function sanitizeField(fieldLike: unknown): Field {
|
||||
"The field passed in is missing a `type`/`name`/`nullable` property",
|
||||
);
|
||||
}
|
||||
const type = sanitizeType(fieldLike.type);
|
||||
let type: DataType;
|
||||
try {
|
||||
type = sanitizeType(fieldLike.type);
|
||||
} catch (error: unknown) {
|
||||
throw Error(
|
||||
`Unable to sanitize type for field: ${fieldLike.name} due to error: ${error}`,
|
||||
{ cause: error },
|
||||
);
|
||||
}
|
||||
const name = fieldLike.name;
|
||||
if (!(typeof name === "string")) {
|
||||
throw Error("The field passed in had a non-string `name` property");
|
||||
@@ -581,3 +592,46 @@ function sanitizeData(
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
const constructorsByTypeName = {
|
||||
null: () => new Null(),
|
||||
binary: () => new Binary(),
|
||||
utf8: () => new Utf8(),
|
||||
bool: () => new Bool(),
|
||||
int8: () => new Int8(),
|
||||
int16: () => new Int16(),
|
||||
int32: () => new Int32(),
|
||||
int64: () => new Int64(),
|
||||
uint8: () => new Uint8(),
|
||||
uint16: () => new Uint16(),
|
||||
uint32: () => new Uint32(),
|
||||
uint64: () => new Uint64(),
|
||||
float16: () => new Float16(),
|
||||
float32: () => new Float32(),
|
||||
float64: () => new Float64(),
|
||||
datemillisecond: () => new DateMillisecond(),
|
||||
dateday: () => new DateDay(),
|
||||
timenanosecond: () => new TimeNanosecond(),
|
||||
timemicrosecond: () => new TimeMicrosecond(),
|
||||
timemillisecond: () => new TimeMillisecond(),
|
||||
timesecond: () => new TimeSecond(),
|
||||
intervaldaytime: () => new IntervalDayTime(),
|
||||
intervalyearmonth: () => new IntervalYearMonth(),
|
||||
durationnanosecond: () => new DurationNanosecond(),
|
||||
durationmicrosecond: () => new DurationMicrosecond(),
|
||||
durationmillisecond: () => new DurationMillisecond(),
|
||||
durationsecond: () => new DurationSecond(),
|
||||
} as const;
|
||||
|
||||
type MappableTypeName = keyof typeof constructorsByTypeName;
|
||||
|
||||
export function dataTypeFromName(typeName: string): DataType {
|
||||
const normalizedTypeName = typeName.toLowerCase() as MappableTypeName;
|
||||
const _constructor = constructorsByTypeName[normalizedTypeName];
|
||||
|
||||
if (!_constructor) {
|
||||
throw new Error("Unrecognized type name in schema: " + typeName);
|
||||
}
|
||||
|
||||
return _constructor();
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.22.1-beta.4",
|
||||
"version": "0.22.3-beta.2",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.22.1-beta.4",
|
||||
"version": "0.22.3-beta.2",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.22.1-beta.4",
|
||||
"version": "0.22.3-beta.2",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.22.1-beta.4",
|
||||
"version": "0.22.3-beta.2",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.22.1-beta.4",
|
||||
"version": "0.22.3-beta.2",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.22.1-beta.4",
|
||||
"version": "0.22.3-beta.2",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.22.1-beta.4",
|
||||
"version": "0.22.3-beta.2",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.22.1-beta.4",
|
||||
"version": "0.22.3-beta.2",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.22.1-beta.3",
|
||||
"version": "0.22.3-beta.2",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.22.1-beta.3",
|
||||
"version": "0.22.3-beta.2",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.22.1-beta.4",
|
||||
"version": "0.22.3-beta.2",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::sync::Mutex;
|
||||
use lancedb::index::scalar::{BTreeIndexBuilder, FtsIndexBuilder};
|
||||
use lancedb::index::vector::{
|
||||
IvfFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder,
|
||||
IvfRqIndexBuilder,
|
||||
};
|
||||
use lancedb::index::Index as LanceDbIndex;
|
||||
use napi_derive::napi;
|
||||
@@ -65,6 +66,36 @@ impl Index {
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn ivf_rq(
|
||||
distance_type: Option<String>,
|
||||
num_partitions: Option<u32>,
|
||||
num_bits: Option<u32>,
|
||||
max_iterations: Option<u32>,
|
||||
sample_rate: Option<u32>,
|
||||
) -> napi::Result<Self> {
|
||||
let mut ivf_rq_builder = IvfRqIndexBuilder::default();
|
||||
if let Some(distance_type) = distance_type {
|
||||
let distance_type = parse_distance_type(distance_type)?;
|
||||
ivf_rq_builder = ivf_rq_builder.distance_type(distance_type);
|
||||
}
|
||||
if let Some(num_partitions) = num_partitions {
|
||||
ivf_rq_builder = ivf_rq_builder.num_partitions(num_partitions);
|
||||
}
|
||||
if let Some(num_bits) = num_bits {
|
||||
ivf_rq_builder = ivf_rq_builder.num_bits(num_bits);
|
||||
}
|
||||
if let Some(max_iterations) = max_iterations {
|
||||
ivf_rq_builder = ivf_rq_builder.max_iterations(max_iterations);
|
||||
}
|
||||
if let Some(sample_rate) = sample_rate {
|
||||
ivf_rq_builder = ivf_rq_builder.sample_rate(sample_rate);
|
||||
}
|
||||
Ok(Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::IvfRq(ivf_rq_builder))),
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn ivf_flat(
|
||||
distance_type: Option<String>,
|
||||
|
||||
@@ -12,6 +12,7 @@ mod header;
|
||||
mod index;
|
||||
mod iterator;
|
||||
pub mod merge;
|
||||
pub mod permutation;
|
||||
mod query;
|
||||
pub mod remote;
|
||||
mod rerankers;
|
||||
|
||||
@@ -43,6 +43,13 @@ impl NativeMergeInsertBuilder {
|
||||
self.inner.timeout(Duration::from_millis(timeout as u64));
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn use_index(&self, use_index: bool) -> Self {
|
||||
let mut this = self.clone();
|
||||
this.inner.use_index(use_index);
|
||||
this
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeResult> {
|
||||
let data = ipc_file_to_batches(buf.to_vec())
|
||||
|
||||
214
nodejs/src/permutation.rs
Normal file
214
nodejs/src/permutation.rs
Normal file
@@ -0,0 +1,214 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use crate::{error::NapiErrorExt, table::Table};
|
||||
use lancedb::dataloader::{
|
||||
permutation::builder::{PermutationBuilder as LancePermutationBuilder, ShuffleStrategy},
|
||||
permutation::split::{SplitSizes, SplitStrategy},
|
||||
};
|
||||
use napi_derive::napi;
|
||||
|
||||
#[napi(object)]
|
||||
pub struct SplitRandomOptions {
|
||||
pub ratios: Option<Vec<f64>>,
|
||||
pub counts: Option<Vec<i64>>,
|
||||
pub fixed: Option<i64>,
|
||||
pub seed: Option<i64>,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct SplitHashOptions {
|
||||
pub columns: Vec<String>,
|
||||
pub split_weights: Vec<i64>,
|
||||
pub discard_weight: Option<i64>,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct SplitSequentialOptions {
|
||||
pub ratios: Option<Vec<f64>>,
|
||||
pub counts: Option<Vec<i64>>,
|
||||
pub fixed: Option<i64>,
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct ShuffleOptions {
|
||||
pub seed: Option<i64>,
|
||||
pub clump_size: Option<i64>,
|
||||
}
|
||||
|
||||
pub struct PermutationBuilderState {
|
||||
pub builder: Option<LancePermutationBuilder>,
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub struct PermutationBuilder {
|
||||
state: Arc<Mutex<PermutationBuilderState>>,
|
||||
}
|
||||
|
||||
impl PermutationBuilder {
|
||||
pub fn new(builder: LancePermutationBuilder) -> Self {
|
||||
Self {
|
||||
state: Arc::new(Mutex::new(PermutationBuilderState {
|
||||
builder: Some(builder),
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PermutationBuilder {
|
||||
fn modify(
|
||||
&self,
|
||||
func: impl FnOnce(LancePermutationBuilder) -> LancePermutationBuilder,
|
||||
) -> napi::Result<Self> {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
let builder = state
|
||||
.builder
|
||||
.take()
|
||||
.ok_or_else(|| napi::Error::from_reason("Builder already consumed"))?;
|
||||
state.builder = Some(func(builder));
|
||||
Ok(Self {
|
||||
state: self.state.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[napi]
|
||||
impl PermutationBuilder {
|
||||
/// Configure random splits
|
||||
#[napi]
|
||||
pub fn split_random(&self, options: SplitRandomOptions) -> napi::Result<Self> {
|
||||
// Check that exactly one split type is provided
|
||||
let split_args_count = [
|
||||
options.ratios.is_some(),
|
||||
options.counts.is_some(),
|
||||
options.fixed.is_some(),
|
||||
]
|
||||
.iter()
|
||||
.filter(|&&x| x)
|
||||
.count();
|
||||
|
||||
if split_args_count != 1 {
|
||||
return Err(napi::Error::from_reason(
|
||||
"Exactly one of 'ratios', 'counts', or 'fixed' must be provided",
|
||||
));
|
||||
}
|
||||
|
||||
let sizes = if let Some(ratios) = options.ratios {
|
||||
SplitSizes::Percentages(ratios)
|
||||
} else if let Some(counts) = options.counts {
|
||||
SplitSizes::Counts(counts.into_iter().map(|c| c as u64).collect())
|
||||
} else if let Some(fixed) = options.fixed {
|
||||
SplitSizes::Fixed(fixed as u64)
|
||||
} else {
|
||||
unreachable!("One of the split arguments must be provided");
|
||||
};
|
||||
|
||||
let seed = options.seed.map(|s| s as u64);
|
||||
|
||||
self.modify(|builder| builder.with_split_strategy(SplitStrategy::Random { seed, sizes }))
|
||||
}
|
||||
|
||||
/// Configure hash-based splits
|
||||
#[napi]
|
||||
pub fn split_hash(&self, options: SplitHashOptions) -> napi::Result<Self> {
|
||||
let split_weights = options
|
||||
.split_weights
|
||||
.into_iter()
|
||||
.map(|w| w as u64)
|
||||
.collect();
|
||||
let discard_weight = options.discard_weight.unwrap_or(0) as u64;
|
||||
|
||||
self.modify(|builder| {
|
||||
builder.with_split_strategy(SplitStrategy::Hash {
|
||||
columns: options.columns,
|
||||
split_weights,
|
||||
discard_weight,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/// Configure sequential splits
|
||||
#[napi]
|
||||
pub fn split_sequential(&self, options: SplitSequentialOptions) -> napi::Result<Self> {
|
||||
// Check that exactly one split type is provided
|
||||
let split_args_count = [
|
||||
options.ratios.is_some(),
|
||||
options.counts.is_some(),
|
||||
options.fixed.is_some(),
|
||||
]
|
||||
.iter()
|
||||
.filter(|&&x| x)
|
||||
.count();
|
||||
|
||||
if split_args_count != 1 {
|
||||
return Err(napi::Error::from_reason(
|
||||
"Exactly one of 'ratios', 'counts', or 'fixed' must be provided",
|
||||
));
|
||||
}
|
||||
|
||||
let sizes = if let Some(ratios) = options.ratios {
|
||||
SplitSizes::Percentages(ratios)
|
||||
} else if let Some(counts) = options.counts {
|
||||
SplitSizes::Counts(counts.into_iter().map(|c| c as u64).collect())
|
||||
} else if let Some(fixed) = options.fixed {
|
||||
SplitSizes::Fixed(fixed as u64)
|
||||
} else {
|
||||
unreachable!("One of the split arguments must be provided");
|
||||
};
|
||||
|
||||
self.modify(|builder| builder.with_split_strategy(SplitStrategy::Sequential { sizes }))
|
||||
}
|
||||
|
||||
/// Configure calculated splits
|
||||
#[napi]
|
||||
pub fn split_calculated(&self, calculation: String) -> napi::Result<Self> {
|
||||
self.modify(|builder| {
|
||||
builder.with_split_strategy(SplitStrategy::Calculated { calculation })
|
||||
})
|
||||
}
|
||||
|
||||
/// Configure shuffling
|
||||
#[napi]
|
||||
pub fn shuffle(&self, options: ShuffleOptions) -> napi::Result<Self> {
|
||||
let seed = options.seed.map(|s| s as u64);
|
||||
let clump_size = options.clump_size.map(|c| c as u64);
|
||||
|
||||
self.modify(|builder| {
|
||||
builder.with_shuffle_strategy(ShuffleStrategy::Random { seed, clump_size })
|
||||
})
|
||||
}
|
||||
|
||||
/// Configure filtering
|
||||
#[napi]
|
||||
pub fn filter(&self, filter: String) -> napi::Result<Self> {
|
||||
self.modify(|builder| builder.with_filter(filter))
|
||||
}
|
||||
|
||||
/// Execute the permutation builder and create the table
|
||||
#[napi]
|
||||
pub async fn execute(&self) -> napi::Result<Table> {
|
||||
let builder = {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state
|
||||
.builder
|
||||
.take()
|
||||
.ok_or_else(|| napi::Error::from_reason("Builder already consumed"))?
|
||||
};
|
||||
|
||||
let table = builder.build().await.default_error()?;
|
||||
Ok(Table::new(table))
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a permutation builder for the given table
|
||||
#[napi]
|
||||
pub fn permutation_builder(table: &crate::table::Table) -> napi::Result<PermutationBuilder> {
|
||||
use lancedb::dataloader::permutation::builder::PermutationBuilder as LancePermutationBuilder;
|
||||
|
||||
let inner_table = table.inner_ref()?.clone();
|
||||
let inner_builder = LancePermutationBuilder::new(inner_table);
|
||||
|
||||
Ok(PermutationBuilder::new(inner_builder))
|
||||
}
|
||||
@@ -22,7 +22,7 @@ use crate::error::NapiErrorExt;
|
||||
use crate::iterator::RecordBatchIterator;
|
||||
use crate::rerankers::Reranker;
|
||||
use crate::rerankers::RerankerCallbacks;
|
||||
use crate::util::parse_distance_type;
|
||||
use crate::util::{parse_distance_type, schema_to_buffer};
|
||||
|
||||
#[napi]
|
||||
pub struct Query {
|
||||
@@ -88,6 +88,12 @@ impl Query {
|
||||
self.inner = self.inner.clone().with_row_id();
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn output_schema(&self) -> napi::Result<Buffer> {
|
||||
let schema = self.inner.output_schema().await.default_error()?;
|
||||
schema_to_buffer(&schema)
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn execute(
|
||||
&self,
|
||||
@@ -273,6 +279,12 @@ impl VectorQuery {
|
||||
.rerank(Arc::new(Reranker::new(callbacks)));
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn output_schema(&self) -> napi::Result<Buffer> {
|
||||
let schema = self.inner.output_schema().await.default_error()?;
|
||||
schema_to_buffer(&schema)
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn execute(
|
||||
&self,
|
||||
@@ -346,6 +358,12 @@ impl TakeQuery {
|
||||
self.inner = self.inner.clone().with_row_id();
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn output_schema(&self) -> napi::Result<Buffer> {
|
||||
let schema = self.inner.output_schema().await.default_error()?;
|
||||
schema_to_buffer(&schema)
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn execute(
|
||||
&self,
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use arrow_ipc::writer::FileWriter;
|
||||
use lancedb::ipc::ipc_file_to_batches;
|
||||
use lancedb::table::{
|
||||
AddDataMode, ColumnAlteration as LanceColumnAlteration, Duration, NewColumnTransform,
|
||||
@@ -16,6 +15,7 @@ use crate::error::NapiErrorExt;
|
||||
use crate::index::Index;
|
||||
use crate::merge::NativeMergeInsertBuilder;
|
||||
use crate::query::{Query, TakeQuery, VectorQuery};
|
||||
use crate::util::schema_to_buffer;
|
||||
|
||||
#[napi]
|
||||
pub struct Table {
|
||||
@@ -26,7 +26,7 @@ pub struct Table {
|
||||
}
|
||||
|
||||
impl Table {
|
||||
fn inner_ref(&self) -> napi::Result<&LanceDbTable> {
|
||||
pub(crate) fn inner_ref(&self) -> napi::Result<&LanceDbTable> {
|
||||
self.inner
|
||||
.as_ref()
|
||||
.ok_or_else(|| napi::Error::from_reason(format!("Table {} is closed", self.name)))
|
||||
@@ -64,14 +64,7 @@ impl Table {
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn schema(&self) -> napi::Result<Buffer> {
|
||||
let schema = self.inner_ref()?.schema().await.default_error()?;
|
||||
let mut writer = FileWriter::try_new(vec![], &schema)
|
||||
.map_err(|e| napi::Error::from_reason(format!("Failed to create IPC file: {}", e)))?;
|
||||
writer
|
||||
.finish()
|
||||
.map_err(|e| napi::Error::from_reason(format!("Failed to finish IPC file: {}", e)))?;
|
||||
Ok(Buffer::from(writer.into_inner().map_err(|e| {
|
||||
napi::Error::from_reason(format!("Failed to get IPC file: {}", e))
|
||||
})?))
|
||||
schema_to_buffer(&schema)
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use arrow_ipc::writer::FileWriter;
|
||||
use arrow_schema::Schema;
|
||||
use lancedb::DistanceType;
|
||||
use napi::bindgen_prelude::Buffer;
|
||||
|
||||
pub fn parse_distance_type(distance_type: impl AsRef<str>) -> napi::Result<DistanceType> {
|
||||
match distance_type.as_ref().to_lowercase().as_str() {
|
||||
@@ -15,3 +18,15 @@ pub fn parse_distance_type(distance_type: impl AsRef<str>) -> napi::Result<Dista
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert an Arrow Schema to an Arrow IPC file buffer
|
||||
pub fn schema_to_buffer(schema: &Schema) -> napi::Result<Buffer> {
|
||||
let mut writer = FileWriter::try_new(vec![], schema)
|
||||
.map_err(|e| napi::Error::from_reason(format!("Failed to create IPC file: {}", e)))?;
|
||||
writer
|
||||
.finish()
|
||||
.map_err(|e| napi::Error::from_reason(format!("Failed to finish IPC file: {}", e)))?;
|
||||
Ok(Buffer::from(writer.into_inner().map_err(|e| {
|
||||
napi::Error::from_reason(format!("Failed to get IPC file: {}", e))
|
||||
})?))
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.25.1"
|
||||
current_version = "0.25.3-beta.3"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
@@ -24,6 +24,19 @@ commit = true
|
||||
message = "Bump version: {current_version} → {new_version}"
|
||||
commit_args = ""
|
||||
|
||||
# Update Cargo.lock after version bump
|
||||
pre_commit_hooks = [
|
||||
"""
|
||||
cd python && cargo update -p lancedb-python
|
||||
if git diff --quiet ../Cargo.lock; then
|
||||
echo "Cargo.lock unchanged"
|
||||
else
|
||||
git add ../Cargo.lock
|
||||
echo "Updated and staged Cargo.lock"
|
||||
fi
|
||||
""",
|
||||
]
|
||||
|
||||
[tool.bumpversion.parts.pre_l]
|
||||
values = ["beta", "final"]
|
||||
optional_value = "final"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.25.1"
|
||||
version = "0.25.3-beta.3"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
@@ -14,12 +14,12 @@ name = "_lancedb"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
arrow = { version = "55.1", features = ["pyarrow"] }
|
||||
arrow = { version = "56.2", features = ["pyarrow"] }
|
||||
async-trait = "0.1"
|
||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||
env_logger.workspace = true
|
||||
pyo3 = { version = "0.24", features = ["extension-module", "abi3-py39"] }
|
||||
pyo3-async-runtimes = { version = "0.24", features = [
|
||||
pyo3 = { version = "0.25", features = ["extension-module", "abi3-py39"] }
|
||||
pyo3-async-runtimes = { version = "0.25", features = [
|
||||
"attributes",
|
||||
"tokio-runtime",
|
||||
] }
|
||||
@@ -28,7 +28,7 @@ futures.workspace = true
|
||||
tokio = { version = "1.40", features = ["sync"] }
|
||||
|
||||
[build-dependencies]
|
||||
pyo3-build-config = { version = "0.24", features = [
|
||||
pyo3-build-config = { version = "0.25", features = [
|
||||
"extension-module",
|
||||
"abi3-py39",
|
||||
] }
|
||||
|
||||
@@ -5,12 +5,12 @@ dynamic = ["version"]
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"numpy",
|
||||
"overrides>=0.7",
|
||||
"overrides>=0.7; python_version<'3.12'",
|
||||
"packaging",
|
||||
"pyarrow>=16",
|
||||
"pydantic>=1.10",
|
||||
"tqdm>=4.27.0",
|
||||
"lance-namespace==0.0.6"
|
||||
"lance-namespace>=0.0.16"
|
||||
]
|
||||
description = "lancedb"
|
||||
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
||||
|
||||
@@ -123,6 +123,8 @@ class Table:
|
||||
@property
|
||||
def tags(self) -> Tags: ...
|
||||
def query(self) -> Query: ...
|
||||
def take_offsets(self, offsets: list[int]) -> TakeQuery: ...
|
||||
def take_row_ids(self, row_ids: list[int]) -> TakeQuery: ...
|
||||
def vector_search(self) -> VectorQuery: ...
|
||||
|
||||
class Tags:
|
||||
@@ -133,6 +135,7 @@ class Tags:
|
||||
async def update(self, tag: str, version: int): ...
|
||||
|
||||
class IndexConfig:
|
||||
name: str
|
||||
index_type: str
|
||||
columns: List[str]
|
||||
|
||||
@@ -164,6 +167,7 @@ class Query:
|
||||
def postfilter(self): ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
|
||||
def nearest_to_text(self, query: dict) -> FTSQuery: ...
|
||||
async def output_schema(self) -> pa.Schema: ...
|
||||
async def execute(
|
||||
self, max_batch_length: Optional[int], timeout: Optional[timedelta]
|
||||
) -> RecordBatchStream: ...
|
||||
@@ -171,6 +175,13 @@ class Query:
|
||||
async def analyze_plan(self) -> str: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class TakeQuery:
|
||||
def select(self, columns: List[str]): ...
|
||||
def with_row_id(self): ...
|
||||
async def output_schema(self) -> pa.Schema: ...
|
||||
async def execute(self) -> RecordBatchStream: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class FTSQuery:
|
||||
def where(self, filter: str): ...
|
||||
def select(self, columns: List[str]): ...
|
||||
@@ -182,12 +193,14 @@ class FTSQuery:
|
||||
def get_query(self) -> str: ...
|
||||
def add_query_vector(self, query_vec: pa.Array) -> None: ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
|
||||
async def output_schema(self) -> pa.Schema: ...
|
||||
async def execute(
|
||||
self, max_batch_length: Optional[int], timeout: Optional[timedelta]
|
||||
) -> RecordBatchStream: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class VectorQuery:
|
||||
async def output_schema(self) -> pa.Schema: ...
|
||||
async def execute(self) -> RecordBatchStream: ...
|
||||
def where(self, filter: str): ...
|
||||
def select(self, columns: List[str]): ...
|
||||
@@ -295,3 +308,34 @@ class AlterColumnsResult:
|
||||
|
||||
class DropColumnsResult:
|
||||
version: int
|
||||
|
||||
class AsyncPermutationBuilder:
|
||||
def select(self, projections: Dict[str, str]) -> "AsyncPermutationBuilder": ...
|
||||
def split_random(
|
||||
self,
|
||||
*,
|
||||
ratios: Optional[List[float]] = None,
|
||||
counts: Optional[List[int]] = None,
|
||||
fixed: Optional[int] = None,
|
||||
seed: Optional[int] = None,
|
||||
) -> "AsyncPermutationBuilder": ...
|
||||
def split_hash(
|
||||
self, columns: List[str], split_weights: List[int], *, discard_weight: int = 0
|
||||
) -> "AsyncPermutationBuilder": ...
|
||||
def split_sequential(
|
||||
self,
|
||||
*,
|
||||
ratios: Optional[List[float]] = None,
|
||||
counts: Optional[List[int]] = None,
|
||||
fixed: Optional[int] = None,
|
||||
) -> "AsyncPermutationBuilder": ...
|
||||
def split_calculated(self, calculation: str) -> "AsyncPermutationBuilder": ...
|
||||
def shuffle(
|
||||
self, seed: Optional[int], clump_size: Optional[int]
|
||||
) -> "AsyncPermutationBuilder": ...
|
||||
def filter(self, filter: str) -> "AsyncPermutationBuilder": ...
|
||||
async def execute(self) -> Table: ...
|
||||
|
||||
def async_permutation_builder(
|
||||
table: Table, dest_table_name: str
|
||||
) -> AsyncPermutationBuilder: ...
|
||||
|
||||
@@ -5,11 +5,20 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import abstractmethod
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Literal, Optional, Union
|
||||
|
||||
if sys.version_info >= (3, 12):
|
||||
from typing import override
|
||||
|
||||
class EnforceOverrides:
|
||||
pass
|
||||
else:
|
||||
from overrides import EnforceOverrides, override # type: ignore
|
||||
|
||||
from lancedb.embeddings.registry import EmbeddingFunctionRegistry
|
||||
from overrides import EnforceOverrides, override # type: ignore
|
||||
|
||||
from lancedb.common import data_to_reader, sanitize_uri, validate_schema
|
||||
from lancedb.background_loop import LOOP
|
||||
@@ -32,7 +41,6 @@ import deprecation
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow as pa
|
||||
from .pydantic import LanceModel
|
||||
from datetime import timedelta
|
||||
|
||||
from ._lancedb import Connection as LanceDbConnection
|
||||
from .common import DATA, URI
|
||||
@@ -444,7 +452,12 @@ class LanceDBConnection(DBConnection):
|
||||
read_consistency_interval: Optional[timedelta] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
session: Optional[Session] = None,
|
||||
_inner: Optional[LanceDbConnection] = None,
|
||||
):
|
||||
if _inner is not None:
|
||||
self._conn = _inner
|
||||
return
|
||||
|
||||
if not isinstance(uri, Path):
|
||||
scheme = get_uri_scheme(uri)
|
||||
is_local = isinstance(uri, Path) or scheme == "file"
|
||||
@@ -453,11 +466,6 @@ class LanceDBConnection(DBConnection):
|
||||
uri = Path(uri)
|
||||
uri = uri.expanduser().absolute()
|
||||
Path(uri).mkdir(parents=True, exist_ok=True)
|
||||
self._uri = str(uri)
|
||||
self._entered = False
|
||||
self.read_consistency_interval = read_consistency_interval
|
||||
self.storage_options = storage_options
|
||||
self.session = session
|
||||
|
||||
if read_consistency_interval is not None:
|
||||
read_consistency_interval_secs = read_consistency_interval.total_seconds()
|
||||
@@ -476,10 +484,32 @@ class LanceDBConnection(DBConnection):
|
||||
session,
|
||||
)
|
||||
|
||||
# TODO: It would be nice if we didn't store self.storage_options but it is
|
||||
# currently used by the LanceTable.to_lance method. This doesn't _really_
|
||||
# work because some paths like LanceDBConnection.from_inner will lose the
|
||||
# storage_options. Also, this class really shouldn't be holding any state
|
||||
# beyond _conn.
|
||||
self.storage_options = storage_options
|
||||
self._conn = AsyncConnection(LOOP.run(do_connect()))
|
||||
|
||||
@property
|
||||
def read_consistency_interval(self) -> Optional[timedelta]:
|
||||
return LOOP.run(self._conn.get_read_consistency_interval())
|
||||
|
||||
@property
|
||||
def session(self) -> Optional[Session]:
|
||||
return self._conn.session
|
||||
|
||||
@property
|
||||
def uri(self) -> str:
|
||||
return self._conn.uri
|
||||
|
||||
@classmethod
|
||||
def from_inner(cls, inner: LanceDbConnection):
|
||||
return cls(None, _inner=inner)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
val = f"{self.__class__.__name__}(uri={self._uri!r}"
|
||||
val = f"{self.__class__.__name__}(uri={self._conn.uri!r}"
|
||||
if self.read_consistency_interval is not None:
|
||||
val += f", read_consistency_interval={repr(self.read_consistency_interval)}"
|
||||
val += ")"
|
||||
@@ -489,6 +519,10 @@ class LanceDBConnection(DBConnection):
|
||||
conn = AsyncConnection(await lancedb_connect(self.uri))
|
||||
return await conn.table_names(start_after=start_after, limit=limit)
|
||||
|
||||
@property
|
||||
def _inner(self) -> LanceDbConnection:
|
||||
return self._conn._inner
|
||||
|
||||
@override
|
||||
def list_namespaces(
|
||||
self,
|
||||
@@ -848,6 +882,13 @@ class AsyncConnection(object):
|
||||
def uri(self) -> str:
|
||||
return self._inner.uri
|
||||
|
||||
async def get_read_consistency_interval(self) -> Optional[timedelta]:
|
||||
interval_secs = await self._inner.get_read_consistency_interval()
|
||||
if interval_secs is not None:
|
||||
return timedelta(seconds=interval_secs)
|
||||
else:
|
||||
return None
|
||||
|
||||
async def list_namespaces(
|
||||
self,
|
||||
namespace: List[str] = [],
|
||||
|
||||
@@ -3,9 +3,11 @@
|
||||
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import List, Union, Optional, Any
|
||||
from logging import warning
|
||||
from typing import List, Union, Optional, Any, Callable
|
||||
import numpy as np
|
||||
import io
|
||||
import warnings
|
||||
|
||||
from ..util import attempt_import_or_raise
|
||||
from .base import EmbeddingFunction
|
||||
@@ -19,35 +21,52 @@ class ColPaliEmbeddings(EmbeddingFunction):
|
||||
An embedding function that uses the ColPali engine for
|
||||
multimodal multi-vector embeddings.
|
||||
|
||||
This embedding function supports ColQwen2.5 models, producing multivector outputs
|
||||
for both text and image inputs. The output embeddings are lists of vectors, each
|
||||
vector being 128-dimensional by default, represented as List[List[float]].
|
||||
This embedding function supports ColPali models, producing multivector outputs
|
||||
for both text and image inputs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model_name : str
|
||||
The name of the model to use (e.g., "Metric-AI/ColQwen2.5-3b-multilingual-v1.0")
|
||||
Supports models based on these engines:
|
||||
- ColPali: "vidore/colpali-v1.3" and others
|
||||
- ColQwen2.5: "Metric-AI/ColQwen2.5-3b-multilingual-v1.0" and others
|
||||
- ColQwen2: "vidore/colqwen2-v1.0" and others
|
||||
- ColSmol: "vidore/colSmol-256M" and others
|
||||
|
||||
device : str
|
||||
The device for inference (default "cuda:0").
|
||||
The device for inference (default "auto").
|
||||
dtype : str
|
||||
Data type for model weights (default "bfloat16").
|
||||
use_token_pooling : bool
|
||||
Whether to use token pooling to reduce embedding size (default True).
|
||||
DEPRECATED. Whether to use token pooling. Use `pooling_strategy` instead.
|
||||
pooling_strategy : str, optional
|
||||
The token pooling strategy to use, by default "hierarchical".
|
||||
- "hierarchical": Progressively pools tokens to reduce sequence length.
|
||||
- "lambda": A simpler pooling that uses a custom `pooling_func`.
|
||||
pooling_func: typing.Callable, optional
|
||||
A function to use for pooling when `pooling_strategy` is "lambda".
|
||||
pool_factor : int
|
||||
Factor to reduce sequence length if token pooling is enabled (default 2).
|
||||
quantization_config : Optional[BitsAndBytesConfig]
|
||||
Quantization configuration for the model. (default None, bitsandbytes needed)
|
||||
batch_size : int
|
||||
Batch size for processing inputs (default 2).
|
||||
offload_folder: str, optional
|
||||
Folder to offload model weights if using CPU offloading (default None). This is
|
||||
useful for large models that do not fit in memory.
|
||||
"""
|
||||
|
||||
model_name: str = "Metric-AI/ColQwen2.5-3b-multilingual-v1.0"
|
||||
device: str = "auto"
|
||||
dtype: str = "bfloat16"
|
||||
use_token_pooling: bool = True
|
||||
pooling_strategy: Optional[str] = "hierarchical"
|
||||
pooling_func: Optional[Any] = None
|
||||
pool_factor: int = 2
|
||||
quantization_config: Optional[Any] = None
|
||||
batch_size: int = 2
|
||||
offload_folder: Optional[str] = None
|
||||
|
||||
_model = None
|
||||
_processor = None
|
||||
@@ -56,15 +75,43 @@ class ColPaliEmbeddings(EmbeddingFunction):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
torch = attempt_import_or_raise("torch", "torch")
|
||||
|
||||
if not self.use_token_pooling:
|
||||
warnings.warn(
|
||||
"use_token_pooling is deprecated, use pooling_strategy=None instead",
|
||||
DeprecationWarning,
|
||||
)
|
||||
self.pooling_strategy = None
|
||||
|
||||
if self.pooling_strategy == "lambda" and self.pooling_func is None:
|
||||
raise ValueError(
|
||||
"pooling_func must be provided when pooling_strategy is 'lambda'"
|
||||
)
|
||||
|
||||
device = self.device
|
||||
if device == "auto":
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
elif torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
else:
|
||||
device = "cpu"
|
||||
|
||||
dtype = self.dtype
|
||||
if device == "mps" and dtype == "bfloat16":
|
||||
dtype = "float32" # Avoid NaNs on MPS
|
||||
|
||||
(
|
||||
self._model,
|
||||
self._processor,
|
||||
self._token_pooler,
|
||||
) = self._load_model(
|
||||
self.model_name,
|
||||
self.dtype,
|
||||
self.device,
|
||||
self.use_token_pooling,
|
||||
dtype,
|
||||
device,
|
||||
self.pooling_strategy,
|
||||
self.pooling_func,
|
||||
self.quantization_config,
|
||||
)
|
||||
|
||||
@@ -74,16 +121,26 @@ class ColPaliEmbeddings(EmbeddingFunction):
|
||||
model_name: str,
|
||||
dtype: str,
|
||||
device: str,
|
||||
use_token_pooling: bool,
|
||||
pooling_strategy: Optional[str],
|
||||
pooling_func: Optional[Callable],
|
||||
quantization_config: Optional[Any],
|
||||
):
|
||||
"""
|
||||
Initialize and cache the ColPali model, processor, and token pooler.
|
||||
"""
|
||||
if device.startswith("mps"):
|
||||
# warn some torch ops in late interaction architecture result in nans on mps
|
||||
warning(
|
||||
"MPS device detected. Some operations may result in NaNs. "
|
||||
"If you encounter issues, consider using 'cpu' or 'cuda' devices."
|
||||
)
|
||||
torch = attempt_import_or_raise("torch", "torch")
|
||||
transformers = attempt_import_or_raise("transformers", "transformers")
|
||||
colpali_engine = attempt_import_or_raise("colpali_engine", "colpali_engine")
|
||||
from colpali_engine.compression.token_pooling import HierarchicalTokenPooler
|
||||
from colpali_engine.compression.token_pooling import (
|
||||
HierarchicalTokenPooler,
|
||||
LambdaTokenPooler,
|
||||
)
|
||||
|
||||
if quantization_config is not None:
|
||||
if not isinstance(quantization_config, transformers.BitsAndBytesConfig):
|
||||
@@ -98,21 +155,45 @@ class ColPaliEmbeddings(EmbeddingFunction):
|
||||
else:
|
||||
torch_dtype = torch.float32
|
||||
|
||||
model = colpali_engine.models.ColQwen2_5.from_pretrained(
|
||||
model_class, processor_class = None, None
|
||||
model_name_lower = model_name.lower()
|
||||
if "colqwen2.5" in model_name_lower:
|
||||
model_class = colpali_engine.models.ColQwen2_5
|
||||
processor_class = colpali_engine.models.ColQwen2_5_Processor
|
||||
elif "colsmol" in model_name_lower or "colidefics3" in model_name_lower:
|
||||
model_class = colpali_engine.models.ColIdefics3
|
||||
processor_class = colpali_engine.models.ColIdefics3Processor
|
||||
elif "colqwen" in model_name_lower:
|
||||
model_class = colpali_engine.models.ColQwen2
|
||||
processor_class = colpali_engine.models.ColQwen2Processor
|
||||
elif "colpali" in model_name_lower:
|
||||
model_class = colpali_engine.models.ColPali
|
||||
processor_class = colpali_engine.models.ColPaliProcessor
|
||||
|
||||
if model_class is None:
|
||||
raise ValueError(f"Unsupported model: {model_name}")
|
||||
|
||||
model = model_class.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
device_map=device,
|
||||
quantization_config=quantization_config
|
||||
if quantization_config is not None
|
||||
else None,
|
||||
attn_implementation="flash_attention_2"
|
||||
if is_flash_attn_2_available()
|
||||
else None,
|
||||
low_cpu_mem_usage=True,
|
||||
).eval()
|
||||
processor = colpali_engine.models.ColQwen2_5_Processor.from_pretrained(
|
||||
model_name
|
||||
)
|
||||
token_pooler = HierarchicalTokenPooler() if use_token_pooling else None
|
||||
model = model.to(device)
|
||||
model = model.to(torch_dtype) # Force cast after moving to device
|
||||
processor = processor_class.from_pretrained(model_name)
|
||||
|
||||
token_pooler = None
|
||||
if pooling_strategy == "hierarchical":
|
||||
token_pooler = HierarchicalTokenPooler()
|
||||
elif pooling_strategy == "lambda":
|
||||
token_pooler = LambdaTokenPooler(pool_func=pooling_func)
|
||||
|
||||
return model, processor, token_pooler
|
||||
|
||||
def ndims(self):
|
||||
@@ -128,7 +209,7 @@ class ColPaliEmbeddings(EmbeddingFunction):
|
||||
with torch.no_grad():
|
||||
query_embeddings = self._model(**batch_queries)
|
||||
|
||||
if self.use_token_pooling and self._token_pooler is not None:
|
||||
if self.pooling_strategy and self._token_pooler is not None:
|
||||
query_embeddings = self._token_pooler.pool_embeddings(
|
||||
query_embeddings,
|
||||
pool_factor=self.pool_factor,
|
||||
@@ -145,13 +226,20 @@ class ColPaliEmbeddings(EmbeddingFunction):
|
||||
Use token pooling if enabled.
|
||||
"""
|
||||
torch = attempt_import_or_raise("torch", "torch")
|
||||
if self.use_token_pooling and self._token_pooler is not None:
|
||||
embeddings = self._token_pooler.pool_embeddings(
|
||||
embeddings,
|
||||
pool_factor=self.pool_factor,
|
||||
padding=True,
|
||||
padding_side=self._processor.tokenizer.padding_side,
|
||||
)
|
||||
if self.pooling_strategy and self._token_pooler is not None:
|
||||
if self.pooling_strategy == "hierarchical":
|
||||
embeddings = self._token_pooler.pool_embeddings(
|
||||
embeddings,
|
||||
pool_factor=self.pool_factor,
|
||||
padding=True,
|
||||
padding_side=self._processor.tokenizer.padding_side,
|
||||
)
|
||||
elif self.pooling_strategy == "lambda":
|
||||
embeddings = self._token_pooler.pool_embeddings(
|
||||
embeddings,
|
||||
padding=True,
|
||||
padding_side=self._processor.tokenizer.padding_side,
|
||||
)
|
||||
|
||||
if isinstance(embeddings, torch.Tensor):
|
||||
tensors = embeddings.detach().cpu()
|
||||
@@ -179,6 +267,7 @@ class ColPaliEmbeddings(EmbeddingFunction):
|
||||
)
|
||||
with torch.no_grad():
|
||||
query_embeddings = self._model(**batch_queries)
|
||||
query_embeddings = torch.nan_to_num(query_embeddings)
|
||||
all_embeddings.extend(self._process_embeddings(query_embeddings))
|
||||
return all_embeddings
|
||||
|
||||
@@ -225,6 +314,7 @@ class ColPaliEmbeddings(EmbeddingFunction):
|
||||
)
|
||||
with torch.no_grad():
|
||||
image_embeddings = self._model(**batch_images)
|
||||
image_embeddings = torch.nan_to_num(image_embeddings)
|
||||
all_embeddings.extend(self._process_embeddings(image_embeddings))
|
||||
return all_embeddings
|
||||
|
||||
|
||||
@@ -605,9 +605,53 @@ class IvfPq:
|
||||
target_partition_size: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class IvfRq:
|
||||
"""Describes an IVF RQ Index
|
||||
|
||||
IVF-RQ (Residual Quantization) stores a compressed copy of each vector using
|
||||
residual quantization and organizes them into IVF partitions. Parameters
|
||||
largely mirror IVF-PQ for consistency.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
distance_type: str, default "l2"
|
||||
Distance metric used to train the index and for quantization.
|
||||
|
||||
The following distance types are available:
|
||||
|
||||
"l2" - Euclidean distance.
|
||||
"cosine" - Cosine distance.
|
||||
"dot" - Dot product.
|
||||
|
||||
num_partitions: int, default sqrt(num_rows)
|
||||
Number of IVF partitions to create.
|
||||
|
||||
num_bits: int, default 1
|
||||
Number of bits to encode each dimension.
|
||||
|
||||
max_iterations: int, default 50
|
||||
Max iterations to train kmeans when computing IVF partitions.
|
||||
|
||||
sample_rate: int, default 256
|
||||
Controls the number of training vectors: sample_rate * num_partitions.
|
||||
|
||||
target_partition_size, default is 8192
|
||||
Target size of each partition.
|
||||
"""
|
||||
|
||||
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
||||
num_partitions: Optional[int] = None
|
||||
num_bits: int = 1
|
||||
max_iterations: int = 50
|
||||
sample_rate: int = 256
|
||||
target_partition_size: Optional[int] = None
|
||||
|
||||
|
||||
__all__ = [
|
||||
"BTree",
|
||||
"IvfPq",
|
||||
"IvfRq",
|
||||
"IvfFlat",
|
||||
"HnswPq",
|
||||
"HnswSq",
|
||||
|
||||
@@ -33,6 +33,7 @@ class LanceMergeInsertBuilder(object):
|
||||
self._when_not_matched_by_source_delete = False
|
||||
self._when_not_matched_by_source_condition = None
|
||||
self._timeout = None
|
||||
self._use_index = True
|
||||
|
||||
def when_matched_update_all(
|
||||
self, *, where: Optional[str] = None
|
||||
@@ -78,6 +79,23 @@ class LanceMergeInsertBuilder(object):
|
||||
self._when_not_matched_by_source_condition = condition
|
||||
return self
|
||||
|
||||
def use_index(self, use_index: bool) -> LanceMergeInsertBuilder:
|
||||
"""
|
||||
Controls whether to use indexes for the merge operation.
|
||||
|
||||
When set to `True` (the default), the operation will use an index if available
|
||||
on the join key for improved performance. When set to `False`, it forces a full
|
||||
table scan even if an index exists. This can be useful for benchmarking or when
|
||||
the query optimizer chooses a suboptimal path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
use_index: bool
|
||||
Whether to use indices for the merge operation. Defaults to `True`.
|
||||
"""
|
||||
self._use_index = use_index
|
||||
return self
|
||||
|
||||
def execute(
|
||||
self,
|
||||
new_data: DATA,
|
||||
|
||||
@@ -12,13 +12,18 @@ from __future__ import annotations
|
||||
|
||||
from typing import Dict, Iterable, List, Optional, Union
|
||||
import os
|
||||
import sys
|
||||
|
||||
if sys.version_info >= (3, 12):
|
||||
from typing import override
|
||||
else:
|
||||
from overrides import override
|
||||
|
||||
from lancedb.db import DBConnection
|
||||
from lancedb.table import LanceTable, Table
|
||||
from lancedb.util import validate_table_name
|
||||
from lancedb.common import validate_schema
|
||||
from lancedb.table import sanitize_create_table
|
||||
from overrides import override
|
||||
|
||||
from lance_namespace import LanceNamespace, connect as namespace_connect
|
||||
from lance_namespace_urllib3_client.models import (
|
||||
|
||||
72
python/python/lancedb/permutation.py
Normal file
72
python/python/lancedb/permutation.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
from ._lancedb import async_permutation_builder
|
||||
from .table import LanceTable
|
||||
from .background_loop import LOOP
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class PermutationBuilder:
|
||||
def __init__(self, table: LanceTable):
|
||||
self._async = async_permutation_builder(table)
|
||||
|
||||
def select(self, projections: dict[str, str]) -> "PermutationBuilder":
|
||||
self._async.select(projections)
|
||||
return self
|
||||
|
||||
def split_random(
|
||||
self,
|
||||
*,
|
||||
ratios: Optional[list[float]] = None,
|
||||
counts: Optional[list[int]] = None,
|
||||
fixed: Optional[int] = None,
|
||||
seed: Optional[int] = None,
|
||||
) -> "PermutationBuilder":
|
||||
self._async.split_random(ratios=ratios, counts=counts, fixed=fixed, seed=seed)
|
||||
return self
|
||||
|
||||
def split_hash(
|
||||
self,
|
||||
columns: list[str],
|
||||
split_weights: list[int],
|
||||
*,
|
||||
discard_weight: Optional[int] = None,
|
||||
) -> "PermutationBuilder":
|
||||
self._async.split_hash(columns, split_weights, discard_weight=discard_weight)
|
||||
return self
|
||||
|
||||
def split_sequential(
|
||||
self,
|
||||
*,
|
||||
ratios: Optional[list[float]] = None,
|
||||
counts: Optional[list[int]] = None,
|
||||
fixed: Optional[int] = None,
|
||||
) -> "PermutationBuilder":
|
||||
self._async.split_sequential(ratios=ratios, counts=counts, fixed=fixed)
|
||||
return self
|
||||
|
||||
def split_calculated(self, calculation: str) -> "PermutationBuilder":
|
||||
self._async.split_calculated(calculation)
|
||||
return self
|
||||
|
||||
def shuffle(
|
||||
self, *, seed: Optional[int] = None, clump_size: Optional[int] = None
|
||||
) -> "PermutationBuilder":
|
||||
self._async.shuffle(seed=seed, clump_size=clump_size)
|
||||
return self
|
||||
|
||||
def filter(self, filter: str) -> "PermutationBuilder":
|
||||
self._async.filter(filter)
|
||||
return self
|
||||
|
||||
def execute(self) -> LanceTable:
|
||||
async def do_execute():
|
||||
inner_tbl = await self._async.execute()
|
||||
return LanceTable.from_inner(inner_tbl)
|
||||
|
||||
return LOOP.run(do_execute())
|
||||
|
||||
|
||||
def permutation_builder(table: LanceTable) -> PermutationBuilder:
|
||||
return PermutationBuilder(table)
|
||||
@@ -1237,6 +1237,14 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
self._refine_factor = refine_factor
|
||||
return self
|
||||
|
||||
def output_schema(self) -> pa.Schema:
|
||||
"""
|
||||
Return the output schema for the query
|
||||
|
||||
This does not execute the query.
|
||||
"""
|
||||
return self._table._output_schema(self.to_query_object())
|
||||
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
"""
|
||||
Execute the query and return the results as an
|
||||
@@ -1452,6 +1460,14 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
offset=self._offset,
|
||||
)
|
||||
|
||||
def output_schema(self) -> pa.Schema:
|
||||
"""
|
||||
Return the output schema for the query
|
||||
|
||||
This does not execute the query.
|
||||
"""
|
||||
return self._table._output_schema(self.to_query_object())
|
||||
|
||||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
path, fs, exist = self._table._get_fts_index_path()
|
||||
if exist:
|
||||
@@ -1595,6 +1611,10 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
||||
offset=self._offset,
|
||||
)
|
||||
|
||||
def output_schema(self) -> pa.Schema:
|
||||
query = self.to_query_object()
|
||||
return self._table._output_schema(query)
|
||||
|
||||
def to_batches(
|
||||
self, /, batch_size: Optional[int] = None, timeout: Optional[timedelta] = None
|
||||
) -> pa.RecordBatchReader:
|
||||
@@ -2238,6 +2258,14 @@ class AsyncQueryBase(object):
|
||||
)
|
||||
)
|
||||
|
||||
async def output_schema(self) -> pa.Schema:
|
||||
"""
|
||||
Return the output schema for the query
|
||||
|
||||
This does not execute the query.
|
||||
"""
|
||||
return await self._inner.output_schema()
|
||||
|
||||
async def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||
"""
|
||||
Execute the query and collect the results into an Apache Arrow Table.
|
||||
@@ -3193,6 +3221,14 @@ class BaseQueryBuilder(object):
|
||||
self._inner.with_row_id()
|
||||
return self
|
||||
|
||||
def output_schema(self) -> pa.Schema:
|
||||
"""
|
||||
Return the output schema for the query
|
||||
|
||||
This does not execute the query.
|
||||
"""
|
||||
return LOOP.run(self._inner.output_schema())
|
||||
|
||||
def to_batches(
|
||||
self,
|
||||
*,
|
||||
|
||||
@@ -5,15 +5,20 @@
|
||||
from datetime import timedelta
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import sys
|
||||
from typing import Any, Dict, Iterable, List, Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
import warnings
|
||||
|
||||
if sys.version_info >= (3, 12):
|
||||
from typing import override
|
||||
else:
|
||||
from overrides import override
|
||||
|
||||
# Remove this import to fix circular dependency
|
||||
# from lancedb import connect_async
|
||||
from lancedb.remote import ClientConfig
|
||||
import pyarrow as pa
|
||||
from overrides import override
|
||||
|
||||
from ..common import DATA
|
||||
from ..db import DBConnection, LOOP
|
||||
|
||||
@@ -114,7 +114,7 @@ class RemoteTable(Table):
|
||||
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
|
||||
*,
|
||||
replace: bool = False,
|
||||
wait_timeout: timedelta = None,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
"""Creates a scalar index
|
||||
@@ -153,7 +153,7 @@ class RemoteTable(Table):
|
||||
column: str,
|
||||
*,
|
||||
replace: bool = False,
|
||||
wait_timeout: timedelta = None,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
with_position: bool = False,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: str = "simple",
|
||||
@@ -436,6 +436,9 @@ class RemoteTable(Table):
|
||||
def _analyze_plan(self, query: Query) -> str:
|
||||
return LOOP.run(self._table._analyze_plan(query))
|
||||
|
||||
def _output_schema(self, query: Query) -> pa.Schema:
|
||||
return LOOP.run(self._table._output_schema(query))
|
||||
|
||||
def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:
|
||||
"""Returns a [`LanceMergeInsertBuilder`][lancedb.merge.LanceMergeInsertBuilder]
|
||||
that can be used to create a "merge insert" operation.
|
||||
|
||||
@@ -44,7 +44,7 @@ import numpy as np
|
||||
|
||||
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
||||
from .index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
|
||||
from .index import BTree, IvfFlat, IvfPq, Bitmap, IvfRq, LabelList, HnswPq, HnswSq, FTS
|
||||
from .merge import LanceMergeInsertBuilder
|
||||
from .pydantic import LanceModel, model_to_dict
|
||||
from .query import (
|
||||
@@ -74,6 +74,7 @@ from .index import lang_mapping
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .db import LanceDBConnection
|
||||
from ._lancedb import (
|
||||
Table as LanceDBTable,
|
||||
OptimizeStats,
|
||||
@@ -88,7 +89,6 @@ if TYPE_CHECKING:
|
||||
MergeResult,
|
||||
UpdateResult,
|
||||
)
|
||||
from .db import LanceDBConnection
|
||||
from .index import IndexConfig
|
||||
import pandas
|
||||
import PIL
|
||||
@@ -1248,6 +1248,9 @@ class Table(ABC):
|
||||
@abstractmethod
|
||||
def _analyze_plan(self, query: Query) -> str: ...
|
||||
|
||||
@abstractmethod
|
||||
def _output_schema(self, query: Query) -> pa.Schema: ...
|
||||
|
||||
@abstractmethod
|
||||
def _do_merge(
|
||||
self,
|
||||
@@ -1707,22 +1710,38 @@ class LanceTable(Table):
|
||||
namespace: List[str] = [],
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
_async: AsyncTable = None,
|
||||
):
|
||||
self._conn = connection
|
||||
self._namespace = namespace
|
||||
self._table = LOOP.run(
|
||||
connection._conn.open_table(
|
||||
name,
|
||||
namespace=namespace,
|
||||
storage_options=storage_options,
|
||||
index_cache_size=index_cache_size,
|
||||
if _async is not None:
|
||||
self._table = _async
|
||||
else:
|
||||
self._table = LOOP.run(
|
||||
connection._conn.open_table(
|
||||
name,
|
||||
namespace=namespace,
|
||||
storage_options=storage_options,
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return self._table.name
|
||||
|
||||
@classmethod
|
||||
def from_inner(cls, tbl: LanceDBTable):
|
||||
from .db import LanceDBConnection
|
||||
|
||||
async_tbl = AsyncTable(tbl)
|
||||
conn = LanceDBConnection.from_inner(tbl.database())
|
||||
return cls(
|
||||
conn,
|
||||
async_tbl.name,
|
||||
_async=async_tbl,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def open(cls, db, name, *, namespace: List[str] = [], **kwargs):
|
||||
tbl = cls(db, name, namespace=namespace, **kwargs)
|
||||
@@ -1991,7 +2010,7 @@ class LanceTable(Table):
|
||||
index_cache_size: Optional[int] = None,
|
||||
num_bits: int = 8,
|
||||
index_type: Literal[
|
||||
"IVF_FLAT", "IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
|
||||
"IVF_FLAT", "IVF_PQ", "IVF_RQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
|
||||
] = "IVF_PQ",
|
||||
max_iterations: int = 50,
|
||||
sample_rate: int = 256,
|
||||
@@ -2039,6 +2058,15 @@ class LanceTable(Table):
|
||||
sample_rate=sample_rate,
|
||||
target_partition_size=target_partition_size,
|
||||
)
|
||||
elif index_type == "IVF_RQ":
|
||||
config = IvfRq(
|
||||
distance_type=metric,
|
||||
num_partitions=num_partitions,
|
||||
num_bits=num_bits,
|
||||
max_iterations=max_iterations,
|
||||
sample_rate=sample_rate,
|
||||
target_partition_size=target_partition_size,
|
||||
)
|
||||
elif index_type == "IVF_HNSW_PQ":
|
||||
config = HnswPq(
|
||||
distance_type=metric,
|
||||
@@ -2736,6 +2764,9 @@ class LanceTable(Table):
|
||||
def _analyze_plan(self, query: Query) -> str:
|
||||
return LOOP.run(self._table._analyze_plan(query))
|
||||
|
||||
def _output_schema(self, query: Query) -> pa.Schema:
|
||||
return LOOP.run(self._table._output_schema(query))
|
||||
|
||||
def _do_merge(
|
||||
self,
|
||||
merge: LanceMergeInsertBuilder,
|
||||
@@ -2747,6 +2778,10 @@ class LanceTable(Table):
|
||||
self._table._do_merge(merge, new_data, on_bad_vectors, fill_value)
|
||||
)
|
||||
|
||||
@property
|
||||
def _inner(self) -> LanceDBTable:
|
||||
return self._table._inner
|
||||
|
||||
@deprecation.deprecated(
|
||||
deprecated_in="0.21.0",
|
||||
current_version=__version__,
|
||||
@@ -3330,7 +3365,7 @@ class AsyncTable:
|
||||
*,
|
||||
replace: Optional[bool] = None,
|
||||
config: Optional[
|
||||
Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
||||
Union[IvfFlat, IvfPq, IvfRq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
||||
] = None,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
name: Optional[str] = None,
|
||||
@@ -3369,11 +3404,12 @@ class AsyncTable:
|
||||
"""
|
||||
if config is not None:
|
||||
if not isinstance(
|
||||
config, (IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS)
|
||||
config,
|
||||
(IvfFlat, IvfPq, IvfRq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS),
|
||||
):
|
||||
raise TypeError(
|
||||
"config must be an instance of IvfPq, HnswPq, HnswSq, BTree,"
|
||||
" Bitmap, LabelList, or FTS"
|
||||
"config must be an instance of IvfPq, IvfRq, HnswPq, HnswSq, BTree,"
|
||||
" Bitmap, LabelList, or FTS, but got " + str(type(config))
|
||||
)
|
||||
try:
|
||||
await self._inner.create_index(
|
||||
@@ -3888,6 +3924,10 @@ class AsyncTable:
|
||||
async_query = self._sync_query_to_async(query)
|
||||
return await async_query.analyze_plan()
|
||||
|
||||
async def _output_schema(self, query: Query) -> pa.Schema:
|
||||
async_query = self._sync_query_to_async(query)
|
||||
return await async_query.output_schema()
|
||||
|
||||
async def _do_merge(
|
||||
self,
|
||||
merge: LanceMergeInsertBuilder,
|
||||
@@ -3920,6 +3960,7 @@ class AsyncTable:
|
||||
when_not_matched_by_source_delete=merge._when_not_matched_by_source_delete,
|
||||
when_not_matched_by_source_condition=merge._when_not_matched_by_source_condition,
|
||||
timeout=merge._timeout,
|
||||
use_index=merge._use_index,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -18,10 +18,17 @@ AddMode = Literal["append", "overwrite"]
|
||||
CreateMode = Literal["create", "overwrite"]
|
||||
|
||||
# Index type literals
|
||||
VectorIndexType = Literal["IVF_FLAT", "IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"]
|
||||
VectorIndexType = Literal["IVF_FLAT", "IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ", "IVF_RQ"]
|
||||
ScalarIndexType = Literal["BTREE", "BITMAP", "LABEL_LIST"]
|
||||
IndexType = Literal[
|
||||
"IVF_PQ", "IVF_HNSW_PQ", "IVF_HNSW_SQ", "FTS", "BTREE", "BITMAP", "LABEL_LIST"
|
||||
"IVF_PQ",
|
||||
"IVF_HNSW_PQ",
|
||||
"IVF_HNSW_SQ",
|
||||
"FTS",
|
||||
"BTREE",
|
||||
"BITMAP",
|
||||
"LABEL_LIST",
|
||||
"IVF_RQ",
|
||||
]
|
||||
|
||||
# Tokenizer literals
|
||||
|
||||
@@ -656,6 +656,106 @@ def test_colpali(tmp_path):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skipif(
|
||||
importlib.util.find_spec("colpali_engine") is None,
|
||||
reason="colpali_engine not installed",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[
|
||||
"vidore/colSmol-256M",
|
||||
"vidore/colqwen2.5-v0.2",
|
||||
"vidore/colpali-v1.3",
|
||||
"vidore/colqwen2-v1.0",
|
||||
],
|
||||
)
|
||||
def test_colpali_models(tmp_path, model_name):
|
||||
import requests
|
||||
from lancedb.pydantic import LanceModel
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
registry = get_registry()
|
||||
func = registry.get("colpali").create(model_name=model_name)
|
||||
|
||||
class MediaItems(LanceModel):
|
||||
text: str
|
||||
image_uri: str = func.SourceField()
|
||||
image_bytes: bytes = func.SourceField()
|
||||
image_vectors: MultiVector(func.ndims()) = func.VectorField()
|
||||
|
||||
table = db.create_table(f"media_{model_name.replace('/', '_')}", schema=MediaItems)
|
||||
|
||||
texts = [
|
||||
"a cute cat playing with yarn",
|
||||
]
|
||||
|
||||
uris = [
|
||||
"http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg",
|
||||
]
|
||||
|
||||
image_bytes = [requests.get(uri).content for uri in uris]
|
||||
|
||||
table.add(
|
||||
pd.DataFrame({"text": texts, "image_uri": uris, "image_bytes": image_bytes})
|
||||
)
|
||||
|
||||
image_results = (
|
||||
table.search("fluffy companion", vector_column_name="image_vectors")
|
||||
.limit(1)
|
||||
.to_pydantic(MediaItems)[0]
|
||||
)
|
||||
assert "cat" in image_results.text.lower() or "puppy" in image_results.text.lower()
|
||||
|
||||
first_row = table.to_arrow().to_pylist()[0]
|
||||
assert len(first_row["image_vectors"]) > 1, "Should have multiple image vectors"
|
||||
assert len(first_row["image_vectors"][0]) == func.ndims(), (
|
||||
"Vector dimension mismatch"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skipif(
|
||||
importlib.util.find_spec("colpali_engine") is None,
|
||||
reason="colpali_engine not installed",
|
||||
)
|
||||
def test_colpali_pooling(tmp_path):
|
||||
registry = get_registry()
|
||||
model_name = "vidore/colSmol-256M"
|
||||
test_sentence = "a test sentence for pooling"
|
||||
|
||||
# 1. Get embeddings with no pooling
|
||||
func_no_pool = registry.get("colpali").create(
|
||||
model_name=model_name, pooling_strategy=None
|
||||
)
|
||||
unpooled_embeddings = func_no_pool.generate_text_embeddings([test_sentence])[0]
|
||||
original_length = len(unpooled_embeddings)
|
||||
assert original_length > 1
|
||||
|
||||
# 2. Test hierarchical pooling
|
||||
func_hierarchical = registry.get("colpali").create(
|
||||
model_name=model_name, pooling_strategy="hierarchical", pool_factor=2
|
||||
)
|
||||
hierarchical_embeddings = func_hierarchical.generate_text_embeddings(
|
||||
[test_sentence]
|
||||
)[0]
|
||||
expected_hierarchical_length = (original_length + 1) // 2
|
||||
assert len(hierarchical_embeddings) == expected_hierarchical_length
|
||||
|
||||
# 3. Test lambda pooling
|
||||
def simple_pool_func(tensor):
|
||||
return tensor[::2]
|
||||
|
||||
func_lambda = registry.get("colpali").create(
|
||||
model_name=model_name,
|
||||
pooling_strategy="lambda",
|
||||
pooling_func=simple_pool_func,
|
||||
)
|
||||
lambda_embeddings = func_lambda.generate_text_embeddings([test_sentence])[0]
|
||||
expected_lambda_length = (original_length + 1) // 2
|
||||
assert len(lambda_embeddings) == expected_lambda_length
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_siglip(tmp_path, test_images, query_image_bytes):
|
||||
from PIL import Image
|
||||
|
||||
@@ -8,7 +8,17 @@ import pyarrow as pa
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from lancedb import AsyncConnection, AsyncTable, connect_async
|
||||
from lancedb.index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
|
||||
from lancedb.index import (
|
||||
BTree,
|
||||
IvfFlat,
|
||||
IvfPq,
|
||||
IvfRq,
|
||||
Bitmap,
|
||||
LabelList,
|
||||
HnswPq,
|
||||
HnswSq,
|
||||
FTS,
|
||||
)
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
@@ -35,6 +45,8 @@ async def some_table(db_async):
|
||||
"tags": [
|
||||
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
|
||||
],
|
||||
"is_active": [random.choice([True, False]) for _ in range(NROWS)],
|
||||
"data": [random.randbytes(random.randint(0, 128)) for _ in range(NROWS)],
|
||||
}
|
||||
)
|
||||
return await db_async.create_table(
|
||||
@@ -99,10 +111,17 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable):
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
await some_table.create_index("id", config=Bitmap())
|
||||
await some_table.create_index("is_active", config=Bitmap())
|
||||
await some_table.create_index("data", config=Bitmap())
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(Bitmap, columns=["id"], name="id_idx")]'
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
assert len(indices) == 3
|
||||
assert indices[0].index_type == "Bitmap"
|
||||
assert indices[0].columns == ["id"]
|
||||
assert indices[1].index_type == "Bitmap"
|
||||
assert indices[1].columns == ["is_active"]
|
||||
assert indices[2].index_type == "Bitmap"
|
||||
assert indices[2].columns == ["data"]
|
||||
|
||||
index_name = indices[0].name
|
||||
stats = await some_table.index_stats(index_name)
|
||||
assert stats.index_type == "BITMAP"
|
||||
@@ -111,6 +130,11 @@ async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
|
||||
assert (
|
||||
"ScalarIndexQuery"
|
||||
in await some_table.query().where("is_active = TRUE").explain_plan()
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_label_list_index(some_table: AsyncTable):
|
||||
@@ -181,6 +205,16 @@ async def test_create_4bit_ivfpq_index(some_table: AsyncTable):
|
||||
assert stats.loss >= 0.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_ivfrq_index(some_table: AsyncTable):
|
||||
await some_table.create_index("vector", config=IvfRq(num_bits=1))
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type == "IvfRq"
|
||||
assert indices[0].columns == ["vector"]
|
||||
assert indices[0].name == "vector_idx"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_hnswpq_index(some_table: AsyncTable):
|
||||
await some_table.create_index("vector", config=HnswPq(num_partitions=10))
|
||||
|
||||
@@ -59,6 +59,14 @@ class TempNamespace(LanceNamespace):
|
||||
root
|
||||
] # Reference to shared namespaces
|
||||
|
||||
def namespace_id(self) -> str:
|
||||
"""Return a human-readable unique identifier for this namespace instance.
|
||||
|
||||
Returns:
|
||||
A unique identifier string based on the root directory
|
||||
"""
|
||||
return f"TempNamespace {{ root: '{self.config.root}' }}"
|
||||
|
||||
def list_tables(self, request: ListTablesRequest) -> ListTablesResponse:
|
||||
"""List all tables in the namespace."""
|
||||
if not request.id:
|
||||
|
||||
462
python/python/tests/test_permutation.py
Normal file
462
python/python/tests/test_permutation.py
Normal file
@@ -0,0 +1,462 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
|
||||
from lancedb.permutation import permutation_builder
|
||||
|
||||
|
||||
def test_split_random_ratios(mem_db):
|
||||
"""Test random splitting with ratios."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"x": range(100), "y": range(100)})
|
||||
)
|
||||
permutation_tbl = permutation_builder(tbl).split_random(ratios=[0.3, 0.7]).execute()
|
||||
|
||||
# Check that the table was created and has data
|
||||
assert permutation_tbl.count_rows() == 100
|
||||
|
||||
# Check that split_id column exists and has correct values
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
split_ids = data["split_id"]
|
||||
assert set(split_ids) == {0, 1}
|
||||
|
||||
# Check approximate split sizes (allowing for rounding)
|
||||
split_0_count = split_ids.count(0)
|
||||
split_1_count = split_ids.count(1)
|
||||
assert 25 <= split_0_count <= 35 # ~30% ± tolerance
|
||||
assert 65 <= split_1_count <= 75 # ~70% ± tolerance
|
||||
|
||||
|
||||
def test_split_random_counts(mem_db):
|
||||
"""Test random splitting with absolute counts."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"x": range(100), "y": range(100)})
|
||||
)
|
||||
permutation_tbl = permutation_builder(tbl).split_random(counts=[20, 30]).execute()
|
||||
|
||||
# Check that we have exactly the requested counts
|
||||
assert permutation_tbl.count_rows() == 50
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
split_ids = data["split_id"]
|
||||
assert split_ids.count(0) == 20
|
||||
assert split_ids.count(1) == 30
|
||||
|
||||
|
||||
def test_split_random_fixed(mem_db):
|
||||
"""Test random splitting with fixed number of splits."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"x": range(100), "y": range(100)})
|
||||
)
|
||||
permutation_tbl = permutation_builder(tbl).split_random(fixed=4).execute()
|
||||
|
||||
# Check that we have 4 splits with 25 rows each
|
||||
assert permutation_tbl.count_rows() == 100
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
split_ids = data["split_id"]
|
||||
assert set(split_ids) == {0, 1, 2, 3}
|
||||
|
||||
for split_id in range(4):
|
||||
assert split_ids.count(split_id) == 25
|
||||
|
||||
|
||||
def test_split_random_with_seed(mem_db):
|
||||
"""Test that seeded random splits are reproducible."""
|
||||
tbl = mem_db.create_table("test_table", pa.table({"x": range(50), "y": range(50)}))
|
||||
|
||||
# Create two identical permutations with same seed
|
||||
perm1 = permutation_builder(tbl).split_random(ratios=[0.6, 0.4], seed=42).execute()
|
||||
|
||||
perm2 = permutation_builder(tbl).split_random(ratios=[0.6, 0.4], seed=42).execute()
|
||||
|
||||
# Results should be identical
|
||||
data1 = perm1.search(None).to_arrow().to_pydict()
|
||||
data2 = perm2.search(None).to_arrow().to_pydict()
|
||||
|
||||
assert data1["row_id"] == data2["row_id"]
|
||||
assert data1["split_id"] == data2["split_id"]
|
||||
|
||||
|
||||
def test_split_hash(mem_db):
|
||||
"""Test hash-based splitting."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table",
|
||||
pa.table(
|
||||
{
|
||||
"id": range(100),
|
||||
"category": (["A", "B", "C"] * 34)[:100], # Repeating pattern
|
||||
"value": range(100),
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl)
|
||||
.split_hash(["category"], [1, 1], discard_weight=0)
|
||||
.execute()
|
||||
)
|
||||
|
||||
# Should have all 100 rows (no discard)
|
||||
assert permutation_tbl.count_rows() == 100
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
split_ids = data["split_id"]
|
||||
assert set(split_ids) == {0, 1}
|
||||
|
||||
# Verify that each split has roughly 50 rows (allowing for hash variance)
|
||||
split_0_count = split_ids.count(0)
|
||||
split_1_count = split_ids.count(1)
|
||||
assert 30 <= split_0_count <= 70 # ~50 ± 20 tolerance for hash distribution
|
||||
assert 30 <= split_1_count <= 70 # ~50 ± 20 tolerance for hash distribution
|
||||
|
||||
# Hash splits should be deterministic - same category should go to same split
|
||||
# Let's verify by creating another permutation and checking consistency
|
||||
perm2 = (
|
||||
permutation_builder(tbl)
|
||||
.split_hash(["category"], [1, 1], discard_weight=0)
|
||||
.execute()
|
||||
)
|
||||
|
||||
data2 = perm2.search(None).to_arrow().to_pydict()
|
||||
assert data["split_id"] == data2["split_id"] # Should be identical
|
||||
|
||||
|
||||
def test_split_hash_with_discard(mem_db):
|
||||
"""Test hash-based splitting with discard weight."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table",
|
||||
pa.table({"id": range(100), "category": ["A", "B"] * 50, "value": range(100)}),
|
||||
)
|
||||
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl)
|
||||
.split_hash(["category"], [1, 1], discard_weight=2) # Should discard ~50%
|
||||
.execute()
|
||||
)
|
||||
|
||||
# Should have fewer than 100 rows due to discard
|
||||
row_count = permutation_tbl.count_rows()
|
||||
assert row_count < 100
|
||||
assert row_count > 0 # But not empty
|
||||
|
||||
|
||||
def test_split_sequential(mem_db):
|
||||
"""Test sequential splitting."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"x": range(100), "y": range(100)})
|
||||
)
|
||||
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl).split_sequential(counts=[30, 40]).execute()
|
||||
)
|
||||
|
||||
assert permutation_tbl.count_rows() == 70
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
row_ids = data["row_id"]
|
||||
split_ids = data["split_id"]
|
||||
|
||||
# Sequential should maintain order
|
||||
assert row_ids == sorted(row_ids)
|
||||
|
||||
# First 30 should be split 0, next 40 should be split 1
|
||||
assert split_ids[:30] == [0] * 30
|
||||
assert split_ids[30:] == [1] * 40
|
||||
|
||||
|
||||
def test_split_calculated(mem_db):
|
||||
"""Test calculated splitting."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"id": range(100), "value": range(100)})
|
||||
)
|
||||
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl)
|
||||
.split_calculated("id % 3") # Split based on id modulo 3
|
||||
.execute()
|
||||
)
|
||||
|
||||
assert permutation_tbl.count_rows() == 100
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
row_ids = data["row_id"]
|
||||
split_ids = data["split_id"]
|
||||
|
||||
# Verify the calculation: each row's split_id should equal row_id % 3
|
||||
for i, (row_id, split_id) in enumerate(zip(row_ids, split_ids)):
|
||||
assert split_id == row_id % 3
|
||||
|
||||
|
||||
def test_split_error_cases(mem_db):
|
||||
"""Test error handling for invalid split parameters."""
|
||||
tbl = mem_db.create_table("test_table", pa.table({"x": range(10), "y": range(10)}))
|
||||
|
||||
# Test split_random with no parameters
|
||||
with pytest.raises(Exception):
|
||||
permutation_builder(tbl).split_random().execute()
|
||||
|
||||
# Test split_random with multiple parameters
|
||||
with pytest.raises(Exception):
|
||||
permutation_builder(tbl).split_random(
|
||||
ratios=[0.5, 0.5], counts=[5, 5]
|
||||
).execute()
|
||||
|
||||
# Test split_sequential with no parameters
|
||||
with pytest.raises(Exception):
|
||||
permutation_builder(tbl).split_sequential().execute()
|
||||
|
||||
# Test split_sequential with multiple parameters
|
||||
with pytest.raises(Exception):
|
||||
permutation_builder(tbl).split_sequential(ratios=[0.5, 0.5], fixed=2).execute()
|
||||
|
||||
|
||||
def test_shuffle_no_seed(mem_db):
|
||||
"""Test shuffling without a seed."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"id": range(100), "value": range(100)})
|
||||
)
|
||||
|
||||
# Create a permutation with shuffling (no seed)
|
||||
permutation_tbl = permutation_builder(tbl).shuffle().execute()
|
||||
|
||||
assert permutation_tbl.count_rows() == 100
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
row_ids = data["row_id"]
|
||||
|
||||
# Row IDs should not be in sequential order due to shuffling
|
||||
# This is probabilistic but with 100 rows, it's extremely unlikely they'd stay
|
||||
# in order
|
||||
assert row_ids != list(range(100))
|
||||
|
||||
|
||||
def test_shuffle_with_seed(mem_db):
|
||||
"""Test that shuffling with a seed is reproducible."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"id": range(50), "value": range(50)})
|
||||
)
|
||||
|
||||
# Create two identical permutations with same shuffle seed
|
||||
perm1 = permutation_builder(tbl).shuffle(seed=42).execute()
|
||||
|
||||
perm2 = permutation_builder(tbl).shuffle(seed=42).execute()
|
||||
|
||||
# Results should be identical due to same seed
|
||||
data1 = perm1.search(None).to_arrow().to_pydict()
|
||||
data2 = perm2.search(None).to_arrow().to_pydict()
|
||||
|
||||
assert data1["row_id"] == data2["row_id"]
|
||||
assert data1["split_id"] == data2["split_id"]
|
||||
|
||||
|
||||
def test_shuffle_with_clump_size(mem_db):
|
||||
"""Test shuffling with clump size."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"id": range(100), "value": range(100)})
|
||||
)
|
||||
|
||||
# Create a permutation with shuffling using clumps
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl)
|
||||
.shuffle(clump_size=10) # 10-row clumps
|
||||
.execute()
|
||||
)
|
||||
|
||||
assert permutation_tbl.count_rows() == 100
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
row_ids = data["row_id"]
|
||||
|
||||
for i in range(10):
|
||||
start = row_ids[i * 10]
|
||||
assert row_ids[i * 10 : (i + 1) * 10] == list(range(start, start + 10))
|
||||
|
||||
|
||||
def test_shuffle_different_seeds(mem_db):
|
||||
"""Test that different seeds produce different shuffle orders."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"id": range(50), "value": range(50)})
|
||||
)
|
||||
|
||||
# Create two permutations with different shuffle seeds
|
||||
perm1 = permutation_builder(tbl).split_random(fixed=2).shuffle(seed=42).execute()
|
||||
|
||||
perm2 = permutation_builder(tbl).split_random(fixed=2).shuffle(seed=123).execute()
|
||||
|
||||
# Results should be different due to different seeds
|
||||
data1 = perm1.search(None).to_arrow().to_pydict()
|
||||
data2 = perm2.search(None).to_arrow().to_pydict()
|
||||
|
||||
# Row order should be different
|
||||
assert data1["row_id"] != data2["row_id"]
|
||||
|
||||
|
||||
def test_shuffle_combined_with_splits(mem_db):
|
||||
"""Test shuffling combined with different split strategies."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table",
|
||||
pa.table(
|
||||
{
|
||||
"id": range(100),
|
||||
"category": (["A", "B", "C"] * 34)[:100],
|
||||
"value": range(100),
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
# Test shuffle with random splits
|
||||
perm_random = (
|
||||
permutation_builder(tbl)
|
||||
.split_random(ratios=[0.6, 0.4], seed=42)
|
||||
.shuffle(seed=123, clump_size=None)
|
||||
.execute()
|
||||
)
|
||||
|
||||
# Test shuffle with hash splits
|
||||
perm_hash = (
|
||||
permutation_builder(tbl)
|
||||
.split_hash(["category"], [1, 1], discard_weight=0)
|
||||
.shuffle(seed=456, clump_size=5)
|
||||
.execute()
|
||||
)
|
||||
|
||||
# Test shuffle with sequential splits
|
||||
perm_sequential = (
|
||||
permutation_builder(tbl)
|
||||
.split_sequential(counts=[40, 35])
|
||||
.shuffle(seed=789, clump_size=None)
|
||||
.execute()
|
||||
)
|
||||
|
||||
# Verify all permutations work and have expected properties
|
||||
assert perm_random.count_rows() == 100
|
||||
assert perm_hash.count_rows() == 100
|
||||
assert perm_sequential.count_rows() == 75
|
||||
|
||||
# Verify shuffle affected the order
|
||||
data_random = perm_random.search(None).to_arrow().to_pydict()
|
||||
data_sequential = perm_sequential.search(None).to_arrow().to_pydict()
|
||||
|
||||
assert data_random["row_id"] != list(range(100))
|
||||
assert data_sequential["row_id"] != list(range(75))
|
||||
|
||||
|
||||
def test_no_shuffle_maintains_order(mem_db):
|
||||
"""Test that not calling shuffle maintains the original order."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"id": range(50), "value": range(50)})
|
||||
)
|
||||
|
||||
# Create permutation without shuffle (should maintain some order)
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl)
|
||||
.split_sequential(counts=[25, 25]) # Sequential maintains order
|
||||
.execute()
|
||||
)
|
||||
|
||||
assert permutation_tbl.count_rows() == 50
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
row_ids = data["row_id"]
|
||||
|
||||
# With sequential splits and no shuffle, should maintain order
|
||||
assert row_ids == list(range(50))
|
||||
|
||||
|
||||
def test_filter_basic(mem_db):
|
||||
"""Test basic filtering functionality."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"id": range(100), "value": range(100, 200)})
|
||||
)
|
||||
|
||||
# Filter to only include rows where id < 50
|
||||
permutation_tbl = permutation_builder(tbl).filter("id < 50").execute()
|
||||
|
||||
assert permutation_tbl.count_rows() == 50
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
row_ids = data["row_id"]
|
||||
|
||||
# All row_ids should be less than 50
|
||||
assert all(row_id < 50 for row_id in row_ids)
|
||||
|
||||
|
||||
def test_filter_with_splits(mem_db):
|
||||
"""Test filtering combined with split strategies."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table",
|
||||
pa.table(
|
||||
{
|
||||
"id": range(100),
|
||||
"category": (["A", "B", "C"] * 34)[:100],
|
||||
"value": range(100),
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
# Filter to only category A and B, then split
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl)
|
||||
.filter("category IN ('A', 'B')")
|
||||
.split_random(ratios=[0.5, 0.5])
|
||||
.execute()
|
||||
)
|
||||
|
||||
# Should have fewer than 100 rows due to filtering
|
||||
row_count = permutation_tbl.count_rows()
|
||||
assert row_count == 67
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
categories = data["category"]
|
||||
|
||||
# All categories should be A or B
|
||||
assert all(cat in ["A", "B"] for cat in categories)
|
||||
|
||||
|
||||
def test_filter_with_shuffle(mem_db):
|
||||
"""Test filtering combined with shuffling."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table",
|
||||
pa.table(
|
||||
{
|
||||
"id": range(100),
|
||||
"category": (["A", "B", "C", "D"] * 25)[:100],
|
||||
"value": range(100),
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
# Filter and shuffle
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl)
|
||||
.filter("category IN ('A', 'C')")
|
||||
.shuffle(seed=42)
|
||||
.execute()
|
||||
)
|
||||
|
||||
row_count = permutation_tbl.count_rows()
|
||||
assert row_count == 50 # Should have 50 rows (A and C categories)
|
||||
|
||||
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||
row_ids = data["row_id"]
|
||||
|
||||
assert row_ids != sorted(row_ids)
|
||||
|
||||
|
||||
def test_filter_empty_result(mem_db):
|
||||
"""Test filtering that results in empty set."""
|
||||
tbl = mem_db.create_table(
|
||||
"test_table", pa.table({"id": range(10), "value": range(10)})
|
||||
)
|
||||
|
||||
# Filter that matches nothing
|
||||
permutation_tbl = (
|
||||
permutation_builder(tbl)
|
||||
.filter("value > 100") # No values > 100 in our data
|
||||
.execute()
|
||||
)
|
||||
|
||||
assert permutation_tbl.count_rows() == 0
|
||||
@@ -1298,6 +1298,79 @@ async def test_query_serialization_async(table_async: AsyncTable):
|
||||
)
|
||||
|
||||
|
||||
def test_query_schema(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
tbl = db.create_table(
|
||||
"test",
|
||||
pa.table(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"text": ["a", "b", "c"],
|
||||
"vec": pa.array(
|
||||
[[1, 2], [3, 4], [5, 6]], pa.list_(pa.float32(), list_size=2)
|
||||
),
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
assert tbl.search(None).output_schema() == pa.schema(
|
||||
{
|
||||
"a": pa.int64(),
|
||||
"text": pa.string(),
|
||||
"vec": pa.list_(pa.float32(), list_size=2),
|
||||
}
|
||||
)
|
||||
assert tbl.search(None).select({"bl": "a * 2"}).output_schema() == pa.schema(
|
||||
{"bl": pa.int64()}
|
||||
)
|
||||
assert tbl.search([1, 2]).select(["a"]).output_schema() == pa.schema(
|
||||
{"a": pa.int64(), "_distance": pa.float32()}
|
||||
)
|
||||
assert tbl.search("blah").select(["a"]).output_schema() == pa.schema(
|
||||
{"a": pa.int64()}
|
||||
)
|
||||
assert tbl.take_offsets([0]).select(["text"]).output_schema() == pa.schema(
|
||||
{"text": pa.string()}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_query_schema_async(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
tbl = await db.create_table(
|
||||
"test",
|
||||
pa.table(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"text": ["a", "b", "c"],
|
||||
"vec": pa.array(
|
||||
[[1, 2], [3, 4], [5, 6]], pa.list_(pa.float32(), list_size=2)
|
||||
),
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
assert await tbl.query().output_schema() == pa.schema(
|
||||
{
|
||||
"a": pa.int64(),
|
||||
"text": pa.string(),
|
||||
"vec": pa.list_(pa.float32(), list_size=2),
|
||||
}
|
||||
)
|
||||
assert await tbl.query().select({"bl": "a * 2"}).output_schema() == pa.schema(
|
||||
{"bl": pa.int64()}
|
||||
)
|
||||
assert await tbl.vector_search([1, 2]).select(["a"]).output_schema() == pa.schema(
|
||||
{"a": pa.int64(), "_distance": pa.float32()}
|
||||
)
|
||||
assert await (await tbl.search("blah")).select(["a"]).output_schema() == pa.schema(
|
||||
{"a": pa.int64()}
|
||||
)
|
||||
assert await tbl.take_offsets([0]).select(["text"]).output_schema() == pa.schema(
|
||||
{"text": pa.string()}
|
||||
)
|
||||
|
||||
|
||||
def test_query_timeout(tmp_path):
|
||||
# Use local directory instead of memory:// to add a bit of latency to
|
||||
# operations so a timeout of zero will trigger exceptions.
|
||||
|
||||
@@ -4,7 +4,10 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
|
||||
use lancedb::{connection::Connection as LanceConnection, database::CreateTableMode};
|
||||
use lancedb::{
|
||||
connection::Connection as LanceConnection,
|
||||
database::{CreateTableMode, ReadConsistency},
|
||||
};
|
||||
use pyo3::{
|
||||
exceptions::{PyRuntimeError, PyValueError},
|
||||
pyclass, pyfunction, pymethods, Bound, FromPyObject, Py, PyAny, PyRef, PyResult, Python,
|
||||
@@ -23,7 +26,7 @@ impl Connection {
|
||||
Self { inner: Some(inner) }
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> PyResult<&LanceConnection> {
|
||||
pub(crate) fn get_inner(&self) -> PyResult<&LanceConnection> {
|
||||
self.inner
|
||||
.as_ref()
|
||||
.ok_or_else(|| PyRuntimeError::new_err("Connection is closed"))
|
||||
@@ -63,6 +66,18 @@ impl Connection {
|
||||
self.get_inner().map(|inner| inner.uri().to_string())
|
||||
}
|
||||
|
||||
#[pyo3(signature = ())]
|
||||
pub fn get_read_consistency_interval(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
Ok(match inner.read_consistency().await.infer_error()? {
|
||||
ReadConsistency::Manual => None,
|
||||
ReadConsistency::Eventual(duration) => Some(duration.as_secs_f64()),
|
||||
ReadConsistency::Strong => Some(0.0_f64),
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (namespace=vec![], start_after=None, limit=None))]
|
||||
pub fn table_names(
|
||||
self_: PyRef<'_, Self>,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use lancedb::index::vector::IvfFlatIndexBuilder;
|
||||
use lancedb::index::vector::{IvfFlatIndexBuilder, IvfRqIndexBuilder};
|
||||
use lancedb::index::{
|
||||
scalar::{BTreeIndexBuilder, FtsIndexBuilder},
|
||||
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
|
||||
@@ -87,6 +87,22 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
||||
}
|
||||
Ok(LanceDbIndex::IvfPq(ivf_pq_builder))
|
||||
},
|
||||
"IvfRq" => {
|
||||
let params = source.extract::<IvfRqParams>()?;
|
||||
let distance_type = parse_distance_type(params.distance_type)?;
|
||||
let mut ivf_rq_builder = IvfRqIndexBuilder::default()
|
||||
.distance_type(distance_type)
|
||||
.max_iterations(params.max_iterations)
|
||||
.sample_rate(params.sample_rate)
|
||||
.num_bits(params.num_bits);
|
||||
if let Some(num_partitions) = params.num_partitions {
|
||||
ivf_rq_builder = ivf_rq_builder.num_partitions(num_partitions);
|
||||
}
|
||||
if let Some(target_partition_size) = params.target_partition_size {
|
||||
ivf_rq_builder = ivf_rq_builder.target_partition_size(target_partition_size);
|
||||
}
|
||||
Ok(LanceDbIndex::IvfRq(ivf_rq_builder))
|
||||
},
|
||||
"HnswPq" => {
|
||||
let params = source.extract::<IvfHnswPqParams>()?;
|
||||
let distance_type = parse_distance_type(params.distance_type)?;
|
||||
@@ -170,6 +186,16 @@ struct IvfPqParams {
|
||||
target_partition_size: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
struct IvfRqParams {
|
||||
distance_type: String,
|
||||
num_partitions: Option<u32>,
|
||||
num_bits: u32,
|
||||
max_iterations: u32,
|
||||
sample_rate: u32,
|
||||
target_partition_size: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
struct IvfHnswPqParams {
|
||||
distance_type: String,
|
||||
|
||||
@@ -5,6 +5,7 @@ use arrow::RecordBatchStream;
|
||||
use connection::{connect, Connection};
|
||||
use env_logger::Env;
|
||||
use index::IndexConfig;
|
||||
use permutation::PyAsyncPermutationBuilder;
|
||||
use pyo3::{
|
||||
pymodule,
|
||||
types::{PyModule, PyModuleMethods},
|
||||
@@ -22,6 +23,7 @@ pub mod connection;
|
||||
pub mod error;
|
||||
pub mod header;
|
||||
pub mod index;
|
||||
pub mod permutation;
|
||||
pub mod query;
|
||||
pub mod session;
|
||||
pub mod table;
|
||||
@@ -49,7 +51,9 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_class::<DeleteResult>()?;
|
||||
m.add_class::<DropColumnsResult>()?;
|
||||
m.add_class::<UpdateResult>()?;
|
||||
m.add_class::<PyAsyncPermutationBuilder>()?;
|
||||
m.add_function(wrap_pyfunction!(connect, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(permutation::async_permutation_builder, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
|
||||
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
|
||||
Ok(())
|
||||
|
||||
170
python/src/permutation.rs
Normal file
170
python/src/permutation.rs
Normal file
@@ -0,0 +1,170 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use crate::{error::PythonErrorExt, table::Table};
|
||||
use lancedb::dataloader::{
|
||||
permutation::builder::{PermutationBuilder as LancePermutationBuilder, ShuffleStrategy},
|
||||
permutation::split::{SplitSizes, SplitStrategy},
|
||||
};
|
||||
use pyo3::{
|
||||
exceptions::PyRuntimeError, pyclass, pymethods, types::PyAnyMethods, Bound, PyAny, PyRefMut,
|
||||
PyResult,
|
||||
};
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
/// Create a permutation builder for the given table
|
||||
#[pyo3::pyfunction]
|
||||
pub fn async_permutation_builder(table: Bound<'_, PyAny>) -> PyResult<PyAsyncPermutationBuilder> {
|
||||
let table = table.getattr("_inner")?.downcast_into::<Table>()?;
|
||||
let inner_table = table.borrow().inner_ref()?.clone();
|
||||
let inner_builder = LancePermutationBuilder::new(inner_table);
|
||||
|
||||
Ok(PyAsyncPermutationBuilder {
|
||||
state: Arc::new(Mutex::new(PyAsyncPermutationBuilderState {
|
||||
builder: Some(inner_builder),
|
||||
})),
|
||||
})
|
||||
}
|
||||
|
||||
struct PyAsyncPermutationBuilderState {
|
||||
builder: Option<LancePermutationBuilder>,
|
||||
}
|
||||
|
||||
#[pyclass(name = "AsyncPermutationBuilder")]
|
||||
pub struct PyAsyncPermutationBuilder {
|
||||
state: Arc<Mutex<PyAsyncPermutationBuilderState>>,
|
||||
}
|
||||
|
||||
impl PyAsyncPermutationBuilder {
|
||||
fn modify(
|
||||
&self,
|
||||
func: impl FnOnce(LancePermutationBuilder) -> LancePermutationBuilder,
|
||||
) -> PyResult<Self> {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
let builder = state
|
||||
.builder
|
||||
.take()
|
||||
.ok_or_else(|| PyRuntimeError::new_err("Builder already consumed"))?;
|
||||
state.builder = Some(func(builder));
|
||||
Ok(Self {
|
||||
state: self.state.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl PyAsyncPermutationBuilder {
|
||||
#[pyo3(signature = (*, ratios=None, counts=None, fixed=None, seed=None))]
|
||||
pub fn split_random(
|
||||
slf: PyRefMut<'_, Self>,
|
||||
ratios: Option<Vec<f64>>,
|
||||
counts: Option<Vec<u64>>,
|
||||
fixed: Option<u64>,
|
||||
seed: Option<u64>,
|
||||
) -> PyResult<Self> {
|
||||
// Check that exactly one split type is provided
|
||||
let split_args_count = [ratios.is_some(), counts.is_some(), fixed.is_some()]
|
||||
.iter()
|
||||
.filter(|&&x| x)
|
||||
.count();
|
||||
|
||||
if split_args_count != 1 {
|
||||
return Err(pyo3::exceptions::PyValueError::new_err(
|
||||
"Exactly one of 'ratios', 'counts', or 'fixed' must be provided",
|
||||
));
|
||||
}
|
||||
|
||||
let sizes = if let Some(ratios) = ratios {
|
||||
SplitSizes::Percentages(ratios)
|
||||
} else if let Some(counts) = counts {
|
||||
SplitSizes::Counts(counts)
|
||||
} else if let Some(fixed) = fixed {
|
||||
SplitSizes::Fixed(fixed)
|
||||
} else {
|
||||
unreachable!("One of the split arguments must be provided");
|
||||
};
|
||||
|
||||
slf.modify(|builder| builder.with_split_strategy(SplitStrategy::Random { seed, sizes }))
|
||||
}
|
||||
|
||||
#[pyo3(signature = (columns, split_weights, *, discard_weight=0))]
|
||||
pub fn split_hash(
|
||||
slf: PyRefMut<'_, Self>,
|
||||
columns: Vec<String>,
|
||||
split_weights: Vec<u64>,
|
||||
discard_weight: u64,
|
||||
) -> PyResult<Self> {
|
||||
slf.modify(|builder| {
|
||||
builder.with_split_strategy(SplitStrategy::Hash {
|
||||
columns,
|
||||
split_weights,
|
||||
discard_weight,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (*, ratios=None, counts=None, fixed=None))]
|
||||
pub fn split_sequential(
|
||||
slf: PyRefMut<'_, Self>,
|
||||
ratios: Option<Vec<f64>>,
|
||||
counts: Option<Vec<u64>>,
|
||||
fixed: Option<u64>,
|
||||
) -> PyResult<Self> {
|
||||
// Check that exactly one split type is provided
|
||||
let split_args_count = [ratios.is_some(), counts.is_some(), fixed.is_some()]
|
||||
.iter()
|
||||
.filter(|&&x| x)
|
||||
.count();
|
||||
|
||||
if split_args_count != 1 {
|
||||
return Err(pyo3::exceptions::PyValueError::new_err(
|
||||
"Exactly one of 'ratios', 'counts', or 'fixed' must be provided",
|
||||
));
|
||||
}
|
||||
|
||||
let sizes = if let Some(ratios) = ratios {
|
||||
SplitSizes::Percentages(ratios)
|
||||
} else if let Some(counts) = counts {
|
||||
SplitSizes::Counts(counts)
|
||||
} else if let Some(fixed) = fixed {
|
||||
SplitSizes::Fixed(fixed)
|
||||
} else {
|
||||
unreachable!("One of the split arguments must be provided");
|
||||
};
|
||||
|
||||
slf.modify(|builder| builder.with_split_strategy(SplitStrategy::Sequential { sizes }))
|
||||
}
|
||||
|
||||
pub fn split_calculated(slf: PyRefMut<'_, Self>, calculation: String) -> PyResult<Self> {
|
||||
slf.modify(|builder| builder.with_split_strategy(SplitStrategy::Calculated { calculation }))
|
||||
}
|
||||
|
||||
pub fn shuffle(
|
||||
slf: PyRefMut<'_, Self>,
|
||||
seed: Option<u64>,
|
||||
clump_size: Option<u64>,
|
||||
) -> PyResult<Self> {
|
||||
slf.modify(|builder| {
|
||||
builder.with_shuffle_strategy(ShuffleStrategy::Random { seed, clump_size })
|
||||
})
|
||||
}
|
||||
|
||||
pub fn filter(slf: PyRefMut<'_, Self>, filter: String) -> PyResult<Self> {
|
||||
slf.modify(|builder| builder.with_filter(filter))
|
||||
}
|
||||
|
||||
pub fn execute(slf: PyRefMut<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let mut state = slf.state.lock().unwrap();
|
||||
let builder = state
|
||||
.builder
|
||||
.take()
|
||||
.ok_or_else(|| PyRuntimeError::new_err("Builder already consumed"))?;
|
||||
|
||||
future_into_py(slf.py(), async move {
|
||||
let table = builder.build().await.infer_error()?;
|
||||
Ok(Table::new(table))
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -9,6 +9,7 @@ use arrow::array::Array;
|
||||
use arrow::array::ArrayData;
|
||||
use arrow::pyarrow::FromPyArrow;
|
||||
use arrow::pyarrow::IntoPyArrow;
|
||||
use arrow::pyarrow::ToPyArrow;
|
||||
use lancedb::index::scalar::{
|
||||
BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur,
|
||||
Operator, PhraseQuery,
|
||||
@@ -30,6 +31,7 @@ use pyo3::IntoPyObject;
|
||||
use pyo3::PyAny;
|
||||
use pyo3::PyRef;
|
||||
use pyo3::PyResult;
|
||||
use pyo3::Python;
|
||||
use pyo3::{exceptions::PyRuntimeError, FromPyObject};
|
||||
use pyo3::{
|
||||
exceptions::{PyNotImplementedError, PyValueError},
|
||||
@@ -445,6 +447,15 @@ impl Query {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = ())]
|
||||
pub fn output_schema(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let schema = inner.output_schema().await.infer_error()?;
|
||||
Python::with_gil(|py| schema.to_pyarrow(py))
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None, timeout=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
@@ -515,6 +526,15 @@ impl TakeQuery {
|
||||
self.inner = self.inner.clone().with_row_id();
|
||||
}
|
||||
|
||||
#[pyo3(signature = ())]
|
||||
pub fn output_schema(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let schema = inner.output_schema().await.infer_error()?;
|
||||
Python::with_gil(|py| schema.to_pyarrow(py))
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None, timeout=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
@@ -601,6 +621,15 @@ impl FTSQuery {
|
||||
self.inner = self.inner.clone().postfilter();
|
||||
}
|
||||
|
||||
#[pyo3(signature = ())]
|
||||
pub fn output_schema(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let schema = inner.output_schema().await.infer_error()?;
|
||||
Python::with_gil(|py| schema.to_pyarrow(py))
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None, timeout=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
@@ -771,6 +800,15 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().bypass_vector_index()
|
||||
}
|
||||
|
||||
#[pyo3(signature = ())]
|
||||
pub fn output_schema(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let schema = inner.output_schema().await.infer_error()?;
|
||||
Python::with_gil(|py| schema.to_pyarrow(py))
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None, timeout=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user