mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-03 10:22:56 +00:00
Compare commits
61 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
143184c0ae | ||
|
|
dadb042978 | ||
|
|
5a19cf15a6 | ||
|
|
3dcec724b7 | ||
|
|
86a6bb9fcb | ||
|
|
b59d1007d3 | ||
|
|
56a16b1728 | ||
|
|
b7afed9beb | ||
|
|
5cbbaa2e4a | ||
|
|
1b6bd2498e | ||
|
|
285da9db1d | ||
|
|
ad8306c96b | ||
|
|
3594538509 | ||
|
|
917aabd077 | ||
|
|
5ec12c9971 | ||
|
|
d0ce489b21 | ||
|
|
d7e02c8181 | ||
|
|
70958f6366 | ||
|
|
1ac745eb18 | ||
|
|
1357fe8aa1 | ||
|
|
0d78929893 | ||
|
|
9e2a68541e | ||
|
|
1aa0fd16e7 | ||
|
|
fec2a05629 | ||
|
|
79a1cd60ee | ||
|
|
88807a59a4 | ||
|
|
e0e7e01ea8 | ||
|
|
a416ebc11d | ||
|
|
f941054baf | ||
|
|
1a81c46505 | ||
|
|
82b25a71e9 | ||
|
|
13c613d45f | ||
|
|
e07389a36c | ||
|
|
e7e9e80b1d | ||
|
|
247fb58400 | ||
|
|
504bdc471c | ||
|
|
d617cdef4a | ||
|
|
356d7046fd | ||
|
|
48e5caabda | ||
|
|
d6cc68f671 | ||
|
|
55eacfa685 | ||
|
|
222e3264ab | ||
|
|
13505026cb | ||
|
|
b0800b4b71 | ||
|
|
1befebf614 | ||
|
|
1ab60fae7f | ||
|
|
e921c90c1b | ||
|
|
05a4ea646a | ||
|
|
ebbeeff4e0 | ||
|
|
407ca53f92 | ||
|
|
ff71d7e552 | ||
|
|
2261eb95a0 | ||
|
|
5b397e410b | ||
|
|
b5a39bffec | ||
|
|
5e1e9add07 | ||
|
|
97e9938dfe | ||
|
|
1d4b92e01e | ||
|
|
4c9fc3044b | ||
|
|
0ebc8d45a8 | ||
|
|
f7d78c3420 | ||
|
|
6ea6884260 |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.22.1-beta.0"
|
current_version = "0.22.2"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
45
.github/actions/create-failure-issue/action.yml
vendored
Normal file
45
.github/actions/create-failure-issue/action.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
name: Create Failure Issue
|
||||||
|
description: Creates a GitHub issue if any jobs in the workflow failed
|
||||||
|
|
||||||
|
inputs:
|
||||||
|
job-results:
|
||||||
|
description: 'JSON string of job results from needs context'
|
||||||
|
required: true
|
||||||
|
workflow-name:
|
||||||
|
description: 'Name of the workflow'
|
||||||
|
required: true
|
||||||
|
|
||||||
|
runs:
|
||||||
|
using: composite
|
||||||
|
steps:
|
||||||
|
- name: Check for failures and create issue
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
JOB_RESULTS: ${{ inputs.job-results }}
|
||||||
|
WORKFLOW_NAME: ${{ inputs.workflow-name }}
|
||||||
|
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||||
|
GH_TOKEN: ${{ github.token }}
|
||||||
|
run: |
|
||||||
|
# Check if any job failed
|
||||||
|
if echo "$JOB_RESULTS" | jq -e 'to_entries | any(.value.result == "failure")' > /dev/null; then
|
||||||
|
echo "Detected job failures, creating issue..."
|
||||||
|
|
||||||
|
# Extract failed job names
|
||||||
|
FAILED_JOBS=$(echo "$JOB_RESULTS" | jq -r 'to_entries | map(select(.value.result == "failure")) | map(.key) | join(", ")')
|
||||||
|
|
||||||
|
# Create issue with workflow name, failed jobs, and run URL
|
||||||
|
gh issue create \
|
||||||
|
--title "$WORKFLOW_NAME Failed ($FAILED_JOBS)" \
|
||||||
|
--body "The workflow **$WORKFLOW_NAME** failed during execution.
|
||||||
|
|
||||||
|
**Failed jobs:** $FAILED_JOBS
|
||||||
|
|
||||||
|
**Run URL:** $RUN_URL
|
||||||
|
|
||||||
|
Please investigate the failed jobs and address any issues." \
|
||||||
|
--label "ci"
|
||||||
|
|
||||||
|
echo "Issue created successfully"
|
||||||
|
else
|
||||||
|
echo "No job failures detected, skipping issue creation"
|
||||||
|
fi
|
||||||
14
.github/workflows/cargo-publish.yml
vendored
14
.github/workflows/cargo-publish.yml
vendored
@@ -38,3 +38,17 @@ jobs:
|
|||||||
- name: Publish the package
|
- name: Publish the package
|
||||||
run: |
|
run: |
|
||||||
cargo publish -p lancedb --all-features --token ${{ steps.auth.outputs.token }}
|
cargo publish -p lancedb --all-features --token ${{ steps.auth.outputs.token }}
|
||||||
|
report-failure:
|
||||||
|
name: Report Workflow Failure
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: [build]
|
||||||
|
if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch')
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
issues: write
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: ./.github/actions/create-failure-issue
|
||||||
|
with:
|
||||||
|
job-results: ${{ toJSON(needs) }}
|
||||||
|
workflow-name: ${{ github.workflow }}
|
||||||
|
|||||||
3
.github/workflows/docs.yml
vendored
3
.github/workflows/docs.yml
vendored
@@ -56,8 +56,9 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
node-version: 20
|
node-version: 20
|
||||||
cache: 'npm'
|
cache: 'npm'
|
||||||
|
cache-dependency-path: docs/package-lock.json
|
||||||
- name: Install node dependencies
|
- name: Install node dependencies
|
||||||
working-directory: node
|
working-directory: nodejs
|
||||||
run: |
|
run: |
|
||||||
sudo apt update
|
sudo apt update
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
|
|||||||
4
.github/workflows/docs_test.yml
vendored
4
.github/workflows/docs_test.yml
vendored
@@ -24,7 +24,8 @@ env:
|
|||||||
jobs:
|
jobs:
|
||||||
test-python:
|
test-python:
|
||||||
name: Test doc python code
|
name: Test doc python code
|
||||||
runs-on: ubuntu-24.04
|
runs-on: warp-ubuntu-2204-x64-8x
|
||||||
|
timeout-minutes: 60
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -48,7 +49,6 @@ jobs:
|
|||||||
uses: swatinem/rust-cache@v2
|
uses: swatinem/rust-cache@v2
|
||||||
- name: Build Python
|
- name: Build Python
|
||||||
working-directory: docs/test
|
working-directory: docs/test
|
||||||
timeout-minutes: 60
|
|
||||||
run:
|
run:
|
||||||
python -m pip install --extra-index-url https://pypi.fury.io/lancedb/ -r requirements.txt
|
python -m pip install --extra-index-url https://pypi.fury.io/lancedb/ -r requirements.txt
|
||||||
- name: Create test files
|
- name: Create test files
|
||||||
|
|||||||
15
.github/workflows/java-publish.yml
vendored
15
.github/workflows/java-publish.yml
vendored
@@ -43,7 +43,6 @@ jobs:
|
|||||||
- uses: Swatinem/rust-cache@v2
|
- uses: Swatinem/rust-cache@v2
|
||||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||||
with:
|
with:
|
||||||
toolchain: "1.81.0"
|
|
||||||
cache-workspaces: "./java/core/lancedb-jni"
|
cache-workspaces: "./java/core/lancedb-jni"
|
||||||
# Disable full debug symbol generation to speed up CI build and keep memory down
|
# Disable full debug symbol generation to speed up CI build and keep memory down
|
||||||
# "1" means line tables only, which is useful for panic tracebacks.
|
# "1" means line tables only, which is useful for panic tracebacks.
|
||||||
@@ -112,3 +111,17 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
|
SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
|
||||||
SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
|
SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
|
||||||
|
report-failure:
|
||||||
|
name: Report Workflow Failure
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: [linux-arm64, linux-x86, macos-arm64]
|
||||||
|
if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch')
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
issues: write
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: ./.github/actions/create-failure-issue
|
||||||
|
with:
|
||||||
|
job-results: ${{ toJSON(needs) }}
|
||||||
|
workflow-name: ${{ github.workflow }}
|
||||||
|
|||||||
3
.github/workflows/nodejs.yml
vendored
3
.github/workflows/nodejs.yml
vendored
@@ -6,6 +6,7 @@ on:
|
|||||||
- main
|
- main
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
|
- Cargo.toml
|
||||||
- nodejs/**
|
- nodejs/**
|
||||||
- .github/workflows/nodejs.yml
|
- .github/workflows/nodejs.yml
|
||||||
- docker-compose.yml
|
- docker-compose.yml
|
||||||
@@ -116,7 +117,7 @@ jobs:
|
|||||||
set -e
|
set -e
|
||||||
npm ci
|
npm ci
|
||||||
npm run docs
|
npm run docs
|
||||||
if ! git diff --exit-code -- . ':(exclude)Cargo.lock'; then
|
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
|
||||||
echo "Docs need to be updated"
|
echo "Docs need to be updated"
|
||||||
echo "Run 'npm run docs', fix any warnings, and commit the changes."
|
echo "Run 'npm run docs', fix any warnings, and commit the changes."
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
14
.github/workflows/npm-publish.yml
vendored
14
.github/workflows/npm-publish.yml
vendored
@@ -365,3 +365,17 @@ jobs:
|
|||||||
ARGS="$ARGS --tag preview"
|
ARGS="$ARGS --tag preview"
|
||||||
fi
|
fi
|
||||||
npm publish $ARGS
|
npm publish $ARGS
|
||||||
|
report-failure:
|
||||||
|
name: Report Workflow Failure
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: [build-lancedb, test-lancedb, publish]
|
||||||
|
if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch')
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
issues: write
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: ./.github/actions/create-failure-issue
|
||||||
|
with:
|
||||||
|
job-results: ${{ toJSON(needs) }}
|
||||||
|
workflow-name: ${{ github.workflow }}
|
||||||
|
|||||||
18
.github/workflows/pypi-publish.yml
vendored
18
.github/workflows/pypi-publish.yml
vendored
@@ -56,7 +56,7 @@ jobs:
|
|||||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||||
mac:
|
mac:
|
||||||
timeout-minutes: 60
|
timeout-minutes: 90
|
||||||
runs-on: ${{ matrix.config.runner }}
|
runs-on: ${{ matrix.config.runner }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@@ -64,7 +64,7 @@ jobs:
|
|||||||
- target: x86_64-apple-darwin
|
- target: x86_64-apple-darwin
|
||||||
runner: macos-13
|
runner: macos-13
|
||||||
- target: aarch64-apple-darwin
|
- target: aarch64-apple-darwin
|
||||||
runner: macos-14
|
runner: warp-macos-14-arm64-6x
|
||||||
env:
|
env:
|
||||||
MACOSX_DEPLOYMENT_TARGET: 10.15
|
MACOSX_DEPLOYMENT_TARGET: 10.15
|
||||||
steps:
|
steps:
|
||||||
@@ -173,3 +173,17 @@ jobs:
|
|||||||
generate_release_notes: false
|
generate_release_notes: false
|
||||||
name: Python LanceDB v${{ steps.extract_version.outputs.version }}
|
name: Python LanceDB v${{ steps.extract_version.outputs.version }}
|
||||||
body: ${{ steps.python_release_notes.outputs.changelog }}
|
body: ${{ steps.python_release_notes.outputs.changelog }}
|
||||||
|
report-failure:
|
||||||
|
name: Report Workflow Failure
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: [linux, mac, windows]
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
issues: write
|
||||||
|
if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch')
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: ./.github/actions/create-failure-issue
|
||||||
|
with:
|
||||||
|
job-results: ${{ toJSON(needs) }}
|
||||||
|
workflow-name: ${{ github.workflow }}
|
||||||
|
|||||||
1
.github/workflows/python.yml
vendored
1
.github/workflows/python.yml
vendored
@@ -6,6 +6,7 @@ on:
|
|||||||
- main
|
- main
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
|
- Cargo.toml
|
||||||
- python/**
|
- python/**
|
||||||
- .github/workflows/python.yml
|
- .github/workflows/python.yml
|
||||||
|
|
||||||
|
|||||||
13
.github/workflows/rust.yml
vendored
13
.github/workflows/rust.yml
vendored
@@ -96,6 +96,7 @@ jobs:
|
|||||||
# Need up-to-date compilers for kernels
|
# Need up-to-date compilers for kernels
|
||||||
CC: clang-18
|
CC: clang-18
|
||||||
CXX: clang++-18
|
CXX: clang++-18
|
||||||
|
GH_TOKEN: ${{ secrets.SOPHON_READ_TOKEN }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
@@ -117,15 +118,17 @@ jobs:
|
|||||||
sudo chmod 600 /swapfile
|
sudo chmod 600 /swapfile
|
||||||
sudo mkswap /swapfile
|
sudo mkswap /swapfile
|
||||||
sudo swapon /swapfile
|
sudo swapon /swapfile
|
||||||
- name: Start S3 integration test environment
|
|
||||||
working-directory: .
|
|
||||||
run: docker compose up --detach --wait
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: cargo build --all-features --tests --locked --examples
|
run: cargo build --all-features --tests --locked --examples
|
||||||
- name: Run tests
|
- name: Run feature tests
|
||||||
run: cargo test --all-features --locked
|
run: make -C ./lancedb feature-tests
|
||||||
- name: Run examples
|
- name: Run examples
|
||||||
run: cargo run --example simple --locked
|
run: cargo run --example simple --locked
|
||||||
|
- name: Run remote tests
|
||||||
|
# Running this requires access to secrets, so skip if this is
|
||||||
|
# a PR from a fork.
|
||||||
|
if: github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork
|
||||||
|
run: make -C ./lancedb remote-tests
|
||||||
|
|
||||||
macos:
|
macos:
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
|
|||||||
26
.github/workflows/trigger-vectordb-recipes.yml
vendored
26
.github/workflows/trigger-vectordb-recipes.yml
vendored
@@ -1,26 +0,0 @@
|
|||||||
name: Trigger vectordb-recipers workflow
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches: [ main ]
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- .github/workflows/trigger-vectordb-recipes.yml
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
build:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Trigger vectordb-recipes workflow
|
|
||||||
uses: actions/github-script@v6
|
|
||||||
with:
|
|
||||||
github-token: ${{ secrets.VECTORDB_RECIPES_ACTION_TOKEN }}
|
|
||||||
script: |
|
|
||||||
const result = await github.rest.actions.createWorkflowDispatch({
|
|
||||||
owner: 'lancedb',
|
|
||||||
repo: 'vectordb-recipes',
|
|
||||||
workflow_id: 'examples-test.yml',
|
|
||||||
ref: 'main'
|
|
||||||
});
|
|
||||||
console.log(result);
|
|
||||||
1307
Cargo.lock
generated
1307
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
68
Cargo.toml
68
Cargo.toml
@@ -15,30 +15,34 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.35.0", default-features = false, "features" = ["dynamodb"], "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance = { "version" = "=0.38.2", default-features = false, "features" = ["dynamodb"], "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-io = { "version" = "=0.35.0", default-features = false, "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-core = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-index = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-datagen = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-linalg = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-file = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-table = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-io = { "version" = "=0.38.2", default-features = false, "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-testing = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-index = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-datafusion = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-linalg = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-encoding = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-table = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-testing = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-datafusion = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-encoding = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-namespace = "0.0.18"
|
||||||
|
ahash = "0.8"
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "55.1", optional = false }
|
arrow = { version = "56.2", optional = false }
|
||||||
arrow-array = "55.1"
|
arrow-array = "56.2"
|
||||||
arrow-data = "55.1"
|
arrow-data = "56.2"
|
||||||
arrow-ipc = "55.1"
|
arrow-ipc = "56.2"
|
||||||
arrow-ord = "55.1"
|
arrow-ord = "56.2"
|
||||||
arrow-schema = "55.1"
|
arrow-schema = "56.2"
|
||||||
arrow-arith = "55.1"
|
arrow-cast = "56.2"
|
||||||
arrow-cast = "55.1"
|
|
||||||
async-trait = "0"
|
async-trait = "0"
|
||||||
datafusion = { version = "49.0", default-features = false }
|
datafusion = { version = "50.1", default-features = false }
|
||||||
datafusion-catalog = "49.0"
|
datafusion-catalog = "50.1"
|
||||||
datafusion-common = { version = "49.0", default-features = false }
|
datafusion-common = { version = "50.1", default-features = false }
|
||||||
datafusion-execution = "49.0"
|
datafusion-execution = "50.1"
|
||||||
datafusion-expr = "49.0"
|
datafusion-expr = "50.1"
|
||||||
datafusion-physical-plan = "49.0"
|
datafusion-physical-plan = "50.1"
|
||||||
env_logger = "0.11"
|
env_logger = "0.11"
|
||||||
half = { "version" = "2.6.0", default-features = false, features = [
|
half = { "version" = "2.6.0", default-features = false, features = [
|
||||||
"num-traits",
|
"num-traits",
|
||||||
@@ -48,18 +52,26 @@ log = "0.4"
|
|||||||
moka = { version = "0.12", features = ["future"] }
|
moka = { version = "0.12", features = ["future"] }
|
||||||
object_store = "0.12.0"
|
object_store = "0.12.0"
|
||||||
pin-project = "1.0.7"
|
pin-project = "1.0.7"
|
||||||
|
rand = "0.9"
|
||||||
snafu = "0.8"
|
snafu = "0.8"
|
||||||
url = "2"
|
url = "2"
|
||||||
num-traits = "0.2"
|
num-traits = "0.2"
|
||||||
rand = "0.9"
|
|
||||||
regex = "1.10"
|
regex = "1.10"
|
||||||
lazy_static = "1"
|
lazy_static = "1"
|
||||||
semver = "1.0.25"
|
semver = "1.0.25"
|
||||||
crunchy = "0.2.4"
|
crunchy = "0.2.4"
|
||||||
# Temporary pins to work around downstream issues
|
chrono = "0.4"
|
||||||
# https://github.com/apache/arrow-rs/commit/2fddf85afcd20110ce783ed5b4cdeb82293da30b
|
|
||||||
chrono = "=0.4.41"
|
|
||||||
# https://github.com/RustCrypto/formats/issues/1684
|
|
||||||
base64ct = "=1.6.0"
|
|
||||||
# Workaround for: https://github.com/Lokathor/bytemuck/issues/306
|
# Workaround for: https://github.com/Lokathor/bytemuck/issues/306
|
||||||
bytemuck_derive = ">=1.8.1, <1.9.0"
|
bytemuck_derive = ">=1.8.1, <1.9.0"
|
||||||
|
|
||||||
|
# This is only needed when we reference preview releases of lance
|
||||||
|
# Force to use the same lance version as the rest of the project to avoid duplicate dependencies
|
||||||
|
[patch.crates-io]
|
||||||
|
lance = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-io = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-index = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-linalg = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-table = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-testing = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-datafusion = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-encoding = { "version" = "=0.38.2", "tag" = "v0.38.3-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
|||||||
4
ci/create_lancedb_test_connection.sh
Executable file
4
ci/create_lancedb_test_connection.sh
Executable file
@@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
export RUST_LOG=info
|
||||||
|
exec ./lancedb server --port 0 --sql-port 0 --data-dir "${1}"
|
||||||
18
ci/run_with_docker_compose.sh
Executable file
18
ci/run_with_docker_compose.sh
Executable file
@@ -0,0 +1,18 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
#
|
||||||
|
# A script for running the given command together with a docker compose environment.
|
||||||
|
#
|
||||||
|
|
||||||
|
# Bring down the docker setup once the command is done running.
|
||||||
|
tear_down() {
|
||||||
|
docker compose -p fixture down
|
||||||
|
}
|
||||||
|
trap tear_down EXIT
|
||||||
|
|
||||||
|
set +xe
|
||||||
|
|
||||||
|
# Clean up any existing docker setup and bring up a new one.
|
||||||
|
docker compose -p fixture up --detach --wait || exit 1
|
||||||
|
|
||||||
|
"${@}"
|
||||||
68
ci/run_with_test_connection.sh
Executable file
68
ci/run_with_test_connection.sh
Executable file
@@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
#
|
||||||
|
# A script for running the given command together with the lancedb cli.
|
||||||
|
#
|
||||||
|
|
||||||
|
die() {
|
||||||
|
echo $?
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
check_command_exists() {
|
||||||
|
command="${1}"
|
||||||
|
which ${command} &> /dev/null || \
|
||||||
|
die "Unable to locate command: ${command}. Did you install it?"
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ ! -e ./lancedb ]]; then
|
||||||
|
if [[ -v SOPHON_READ_TOKEN ]]; then
|
||||||
|
INPUT="lancedb-linux-x64"
|
||||||
|
gh release \
|
||||||
|
--repo lancedb/lancedb \
|
||||||
|
download ci-support-binaries \
|
||||||
|
--pattern "${INPUT}" \
|
||||||
|
|| die "failed to fetch cli."
|
||||||
|
check_command_exists openssl
|
||||||
|
openssl enc -aes-256-cbc \
|
||||||
|
-d -pbkdf2 \
|
||||||
|
-pass "env:SOPHON_READ_TOKEN" \
|
||||||
|
-in "${INPUT}" \
|
||||||
|
-out ./lancedb-linux-x64.tar.gz \
|
||||||
|
|| die "openssl failed"
|
||||||
|
TARGET="${INPUT}.tar.gz"
|
||||||
|
else
|
||||||
|
ARCH="x64"
|
||||||
|
if [[ $OSTYPE == 'darwin'* ]]; then
|
||||||
|
UNAME=$(uname -m)
|
||||||
|
if [[ $UNAME == 'arm64' ]]; then
|
||||||
|
ARCH='arm64'
|
||||||
|
fi
|
||||||
|
OSTYPE="macos"
|
||||||
|
elif [[ $OSTYPE == 'linux'* ]]; then
|
||||||
|
if [[ $UNAME == 'aarch64' ]]; then
|
||||||
|
ARCH='arm64'
|
||||||
|
fi
|
||||||
|
OSTYPE="linux"
|
||||||
|
else
|
||||||
|
die "unknown OSTYPE: $OSTYPE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
check_command_exists gh
|
||||||
|
TARGET="lancedb-${OSTYPE}-${ARCH}.tar.gz"
|
||||||
|
gh release \
|
||||||
|
--repo lancedb/sophon \
|
||||||
|
download lancedb-cli-v0.0.3 \
|
||||||
|
--pattern "${TARGET}" \
|
||||||
|
|| die "failed to fetch cli."
|
||||||
|
fi
|
||||||
|
|
||||||
|
check_command_exists tar
|
||||||
|
tar xvf "${TARGET}" || die "tar failed."
|
||||||
|
[[ -e ./lancedb ]] || die "failed to extract lancedb."
|
||||||
|
fi
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
|
||||||
|
export CREATE_LANCEDB_TEST_CONNECTION_SCRIPT="${SCRIPT_DIR}/create_lancedb_test_connection.sh"
|
||||||
|
|
||||||
|
"${@}"
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@@ -18,8 +19,12 @@ def run_command(command: str) -> str:
|
|||||||
|
|
||||||
def get_latest_stable_version() -> str:
|
def get_latest_stable_version() -> str:
|
||||||
version_line = run_command("cargo info lance | grep '^version:'")
|
version_line = run_command("cargo info lance | grep '^version:'")
|
||||||
version = version_line.split(" ")[1].strip()
|
# Example output: "version: 0.35.0 (latest 0.37.0)"
|
||||||
return version
|
match = re.search(r'\(latest ([0-9.]+)\)', version_line)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
# Fallback: use the first version after 'version:'
|
||||||
|
return version_line.split("version:")[1].split()[0].strip()
|
||||||
|
|
||||||
|
|
||||||
def get_latest_preview_version() -> str:
|
def get_latest_preview_version() -> str:
|
||||||
@@ -112,7 +117,7 @@ def update_cargo_toml(line_updater):
|
|||||||
lance_line = ""
|
lance_line = ""
|
||||||
is_parsing_lance_line = False
|
is_parsing_lance_line = False
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if line.startswith("lance"):
|
if line.startswith("lance") and not line.startswith("lance-namespace"):
|
||||||
# Check if this is a single-line or multi-line entry
|
# Check if this is a single-line or multi-line entry
|
||||||
# Single-line entries either:
|
# Single-line entries either:
|
||||||
# 1. End with } (complete inline table)
|
# 1. End with } (complete inline table)
|
||||||
|
|||||||
@@ -70,6 +70,23 @@ plugins:
|
|||||||
- mkdocs-jupyter
|
- mkdocs-jupyter
|
||||||
- render_swagger:
|
- render_swagger:
|
||||||
allow_arbitrary_locations: true
|
allow_arbitrary_locations: true
|
||||||
|
- redirects:
|
||||||
|
redirect_maps:
|
||||||
|
# Redirect the home page and other top-level markdown files. This enables maximum SEO benefit
|
||||||
|
# other sub-pages are handled by the ingected js in overrides/partials/header.html
|
||||||
|
'index.md': 'https://lancedb.com/docs/'
|
||||||
|
'guides/tables.md': 'https://lancedb.com/docs/tables/'
|
||||||
|
'ann_indexes.md': 'https://lancedb.com/docs/indexing/'
|
||||||
|
'basic.md': 'https://lancedb.com/docs/quickstart/'
|
||||||
|
'faq.md': 'https://lancedb.com/docs/faq/'
|
||||||
|
'embeddings/understanding_embeddings.md': 'https://lancedb.com/docs/embedding/'
|
||||||
|
'integrations.md': 'https://lancedb.com/docs/integrations/'
|
||||||
|
'examples.md': 'https://lancedb.com/docs/tutorials/'
|
||||||
|
'concepts/vector_search.md': 'https://lancedb.com/docs/search/vector-search/'
|
||||||
|
'troubleshooting.md': 'https://lancedb.com/docs/troubleshooting/'
|
||||||
|
'guides/storage.md': 'https://lancedb.com/docs/storage/integrations'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
markdown_extensions:
|
markdown_extensions:
|
||||||
- admonition
|
- admonition
|
||||||
|
|||||||
@@ -19,7 +19,13 @@
|
|||||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||||
IN THE SOFTWARE.
|
IN THE SOFTWARE.
|
||||||
-->
|
-->
|
||||||
|
<div id="deprecation-banner" style="background-color: #f8d7da; color: #721c24; padding: 1em; text-align: center;">
|
||||||
|
<p style="margin: 0; font-size: 1.1em;">
|
||||||
|
<strong>This documentation site is deprecated.</strong>
|
||||||
|
Please visit our new documentation site at <a href="https://lancedb.com/docs" style="color: #721c24; text-decoration: underline;">
|
||||||
|
lancedb.com/docs</a> for the latest information.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
{% set class = "md-header" %}
|
{% set class = "md-header" %}
|
||||||
{% if "navigation.tabs.sticky" in features %}
|
{% if "navigation.tabs.sticky" in features %}
|
||||||
{% set class = class ~ " md-header--shadow md-header--lifted" %}
|
{% set class = class ~ " md-header--shadow md-header--lifted" %}
|
||||||
@@ -150,9 +156,9 @@
|
|||||||
|
|
||||||
<div style="margin-left: 10px; margin-right: 5px;">
|
<div style="margin-left: 10px; margin-right: 5px;">
|
||||||
<a href="https://discord.com/invite/zMM32dvNtd" target="_blank" rel="noopener noreferrer">
|
<a href="https://discord.com/invite/zMM32dvNtd" target="_blank" rel="noopener noreferrer">
|
||||||
<svg fill="#FFFFFF" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 50 50" width="25px" height="25px"><path d="M 41.625 10.769531 C 37.644531 7.566406 31.347656 7.023438 31.078125 7.003906 C 30.660156 6.96875 30.261719 7.203125 30.089844 7.589844 C 30.074219 7.613281 29.9375 7.929688 29.785156 8.421875 C 32.417969 8.867188 35.652344 9.761719 38.578125 11.578125 C 39.046875 11.867188 39.191406 12.484375 38.902344 12.953125 C 38.710938 13.261719 38.386719 13.429688 38.050781 13.429688 C 37.871094 13.429688 37.6875 13.378906 37.523438 13.277344 C 32.492188 10.15625 26.210938 10 25 10 C 23.789063 10 17.503906 10.15625 12.476563 13.277344 C 12.007813 13.570313 11.390625 13.425781 11.101563 12.957031 C 10.808594 12.484375 10.953125 11.871094 11.421875 11.578125 C 14.347656 9.765625 17.582031 8.867188 20.214844 8.425781 C 20.0625 7.929688 19.925781 7.617188 19.914063 7.589844 C 19.738281 7.203125 19.34375 6.960938 18.921875 7.003906 C 18.652344 7.023438 12.355469 7.566406 8.320313 10.8125 C 6.214844 12.761719 2 24.152344 2 34 C 2 34.175781 2.046875 34.34375 2.132813 34.496094 C 5.039063 39.605469 12.972656 40.941406 14.78125 41 C 14.789063 41 14.800781 41 14.8125 41 C 15.132813 41 15.433594 40.847656 15.621094 40.589844 L 17.449219 38.074219 C 12.515625 36.800781 9.996094 34.636719 9.851563 34.507813 C 9.4375 34.144531 9.398438 33.511719 9.765625 33.097656 C 10.128906 32.683594 10.761719 32.644531 11.175781 33.007813 C 11.234375 33.0625 15.875 37 25 37 C 34.140625 37 38.78125 33.046875 38.828125 33.007813 C 39.242188 32.648438 39.871094 32.683594 40.238281 33.101563 C 40.601563 33.515625 40.5625 34.144531 40.148438 34.507813 C 40.003906 34.636719 37.484375 36.800781 32.550781 38.074219 L 34.378906 40.589844 C 34.566406 40.847656 34.867188 41 35.1875 41 C 35.199219 41 35.210938 41 35.21875 41 C 37.027344 40.941406 44.960938 39.605469 47.867188 34.496094 C 47.953125 34.34375 48 34.175781 48 34 C 48 24.152344 43.785156 12.761719 41.625 10.769531 Z M 18.5 30 C 16.566406 30 15 28.210938 15 26 C 15 23.789063 16.566406 22 18.5 22 C 20.433594 22 22 23.789063 22 26 C 22 28.210938 20.433594 30 18.5 30 Z M 31.5 30 C 29.566406 30 28 28.210938 28 26 C 28 23.789063 29.566406 22 31.5 22 C 33.433594 22 35 23.789063 35 26 C 35 28.210938 33.433594 30 31.5 30 Z"/></svg>
|
<svg fill="#FFFFFF" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 50 50" width="25px" height="25px"><path d="M 41.625 10.769531 C 37.644531 7.566406 31.347656 7.023438 31.078125 7.003906 C 30.660156 6.96875 30.261719 7.203125 30.089844 7.589844 C 30.074219 7.613281 29.9375 7.929688 29.785156 8.421875 C 32.417969 8.867188 35.652344 9.761719 38.578125 11.578125 C 39.046875 11.867188 39.191406 12.484375 38.902344 12.953125 C 38.710938 13.261719 38.386719 13.429688 38.050781 13.429688 C 37.871094 13.429688 37.6875 13.378906 37.523438 13.277344 C 32.492188 10.15625 26.210938 10 25 10 C 23.789063 10 17.503906 10.15625 12.476563 13.277344 C 12.007813 13.570313 11.390625 13.425781 11.101563 12.957031 C 10.808594 12.484375 10.953125 11.871094 11.421875 11.578125 C 14.347656 9.765625 17.582031 8.867188 20.214844 8.425781 C 20.0625 7.929688 19.925781 7.617188 19.914063 7.589844 C 19.738281 7.203125 19.34375 6.960938 18.921875 7.003906 C 18.652344 7.023438 12.355469 7.566406 8.320313 10.8125 C 6.214844 12.761719 2 24.152344 2 34 C 2 34.175781 2.046875 34.34375 2.132813 34.496094 C 5.039063 39.605469 12.972656 40.941406 14.78125 41 C 14.789063 41 14.800781 41 14.8125 41 C 15.132813 41 15.433594 40.847656 15.621094 40.589844 L 17.449219 38.074219 C 12.515625 36.800781 9.996094 34.636719 9.851563 34.507813 C 9.4375 34.144531 9.398438 33.511719 9.765625 33.097656 C 10.128906 32.683594 10.761719 32.644531 11.175781 33.007813 C 11.234375 33.0625 15.875 37 25 37 C 34.140625 37 38.78125 33.046875 38.828125 33.007813 C 39.242188 32.648438 39.871094 32.683594 40.238281 33.101563 C 40.601563 33.515625 40.5625 34.144531 40.148438 34.507813 C 40.003906 34.636719 37.484375 36.800781 32.550781 38.074219 L 34.378906 40.589844 C 34.566406 40.847656 34.867188 41 35.1875 41 C 35.199219 41 35.210938 41 35.21875 41 C 37.027344 40.941406 44.960938 39.605469 47.867188 34.496094 C 47.953125 34.34375 48 34.175781 48 34 C 48 24.152344 43.785156 12.761719 41.625 10.769531 Z M 18.5 30 C 16.566406 30 15 28.210938 15 26 C 15 23.789063 16.566406 22 18.5 22 C 20.433594 22 22 23.789063 22 26 C 22 28.210938 20.433594 30 18.5 30 Z M 31.5 30 C 29.566406 30 28 28.210938 28 26 C 28 23.789063 29.566406 22 31.5 22 C 33.433594 22 35 23.789063 35 26 C 35 28.210938 33.433594 30 31.5 30 Z"/></svg>
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
<div style="margin-left: 5px; margin-right: 5px;">
|
<div style="margin-left: 5px; margin-right: 5px;">
|
||||||
<a href="https://twitter.com/lancedb" target="_blank" rel="noopener noreferrer">
|
<a href="https://twitter.com/lancedb" target="_blank" rel="noopener noreferrer">
|
||||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0,0,256,256" width="25px" height="25px" fill-rule="nonzero"><g fill-opacity="0" fill="#ffffff" fill-rule="nonzero" stroke="none" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="10" stroke-dasharray="" stroke-dashoffset="0" font-family="none" font-weight="none" font-size="none" text-anchor="none" style="mix-blend-mode: normal"><path d="M0,256v-256h256v256z" id="bgRectangle"></path></g><g fill="#ffffff" fill-rule="nonzero" stroke="none" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="10" stroke-dasharray="" stroke-dashoffset="0" font-family="none" font-weight="none" font-size="none" text-anchor="none" style="mix-blend-mode: normal"><g transform="scale(4,4)"><path d="M57,17.114c-1.32,1.973 -2.991,3.707 -4.916,5.097c0.018,0.423 0.028,0.847 0.028,1.274c0,13.013 -9.902,28.018 -28.016,28.018c-5.562,0 -12.81,-1.948 -15.095,-4.423c0.772,0.092 1.556,0.138 2.35,0.138c4.615,0 8.861,-1.575 12.23,-4.216c-4.309,-0.079 -7.946,-2.928 -9.199,-6.84c1.96,0.308 4.447,-0.17 4.447,-0.17c0,0 -7.7,-1.322 -7.899,-9.779c2.226,1.291 4.46,1.231 4.46,1.231c0,0 -4.441,-2.734 -4.379,-8.195c0.037,-3.221 1.331,-4.953 1.331,-4.953c8.414,10.361 20.298,10.29 20.298,10.29c0,0 -0.255,-1.471 -0.255,-2.243c0,-5.437 4.408,-9.847 9.847,-9.847c2.832,0 5.391,1.196 7.187,3.111c2.245,-0.443 4.353,-1.263 6.255,-2.391c-0.859,3.44 -4.329,5.448 -4.329,5.448c0,0 2.969,-0.329 5.655,-1.55z"></path></g></g></svg>
|
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0,0,256,256" width="25px" height="25px" fill-rule="nonzero"><g fill-opacity="0" fill="#ffffff" fill-rule="nonzero" stroke="none" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="10" stroke-dasharray="" stroke-dashoffset="0" font-family="none" font-weight="none" font-size="none" text-anchor="none" style="mix-blend-mode: normal"><path d="M0,256v-256h256v256z" id="bgRectangle"></path></g><g fill="#ffffff" fill-rule="nonzero" stroke="none" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="10" stroke-dasharray="" stroke-dashoffset="0" font-family="none" font-weight="none" font-size="none" text-anchor="none" style="mix-blend-mode: normal"><g transform="scale(4,4)"><path d="M57,17.114c-1.32,1.973 -2.991,3.707 -4.916,5.097c0.018,0.423 0.028,0.847 0.028,1.274c0,13.013 -9.902,28.018 -28.016,28.018c-5.562,0 -12.81,-1.948 -15.095,-4.423c0.772,0.092 1.556,0.138 2.35,0.138c4.615,0 8.861,-1.575 12.23,-4.216c-4.309,-0.079 -7.946,-2.928 -9.199,-6.84c1.96,0.308 4.447,-0.17 4.447,-0.17c0,0 -7.7,-1.322 -7.899,-9.779c2.226,1.291 4.46,1.231 4.46,1.231c0,0 -4.441,-2.734 -4.379,-8.195c0.037,-3.221 1.331,-4.953 1.331,-4.953c8.414,10.361 20.298,10.29 20.298,10.29c0,0 -0.255,-1.471 -0.255,-2.243c0,-5.437 4.408,-9.847 9.847,-9.847c2.832,0 5.391,1.196 7.187,3.111c2.245,-0.443 4.353,-1.263 6.255,-2.391c-0.859,3.44 -4.329,5.448 -4.329,5.448c0,0 2.969,-0.329 5.655,-1.55z"></path></g></g></svg>
|
||||||
@@ -173,4 +179,77 @@
|
|||||||
{% include "partials/tabs.html" %}
|
{% include "partials/tabs.html" %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
(function() {
|
||||||
|
function checkPathAndRedirect() {
|
||||||
|
var banner = document.getElementById('deprecation-banner');
|
||||||
|
|
||||||
|
if (document.querySelector('meta[http-equiv="refresh"]')) {
|
||||||
|
return; // The redirects plugin is already handling this page.
|
||||||
|
}
|
||||||
|
|
||||||
|
var currentPath = window.location.pathname;
|
||||||
|
|
||||||
|
var cleanPath = currentPath.endsWith('/') && currentPath.length > 1
|
||||||
|
? currentPath.slice(0, -1)
|
||||||
|
: currentPath;
|
||||||
|
|
||||||
|
// These are the ONLY paths that should remain on the old site
|
||||||
|
var apiPaths = [
|
||||||
|
'/lancedb/python',
|
||||||
|
'/lancedb/javascript',
|
||||||
|
'/lancedb/js',
|
||||||
|
'/lancedb/api_reference'
|
||||||
|
];
|
||||||
|
|
||||||
|
var isApiPage = apiPaths.some(function(apiPath) {
|
||||||
|
return cleanPath.startsWith(apiPath);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (isApiPage) {
|
||||||
|
if (banner) {
|
||||||
|
banner.style.display = 'none';
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (banner) {
|
||||||
|
banner.style.display = 'block';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add noindex meta tag to prevent indexing of old docs for seo
|
||||||
|
var noindexMeta = document.createElement('meta');
|
||||||
|
noindexMeta.setAttribute('name', 'robots');
|
||||||
|
noindexMeta.setAttribute('content', 'noindex, follow');
|
||||||
|
document.head.appendChild(noindexMeta);
|
||||||
|
|
||||||
|
// Add canonical link to point to the new docs to reward new site for seo
|
||||||
|
var canonicalLink = document.createElement('link');
|
||||||
|
canonicalLink.setAttribute('rel', 'canonical');
|
||||||
|
canonicalLink.setAttribute('href', 'https://lancedb.com/docs');
|
||||||
|
document.head.appendChild(canonicalLink);
|
||||||
|
|
||||||
|
window.location.replace('https://lancedb.com/docs');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run the check only if doc is ready. This makes sure we catch the initial load
|
||||||
|
// and redirect.
|
||||||
|
if (document.readyState === 'loading') {
|
||||||
|
document.addEventListener('DOMContentLoaded', checkPathAndRedirect);
|
||||||
|
} else {
|
||||||
|
checkPathAndRedirect();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use an interval to handle subsequent navigation clicks.
|
||||||
|
var lastPath = window.location.pathname;
|
||||||
|
setInterval(function() {
|
||||||
|
if (window.location.pathname !== lastPath) {
|
||||||
|
lastPath = window.location.pathname;
|
||||||
|
checkPathAndRedirect();
|
||||||
|
}
|
||||||
|
}, 2000); // keeping it 2 second to make it easy for user to understand
|
||||||
|
// what's happening
|
||||||
|
|
||||||
|
})();
|
||||||
|
</script>
|
||||||
@@ -5,3 +5,4 @@ mkdocstrings[python]==0.25.2
|
|||||||
griffe
|
griffe
|
||||||
mkdocs-render-swagger-plugin
|
mkdocs-render-swagger-plugin
|
||||||
pydantic
|
pydantic
|
||||||
|
mkdocs-redirects
|
||||||
|
|||||||
@@ -25,6 +25,51 @@ the underlying connection has been closed.
|
|||||||
|
|
||||||
## Methods
|
## Methods
|
||||||
|
|
||||||
|
### cloneTable()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
abstract cloneTable(
|
||||||
|
targetTableName,
|
||||||
|
sourceUri,
|
||||||
|
options?): Promise<Table>
|
||||||
|
```
|
||||||
|
|
||||||
|
Clone a table from a source table.
|
||||||
|
|
||||||
|
A shallow clone creates a new table that shares the underlying data files
|
||||||
|
with the source table but has its own independent manifest. This allows
|
||||||
|
both the source and cloned tables to evolve independently while initially
|
||||||
|
sharing the same data, deletion, and index files.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **targetTableName**: `string`
|
||||||
|
The name of the target table to create.
|
||||||
|
|
||||||
|
* **sourceUri**: `string`
|
||||||
|
The URI of the source table to clone from.
|
||||||
|
|
||||||
|
* **options?**
|
||||||
|
Clone options.
|
||||||
|
|
||||||
|
* **options.isShallow?**: `boolean`
|
||||||
|
Whether to perform a shallow clone (defaults to true).
|
||||||
|
|
||||||
|
* **options.sourceTag?**: `string`
|
||||||
|
The tag of the source table to clone.
|
||||||
|
|
||||||
|
* **options.sourceVersion?**: `number`
|
||||||
|
The version of the source table to clone.
|
||||||
|
|
||||||
|
* **options.targetNamespace?**: `string`[]
|
||||||
|
The namespace for the target table (defaults to root namespace).
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`Promise`<[`Table`](Table.md)>
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### close()
|
### close()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -194,6 +194,37 @@ currently is also a memory intensive operation.
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### ivfRq()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
static ivfRq(options?): Index
|
||||||
|
```
|
||||||
|
|
||||||
|
Create an IvfRq index
|
||||||
|
|
||||||
|
IVF-RQ (RabitQ Quantization) compresses vectors using RabitQ quantization
|
||||||
|
and organizes them into IVF partitions.
|
||||||
|
|
||||||
|
The compression scheme is called RabitQ quantization. Each dimension is quantized into a small number of bits.
|
||||||
|
The parameters `num_bits` and `num_partitions` control this process, providing a tradeoff
|
||||||
|
between index size (and thus search speed) and index accuracy.
|
||||||
|
|
||||||
|
The partitioning process is called IVF and the `num_partitions` parameter controls how
|
||||||
|
many groups to create.
|
||||||
|
|
||||||
|
Note that training an IVF RQ index on a large dataset is a slow operation and
|
||||||
|
currently is also a memory intensive operation.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **options?**: `Partial`<[`IvfRqOptions`](../interfaces/IvfRqOptions.md)>
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`Index`](Index.md)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### labelList()
|
### labelList()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -52,6 +52,30 @@ the merge result
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### useIndex()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
useIndex(useIndex): MergeInsertBuilder
|
||||||
|
```
|
||||||
|
|
||||||
|
Controls whether to use indexes for the merge operation.
|
||||||
|
|
||||||
|
When set to `true` (the default), the operation will use an index if available
|
||||||
|
on the join key for improved performance. When set to `false`, it forces a full
|
||||||
|
table scan even if an index exists. This can be useful for benchmarking or when
|
||||||
|
the query optimizer chooses a suboptimal path.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **useIndex**: `boolean`
|
||||||
|
Whether to use indices for the merge operation. Defaults to `true`.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`MergeInsertBuilder`](MergeInsertBuilder.md)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### whenMatchedUpdateAll()
|
### whenMatchedUpdateAll()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
220
docs/src/js/classes/PermutationBuilder.md
Normal file
220
docs/src/js/classes/PermutationBuilder.md
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / PermutationBuilder
|
||||||
|
|
||||||
|
# Class: PermutationBuilder
|
||||||
|
|
||||||
|
A PermutationBuilder for creating data permutations with splits, shuffling, and filtering.
|
||||||
|
|
||||||
|
This class provides a TypeScript wrapper around the native Rust PermutationBuilder,
|
||||||
|
offering methods to configure data splits, shuffling, and filtering before executing
|
||||||
|
the permutation to create a new table.
|
||||||
|
|
||||||
|
## Methods
|
||||||
|
|
||||||
|
### execute()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
execute(): Promise<Table>
|
||||||
|
```
|
||||||
|
|
||||||
|
Execute the permutation and create the destination table.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`Promise`<[`Table`](Table.md)>
|
||||||
|
|
||||||
|
A Promise that resolves to the new Table instance
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
console.log(`Created table: ${permutationTable.name}`);
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### filter()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
filter(filter): PermutationBuilder
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure filtering for the permutation.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **filter**: `string`
|
||||||
|
SQL filter expression
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`PermutationBuilder`](PermutationBuilder.md)
|
||||||
|
|
||||||
|
A new PermutationBuilder instance
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
builder.filter("age > 18 AND status = 'active'");
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### shuffle()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
shuffle(options): PermutationBuilder
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure shuffling for the permutation.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **options**: [`ShuffleOptions`](../interfaces/ShuffleOptions.md)
|
||||||
|
Configuration for shuffling
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`PermutationBuilder`](PermutationBuilder.md)
|
||||||
|
|
||||||
|
A new PermutationBuilder instance
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
// Basic shuffle
|
||||||
|
builder.shuffle({ seed: 42 });
|
||||||
|
|
||||||
|
// Shuffle with clump size
|
||||||
|
builder.shuffle({ seed: 42, clumpSize: 10 });
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### splitCalculated()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
splitCalculated(calculation): PermutationBuilder
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure calculated splits for the permutation.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **calculation**: `string`
|
||||||
|
SQL expression for calculating splits
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`PermutationBuilder`](PermutationBuilder.md)
|
||||||
|
|
||||||
|
A new PermutationBuilder instance
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
builder.splitCalculated("user_id % 3");
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### splitHash()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
splitHash(options): PermutationBuilder
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure hash-based splits for the permutation.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **options**: [`SplitHashOptions`](../interfaces/SplitHashOptions.md)
|
||||||
|
Configuration for hash-based splitting
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`PermutationBuilder`](PermutationBuilder.md)
|
||||||
|
|
||||||
|
A new PermutationBuilder instance
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
builder.splitHash({
|
||||||
|
columns: ["user_id"],
|
||||||
|
splitWeights: [70, 30],
|
||||||
|
discardWeight: 0
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### splitRandom()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
splitRandom(options): PermutationBuilder
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure random splits for the permutation.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **options**: [`SplitRandomOptions`](../interfaces/SplitRandomOptions.md)
|
||||||
|
Configuration for random splitting
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`PermutationBuilder`](PermutationBuilder.md)
|
||||||
|
|
||||||
|
A new PermutationBuilder instance
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
// Split by ratios
|
||||||
|
builder.splitRandom({ ratios: [0.7, 0.3], seed: 42 });
|
||||||
|
|
||||||
|
// Split by counts
|
||||||
|
builder.splitRandom({ counts: [1000, 500], seed: 42 });
|
||||||
|
|
||||||
|
// Split with fixed size
|
||||||
|
builder.splitRandom({ fixed: 100, seed: 42 });
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### splitSequential()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
splitSequential(options): PermutationBuilder
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure sequential splits for the permutation.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **options**: [`SplitSequentialOptions`](../interfaces/SplitSequentialOptions.md)
|
||||||
|
Configuration for sequential splitting
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`PermutationBuilder`](PermutationBuilder.md)
|
||||||
|
|
||||||
|
A new PermutationBuilder instance
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
// Split by ratios
|
||||||
|
builder.splitSequential({ ratios: [0.8, 0.2] });
|
||||||
|
|
||||||
|
// Split by counts
|
||||||
|
builder.splitSequential({ counts: [800, 200] });
|
||||||
|
|
||||||
|
// Split with fixed size
|
||||||
|
builder.splitSequential({ fixed: 1000 });
|
||||||
|
```
|
||||||
@@ -13,7 +13,7 @@ function makeArrowTable(
|
|||||||
metadata?): ArrowTable
|
metadata?): ArrowTable
|
||||||
```
|
```
|
||||||
|
|
||||||
An enhanced version of the makeTable function from Apache Arrow
|
An enhanced version of the apache-arrow makeTable function from Apache Arrow
|
||||||
that supports nested fields and embeddings columns.
|
that supports nested fields and embeddings columns.
|
||||||
|
|
||||||
(typically you do not need to call this function. It will be called automatically
|
(typically you do not need to call this function. It will be called automatically
|
||||||
|
|||||||
37
docs/src/js/functions/permutationBuilder.md
Normal file
37
docs/src/js/functions/permutationBuilder.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / permutationBuilder
|
||||||
|
|
||||||
|
# Function: permutationBuilder()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
function permutationBuilder(table, destTableName): PermutationBuilder
|
||||||
|
```
|
||||||
|
|
||||||
|
Create a permutation builder for the given table.
|
||||||
|
|
||||||
|
## Parameters
|
||||||
|
|
||||||
|
* **table**: [`Table`](../classes/Table.md)
|
||||||
|
The source table to create a permutation from
|
||||||
|
|
||||||
|
* **destTableName**: `string`
|
||||||
|
The name for the destination permutation table
|
||||||
|
|
||||||
|
## Returns
|
||||||
|
|
||||||
|
[`PermutationBuilder`](../classes/PermutationBuilder.md)
|
||||||
|
|
||||||
|
A PermutationBuilder instance
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
const builder = permutationBuilder(sourceTable, "training_data")
|
||||||
|
.splitRandom({ ratios: [0.8, 0.2], seed: 42 })
|
||||||
|
.shuffle({ seed: 123 });
|
||||||
|
|
||||||
|
const trainingTable = await builder.execute();
|
||||||
|
```
|
||||||
@@ -28,6 +28,7 @@
|
|||||||
- [MultiMatchQuery](classes/MultiMatchQuery.md)
|
- [MultiMatchQuery](classes/MultiMatchQuery.md)
|
||||||
- [NativeJsHeaderProvider](classes/NativeJsHeaderProvider.md)
|
- [NativeJsHeaderProvider](classes/NativeJsHeaderProvider.md)
|
||||||
- [OAuthHeaderProvider](classes/OAuthHeaderProvider.md)
|
- [OAuthHeaderProvider](classes/OAuthHeaderProvider.md)
|
||||||
|
- [PermutationBuilder](classes/PermutationBuilder.md)
|
||||||
- [PhraseQuery](classes/PhraseQuery.md)
|
- [PhraseQuery](classes/PhraseQuery.md)
|
||||||
- [Query](classes/Query.md)
|
- [Query](classes/Query.md)
|
||||||
- [QueryBase](classes/QueryBase.md)
|
- [QueryBase](classes/QueryBase.md)
|
||||||
@@ -68,6 +69,7 @@
|
|||||||
- [IndexStatistics](interfaces/IndexStatistics.md)
|
- [IndexStatistics](interfaces/IndexStatistics.md)
|
||||||
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
|
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
|
||||||
- [IvfPqOptions](interfaces/IvfPqOptions.md)
|
- [IvfPqOptions](interfaces/IvfPqOptions.md)
|
||||||
|
- [IvfRqOptions](interfaces/IvfRqOptions.md)
|
||||||
- [MergeResult](interfaces/MergeResult.md)
|
- [MergeResult](interfaces/MergeResult.md)
|
||||||
- [OpenTableOptions](interfaces/OpenTableOptions.md)
|
- [OpenTableOptions](interfaces/OpenTableOptions.md)
|
||||||
- [OptimizeOptions](interfaces/OptimizeOptions.md)
|
- [OptimizeOptions](interfaces/OptimizeOptions.md)
|
||||||
@@ -75,9 +77,14 @@
|
|||||||
- [QueryExecutionOptions](interfaces/QueryExecutionOptions.md)
|
- [QueryExecutionOptions](interfaces/QueryExecutionOptions.md)
|
||||||
- [RemovalStats](interfaces/RemovalStats.md)
|
- [RemovalStats](interfaces/RemovalStats.md)
|
||||||
- [RetryConfig](interfaces/RetryConfig.md)
|
- [RetryConfig](interfaces/RetryConfig.md)
|
||||||
|
- [ShuffleOptions](interfaces/ShuffleOptions.md)
|
||||||
|
- [SplitHashOptions](interfaces/SplitHashOptions.md)
|
||||||
|
- [SplitRandomOptions](interfaces/SplitRandomOptions.md)
|
||||||
|
- [SplitSequentialOptions](interfaces/SplitSequentialOptions.md)
|
||||||
- [TableNamesOptions](interfaces/TableNamesOptions.md)
|
- [TableNamesOptions](interfaces/TableNamesOptions.md)
|
||||||
- [TableStatistics](interfaces/TableStatistics.md)
|
- [TableStatistics](interfaces/TableStatistics.md)
|
||||||
- [TimeoutConfig](interfaces/TimeoutConfig.md)
|
- [TimeoutConfig](interfaces/TimeoutConfig.md)
|
||||||
|
- [TlsConfig](interfaces/TlsConfig.md)
|
||||||
- [TokenResponse](interfaces/TokenResponse.md)
|
- [TokenResponse](interfaces/TokenResponse.md)
|
||||||
- [UpdateOptions](interfaces/UpdateOptions.md)
|
- [UpdateOptions](interfaces/UpdateOptions.md)
|
||||||
- [UpdateResult](interfaces/UpdateResult.md)
|
- [UpdateResult](interfaces/UpdateResult.md)
|
||||||
@@ -101,3 +108,4 @@
|
|||||||
- [connect](functions/connect.md)
|
- [connect](functions/connect.md)
|
||||||
- [makeArrowTable](functions/makeArrowTable.md)
|
- [makeArrowTable](functions/makeArrowTable.md)
|
||||||
- [packBits](functions/packBits.md)
|
- [packBits](functions/packBits.md)
|
||||||
|
- [permutationBuilder](functions/permutationBuilder.md)
|
||||||
|
|||||||
@@ -40,6 +40,14 @@ optional timeoutConfig: TimeoutConfig;
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### tlsConfig?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional tlsConfig: TlsConfig;
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### userAgent?
|
### userAgent?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
23
docs/src/js/interfaces/ShuffleOptions.md
Normal file
23
docs/src/js/interfaces/ShuffleOptions.md
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / ShuffleOptions
|
||||||
|
|
||||||
|
# Interface: ShuffleOptions
|
||||||
|
|
||||||
|
## Properties
|
||||||
|
|
||||||
|
### clumpSize?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional clumpSize: number;
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### seed?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional seed: number;
|
||||||
|
```
|
||||||
31
docs/src/js/interfaces/SplitHashOptions.md
Normal file
31
docs/src/js/interfaces/SplitHashOptions.md
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / SplitHashOptions
|
||||||
|
|
||||||
|
# Interface: SplitHashOptions
|
||||||
|
|
||||||
|
## Properties
|
||||||
|
|
||||||
|
### columns
|
||||||
|
|
||||||
|
```ts
|
||||||
|
columns: string[];
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### discardWeight?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional discardWeight: number;
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### splitWeights
|
||||||
|
|
||||||
|
```ts
|
||||||
|
splitWeights: number[];
|
||||||
|
```
|
||||||
39
docs/src/js/interfaces/SplitRandomOptions.md
Normal file
39
docs/src/js/interfaces/SplitRandomOptions.md
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / SplitRandomOptions
|
||||||
|
|
||||||
|
# Interface: SplitRandomOptions
|
||||||
|
|
||||||
|
## Properties
|
||||||
|
|
||||||
|
### counts?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional counts: number[];
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### fixed?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional fixed: number;
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### ratios?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional ratios: number[];
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### seed?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional seed: number;
|
||||||
|
```
|
||||||
31
docs/src/js/interfaces/SplitSequentialOptions.md
Normal file
31
docs/src/js/interfaces/SplitSequentialOptions.md
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / SplitSequentialOptions
|
||||||
|
|
||||||
|
# Interface: SplitSequentialOptions
|
||||||
|
|
||||||
|
## Properties
|
||||||
|
|
||||||
|
### counts?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional counts: number[];
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### fixed?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional fixed: number;
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### ratios?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional ratios: number[];
|
||||||
|
```
|
||||||
49
docs/src/js/interfaces/TlsConfig.md
Normal file
49
docs/src/js/interfaces/TlsConfig.md
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / TlsConfig
|
||||||
|
|
||||||
|
# Interface: TlsConfig
|
||||||
|
|
||||||
|
TLS/mTLS configuration for the remote HTTP client.
|
||||||
|
|
||||||
|
## Properties
|
||||||
|
|
||||||
|
### assertHostname?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional assertHostname: boolean;
|
||||||
|
```
|
||||||
|
|
||||||
|
Whether to verify the hostname in the server's certificate.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### certFile?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional certFile: string;
|
||||||
|
```
|
||||||
|
|
||||||
|
Path to the client certificate file (PEM format) for mTLS authentication.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### keyFile?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional keyFile: string;
|
||||||
|
```
|
||||||
|
|
||||||
|
Path to the client private key file (PEM format) for mTLS authentication.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### sslCaCert?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional sslCaCert: string;
|
||||||
|
```
|
||||||
|
|
||||||
|
Path to the CA certificate file (PEM format) for server verification.
|
||||||
@@ -16,6 +16,7 @@ pub trait JNIEnvExt {
|
|||||||
fn get_integers(&mut self, obj: &JObject) -> Result<Vec<i32>>;
|
fn get_integers(&mut self, obj: &JObject) -> Result<Vec<i32>>;
|
||||||
|
|
||||||
/// Get strings from Java List<String> object.
|
/// Get strings from Java List<String> object.
|
||||||
|
#[allow(dead_code)]
|
||||||
fn get_strings(&mut self, obj: &JObject) -> Result<Vec<String>>;
|
fn get_strings(&mut self, obj: &JObject) -> Result<Vec<String>>;
|
||||||
|
|
||||||
/// Get strings from Java String[] object.
|
/// Get strings from Java String[] object.
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use jni::JNIEnv;
|
|||||||
|
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
pub trait FromJObject<T> {
|
pub trait FromJObject<T> {
|
||||||
fn extract(&self) -> Result<T>;
|
fn extract(&self) -> Result<T>;
|
||||||
}
|
}
|
||||||
@@ -39,6 +40,7 @@ impl FromJObject<f64> for JObject<'_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
pub trait FromJString {
|
pub trait FromJString {
|
||||||
fn extract(&self, env: &mut JNIEnv) -> Result<String>;
|
fn extract(&self, env: &mut JNIEnv) -> Result<String>;
|
||||||
}
|
}
|
||||||
@@ -66,6 +68,7 @@ pub trait JMapExt {
|
|||||||
fn get_f64(&self, env: &mut JNIEnv, key: &str) -> Result<Option<f64>>;
|
fn get_f64(&self, env: &mut JNIEnv, key: &str) -> Result<Option<f64>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
fn get_map_value<T>(env: &mut JNIEnv, map: &JMap, key: &str) -> Result<Option<T>>
|
fn get_map_value<T>(env: &mut JNIEnv, map: &JMap, key: &str) -> Result<Option<T>>
|
||||||
where
|
where
|
||||||
for<'a> JObject<'a>: FromJObject<T>,
|
for<'a> JObject<'a>: FromJObject<T>,
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.22.1-beta.0</version>
|
<version>0.22.2-final.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.22.1-beta.0</version>
|
<version>0.22.2-final.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.22.1-beta.0</version>
|
<version>0.22.2-final.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<name>${project.artifactId}</name>
|
<name>${project.artifactId}</name>
|
||||||
<description>LanceDB Java SDK Parent POM</description>
|
<description>LanceDB Java SDK Parent POM</description>
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.22.1-beta.0"
|
version = "0.22.2"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -1,17 +1,5 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
import {
|
|
||||||
Bool,
|
|
||||||
Field,
|
|
||||||
Int32,
|
|
||||||
List,
|
|
||||||
Schema,
|
|
||||||
Struct,
|
|
||||||
Uint8,
|
|
||||||
Utf8,
|
|
||||||
} from "apache-arrow";
|
|
||||||
|
|
||||||
import * as arrow15 from "apache-arrow-15";
|
import * as arrow15 from "apache-arrow-15";
|
||||||
import * as arrow16 from "apache-arrow-16";
|
import * as arrow16 from "apache-arrow-16";
|
||||||
import * as arrow17 from "apache-arrow-17";
|
import * as arrow17 from "apache-arrow-17";
|
||||||
@@ -25,11 +13,9 @@ import {
|
|||||||
fromTableToBuffer,
|
fromTableToBuffer,
|
||||||
makeArrowTable,
|
makeArrowTable,
|
||||||
makeEmptyTable,
|
makeEmptyTable,
|
||||||
tableFromIPC,
|
|
||||||
} from "../lancedb/arrow";
|
} from "../lancedb/arrow";
|
||||||
import {
|
import {
|
||||||
EmbeddingFunction,
|
EmbeddingFunction,
|
||||||
FieldOptions,
|
|
||||||
FunctionOptions,
|
FunctionOptions,
|
||||||
} from "../lancedb/embedding/embedding_function";
|
} from "../lancedb/embedding/embedding_function";
|
||||||
import { EmbeddingFunctionConfig } from "../lancedb/embedding/registry";
|
import { EmbeddingFunctionConfig } from "../lancedb/embedding/registry";
|
||||||
@@ -1008,5 +994,64 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(result).toEqual(null);
|
expect(result).toEqual(null);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("boolean null handling", function () {
|
||||||
|
it("should handle null values in nullable boolean fields", () => {
|
||||||
|
const { makeArrowTable } = require("../lancedb/arrow");
|
||||||
|
const schema = new Schema([new Field("test", new arrow.Bool(), true)]);
|
||||||
|
|
||||||
|
// Test with all null values
|
||||||
|
const data = [{ test: null }];
|
||||||
|
const table = makeArrowTable(data, { schema });
|
||||||
|
|
||||||
|
expect(table.numRows).toBe(1);
|
||||||
|
expect(table.schema.names).toEqual(["test"]);
|
||||||
|
expect(table.getChild("test")!.get(0)).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle mixed null and non-null boolean values", () => {
|
||||||
|
const { makeArrowTable } = require("../lancedb/arrow");
|
||||||
|
const schema = new Schema([new Field("test", new Bool(), true)]);
|
||||||
|
|
||||||
|
// Test with mixed values
|
||||||
|
const data = [{ test: true }, { test: null }, { test: false }];
|
||||||
|
const table = makeArrowTable(data, { schema });
|
||||||
|
|
||||||
|
expect(table.numRows).toBe(3);
|
||||||
|
expect(table.getChild("test")!.get(0)).toBe(true);
|
||||||
|
expect(table.getChild("test")!.get(1)).toBeNull();
|
||||||
|
expect(table.getChild("test")!.get(2)).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test for the undefined values bug fix
|
||||||
|
describe("undefined values handling", () => {
|
||||||
|
it("should handle mixed undefined and actual values", () => {
|
||||||
|
const schema = new Schema([
|
||||||
|
new Field("text", new Utf8(), true), // nullable
|
||||||
|
new Field("number", new Int32(), true), // nullable
|
||||||
|
new Field("bool", new Bool(), true), // nullable
|
||||||
|
]);
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ text: undefined, number: 42, bool: true },
|
||||||
|
{ text: "hello", number: undefined, bool: false },
|
||||||
|
{ text: "world", number: 123, bool: undefined },
|
||||||
|
];
|
||||||
|
const table = makeArrowTable(data, { schema });
|
||||||
|
|
||||||
|
const result = table.toArray();
|
||||||
|
expect(result).toHaveLength(3);
|
||||||
|
expect(result[0].text).toBe(null);
|
||||||
|
expect(result[0].number).toBe(42);
|
||||||
|
expect(result[0].bool).toBe(true);
|
||||||
|
expect(result[1].text).toBe("hello");
|
||||||
|
expect(result[1].number).toBe(null);
|
||||||
|
expect(result[1].bool).toBe(false);
|
||||||
|
expect(result[2].text).toBe("world");
|
||||||
|
expect(result[2].number).toBe(123);
|
||||||
|
expect(result[2].bool).toBe(null);
|
||||||
|
});
|
||||||
|
});
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -203,3 +203,106 @@ describe("given a connection", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("clone table functionality", () => {
|
||||||
|
let tmpDir: tmp.DirResult;
|
||||||
|
let db: Connection;
|
||||||
|
beforeEach(async () => {
|
||||||
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||||
|
db = await connect(tmpDir.name);
|
||||||
|
});
|
||||||
|
afterEach(() => tmpDir.removeCallback());
|
||||||
|
|
||||||
|
it("should clone a table with latest version (default behavior)", async () => {
|
||||||
|
// Create source table with some data
|
||||||
|
const data = [
|
||||||
|
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
||||||
|
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
||||||
|
];
|
||||||
|
const sourceTable = await db.createTable("source", data);
|
||||||
|
|
||||||
|
// Add more data to create a new version
|
||||||
|
const moreData = [{ id: 3, text: "test", vector: [5.0, 6.0] }];
|
||||||
|
await sourceTable.add(moreData);
|
||||||
|
|
||||||
|
// Clone the table (should get latest version with 3 rows)
|
||||||
|
const sourceUri = `${tmpDir.name}/source.lance`;
|
||||||
|
const clonedTable = await db.cloneTable("cloned", sourceUri);
|
||||||
|
|
||||||
|
// Verify cloned table has all 3 rows
|
||||||
|
expect(await clonedTable.countRows()).toBe(3);
|
||||||
|
expect((await db.tableNames()).includes("cloned")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should clone a table from a specific version", async () => {
|
||||||
|
// Create source table with initial data
|
||||||
|
const data = [
|
||||||
|
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
||||||
|
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
||||||
|
];
|
||||||
|
const sourceTable = await db.createTable("source", data);
|
||||||
|
|
||||||
|
// Get the initial version
|
||||||
|
const initialVersion = await sourceTable.version();
|
||||||
|
|
||||||
|
// Add more data to create a new version
|
||||||
|
const moreData = [{ id: 3, text: "test", vector: [5.0, 6.0] }];
|
||||||
|
await sourceTable.add(moreData);
|
||||||
|
|
||||||
|
// Verify source now has 3 rows
|
||||||
|
expect(await sourceTable.countRows()).toBe(3);
|
||||||
|
|
||||||
|
// Clone from the initial version (should have only 2 rows)
|
||||||
|
const sourceUri = `${tmpDir.name}/source.lance`;
|
||||||
|
const clonedTable = await db.cloneTable("cloned", sourceUri, {
|
||||||
|
sourceVersion: initialVersion,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Verify cloned table has only the initial 2 rows
|
||||||
|
expect(await clonedTable.countRows()).toBe(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should clone a table from a tagged version", async () => {
|
||||||
|
// Create source table with initial data
|
||||||
|
const data = [
|
||||||
|
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
||||||
|
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
||||||
|
];
|
||||||
|
const sourceTable = await db.createTable("source", data);
|
||||||
|
|
||||||
|
// Create a tag for the current version
|
||||||
|
const tags = await sourceTable.tags();
|
||||||
|
await tags.create("v1.0", await sourceTable.version());
|
||||||
|
|
||||||
|
// Add more data after the tag
|
||||||
|
const moreData = [{ id: 3, text: "test", vector: [5.0, 6.0] }];
|
||||||
|
await sourceTable.add(moreData);
|
||||||
|
|
||||||
|
// Verify source now has 3 rows
|
||||||
|
expect(await sourceTable.countRows()).toBe(3);
|
||||||
|
|
||||||
|
// Clone from the tagged version (should have only 2 rows)
|
||||||
|
const sourceUri = `${tmpDir.name}/source.lance`;
|
||||||
|
const clonedTable = await db.cloneTable("cloned", sourceUri, {
|
||||||
|
sourceTag: "v1.0",
|
||||||
|
});
|
||||||
|
|
||||||
|
// Verify cloned table has only the tagged version's 2 rows
|
||||||
|
expect(await clonedTable.countRows()).toBe(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should fail when attempting deep clone", async () => {
|
||||||
|
// Create source table with some data
|
||||||
|
const data = [
|
||||||
|
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
||||||
|
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
||||||
|
];
|
||||||
|
await db.createTable("source", data);
|
||||||
|
|
||||||
|
// Try to create a deep clone (should fail)
|
||||||
|
const sourceUri = `${tmpDir.name}/source.lance`;
|
||||||
|
await expect(
|
||||||
|
db.cloneTable("cloned", sourceUri, { isShallow: false }),
|
||||||
|
).rejects.toThrow("Deep clone is not yet implemented");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@@ -256,6 +256,60 @@ describe("embedding functions", () => {
|
|||||||
expect(actual).toHaveProperty("text");
|
expect(actual).toHaveProperty("text");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should handle undefined vector field with embedding function correctly", async () => {
|
||||||
|
@register("undefined_test")
|
||||||
|
class MockEmbeddingFunction extends EmbeddingFunction<string> {
|
||||||
|
ndims() {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
embeddingDataType(): Float {
|
||||||
|
return new Float32();
|
||||||
|
}
|
||||||
|
async computeQueryEmbeddings(_data: string) {
|
||||||
|
return [1, 2, 3];
|
||||||
|
}
|
||||||
|
async computeSourceEmbeddings(data: string[]) {
|
||||||
|
return Array.from({ length: data.length }).fill([
|
||||||
|
1, 2, 3,
|
||||||
|
]) as number[][];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const func = getRegistry()
|
||||||
|
.get<MockEmbeddingFunction>("undefined_test")!
|
||||||
|
.create();
|
||||||
|
const schema = new Schema([
|
||||||
|
new Field("text", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"vector",
|
||||||
|
new FixedSizeList(3, new Field("item", new Float32(), true)),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const table = await db.createEmptyTable("test_undefined", schema, {
|
||||||
|
embeddingFunction: {
|
||||||
|
function: func,
|
||||||
|
sourceColumn: "text",
|
||||||
|
vectorColumn: "vector",
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test that undefined, null, and omitted vector fields all work
|
||||||
|
await table.add([{ text: "test1", vector: undefined }]);
|
||||||
|
await table.add([{ text: "test2", vector: null }]);
|
||||||
|
await table.add([{ text: "test3" }]);
|
||||||
|
|
||||||
|
const rows = await table.query().toArray();
|
||||||
|
expect(rows.length).toBe(3);
|
||||||
|
|
||||||
|
// All rows should have vectors computed by the embedding function
|
||||||
|
for (const row of rows) {
|
||||||
|
expect(row.vector).toBeDefined();
|
||||||
|
expect(JSON.parse(JSON.stringify(row.vector))).toEqual([1, 2, 3]);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
test.each([new Float16(), new Float32(), new Float64()])(
|
test.each([new Float16(), new Float32(), new Float64()])(
|
||||||
"should be able to provide manual embeddings with multiple float datatype",
|
"should be able to provide manual embeddings with multiple float datatype",
|
||||||
async (floatType) => {
|
async (floatType) => {
|
||||||
|
|||||||
234
nodejs/__test__/permutation.test.ts
Normal file
234
nodejs/__test__/permutation.test.ts
Normal file
@@ -0,0 +1,234 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
import * as tmp from "tmp";
|
||||||
|
import { Table, connect, permutationBuilder } from "../lancedb";
|
||||||
|
import { makeArrowTable } from "../lancedb/arrow";
|
||||||
|
|
||||||
|
describe("PermutationBuilder", () => {
|
||||||
|
let tmpDir: tmp.DirResult;
|
||||||
|
let table: Table;
|
||||||
|
|
||||||
|
beforeEach(async () => {
|
||||||
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
|
||||||
|
// Create test data
|
||||||
|
const data = makeArrowTable(
|
||||||
|
[
|
||||||
|
{ id: 1, value: 10 },
|
||||||
|
{ id: 2, value: 20 },
|
||||||
|
{ id: 3, value: 30 },
|
||||||
|
{ id: 4, value: 40 },
|
||||||
|
{ id: 5, value: 50 },
|
||||||
|
{ id: 6, value: 60 },
|
||||||
|
{ id: 7, value: 70 },
|
||||||
|
{ id: 8, value: 80 },
|
||||||
|
{ id: 9, value: 90 },
|
||||||
|
{ id: 10, value: 100 },
|
||||||
|
],
|
||||||
|
{ vectorColumns: {} },
|
||||||
|
);
|
||||||
|
|
||||||
|
table = await db.createTable("test_table", data);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
tmpDir.removeCallback();
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should create permutation builder", () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table");
|
||||||
|
expect(builder).toBeDefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should execute basic permutation", async () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table");
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
|
||||||
|
expect(permutationTable).toBeDefined();
|
||||||
|
expect(permutationTable.name).toBe("permutation_table");
|
||||||
|
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(10);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should create permutation with random splits", async () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table").splitRandom({
|
||||||
|
ratios: [1.0],
|
||||||
|
seed: 42,
|
||||||
|
});
|
||||||
|
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(10);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should create permutation with percentage splits", async () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table").splitRandom({
|
||||||
|
ratios: [0.3, 0.7],
|
||||||
|
seed: 42,
|
||||||
|
});
|
||||||
|
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(10);
|
||||||
|
|
||||||
|
// Check split distribution
|
||||||
|
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||||
|
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||||
|
|
||||||
|
expect(split0Count).toBeGreaterThan(0);
|
||||||
|
expect(split1Count).toBeGreaterThan(0);
|
||||||
|
expect(split0Count + split1Count).toBe(10);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should create permutation with count splits", async () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table").splitRandom({
|
||||||
|
counts: [3, 7],
|
||||||
|
seed: 42,
|
||||||
|
});
|
||||||
|
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(10);
|
||||||
|
|
||||||
|
// Check split distribution
|
||||||
|
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||||
|
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||||
|
|
||||||
|
expect(split0Count).toBe(3);
|
||||||
|
expect(split1Count).toBe(7);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should create permutation with hash splits", async () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table").splitHash({
|
||||||
|
columns: ["id"],
|
||||||
|
splitWeights: [50, 50],
|
||||||
|
discardWeight: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(10);
|
||||||
|
|
||||||
|
// Check that splits exist
|
||||||
|
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||||
|
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||||
|
|
||||||
|
expect(split0Count).toBeGreaterThan(0);
|
||||||
|
expect(split1Count).toBeGreaterThan(0);
|
||||||
|
expect(split0Count + split1Count).toBe(10);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should create permutation with sequential splits", async () => {
|
||||||
|
const builder = permutationBuilder(
|
||||||
|
table,
|
||||||
|
"permutation_table",
|
||||||
|
).splitSequential({ ratios: [0.5, 0.5] });
|
||||||
|
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(10);
|
||||||
|
|
||||||
|
// Check split distribution - sequential should give exactly 5 and 5
|
||||||
|
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||||
|
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||||
|
|
||||||
|
expect(split0Count).toBe(5);
|
||||||
|
expect(split1Count).toBe(5);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should create permutation with calculated splits", async () => {
|
||||||
|
const builder = permutationBuilder(
|
||||||
|
table,
|
||||||
|
"permutation_table",
|
||||||
|
).splitCalculated("id % 2");
|
||||||
|
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(10);
|
||||||
|
|
||||||
|
// Check split distribution
|
||||||
|
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||||
|
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||||
|
|
||||||
|
expect(split0Count).toBeGreaterThan(0);
|
||||||
|
expect(split1Count).toBeGreaterThan(0);
|
||||||
|
expect(split0Count + split1Count).toBe(10);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should create permutation with shuffle", async () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table").shuffle({
|
||||||
|
seed: 42,
|
||||||
|
});
|
||||||
|
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(10);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should create permutation with shuffle and clump size", async () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table").shuffle({
|
||||||
|
seed: 42,
|
||||||
|
clumpSize: 2,
|
||||||
|
});
|
||||||
|
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(10);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should create permutation with filter", async () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table").filter(
|
||||||
|
"value > 50",
|
||||||
|
);
|
||||||
|
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(5); // Values 60, 70, 80, 90, 100
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should chain multiple operations", async () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table")
|
||||||
|
.filter("value <= 80")
|
||||||
|
.splitRandom({ ratios: [0.5, 0.5], seed: 42 })
|
||||||
|
.shuffle({ seed: 123 });
|
||||||
|
|
||||||
|
const permutationTable = await builder.execute();
|
||||||
|
const rowCount = await permutationTable.countRows();
|
||||||
|
expect(rowCount).toBe(8); // Values 10, 20, 30, 40, 50, 60, 70, 80
|
||||||
|
|
||||||
|
// Check split distribution
|
||||||
|
const split0Count = await permutationTable.countRows("split_id = 0");
|
||||||
|
const split1Count = await permutationTable.countRows("split_id = 1");
|
||||||
|
|
||||||
|
expect(split0Count).toBeGreaterThan(0);
|
||||||
|
expect(split1Count).toBeGreaterThan(0);
|
||||||
|
expect(split0Count + split1Count).toBe(8);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should throw error for invalid split arguments", () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table");
|
||||||
|
|
||||||
|
// Test no arguments provided
|
||||||
|
expect(() => builder.splitRandom({})).toThrow(
|
||||||
|
"Exactly one of 'ratios', 'counts', or 'fixed' must be provided",
|
||||||
|
);
|
||||||
|
|
||||||
|
// Test multiple arguments provided
|
||||||
|
expect(() =>
|
||||||
|
builder.splitRandom({ ratios: [0.5, 0.5], counts: [3, 7], seed: 42 }),
|
||||||
|
).toThrow("Exactly one of 'ratios', 'counts', or 'fixed' must be provided");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("should throw error when builder is consumed", async () => {
|
||||||
|
const builder = permutationBuilder(table, "permutation_table");
|
||||||
|
|
||||||
|
// Execute once
|
||||||
|
await builder.execute();
|
||||||
|
|
||||||
|
// Should throw error on second execution
|
||||||
|
await expect(builder.execute()).rejects.toThrow("Builder already consumed");
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -7,7 +7,6 @@ import {
|
|||||||
ClientConfig,
|
ClientConfig,
|
||||||
Connection,
|
Connection,
|
||||||
ConnectionOptions,
|
ConnectionOptions,
|
||||||
NativeJsHeaderProvider,
|
|
||||||
TlsConfig,
|
TlsConfig,
|
||||||
connect,
|
connect,
|
||||||
} from "../lancedb";
|
} from "../lancedb";
|
||||||
|
|||||||
184
nodejs/__test__/sanitize.test.ts
Normal file
184
nodejs/__test__/sanitize.test.ts
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
import * as arrow from "../lancedb/arrow";
|
||||||
|
import { sanitizeField, sanitizeType } from "../lancedb/sanitize";
|
||||||
|
|
||||||
|
describe("sanitize", function () {
|
||||||
|
describe("sanitizeType function", function () {
|
||||||
|
it("should handle type objects", function () {
|
||||||
|
const type = new arrow.Int32();
|
||||||
|
const result = sanitizeType(type);
|
||||||
|
|
||||||
|
expect(result.typeId).toBe(arrow.Type.Int);
|
||||||
|
expect((result as arrow.Int).bitWidth).toBe(32);
|
||||||
|
expect((result as arrow.Int).isSigned).toBe(true);
|
||||||
|
|
||||||
|
const floatType = {
|
||||||
|
typeId: 3, // Type.Float = 3
|
||||||
|
precision: 2,
|
||||||
|
toString: () => "Float",
|
||||||
|
isFloat: true,
|
||||||
|
isFixedWidth: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
const floatResult = sanitizeType(floatType);
|
||||||
|
expect(floatResult).toBeInstanceOf(arrow.DataType);
|
||||||
|
expect(floatResult.typeId).toBe(arrow.Type.Float);
|
||||||
|
|
||||||
|
const floatResult2 = sanitizeType({ ...floatType, typeId: () => 3 });
|
||||||
|
expect(floatResult2).toBeInstanceOf(arrow.DataType);
|
||||||
|
expect(floatResult2.typeId).toBe(arrow.Type.Float);
|
||||||
|
});
|
||||||
|
|
||||||
|
const allTypeNameTestCases = [
|
||||||
|
["null", new arrow.Null()],
|
||||||
|
["binary", new arrow.Binary()],
|
||||||
|
["utf8", new arrow.Utf8()],
|
||||||
|
["bool", new arrow.Bool()],
|
||||||
|
["int8", new arrow.Int8()],
|
||||||
|
["int16", new arrow.Int16()],
|
||||||
|
["int32", new arrow.Int32()],
|
||||||
|
["int64", new arrow.Int64()],
|
||||||
|
["uint8", new arrow.Uint8()],
|
||||||
|
["uint16", new arrow.Uint16()],
|
||||||
|
["uint32", new arrow.Uint32()],
|
||||||
|
["uint64", new arrow.Uint64()],
|
||||||
|
["float16", new arrow.Float16()],
|
||||||
|
["float32", new arrow.Float32()],
|
||||||
|
["float64", new arrow.Float64()],
|
||||||
|
["datemillisecond", new arrow.DateMillisecond()],
|
||||||
|
["dateday", new arrow.DateDay()],
|
||||||
|
["timenanosecond", new arrow.TimeNanosecond()],
|
||||||
|
["timemicrosecond", new arrow.TimeMicrosecond()],
|
||||||
|
["timemillisecond", new arrow.TimeMillisecond()],
|
||||||
|
["timesecond", new arrow.TimeSecond()],
|
||||||
|
["intervaldaytime", new arrow.IntervalDayTime()],
|
||||||
|
["intervalyearmonth", new arrow.IntervalYearMonth()],
|
||||||
|
["durationnanosecond", new arrow.DurationNanosecond()],
|
||||||
|
["durationmicrosecond", new arrow.DurationMicrosecond()],
|
||||||
|
["durationmillisecond", new arrow.DurationMillisecond()],
|
||||||
|
["durationsecond", new arrow.DurationSecond()],
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
it.each(allTypeNameTestCases)(
|
||||||
|
'should map type name "%s" to %s',
|
||||||
|
function (name, expected) {
|
||||||
|
const result = sanitizeType(name);
|
||||||
|
expect(result).toBeInstanceOf(expected.constructor);
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
const caseVariationTestCases = [
|
||||||
|
["NULL", new arrow.Null()],
|
||||||
|
["Utf8", new arrow.Utf8()],
|
||||||
|
["FLOAT32", new arrow.Float32()],
|
||||||
|
["DaTedAy", new arrow.DateDay()],
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
it.each(caseVariationTestCases)(
|
||||||
|
'should be case insensitive for type name "%s" mapped to %s',
|
||||||
|
function (name, expected) {
|
||||||
|
const result = sanitizeType(name);
|
||||||
|
expect(result).toBeInstanceOf(expected.constructor);
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
it("should throw error for unrecognized type name", function () {
|
||||||
|
expect(() => sanitizeType("invalid_type")).toThrow(
|
||||||
|
"Unrecognized type name in schema: invalid_type",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("sanitizeField function", function () {
|
||||||
|
it("should handle field with string type name", function () {
|
||||||
|
const field = sanitizeField({
|
||||||
|
name: "string_field",
|
||||||
|
type: "utf8",
|
||||||
|
nullable: true,
|
||||||
|
metadata: new Map([["key", "value"]]),
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(field).toBeInstanceOf(arrow.Field);
|
||||||
|
expect(field.name).toBe("string_field");
|
||||||
|
expect(field.type).toBeInstanceOf(arrow.Utf8);
|
||||||
|
expect(field.nullable).toBe(true);
|
||||||
|
expect(field.metadata?.get("key")).toBe("value");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle field with type object", function () {
|
||||||
|
const floatType = {
|
||||||
|
typeId: 3, // Float
|
||||||
|
precision: 32,
|
||||||
|
};
|
||||||
|
|
||||||
|
const field = sanitizeField({
|
||||||
|
name: "float_field",
|
||||||
|
type: floatType,
|
||||||
|
nullable: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(field).toBeInstanceOf(arrow.Field);
|
||||||
|
expect(field.name).toBe("float_field");
|
||||||
|
expect(field.type).toBeInstanceOf(arrow.DataType);
|
||||||
|
expect(field.type.typeId).toBe(arrow.Type.Float);
|
||||||
|
expect((field.type as arrow.Float64).precision).toBe(32);
|
||||||
|
expect(field.nullable).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle field with direct Type instance", function () {
|
||||||
|
const field = sanitizeField({
|
||||||
|
name: "bool_field",
|
||||||
|
type: new arrow.Bool(),
|
||||||
|
nullable: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(field).toBeInstanceOf(arrow.Field);
|
||||||
|
expect(field.name).toBe("bool_field");
|
||||||
|
expect(field.type).toBeInstanceOf(arrow.Bool);
|
||||||
|
expect(field.nullable).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should throw error for invalid field object", function () {
|
||||||
|
expect(() =>
|
||||||
|
sanitizeField({
|
||||||
|
type: "int32",
|
||||||
|
nullable: true,
|
||||||
|
}),
|
||||||
|
).toThrow(
|
||||||
|
"The field passed in is missing a `type`/`name`/`nullable` property",
|
||||||
|
);
|
||||||
|
|
||||||
|
// Invalid type
|
||||||
|
expect(() =>
|
||||||
|
sanitizeField({
|
||||||
|
name: "invalid",
|
||||||
|
type: { invalid: true },
|
||||||
|
nullable: true,
|
||||||
|
}),
|
||||||
|
).toThrow("Expected a Type to have a typeId property");
|
||||||
|
|
||||||
|
// Invalid nullable
|
||||||
|
expect(() =>
|
||||||
|
sanitizeField({
|
||||||
|
name: "invalid_nullable",
|
||||||
|
type: "int32",
|
||||||
|
nullable: "not a boolean",
|
||||||
|
}),
|
||||||
|
).toThrow("The field passed in had a non-boolean `nullable` property");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should report error for invalid type name", function () {
|
||||||
|
expect(() =>
|
||||||
|
sanitizeField({
|
||||||
|
name: "invalid_field",
|
||||||
|
type: "invalid_type",
|
||||||
|
nullable: true,
|
||||||
|
}),
|
||||||
|
).toThrow(
|
||||||
|
"Unable to sanitize type for field: invalid_field due to error: Error: Unrecognized type name in schema: invalid_type",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -10,7 +10,13 @@ import * as arrow16 from "apache-arrow-16";
|
|||||||
import * as arrow17 from "apache-arrow-17";
|
import * as arrow17 from "apache-arrow-17";
|
||||||
import * as arrow18 from "apache-arrow-18";
|
import * as arrow18 from "apache-arrow-18";
|
||||||
|
|
||||||
import { MatchQuery, PhraseQuery, Table, connect } from "../lancedb";
|
import {
|
||||||
|
Connection,
|
||||||
|
MatchQuery,
|
||||||
|
PhraseQuery,
|
||||||
|
Table,
|
||||||
|
connect,
|
||||||
|
} from "../lancedb";
|
||||||
import {
|
import {
|
||||||
Table as ArrowTable,
|
Table as ArrowTable,
|
||||||
Field,
|
Field,
|
||||||
@@ -21,6 +27,8 @@ import {
|
|||||||
Int64,
|
Int64,
|
||||||
List,
|
List,
|
||||||
Schema,
|
Schema,
|
||||||
|
SchemaLike,
|
||||||
|
Type,
|
||||||
Uint8,
|
Uint8,
|
||||||
Utf8,
|
Utf8,
|
||||||
makeArrowTable,
|
makeArrowTable,
|
||||||
@@ -39,7 +47,6 @@ import {
|
|||||||
Operator,
|
Operator,
|
||||||
instanceOfFullTextQuery,
|
instanceOfFullTextQuery,
|
||||||
} from "../lancedb/query";
|
} from "../lancedb/query";
|
||||||
import exp = require("constants");
|
|
||||||
|
|
||||||
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||||
"Given a table",
|
"Given a table",
|
||||||
@@ -212,8 +219,7 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
// TODO: https://github.com/lancedb/lancedb/issues/1832
|
it("should be able to omit nullable fields", async () => {
|
||||||
it.skip("should be able to omit nullable fields", async () => {
|
|
||||||
const db = await connect(tmpDir.name);
|
const db = await connect(tmpDir.name);
|
||||||
const schema = new arrow.Schema([
|
const schema = new arrow.Schema([
|
||||||
new arrow.Field(
|
new arrow.Field(
|
||||||
@@ -237,23 +243,36 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
await table.add([data3]);
|
await table.add([data3]);
|
||||||
|
|
||||||
let res = await table.query().limit(10).toArray();
|
let res = await table.query().limit(10).toArray();
|
||||||
const resVector = res.map((r) => r.get("vector").toArray());
|
const resVector = res.map((r) =>
|
||||||
|
r.vector ? Array.from(r.vector) : null,
|
||||||
|
);
|
||||||
expect(resVector).toEqual([null, data2.vector, data3.vector]);
|
expect(resVector).toEqual([null, data2.vector, data3.vector]);
|
||||||
const resItem = res.map((r) => r.get("item").toArray());
|
const resItem = res.map((r) => r.item);
|
||||||
expect(resItem).toEqual(["foo", null, "bar"]);
|
expect(resItem).toEqual(["foo", null, "bar"]);
|
||||||
const resPrice = res.map((r) => r.get("price").toArray());
|
const resPrice = res.map((r) => r.price);
|
||||||
expect(resPrice).toEqual([10.0, 2.0, 3.0]);
|
expect(resPrice).toEqual([10.0, 2.0, 3.0]);
|
||||||
|
|
||||||
const data4 = { item: "foo" };
|
const data4 = { item: "foo" };
|
||||||
// We can't omit a column if it's not nullable
|
// We can't omit a column if it's not nullable
|
||||||
await expect(table.add([data4])).rejects.toThrow("Invalid user input");
|
await expect(table.add([data4])).rejects.toThrow(
|
||||||
|
"Append with different schema",
|
||||||
|
);
|
||||||
|
|
||||||
// But we can alter columns to make them nullable
|
// But we can alter columns to make them nullable
|
||||||
await table.alterColumns([{ path: "price", nullable: true }]);
|
await table.alterColumns([{ path: "price", nullable: true }]);
|
||||||
await table.add([data4]);
|
await table.add([data4]);
|
||||||
|
|
||||||
res = (await table.query().limit(10).toArray()).map((r) => r.toJSON());
|
res = (await table.query().limit(10).toArray()).map((r) => ({
|
||||||
expect(res).toEqual([data1, data2, data3, data4]);
|
...r.toJSON(),
|
||||||
|
vector: r.vector ? Array.from(r.vector) : null,
|
||||||
|
}));
|
||||||
|
// Rust fills missing nullable fields with null
|
||||||
|
expect(res).toEqual([
|
||||||
|
{ ...data1, vector: null },
|
||||||
|
{ ...data2, item: null },
|
||||||
|
data3,
|
||||||
|
{ ...data4, price: null, vector: null },
|
||||||
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should be able to insert nullable data for non-nullable fields", async () => {
|
it("should be able to insert nullable data for non-nullable fields", async () => {
|
||||||
@@ -331,6 +350,43 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
const table = await db.createTable("my_table", data);
|
const table = await db.createTable("my_table", data);
|
||||||
expect(await table.countRows()).toEqual(2);
|
expect(await table.countRows()).toEqual(2);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should allow undefined and omitted nullable vector fields", async () => {
|
||||||
|
// Test for the bug: can't pass undefined or omit vector column
|
||||||
|
const db = await connect("memory://");
|
||||||
|
const schema = new arrow.Schema([
|
||||||
|
new arrow.Field("id", new arrow.Int32(), true),
|
||||||
|
new arrow.Field(
|
||||||
|
"vector",
|
||||||
|
new arrow.FixedSizeList(
|
||||||
|
32,
|
||||||
|
new arrow.Field("item", new arrow.Float32(), true),
|
||||||
|
),
|
||||||
|
true, // nullable = true
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
const table = await db.createEmptyTable("test_table", schema);
|
||||||
|
|
||||||
|
// Should not throw error for undefined value
|
||||||
|
await table.add([{ id: 0, vector: undefined }]);
|
||||||
|
|
||||||
|
// Should not throw error for omitted field
|
||||||
|
await table.add([{ id: 1 }]);
|
||||||
|
|
||||||
|
// Should still work for null
|
||||||
|
await table.add([{ id: 2, vector: null }]);
|
||||||
|
|
||||||
|
// Should still work for actual vector
|
||||||
|
const testVector = new Array(32).fill(0.5);
|
||||||
|
await table.add([{ id: 3, vector: testVector }]);
|
||||||
|
expect(await table.countRows()).toEqual(4);
|
||||||
|
|
||||||
|
const res = await table.query().limit(10).toArray();
|
||||||
|
const resVector = res.map((r) =>
|
||||||
|
r.vector ? Array.from(r.vector) : null,
|
||||||
|
);
|
||||||
|
expect(resVector).toEqual([null, null, null, testVector]);
|
||||||
|
});
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -488,6 +544,32 @@ describe("merge insert", () => {
|
|||||||
.execute(newData, { timeoutMs: 0 }),
|
.execute(newData, { timeoutMs: 0 }),
|
||||||
).rejects.toThrow("merge insert timed out");
|
).rejects.toThrow("merge insert timed out");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("useIndex", async () => {
|
||||||
|
const newData = [
|
||||||
|
{ a: 2, b: "x" },
|
||||||
|
{ a: 4, b: "z" },
|
||||||
|
];
|
||||||
|
|
||||||
|
// Test with useIndex(true) - should work fine
|
||||||
|
const result1 = await table
|
||||||
|
.mergeInsert("a")
|
||||||
|
.whenNotMatchedInsertAll()
|
||||||
|
.useIndex(true)
|
||||||
|
.execute(newData);
|
||||||
|
|
||||||
|
expect(result1.numInsertedRows).toBe(1); // Only a=4 should be inserted
|
||||||
|
|
||||||
|
// Test with useIndex(false) - should also work fine
|
||||||
|
const newData2 = [{ a: 5, b: "w" }];
|
||||||
|
const result2 = await table
|
||||||
|
.mergeInsert("a")
|
||||||
|
.whenNotMatchedInsertAll()
|
||||||
|
.useIndex(false)
|
||||||
|
.execute(newData2);
|
||||||
|
|
||||||
|
expect(result2.numInsertedRows).toBe(1); // a=5 should be inserted
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("When creating an index", () => {
|
describe("When creating an index", () => {
|
||||||
@@ -779,6 +861,15 @@ describe("When creating an index", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should be able to create IVF_RQ", async () => {
|
||||||
|
await tbl.createIndex("vec", {
|
||||||
|
config: Index.ivfRq({
|
||||||
|
numPartitions: 10,
|
||||||
|
numBits: 1,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it("should allow me to replace (or not) an existing index", async () => {
|
it("should allow me to replace (or not) an existing index", async () => {
|
||||||
await tbl.createIndex("id");
|
await tbl.createIndex("id");
|
||||||
// Default is replace=true
|
// Default is replace=true
|
||||||
@@ -1429,7 +1520,9 @@ describe("when optimizing a dataset", () => {
|
|||||||
|
|
||||||
it("delete unverified", async () => {
|
it("delete unverified", async () => {
|
||||||
const version = await table.version();
|
const version = await table.version();
|
||||||
const versionFile = `${tmpDir.name}/${table.name}.lance/_versions/${version - 1}.manifest`;
|
const versionFile = `${tmpDir.name}/${table.name}.lance/_versions/${
|
||||||
|
version - 1
|
||||||
|
}.manifest`;
|
||||||
fs.rmSync(versionFile);
|
fs.rmSync(versionFile);
|
||||||
|
|
||||||
let stats = await table.optimize({ deleteUnverified: false });
|
let stats = await table.optimize({ deleteUnverified: false });
|
||||||
@@ -1943,3 +2036,52 @@ describe("column name options", () => {
|
|||||||
expect(results2.length).toBe(10);
|
expect(results2.length).toBe(10);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("when creating an empty table", () => {
|
||||||
|
let con: Connection;
|
||||||
|
beforeEach(async () => {
|
||||||
|
const tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||||
|
con = await connect(tmpDir.name);
|
||||||
|
});
|
||||||
|
afterEach(() => {
|
||||||
|
con.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("can create an empty table from an arrow Schema", async () => {
|
||||||
|
const schema = new Schema([
|
||||||
|
new Field("id", new Int64()),
|
||||||
|
new Field("vector", new Float64()),
|
||||||
|
]);
|
||||||
|
const table = await con.createEmptyTable("test", schema);
|
||||||
|
const actualSchema = await table.schema();
|
||||||
|
expect(actualSchema.fields[0].type.typeId).toBe(Type.Int);
|
||||||
|
expect((actualSchema.fields[0].type as Int64).bitWidth).toBe(64);
|
||||||
|
expect(actualSchema.fields[1].type.typeId).toBe(Type.Float);
|
||||||
|
expect((actualSchema.fields[1].type as Float64).precision).toBe(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("can create an empty table from schema that specifies field types by name", async () => {
|
||||||
|
const schemaLike = {
|
||||||
|
fields: [
|
||||||
|
{
|
||||||
|
name: "id",
|
||||||
|
type: "int64",
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "vector",
|
||||||
|
type: "float64",
|
||||||
|
nullable: true,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
metadata: new Map(),
|
||||||
|
names: ["id", "vector"],
|
||||||
|
} satisfies SchemaLike;
|
||||||
|
const table = await con.createEmptyTable("test", schemaLike);
|
||||||
|
const actualSchema = await table.schema();
|
||||||
|
expect(actualSchema.fields[0].type.typeId).toBe(Type.Int);
|
||||||
|
expect((actualSchema.fields[0].type as Int64).bitWidth).toBe(64);
|
||||||
|
expect(actualSchema.fields[1].type.typeId).toBe(Type.Float);
|
||||||
|
expect((actualSchema.fields[1].type as Float64).precision).toBe(2);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@@ -48,6 +48,7 @@
|
|||||||
"noUnreachableSuper": "error",
|
"noUnreachableSuper": "error",
|
||||||
"noUnsafeFinally": "error",
|
"noUnsafeFinally": "error",
|
||||||
"noUnsafeOptionalChaining": "error",
|
"noUnsafeOptionalChaining": "error",
|
||||||
|
"noUnusedImports": "error",
|
||||||
"noUnusedLabels": "error",
|
"noUnusedLabels": "error",
|
||||||
"noUnusedVariables": "warn",
|
"noUnusedVariables": "warn",
|
||||||
"useIsNan": "error",
|
"useIsNan": "error",
|
||||||
|
|||||||
@@ -41,7 +41,6 @@ import {
|
|||||||
vectorFromArray as badVectorFromArray,
|
vectorFromArray as badVectorFromArray,
|
||||||
makeBuilder,
|
makeBuilder,
|
||||||
makeData,
|
makeData,
|
||||||
makeTable,
|
|
||||||
} from "apache-arrow";
|
} from "apache-arrow";
|
||||||
import { Buffers } from "apache-arrow/data";
|
import { Buffers } from "apache-arrow/data";
|
||||||
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
||||||
@@ -74,7 +73,7 @@ export type FieldLike =
|
|||||||
| {
|
| {
|
||||||
type: string;
|
type: string;
|
||||||
name: string;
|
name: string;
|
||||||
nullable?: boolean;
|
nullable: boolean;
|
||||||
metadata?: Map<string, string>;
|
metadata?: Map<string, string>;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -279,7 +278,7 @@ export class MakeArrowTableOptions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An enhanced version of the {@link makeTable} function from Apache Arrow
|
* An enhanced version of the apache-arrow makeTable function from Apache Arrow
|
||||||
* that supports nested fields and embeddings columns.
|
* that supports nested fields and embeddings columns.
|
||||||
*
|
*
|
||||||
* (typically you do not need to call this function. It will be called automatically
|
* (typically you do not need to call this function. It will be called automatically
|
||||||
@@ -512,7 +511,11 @@ function* rowPathsAndValues(
|
|||||||
if (isObject(value)) {
|
if (isObject(value)) {
|
||||||
yield* rowPathsAndValues(value, [...basePath, key]);
|
yield* rowPathsAndValues(value, [...basePath, key]);
|
||||||
} else {
|
} else {
|
||||||
yield [[...basePath, key], value];
|
// Skip undefined values - they should be treated the same as missing fields
|
||||||
|
// for embedding function purposes
|
||||||
|
if (value !== undefined) {
|
||||||
|
yield [[...basePath, key], value];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -701,7 +704,7 @@ function transposeData(
|
|||||||
}
|
}
|
||||||
return current;
|
return current;
|
||||||
});
|
});
|
||||||
return makeVector(values, field.type);
|
return makeVector(values, field.type, undefined, field.nullable);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -748,9 +751,30 @@ function makeVector(
|
|||||||
values: unknown[],
|
values: unknown[],
|
||||||
type?: DataType,
|
type?: DataType,
|
||||||
stringAsDictionary?: boolean,
|
stringAsDictionary?: boolean,
|
||||||
|
nullable?: boolean,
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||||
): Vector<any> {
|
): Vector<any> {
|
||||||
if (type !== undefined) {
|
if (type !== undefined) {
|
||||||
|
// Convert undefined values to null for nullable fields
|
||||||
|
if (nullable) {
|
||||||
|
values = values.map((v) => (v === undefined ? null : v));
|
||||||
|
}
|
||||||
|
|
||||||
|
// workaround for: https://github.com/apache/arrow-js/issues/68
|
||||||
|
if (DataType.isBool(type)) {
|
||||||
|
const hasNonNullValue = values.some((v) => v !== null && v !== undefined);
|
||||||
|
if (!hasNonNullValue) {
|
||||||
|
const nullBitmap = new Uint8Array(Math.ceil(values.length / 8));
|
||||||
|
const data = makeData({
|
||||||
|
type: type,
|
||||||
|
length: values.length,
|
||||||
|
nullCount: values.length,
|
||||||
|
nullBitmap,
|
||||||
|
});
|
||||||
|
return arrowMakeVector(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// No need for inference, let Arrow create it
|
// No need for inference, let Arrow create it
|
||||||
if (type instanceof Int) {
|
if (type instanceof Int) {
|
||||||
if (DataType.isInt(type) && type.bitWidth === 64) {
|
if (DataType.isInt(type) && type.bitWidth === 64) {
|
||||||
@@ -875,7 +899,12 @@ async function applyEmbeddingsFromMetadata(
|
|||||||
for (const field of schema.fields) {
|
for (const field of schema.fields) {
|
||||||
if (!(field.name in columns)) {
|
if (!(field.name in columns)) {
|
||||||
const nullValues = new Array(table.numRows).fill(null);
|
const nullValues = new Array(table.numRows).fill(null);
|
||||||
columns[field.name] = makeVector(nullValues, field.type);
|
columns[field.name] = makeVector(
|
||||||
|
nullValues,
|
||||||
|
field.type,
|
||||||
|
undefined,
|
||||||
|
field.nullable,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -939,7 +968,12 @@ async function applyEmbeddings<T>(
|
|||||||
} else if (schema != null) {
|
} else if (schema != null) {
|
||||||
const destField = schema.fields.find((f) => f.name === destColumn);
|
const destField = schema.fields.find((f) => f.name === destColumn);
|
||||||
if (destField != null) {
|
if (destField != null) {
|
||||||
newColumns[destColumn] = makeVector([], destField.type);
|
newColumns[destColumn] = makeVector(
|
||||||
|
[],
|
||||||
|
destField.type,
|
||||||
|
undefined,
|
||||||
|
destField.nullable,
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`,
|
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`,
|
||||||
@@ -1251,19 +1285,36 @@ function validateSchemaEmbeddings(
|
|||||||
if (isFixedSizeList(field.type)) {
|
if (isFixedSizeList(field.type)) {
|
||||||
field = sanitizeField(field);
|
field = sanitizeField(field);
|
||||||
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
||||||
|
// Check if there's an embedding function registered for this field
|
||||||
|
let hasEmbeddingFunction = false;
|
||||||
|
|
||||||
|
// Check schema metadata for embedding functions
|
||||||
if (schema.metadata.has("embedding_functions")) {
|
if (schema.metadata.has("embedding_functions")) {
|
||||||
const embeddings = JSON.parse(
|
const embeddings = JSON.parse(
|
||||||
schema.metadata.get("embedding_functions")!,
|
schema.metadata.get("embedding_functions")!,
|
||||||
);
|
);
|
||||||
if (
|
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
if (embeddings.find((f: any) => f["vectorColumn"] === field.name)) {
|
||||||
embeddings.find((f: any) => f["vectorColumn"] === field.name) ===
|
hasEmbeddingFunction = true;
|
||||||
undefined
|
}
|
||||||
) {
|
}
|
||||||
|
|
||||||
|
// Check passed embedding function parameter
|
||||||
|
if (embeddings && embeddings.vectorColumn === field.name) {
|
||||||
|
hasEmbeddingFunction = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the field is nullable AND there's no embedding function, allow undefined/omitted values
|
||||||
|
if (field.nullable && !hasEmbeddingFunction) {
|
||||||
|
fields.push(field);
|
||||||
|
} else {
|
||||||
|
// Either not nullable OR has embedding function - require explicit values
|
||||||
|
if (hasEmbeddingFunction) {
|
||||||
|
// Don't add to missingEmbeddingFields since this is expected to be filled by embedding function
|
||||||
|
fields.push(field);
|
||||||
|
} else {
|
||||||
missingEmbeddingFields.push(field);
|
missingEmbeddingFields.push(field);
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
missingEmbeddingFields.push(field);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fields.push(field);
|
fields.push(field);
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
import {
|
import {
|
||||||
Data,
|
Data,
|
||||||
Schema,
|
|
||||||
SchemaLike,
|
SchemaLike,
|
||||||
TableLike,
|
TableLike,
|
||||||
fromTableToStreamBuffer,
|
fromTableToStreamBuffer,
|
||||||
@@ -268,6 +267,33 @@ export abstract class Connection {
|
|||||||
* @param {string[]} namespace The namespace to drop tables from (defaults to root namespace).
|
* @param {string[]} namespace The namespace to drop tables from (defaults to root namespace).
|
||||||
*/
|
*/
|
||||||
abstract dropAllTables(namespace?: string[]): Promise<void>;
|
abstract dropAllTables(namespace?: string[]): Promise<void>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clone a table from a source table.
|
||||||
|
*
|
||||||
|
* A shallow clone creates a new table that shares the underlying data files
|
||||||
|
* with the source table but has its own independent manifest. This allows
|
||||||
|
* both the source and cloned tables to evolve independently while initially
|
||||||
|
* sharing the same data, deletion, and index files.
|
||||||
|
*
|
||||||
|
* @param {string} targetTableName - The name of the target table to create.
|
||||||
|
* @param {string} sourceUri - The URI of the source table to clone from.
|
||||||
|
* @param {object} options - Clone options.
|
||||||
|
* @param {string[]} options.targetNamespace - The namespace for the target table (defaults to root namespace).
|
||||||
|
* @param {number} options.sourceVersion - The version of the source table to clone.
|
||||||
|
* @param {string} options.sourceTag - The tag of the source table to clone.
|
||||||
|
* @param {boolean} options.isShallow - Whether to perform a shallow clone (defaults to true).
|
||||||
|
*/
|
||||||
|
abstract cloneTable(
|
||||||
|
targetTableName: string,
|
||||||
|
sourceUri: string,
|
||||||
|
options?: {
|
||||||
|
targetNamespace?: string[];
|
||||||
|
sourceVersion?: number;
|
||||||
|
sourceTag?: string;
|
||||||
|
isShallow?: boolean;
|
||||||
|
},
|
||||||
|
): Promise<Table>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @hideconstructor */
|
/** @hideconstructor */
|
||||||
@@ -332,6 +358,28 @@ export class LocalConnection extends Connection {
|
|||||||
return new LocalTable(innerTable);
|
return new LocalTable(innerTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async cloneTable(
|
||||||
|
targetTableName: string,
|
||||||
|
sourceUri: string,
|
||||||
|
options?: {
|
||||||
|
targetNamespace?: string[];
|
||||||
|
sourceVersion?: number;
|
||||||
|
sourceTag?: string;
|
||||||
|
isShallow?: boolean;
|
||||||
|
},
|
||||||
|
): Promise<Table> {
|
||||||
|
const innerTable = await this.inner.cloneTable(
|
||||||
|
targetTableName,
|
||||||
|
sourceUri,
|
||||||
|
options?.targetNamespace ?? [],
|
||||||
|
options?.sourceVersion ?? null,
|
||||||
|
options?.sourceTag ?? null,
|
||||||
|
options?.isShallow ?? true,
|
||||||
|
);
|
||||||
|
|
||||||
|
return new LocalTable(innerTable);
|
||||||
|
}
|
||||||
|
|
||||||
private getStorageOptions(
|
private getStorageOptions(
|
||||||
options?: Partial<CreateTableOptions>,
|
options?: Partial<CreateTableOptions>,
|
||||||
): Record<string, string> | undefined {
|
): Record<string, string> | undefined {
|
||||||
|
|||||||
@@ -43,6 +43,10 @@ export {
|
|||||||
DeleteResult,
|
DeleteResult,
|
||||||
DropColumnsResult,
|
DropColumnsResult,
|
||||||
UpdateResult,
|
UpdateResult,
|
||||||
|
SplitRandomOptions,
|
||||||
|
SplitHashOptions,
|
||||||
|
SplitSequentialOptions,
|
||||||
|
ShuffleOptions,
|
||||||
} from "./native.js";
|
} from "./native.js";
|
||||||
|
|
||||||
export {
|
export {
|
||||||
@@ -85,6 +89,7 @@ export {
|
|||||||
Index,
|
Index,
|
||||||
IndexOptions,
|
IndexOptions,
|
||||||
IvfPqOptions,
|
IvfPqOptions,
|
||||||
|
IvfRqOptions,
|
||||||
IvfFlatOptions,
|
IvfFlatOptions,
|
||||||
HnswPqOptions,
|
HnswPqOptions,
|
||||||
HnswSqOptions,
|
HnswSqOptions,
|
||||||
@@ -110,6 +115,7 @@ export {
|
|||||||
export { MergeInsertBuilder, WriteExecutionOptions } from "./merge";
|
export { MergeInsertBuilder, WriteExecutionOptions } from "./merge";
|
||||||
|
|
||||||
export * as embedding from "./embedding";
|
export * as embedding from "./embedding";
|
||||||
|
export { permutationBuilder, PermutationBuilder } from "./permutation";
|
||||||
export * as rerankers from "./rerankers";
|
export * as rerankers from "./rerankers";
|
||||||
export {
|
export {
|
||||||
SchemaLike,
|
SchemaLike,
|
||||||
|
|||||||
@@ -112,6 +112,77 @@ export interface IvfPqOptions {
|
|||||||
sampleRate?: number;
|
sampleRate?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface IvfRqOptions {
|
||||||
|
/**
|
||||||
|
* The number of IVF partitions to create.
|
||||||
|
*
|
||||||
|
* This value should generally scale with the number of rows in the dataset.
|
||||||
|
* By default the number of partitions is the square root of the number of
|
||||||
|
* rows.
|
||||||
|
*
|
||||||
|
* If this value is too large then the first part of the search (picking the
|
||||||
|
* right partition) will be slow. If this value is too small then the second
|
||||||
|
* part of the search (searching within a partition) will be slow.
|
||||||
|
*/
|
||||||
|
numPartitions?: number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of bits per dimension for residual quantization.
|
||||||
|
*
|
||||||
|
* This value controls how much each residual component is compressed. The more
|
||||||
|
* bits, the more accurate the index will be but the slower search. Typical values
|
||||||
|
* are small integers; the default is 1 bit per dimension.
|
||||||
|
*/
|
||||||
|
numBits?: number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Distance type to use to build the index.
|
||||||
|
*
|
||||||
|
* Default value is "l2".
|
||||||
|
*
|
||||||
|
* This is used when training the index to calculate the IVF partitions
|
||||||
|
* (vectors are grouped in partitions with similar vectors according to this
|
||||||
|
* distance type) and during quantization.
|
||||||
|
*
|
||||||
|
* The distance type used to train an index MUST match the distance type used
|
||||||
|
* to search the index. Failure to do so will yield inaccurate results.
|
||||||
|
*
|
||||||
|
* The following distance types are available:
|
||||||
|
*
|
||||||
|
* "l2" - Euclidean distance.
|
||||||
|
* "cosine" - Cosine distance.
|
||||||
|
* "dot" - Dot product.
|
||||||
|
*/
|
||||||
|
distanceType?: "l2" | "cosine" | "dot";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Max iterations to train IVF kmeans.
|
||||||
|
*
|
||||||
|
* When training an IVF index we use kmeans to calculate the partitions. This parameter
|
||||||
|
* controls how many iterations of kmeans to run.
|
||||||
|
*
|
||||||
|
* The default value is 50.
|
||||||
|
*/
|
||||||
|
maxIterations?: number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of vectors, per partition, to sample when training IVF kmeans.
|
||||||
|
*
|
||||||
|
* When an IVF index is trained, we need to calculate partitions. These are groups
|
||||||
|
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
||||||
|
*
|
||||||
|
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
||||||
|
* random sample of the data. This parameter controls the size of the sample. The total
|
||||||
|
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
||||||
|
*
|
||||||
|
* Increasing this value might improve the quality of the index but in most cases the
|
||||||
|
* default should be sufficient.
|
||||||
|
*
|
||||||
|
* The default value is 256.
|
||||||
|
*/
|
||||||
|
sampleRate?: number;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Options to create an `HNSW_PQ` index
|
* Options to create an `HNSW_PQ` index
|
||||||
*/
|
*/
|
||||||
@@ -523,6 +594,35 @@ export class Index {
|
|||||||
options?.distanceType,
|
options?.distanceType,
|
||||||
options?.numPartitions,
|
options?.numPartitions,
|
||||||
options?.numSubVectors,
|
options?.numSubVectors,
|
||||||
|
options?.numBits,
|
||||||
|
options?.maxIterations,
|
||||||
|
options?.sampleRate,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an IvfRq index
|
||||||
|
*
|
||||||
|
* IVF-RQ (RabitQ Quantization) compresses vectors using RabitQ quantization
|
||||||
|
* and organizes them into IVF partitions.
|
||||||
|
*
|
||||||
|
* The compression scheme is called RabitQ quantization. Each dimension is quantized into a small number of bits.
|
||||||
|
* The parameters `num_bits` and `num_partitions` control this process, providing a tradeoff
|
||||||
|
* between index size (and thus search speed) and index accuracy.
|
||||||
|
*
|
||||||
|
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
||||||
|
* many groups to create.
|
||||||
|
*
|
||||||
|
* Note that training an IVF RQ index on a large dataset is a slow operation and
|
||||||
|
* currently is also a memory intensive operation.
|
||||||
|
*/
|
||||||
|
static ivfRq(options?: Partial<IvfRqOptions>) {
|
||||||
|
return new Index(
|
||||||
|
LanceDbIndex.ivfRq(
|
||||||
|
options?.distanceType,
|
||||||
|
options?.numPartitions,
|
||||||
|
options?.numBits,
|
||||||
options?.maxIterations,
|
options?.maxIterations,
|
||||||
options?.sampleRate,
|
options?.sampleRate,
|
||||||
),
|
),
|
||||||
|
|||||||
@@ -70,6 +70,23 @@ export class MergeInsertBuilder {
|
|||||||
this.#schema,
|
this.#schema,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Controls whether to use indexes for the merge operation.
|
||||||
|
*
|
||||||
|
* When set to `true` (the default), the operation will use an index if available
|
||||||
|
* on the join key for improved performance. When set to `false`, it forces a full
|
||||||
|
* table scan even if an index exists. This can be useful for benchmarking or when
|
||||||
|
* the query optimizer chooses a suboptimal path.
|
||||||
|
*
|
||||||
|
* @param useIndex - Whether to use indices for the merge operation. Defaults to `true`.
|
||||||
|
*/
|
||||||
|
useIndex(useIndex: boolean): MergeInsertBuilder {
|
||||||
|
return new MergeInsertBuilder(
|
||||||
|
this.#native.useIndex(useIndex),
|
||||||
|
this.#schema,
|
||||||
|
);
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* Executes the merge insert operation
|
* Executes the merge insert operation
|
||||||
*
|
*
|
||||||
|
|||||||
188
nodejs/lancedb/permutation.ts
Normal file
188
nodejs/lancedb/permutation.ts
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
import {
|
||||||
|
PermutationBuilder as NativePermutationBuilder,
|
||||||
|
Table as NativeTable,
|
||||||
|
ShuffleOptions,
|
||||||
|
SplitHashOptions,
|
||||||
|
SplitRandomOptions,
|
||||||
|
SplitSequentialOptions,
|
||||||
|
permutationBuilder as nativePermutationBuilder,
|
||||||
|
} from "./native.js";
|
||||||
|
import { LocalTable, Table } from "./table";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A PermutationBuilder for creating data permutations with splits, shuffling, and filtering.
|
||||||
|
*
|
||||||
|
* This class provides a TypeScript wrapper around the native Rust PermutationBuilder,
|
||||||
|
* offering methods to configure data splits, shuffling, and filtering before executing
|
||||||
|
* the permutation to create a new table.
|
||||||
|
*/
|
||||||
|
export class PermutationBuilder {
|
||||||
|
private inner: NativePermutationBuilder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @hidden
|
||||||
|
*/
|
||||||
|
constructor(inner: NativePermutationBuilder) {
|
||||||
|
this.inner = inner;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configure random splits for the permutation.
|
||||||
|
*
|
||||||
|
* @param options - Configuration for random splitting
|
||||||
|
* @returns A new PermutationBuilder instance
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* // Split by ratios
|
||||||
|
* builder.splitRandom({ ratios: [0.7, 0.3], seed: 42 });
|
||||||
|
*
|
||||||
|
* // Split by counts
|
||||||
|
* builder.splitRandom({ counts: [1000, 500], seed: 42 });
|
||||||
|
*
|
||||||
|
* // Split with fixed size
|
||||||
|
* builder.splitRandom({ fixed: 100, seed: 42 });
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
splitRandom(options: SplitRandomOptions): PermutationBuilder {
|
||||||
|
const newInner = this.inner.splitRandom(options);
|
||||||
|
return new PermutationBuilder(newInner);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configure hash-based splits for the permutation.
|
||||||
|
*
|
||||||
|
* @param options - Configuration for hash-based splitting
|
||||||
|
* @returns A new PermutationBuilder instance
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* builder.splitHash({
|
||||||
|
* columns: ["user_id"],
|
||||||
|
* splitWeights: [70, 30],
|
||||||
|
* discardWeight: 0
|
||||||
|
* });
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
splitHash(options: SplitHashOptions): PermutationBuilder {
|
||||||
|
const newInner = this.inner.splitHash(options);
|
||||||
|
return new PermutationBuilder(newInner);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configure sequential splits for the permutation.
|
||||||
|
*
|
||||||
|
* @param options - Configuration for sequential splitting
|
||||||
|
* @returns A new PermutationBuilder instance
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* // Split by ratios
|
||||||
|
* builder.splitSequential({ ratios: [0.8, 0.2] });
|
||||||
|
*
|
||||||
|
* // Split by counts
|
||||||
|
* builder.splitSequential({ counts: [800, 200] });
|
||||||
|
*
|
||||||
|
* // Split with fixed size
|
||||||
|
* builder.splitSequential({ fixed: 1000 });
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
splitSequential(options: SplitSequentialOptions): PermutationBuilder {
|
||||||
|
const newInner = this.inner.splitSequential(options);
|
||||||
|
return new PermutationBuilder(newInner);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configure calculated splits for the permutation.
|
||||||
|
*
|
||||||
|
* @param calculation - SQL expression for calculating splits
|
||||||
|
* @returns A new PermutationBuilder instance
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* builder.splitCalculated("user_id % 3");
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
splitCalculated(calculation: string): PermutationBuilder {
|
||||||
|
const newInner = this.inner.splitCalculated(calculation);
|
||||||
|
return new PermutationBuilder(newInner);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configure shuffling for the permutation.
|
||||||
|
*
|
||||||
|
* @param options - Configuration for shuffling
|
||||||
|
* @returns A new PermutationBuilder instance
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* // Basic shuffle
|
||||||
|
* builder.shuffle({ seed: 42 });
|
||||||
|
*
|
||||||
|
* // Shuffle with clump size
|
||||||
|
* builder.shuffle({ seed: 42, clumpSize: 10 });
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
shuffle(options: ShuffleOptions): PermutationBuilder {
|
||||||
|
const newInner = this.inner.shuffle(options);
|
||||||
|
return new PermutationBuilder(newInner);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configure filtering for the permutation.
|
||||||
|
*
|
||||||
|
* @param filter - SQL filter expression
|
||||||
|
* @returns A new PermutationBuilder instance
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* builder.filter("age > 18 AND status = 'active'");
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
filter(filter: string): PermutationBuilder {
|
||||||
|
const newInner = this.inner.filter(filter);
|
||||||
|
return new PermutationBuilder(newInner);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute the permutation and create the destination table.
|
||||||
|
*
|
||||||
|
* @returns A Promise that resolves to the new Table instance
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* const permutationTable = await builder.execute();
|
||||||
|
* console.log(`Created table: ${permutationTable.name}`);
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
async execute(): Promise<Table> {
|
||||||
|
const nativeTable: NativeTable = await this.inner.execute();
|
||||||
|
return new LocalTable(nativeTable);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a permutation builder for the given table.
|
||||||
|
*
|
||||||
|
* @param table - The source table to create a permutation from
|
||||||
|
* @param destTableName - The name for the destination permutation table
|
||||||
|
* @returns A PermutationBuilder instance
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* const builder = permutationBuilder(sourceTable, "training_data")
|
||||||
|
* .splitRandom({ ratios: [0.8, 0.2], seed: 42 })
|
||||||
|
* .shuffle({ seed: 123 });
|
||||||
|
*
|
||||||
|
* const trainingTable = await builder.execute();
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
export function permutationBuilder(
|
||||||
|
table: Table,
|
||||||
|
destTableName: string,
|
||||||
|
): PermutationBuilder {
|
||||||
|
// Extract the inner native table from the TypeScript wrapper
|
||||||
|
const localTable = table as LocalTable;
|
||||||
|
// Access inner through type assertion since it's private
|
||||||
|
const nativeBuilder = nativePermutationBuilder(
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: need access to private variable
|
||||||
|
(localTable as any).inner,
|
||||||
|
destTableName,
|
||||||
|
);
|
||||||
|
return new PermutationBuilder(nativeBuilder);
|
||||||
|
}
|
||||||
@@ -326,6 +326,9 @@ export function sanitizeDictionary(typeLike: object) {
|
|||||||
|
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||||
export function sanitizeType(typeLike: unknown): DataType<any> {
|
export function sanitizeType(typeLike: unknown): DataType<any> {
|
||||||
|
if (typeof typeLike === "string") {
|
||||||
|
return dataTypeFromName(typeLike);
|
||||||
|
}
|
||||||
if (typeof typeLike !== "object" || typeLike === null) {
|
if (typeof typeLike !== "object" || typeLike === null) {
|
||||||
throw Error("Expected a Type but object was null/undefined");
|
throw Error("Expected a Type but object was null/undefined");
|
||||||
}
|
}
|
||||||
@@ -447,7 +450,7 @@ export function sanitizeType(typeLike: unknown): DataType<any> {
|
|||||||
case Type.DurationSecond:
|
case Type.DurationSecond:
|
||||||
return new DurationSecond();
|
return new DurationSecond();
|
||||||
default:
|
default:
|
||||||
throw new Error("Unrecoginized type id in schema: " + typeId);
|
throw new Error("Unrecognized type id in schema: " + typeId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -467,7 +470,15 @@ export function sanitizeField(fieldLike: unknown): Field {
|
|||||||
"The field passed in is missing a `type`/`name`/`nullable` property",
|
"The field passed in is missing a `type`/`name`/`nullable` property",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
const type = sanitizeType(fieldLike.type);
|
let type: DataType;
|
||||||
|
try {
|
||||||
|
type = sanitizeType(fieldLike.type);
|
||||||
|
} catch (error: unknown) {
|
||||||
|
throw Error(
|
||||||
|
`Unable to sanitize type for field: ${fieldLike.name} due to error: ${error}`,
|
||||||
|
{ cause: error },
|
||||||
|
);
|
||||||
|
}
|
||||||
const name = fieldLike.name;
|
const name = fieldLike.name;
|
||||||
if (!(typeof name === "string")) {
|
if (!(typeof name === "string")) {
|
||||||
throw Error("The field passed in had a non-string `name` property");
|
throw Error("The field passed in had a non-string `name` property");
|
||||||
@@ -581,3 +592,46 @@ function sanitizeData(
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const constructorsByTypeName = {
|
||||||
|
null: () => new Null(),
|
||||||
|
binary: () => new Binary(),
|
||||||
|
utf8: () => new Utf8(),
|
||||||
|
bool: () => new Bool(),
|
||||||
|
int8: () => new Int8(),
|
||||||
|
int16: () => new Int16(),
|
||||||
|
int32: () => new Int32(),
|
||||||
|
int64: () => new Int64(),
|
||||||
|
uint8: () => new Uint8(),
|
||||||
|
uint16: () => new Uint16(),
|
||||||
|
uint32: () => new Uint32(),
|
||||||
|
uint64: () => new Uint64(),
|
||||||
|
float16: () => new Float16(),
|
||||||
|
float32: () => new Float32(),
|
||||||
|
float64: () => new Float64(),
|
||||||
|
datemillisecond: () => new DateMillisecond(),
|
||||||
|
dateday: () => new DateDay(),
|
||||||
|
timenanosecond: () => new TimeNanosecond(),
|
||||||
|
timemicrosecond: () => new TimeMicrosecond(),
|
||||||
|
timemillisecond: () => new TimeMillisecond(),
|
||||||
|
timesecond: () => new TimeSecond(),
|
||||||
|
intervaldaytime: () => new IntervalDayTime(),
|
||||||
|
intervalyearmonth: () => new IntervalYearMonth(),
|
||||||
|
durationnanosecond: () => new DurationNanosecond(),
|
||||||
|
durationmicrosecond: () => new DurationMicrosecond(),
|
||||||
|
durationmillisecond: () => new DurationMillisecond(),
|
||||||
|
durationsecond: () => new DurationSecond(),
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
type MappableTypeName = keyof typeof constructorsByTypeName;
|
||||||
|
|
||||||
|
export function dataTypeFromName(typeName: string): DataType {
|
||||||
|
const normalizedTypeName = typeName.toLowerCase() as MappableTypeName;
|
||||||
|
const _constructor = constructorsByTypeName[normalizedTypeName];
|
||||||
|
|
||||||
|
if (!_constructor) {
|
||||||
|
throw new Error("Unrecognized type name in schema: " + typeName);
|
||||||
|
}
|
||||||
|
|
||||||
|
return _constructor();
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.22.1-beta.0",
|
"version": "0.22.2",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
|
|||||||
@@ -213,6 +213,36 @@ impl Connection {
|
|||||||
Ok(Table::new(tbl))
|
Ok(Table::new(tbl))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(catch_unwind)]
|
||||||
|
pub async fn clone_table(
|
||||||
|
&self,
|
||||||
|
target_table_name: String,
|
||||||
|
source_uri: String,
|
||||||
|
target_namespace: Vec<String>,
|
||||||
|
source_version: Option<i64>,
|
||||||
|
source_tag: Option<String>,
|
||||||
|
is_shallow: bool,
|
||||||
|
) -> napi::Result<Table> {
|
||||||
|
let mut builder = self
|
||||||
|
.get_inner()?
|
||||||
|
.clone_table(&target_table_name, &source_uri);
|
||||||
|
|
||||||
|
builder = builder.target_namespace(target_namespace);
|
||||||
|
|
||||||
|
if let Some(version) = source_version {
|
||||||
|
builder = builder.source_version(version as u64);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(tag) = source_tag {
|
||||||
|
builder = builder.source_tag(tag);
|
||||||
|
}
|
||||||
|
|
||||||
|
builder = builder.is_shallow(is_shallow);
|
||||||
|
|
||||||
|
let tbl = builder.execute().await.default_error()?;
|
||||||
|
Ok(Table::new(tbl))
|
||||||
|
}
|
||||||
|
|
||||||
/// Drop table with the name. Or raise an error if the table does not exist.
|
/// Drop table with the name. Or raise an error if the table does not exist.
|
||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub async fn drop_table(&self, name: String, namespace: Vec<String>) -> napi::Result<()> {
|
pub async fn drop_table(&self, name: String, namespace: Vec<String>) -> napi::Result<()> {
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use std::sync::Mutex;
|
|||||||
use lancedb::index::scalar::{BTreeIndexBuilder, FtsIndexBuilder};
|
use lancedb::index::scalar::{BTreeIndexBuilder, FtsIndexBuilder};
|
||||||
use lancedb::index::vector::{
|
use lancedb::index::vector::{
|
||||||
IvfFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder,
|
IvfFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder,
|
||||||
|
IvfRqIndexBuilder,
|
||||||
};
|
};
|
||||||
use lancedb::index::Index as LanceDbIndex;
|
use lancedb::index::Index as LanceDbIndex;
|
||||||
use napi_derive::napi;
|
use napi_derive::napi;
|
||||||
@@ -65,6 +66,36 @@ impl Index {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(factory)]
|
||||||
|
pub fn ivf_rq(
|
||||||
|
distance_type: Option<String>,
|
||||||
|
num_partitions: Option<u32>,
|
||||||
|
num_bits: Option<u32>,
|
||||||
|
max_iterations: Option<u32>,
|
||||||
|
sample_rate: Option<u32>,
|
||||||
|
) -> napi::Result<Self> {
|
||||||
|
let mut ivf_rq_builder = IvfRqIndexBuilder::default();
|
||||||
|
if let Some(distance_type) = distance_type {
|
||||||
|
let distance_type = parse_distance_type(distance_type)?;
|
||||||
|
ivf_rq_builder = ivf_rq_builder.distance_type(distance_type);
|
||||||
|
}
|
||||||
|
if let Some(num_partitions) = num_partitions {
|
||||||
|
ivf_rq_builder = ivf_rq_builder.num_partitions(num_partitions);
|
||||||
|
}
|
||||||
|
if let Some(num_bits) = num_bits {
|
||||||
|
ivf_rq_builder = ivf_rq_builder.num_bits(num_bits);
|
||||||
|
}
|
||||||
|
if let Some(max_iterations) = max_iterations {
|
||||||
|
ivf_rq_builder = ivf_rq_builder.max_iterations(max_iterations);
|
||||||
|
}
|
||||||
|
if let Some(sample_rate) = sample_rate {
|
||||||
|
ivf_rq_builder = ivf_rq_builder.sample_rate(sample_rate);
|
||||||
|
}
|
||||||
|
Ok(Self {
|
||||||
|
inner: Mutex::new(Some(LanceDbIndex::IvfRq(ivf_rq_builder))),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
#[napi(factory)]
|
#[napi(factory)]
|
||||||
pub fn ivf_flat(
|
pub fn ivf_flat(
|
||||||
distance_type: Option<String>,
|
distance_type: Option<String>,
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ mod header;
|
|||||||
mod index;
|
mod index;
|
||||||
mod iterator;
|
mod iterator;
|
||||||
pub mod merge;
|
pub mod merge;
|
||||||
|
pub mod permutation;
|
||||||
mod query;
|
mod query;
|
||||||
pub mod remote;
|
pub mod remote;
|
||||||
mod rerankers;
|
mod rerankers;
|
||||||
|
|||||||
@@ -43,6 +43,13 @@ impl NativeMergeInsertBuilder {
|
|||||||
self.inner.timeout(Duration::from_millis(timeout as u64));
|
self.inner.timeout(Duration::from_millis(timeout as u64));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn use_index(&self, use_index: bool) -> Self {
|
||||||
|
let mut this = self.clone();
|
||||||
|
this.inner.use_index(use_index);
|
||||||
|
this
|
||||||
|
}
|
||||||
|
|
||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeResult> {
|
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeResult> {
|
||||||
let data = ipc_file_to_batches(buf.to_vec())
|
let data = ipc_file_to_batches(buf.to_vec())
|
||||||
|
|||||||
222
nodejs/src/permutation.rs
Normal file
222
nodejs/src/permutation.rs
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
|
use crate::{error::NapiErrorExt, table::Table};
|
||||||
|
use lancedb::dataloader::{
|
||||||
|
permutation::{PermutationBuilder as LancePermutationBuilder, ShuffleStrategy},
|
||||||
|
split::{SplitSizes, SplitStrategy},
|
||||||
|
};
|
||||||
|
use napi_derive::napi;
|
||||||
|
|
||||||
|
#[napi(object)]
|
||||||
|
pub struct SplitRandomOptions {
|
||||||
|
pub ratios: Option<Vec<f64>>,
|
||||||
|
pub counts: Option<Vec<i64>>,
|
||||||
|
pub fixed: Option<i64>,
|
||||||
|
pub seed: Option<i64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi(object)]
|
||||||
|
pub struct SplitHashOptions {
|
||||||
|
pub columns: Vec<String>,
|
||||||
|
pub split_weights: Vec<i64>,
|
||||||
|
pub discard_weight: Option<i64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi(object)]
|
||||||
|
pub struct SplitSequentialOptions {
|
||||||
|
pub ratios: Option<Vec<f64>>,
|
||||||
|
pub counts: Option<Vec<i64>>,
|
||||||
|
pub fixed: Option<i64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi(object)]
|
||||||
|
pub struct ShuffleOptions {
|
||||||
|
pub seed: Option<i64>,
|
||||||
|
pub clump_size: Option<i64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct PermutationBuilderState {
|
||||||
|
pub builder: Option<LancePermutationBuilder>,
|
||||||
|
pub dest_table_name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub struct PermutationBuilder {
|
||||||
|
state: Arc<Mutex<PermutationBuilderState>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PermutationBuilder {
|
||||||
|
pub fn new(builder: LancePermutationBuilder, dest_table_name: String) -> Self {
|
||||||
|
Self {
|
||||||
|
state: Arc::new(Mutex::new(PermutationBuilderState {
|
||||||
|
builder: Some(builder),
|
||||||
|
dest_table_name,
|
||||||
|
})),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PermutationBuilder {
|
||||||
|
fn modify(
|
||||||
|
&self,
|
||||||
|
func: impl FnOnce(LancePermutationBuilder) -> LancePermutationBuilder,
|
||||||
|
) -> napi::Result<Self> {
|
||||||
|
let mut state = self.state.lock().unwrap();
|
||||||
|
let builder = state
|
||||||
|
.builder
|
||||||
|
.take()
|
||||||
|
.ok_or_else(|| napi::Error::from_reason("Builder already consumed"))?;
|
||||||
|
state.builder = Some(func(builder));
|
||||||
|
Ok(Self {
|
||||||
|
state: self.state.clone(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
impl PermutationBuilder {
|
||||||
|
/// Configure random splits
|
||||||
|
#[napi]
|
||||||
|
pub fn split_random(&self, options: SplitRandomOptions) -> napi::Result<Self> {
|
||||||
|
// Check that exactly one split type is provided
|
||||||
|
let split_args_count = [
|
||||||
|
options.ratios.is_some(),
|
||||||
|
options.counts.is_some(),
|
||||||
|
options.fixed.is_some(),
|
||||||
|
]
|
||||||
|
.iter()
|
||||||
|
.filter(|&&x| x)
|
||||||
|
.count();
|
||||||
|
|
||||||
|
if split_args_count != 1 {
|
||||||
|
return Err(napi::Error::from_reason(
|
||||||
|
"Exactly one of 'ratios', 'counts', or 'fixed' must be provided",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let sizes = if let Some(ratios) = options.ratios {
|
||||||
|
SplitSizes::Percentages(ratios)
|
||||||
|
} else if let Some(counts) = options.counts {
|
||||||
|
SplitSizes::Counts(counts.into_iter().map(|c| c as u64).collect())
|
||||||
|
} else if let Some(fixed) = options.fixed {
|
||||||
|
SplitSizes::Fixed(fixed as u64)
|
||||||
|
} else {
|
||||||
|
unreachable!("One of the split arguments must be provided");
|
||||||
|
};
|
||||||
|
|
||||||
|
let seed = options.seed.map(|s| s as u64);
|
||||||
|
|
||||||
|
self.modify(|builder| builder.with_split_strategy(SplitStrategy::Random { seed, sizes }))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configure hash-based splits
|
||||||
|
#[napi]
|
||||||
|
pub fn split_hash(&self, options: SplitHashOptions) -> napi::Result<Self> {
|
||||||
|
let split_weights = options
|
||||||
|
.split_weights
|
||||||
|
.into_iter()
|
||||||
|
.map(|w| w as u64)
|
||||||
|
.collect();
|
||||||
|
let discard_weight = options.discard_weight.unwrap_or(0) as u64;
|
||||||
|
|
||||||
|
self.modify(|builder| {
|
||||||
|
builder.with_split_strategy(SplitStrategy::Hash {
|
||||||
|
columns: options.columns,
|
||||||
|
split_weights,
|
||||||
|
discard_weight,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configure sequential splits
|
||||||
|
#[napi]
|
||||||
|
pub fn split_sequential(&self, options: SplitSequentialOptions) -> napi::Result<Self> {
|
||||||
|
// Check that exactly one split type is provided
|
||||||
|
let split_args_count = [
|
||||||
|
options.ratios.is_some(),
|
||||||
|
options.counts.is_some(),
|
||||||
|
options.fixed.is_some(),
|
||||||
|
]
|
||||||
|
.iter()
|
||||||
|
.filter(|&&x| x)
|
||||||
|
.count();
|
||||||
|
|
||||||
|
if split_args_count != 1 {
|
||||||
|
return Err(napi::Error::from_reason(
|
||||||
|
"Exactly one of 'ratios', 'counts', or 'fixed' must be provided",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let sizes = if let Some(ratios) = options.ratios {
|
||||||
|
SplitSizes::Percentages(ratios)
|
||||||
|
} else if let Some(counts) = options.counts {
|
||||||
|
SplitSizes::Counts(counts.into_iter().map(|c| c as u64).collect())
|
||||||
|
} else if let Some(fixed) = options.fixed {
|
||||||
|
SplitSizes::Fixed(fixed as u64)
|
||||||
|
} else {
|
||||||
|
unreachable!("One of the split arguments must be provided");
|
||||||
|
};
|
||||||
|
|
||||||
|
self.modify(|builder| builder.with_split_strategy(SplitStrategy::Sequential { sizes }))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configure calculated splits
|
||||||
|
#[napi]
|
||||||
|
pub fn split_calculated(&self, calculation: String) -> napi::Result<Self> {
|
||||||
|
self.modify(|builder| {
|
||||||
|
builder.with_split_strategy(SplitStrategy::Calculated { calculation })
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configure shuffling
|
||||||
|
#[napi]
|
||||||
|
pub fn shuffle(&self, options: ShuffleOptions) -> napi::Result<Self> {
|
||||||
|
let seed = options.seed.map(|s| s as u64);
|
||||||
|
let clump_size = options.clump_size.map(|c| c as u64);
|
||||||
|
|
||||||
|
self.modify(|builder| {
|
||||||
|
builder.with_shuffle_strategy(ShuffleStrategy::Random { seed, clump_size })
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configure filtering
|
||||||
|
#[napi]
|
||||||
|
pub fn filter(&self, filter: String) -> napi::Result<Self> {
|
||||||
|
self.modify(|builder| builder.with_filter(filter))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute the permutation builder and create the table
|
||||||
|
#[napi]
|
||||||
|
pub async fn execute(&self) -> napi::Result<Table> {
|
||||||
|
let (builder, dest_table_name) = {
|
||||||
|
let mut state = self.state.lock().unwrap();
|
||||||
|
let builder = state
|
||||||
|
.builder
|
||||||
|
.take()
|
||||||
|
.ok_or_else(|| napi::Error::from_reason("Builder already consumed"))?;
|
||||||
|
|
||||||
|
let dest_table_name = std::mem::take(&mut state.dest_table_name);
|
||||||
|
(builder, dest_table_name)
|
||||||
|
};
|
||||||
|
|
||||||
|
let table = builder.build(&dest_table_name).await.default_error()?;
|
||||||
|
Ok(Table::new(table))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a permutation builder for the given table
|
||||||
|
#[napi]
|
||||||
|
pub fn permutation_builder(
|
||||||
|
table: &crate::table::Table,
|
||||||
|
dest_table_name: String,
|
||||||
|
) -> napi::Result<PermutationBuilder> {
|
||||||
|
use lancedb::dataloader::permutation::PermutationBuilder as LancePermutationBuilder;
|
||||||
|
|
||||||
|
let inner_table = table.inner_ref()?.clone();
|
||||||
|
let inner_builder = LancePermutationBuilder::new(inner_table);
|
||||||
|
|
||||||
|
Ok(PermutationBuilder::new(inner_builder, dest_table_name))
|
||||||
|
}
|
||||||
@@ -26,7 +26,7 @@ pub struct Table {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Table {
|
impl Table {
|
||||||
fn inner_ref(&self) -> napi::Result<&LanceDbTable> {
|
pub(crate) fn inner_ref(&self) -> napi::Result<&LanceDbTable> {
|
||||||
self.inner
|
self.inner
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.ok_or_else(|| napi::Error::from_reason(format!("Table {} is closed", self.name)))
|
.ok_or_else(|| napi::Error::from_reason(format!("Table {} is closed", self.name)))
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.25.1-beta.1"
|
current_version = "0.25.3-beta.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
@@ -24,6 +24,19 @@ commit = true
|
|||||||
message = "Bump version: {current_version} → {new_version}"
|
message = "Bump version: {current_version} → {new_version}"
|
||||||
commit_args = ""
|
commit_args = ""
|
||||||
|
|
||||||
|
# Update Cargo.lock after version bump
|
||||||
|
pre_commit_hooks = [
|
||||||
|
"""
|
||||||
|
cd python && cargo update -p lancedb-python
|
||||||
|
if git diff --quiet ../Cargo.lock; then
|
||||||
|
echo "Cargo.lock unchanged"
|
||||||
|
else
|
||||||
|
git add ../Cargo.lock
|
||||||
|
echo "Updated and staged Cargo.lock"
|
||||||
|
fi
|
||||||
|
""",
|
||||||
|
]
|
||||||
|
|
||||||
[tool.bumpversion.parts.pre_l]
|
[tool.bumpversion.parts.pre_l]
|
||||||
values = ["beta", "final"]
|
values = ["beta", "final"]
|
||||||
optional_value = "final"
|
optional_value = "final"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.25.1-beta.1"
|
version = "0.25.3-beta.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -14,12 +14,12 @@ name = "_lancedb"
|
|||||||
crate-type = ["cdylib"]
|
crate-type = ["cdylib"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
arrow = { version = "55.1", features = ["pyarrow"] }
|
arrow = { version = "56.2", features = ["pyarrow"] }
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
pyo3 = { version = "0.24", features = ["extension-module", "abi3-py39"] }
|
pyo3 = { version = "0.25", features = ["extension-module", "abi3-py39"] }
|
||||||
pyo3-async-runtimes = { version = "0.24", features = [
|
pyo3-async-runtimes = { version = "0.25", features = [
|
||||||
"attributes",
|
"attributes",
|
||||||
"tokio-runtime",
|
"tokio-runtime",
|
||||||
] }
|
] }
|
||||||
@@ -28,7 +28,7 @@ futures.workspace = true
|
|||||||
tokio = { version = "1.40", features = ["sync"] }
|
tokio = { version = "1.40", features = ["sync"] }
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
pyo3-build-config = { version = "0.24", features = [
|
pyo3-build-config = { version = "0.25", features = [
|
||||||
"extension-module",
|
"extension-module",
|
||||||
"abi3-py39",
|
"abi3-py39",
|
||||||
] }
|
] }
|
||||||
|
|||||||
@@ -5,12 +5,12 @@ dynamic = ["version"]
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"numpy",
|
"numpy",
|
||||||
"overrides>=0.7",
|
"overrides>=0.7; python_version<'3.12'",
|
||||||
"packaging",
|
"packaging",
|
||||||
"pyarrow>=16",
|
"pyarrow>=16",
|
||||||
"pydantic>=1.10",
|
"pydantic>=1.10",
|
||||||
"tqdm>=4.27.0",
|
"tqdm>=4.27.0",
|
||||||
"lance-namespace==0.0.6"
|
"lance-namespace>=0.0.16"
|
||||||
]
|
]
|
||||||
description = "lancedb"
|
description = "lancedb"
|
||||||
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
||||||
|
|||||||
@@ -60,6 +60,15 @@ class Connection(object):
|
|||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
index_cache_size: Optional[int] = None,
|
index_cache_size: Optional[int] = None,
|
||||||
) -> Table: ...
|
) -> Table: ...
|
||||||
|
async def clone_table(
|
||||||
|
self,
|
||||||
|
target_table_name: str,
|
||||||
|
source_uri: str,
|
||||||
|
target_namespace: List[str] = [],
|
||||||
|
source_version: Optional[int] = None,
|
||||||
|
source_tag: Optional[str] = None,
|
||||||
|
is_shallow: bool = True,
|
||||||
|
) -> Table: ...
|
||||||
async def rename_table(
|
async def rename_table(
|
||||||
self,
|
self,
|
||||||
cur_name: str,
|
cur_name: str,
|
||||||
@@ -124,6 +133,7 @@ class Tags:
|
|||||||
async def update(self, tag: str, version: int): ...
|
async def update(self, tag: str, version: int): ...
|
||||||
|
|
||||||
class IndexConfig:
|
class IndexConfig:
|
||||||
|
name: str
|
||||||
index_type: str
|
index_type: str
|
||||||
columns: List[str]
|
columns: List[str]
|
||||||
|
|
||||||
@@ -286,3 +296,34 @@ class AlterColumnsResult:
|
|||||||
|
|
||||||
class DropColumnsResult:
|
class DropColumnsResult:
|
||||||
version: int
|
version: int
|
||||||
|
|
||||||
|
class AsyncPermutationBuilder:
|
||||||
|
def select(self, projections: Dict[str, str]) -> "AsyncPermutationBuilder": ...
|
||||||
|
def split_random(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
ratios: Optional[List[float]] = None,
|
||||||
|
counts: Optional[List[int]] = None,
|
||||||
|
fixed: Optional[int] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
) -> "AsyncPermutationBuilder": ...
|
||||||
|
def split_hash(
|
||||||
|
self, columns: List[str], split_weights: List[int], *, discard_weight: int = 0
|
||||||
|
) -> "AsyncPermutationBuilder": ...
|
||||||
|
def split_sequential(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
ratios: Optional[List[float]] = None,
|
||||||
|
counts: Optional[List[int]] = None,
|
||||||
|
fixed: Optional[int] = None,
|
||||||
|
) -> "AsyncPermutationBuilder": ...
|
||||||
|
def split_calculated(self, calculation: str) -> "AsyncPermutationBuilder": ...
|
||||||
|
def shuffle(
|
||||||
|
self, seed: Optional[int], clump_size: Optional[int]
|
||||||
|
) -> "AsyncPermutationBuilder": ...
|
||||||
|
def filter(self, filter: str) -> "AsyncPermutationBuilder": ...
|
||||||
|
async def execute(self) -> Table: ...
|
||||||
|
|
||||||
|
def async_permutation_builder(
|
||||||
|
table: Table, dest_table_name: str
|
||||||
|
) -> AsyncPermutationBuilder: ...
|
||||||
|
|||||||
@@ -5,11 +5,20 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
from datetime import timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Literal, Optional, Union
|
from typing import TYPE_CHECKING, Dict, Iterable, List, Literal, Optional, Union
|
||||||
|
|
||||||
|
if sys.version_info >= (3, 12):
|
||||||
|
from typing import override
|
||||||
|
|
||||||
|
class EnforceOverrides:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
from overrides import EnforceOverrides, override # type: ignore
|
||||||
|
|
||||||
from lancedb.embeddings.registry import EmbeddingFunctionRegistry
|
from lancedb.embeddings.registry import EmbeddingFunctionRegistry
|
||||||
from overrides import EnforceOverrides, override # type: ignore
|
|
||||||
|
|
||||||
from lancedb.common import data_to_reader, sanitize_uri, validate_schema
|
from lancedb.common import data_to_reader, sanitize_uri, validate_schema
|
||||||
from lancedb.background_loop import LOOP
|
from lancedb.background_loop import LOOP
|
||||||
@@ -32,7 +41,6 @@ import deprecation
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from .pydantic import LanceModel
|
from .pydantic import LanceModel
|
||||||
from datetime import timedelta
|
|
||||||
|
|
||||||
from ._lancedb import Connection as LanceDbConnection
|
from ._lancedb import Connection as LanceDbConnection
|
||||||
from .common import DATA, URI
|
from .common import DATA, URI
|
||||||
@@ -444,7 +452,12 @@ class LanceDBConnection(DBConnection):
|
|||||||
read_consistency_interval: Optional[timedelta] = None,
|
read_consistency_interval: Optional[timedelta] = None,
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
session: Optional[Session] = None,
|
session: Optional[Session] = None,
|
||||||
|
_inner: Optional[LanceDbConnection] = None,
|
||||||
):
|
):
|
||||||
|
if _inner is not None:
|
||||||
|
self._conn = _inner
|
||||||
|
return
|
||||||
|
|
||||||
if not isinstance(uri, Path):
|
if not isinstance(uri, Path):
|
||||||
scheme = get_uri_scheme(uri)
|
scheme = get_uri_scheme(uri)
|
||||||
is_local = isinstance(uri, Path) or scheme == "file"
|
is_local = isinstance(uri, Path) or scheme == "file"
|
||||||
@@ -453,11 +466,6 @@ class LanceDBConnection(DBConnection):
|
|||||||
uri = Path(uri)
|
uri = Path(uri)
|
||||||
uri = uri.expanduser().absolute()
|
uri = uri.expanduser().absolute()
|
||||||
Path(uri).mkdir(parents=True, exist_ok=True)
|
Path(uri).mkdir(parents=True, exist_ok=True)
|
||||||
self._uri = str(uri)
|
|
||||||
self._entered = False
|
|
||||||
self.read_consistency_interval = read_consistency_interval
|
|
||||||
self.storage_options = storage_options
|
|
||||||
self.session = session
|
|
||||||
|
|
||||||
if read_consistency_interval is not None:
|
if read_consistency_interval is not None:
|
||||||
read_consistency_interval_secs = read_consistency_interval.total_seconds()
|
read_consistency_interval_secs = read_consistency_interval.total_seconds()
|
||||||
@@ -476,10 +484,32 @@ class LanceDBConnection(DBConnection):
|
|||||||
session,
|
session,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# TODO: It would be nice if we didn't store self.storage_options but it is
|
||||||
|
# currently used by the LanceTable.to_lance method. This doesn't _really_
|
||||||
|
# work because some paths like LanceDBConnection.from_inner will lose the
|
||||||
|
# storage_options. Also, this class really shouldn't be holding any state
|
||||||
|
# beyond _conn.
|
||||||
|
self.storage_options = storage_options
|
||||||
self._conn = AsyncConnection(LOOP.run(do_connect()))
|
self._conn = AsyncConnection(LOOP.run(do_connect()))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def read_consistency_interval(self) -> Optional[timedelta]:
|
||||||
|
return LOOP.run(self._conn.get_read_consistency_interval())
|
||||||
|
|
||||||
|
@property
|
||||||
|
def session(self) -> Optional[Session]:
|
||||||
|
return self._conn.session
|
||||||
|
|
||||||
|
@property
|
||||||
|
def uri(self) -> str:
|
||||||
|
return self._conn.uri
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_inner(cls, inner: LanceDbConnection):
|
||||||
|
return cls(None, _inner=inner)
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
val = f"{self.__class__.__name__}(uri={self._uri!r}"
|
val = f"{self.__class__.__name__}(uri={self._conn.uri!r}"
|
||||||
if self.read_consistency_interval is not None:
|
if self.read_consistency_interval is not None:
|
||||||
val += f", read_consistency_interval={repr(self.read_consistency_interval)}"
|
val += f", read_consistency_interval={repr(self.read_consistency_interval)}"
|
||||||
val += ")"
|
val += ")"
|
||||||
@@ -489,6 +519,10 @@ class LanceDBConnection(DBConnection):
|
|||||||
conn = AsyncConnection(await lancedb_connect(self.uri))
|
conn = AsyncConnection(await lancedb_connect(self.uri))
|
||||||
return await conn.table_names(start_after=start_after, limit=limit)
|
return await conn.table_names(start_after=start_after, limit=limit)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _inner(self) -> LanceDbConnection:
|
||||||
|
return self._conn._inner
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def list_namespaces(
|
def list_namespaces(
|
||||||
self,
|
self,
|
||||||
@@ -665,6 +699,60 @@ class LanceDBConnection(DBConnection):
|
|||||||
index_cache_size=index_cache_size,
|
index_cache_size=index_cache_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def clone_table(
|
||||||
|
self,
|
||||||
|
target_table_name: str,
|
||||||
|
source_uri: str,
|
||||||
|
*,
|
||||||
|
target_namespace: List[str] = [],
|
||||||
|
source_version: Optional[int] = None,
|
||||||
|
source_tag: Optional[str] = None,
|
||||||
|
is_shallow: bool = True,
|
||||||
|
) -> LanceTable:
|
||||||
|
"""Clone a table from a source table.
|
||||||
|
|
||||||
|
A shallow clone creates a new table that shares the underlying data files
|
||||||
|
with the source table but has its own independent manifest. This allows
|
||||||
|
both the source and cloned tables to evolve independently while initially
|
||||||
|
sharing the same data, deletion, and index files.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
target_table_name: str
|
||||||
|
The name of the target table to create.
|
||||||
|
source_uri: str
|
||||||
|
The URI of the source table to clone from.
|
||||||
|
target_namespace: List[str], optional
|
||||||
|
The namespace for the target table.
|
||||||
|
None or empty list represents root namespace.
|
||||||
|
source_version: int, optional
|
||||||
|
The version of the source table to clone.
|
||||||
|
source_tag: str, optional
|
||||||
|
The tag of the source table to clone.
|
||||||
|
is_shallow: bool, default True
|
||||||
|
Whether to perform a shallow clone (True) or deep clone (False).
|
||||||
|
Currently only shallow clone is supported.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
A LanceTable object representing the cloned table.
|
||||||
|
"""
|
||||||
|
LOOP.run(
|
||||||
|
self._conn.clone_table(
|
||||||
|
target_table_name,
|
||||||
|
source_uri,
|
||||||
|
target_namespace=target_namespace,
|
||||||
|
source_version=source_version,
|
||||||
|
source_tag=source_tag,
|
||||||
|
is_shallow=is_shallow,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return LanceTable.open(
|
||||||
|
self,
|
||||||
|
target_table_name,
|
||||||
|
namespace=target_namespace,
|
||||||
|
)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def drop_table(
|
def drop_table(
|
||||||
self,
|
self,
|
||||||
@@ -794,6 +882,13 @@ class AsyncConnection(object):
|
|||||||
def uri(self) -> str:
|
def uri(self) -> str:
|
||||||
return self._inner.uri
|
return self._inner.uri
|
||||||
|
|
||||||
|
async def get_read_consistency_interval(self) -> Optional[timedelta]:
|
||||||
|
interval_secs = await self._inner.get_read_consistency_interval()
|
||||||
|
if interval_secs is not None:
|
||||||
|
return timedelta(seconds=interval_secs)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
async def list_namespaces(
|
async def list_namespaces(
|
||||||
self,
|
self,
|
||||||
namespace: List[str] = [],
|
namespace: List[str] = [],
|
||||||
@@ -1136,6 +1231,54 @@ class AsyncConnection(object):
|
|||||||
)
|
)
|
||||||
return AsyncTable(table)
|
return AsyncTable(table)
|
||||||
|
|
||||||
|
async def clone_table(
|
||||||
|
self,
|
||||||
|
target_table_name: str,
|
||||||
|
source_uri: str,
|
||||||
|
*,
|
||||||
|
target_namespace: List[str] = [],
|
||||||
|
source_version: Optional[int] = None,
|
||||||
|
source_tag: Optional[str] = None,
|
||||||
|
is_shallow: bool = True,
|
||||||
|
) -> AsyncTable:
|
||||||
|
"""Clone a table from a source table.
|
||||||
|
|
||||||
|
A shallow clone creates a new table that shares the underlying data files
|
||||||
|
with the source table but has its own independent manifest. This allows
|
||||||
|
both the source and cloned tables to evolve independently while initially
|
||||||
|
sharing the same data, deletion, and index files.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
target_table_name: str
|
||||||
|
The name of the target table to create.
|
||||||
|
source_uri: str
|
||||||
|
The URI of the source table to clone from.
|
||||||
|
target_namespace: List[str], optional
|
||||||
|
The namespace for the target table.
|
||||||
|
None or empty list represents root namespace.
|
||||||
|
source_version: int, optional
|
||||||
|
The version of the source table to clone.
|
||||||
|
source_tag: str, optional
|
||||||
|
The tag of the source table to clone.
|
||||||
|
is_shallow: bool, default True
|
||||||
|
Whether to perform a shallow clone (True) or deep clone (False).
|
||||||
|
Currently only shallow clone is supported.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
An AsyncTable object representing the cloned table.
|
||||||
|
"""
|
||||||
|
table = await self._inner.clone_table(
|
||||||
|
target_table_name,
|
||||||
|
source_uri,
|
||||||
|
target_namespace=target_namespace,
|
||||||
|
source_version=source_version,
|
||||||
|
source_tag=source_tag,
|
||||||
|
is_shallow=is_shallow,
|
||||||
|
)
|
||||||
|
return AsyncTable(table)
|
||||||
|
|
||||||
async def rename_table(
|
async def rename_table(
|
||||||
self,
|
self,
|
||||||
cur_name: str,
|
cur_name: str,
|
||||||
|
|||||||
@@ -122,7 +122,7 @@ class EmbeddingFunctionRegistry:
|
|||||||
obj["vector_column"]: EmbeddingFunctionConfig(
|
obj["vector_column"]: EmbeddingFunctionConfig(
|
||||||
vector_column=obj["vector_column"],
|
vector_column=obj["vector_column"],
|
||||||
source_column=obj["source_column"],
|
source_column=obj["source_column"],
|
||||||
function=self.get(obj["name"])(**obj["model"]),
|
function=self.get(obj["name"]).create(**obj["model"]),
|
||||||
)
|
)
|
||||||
for obj in raw_list
|
for obj in raw_list
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -251,6 +251,13 @@ class HnswPq:
|
|||||||
results. In most cases, there is no benefit to setting this higher than 500.
|
results. In most cases, there is no benefit to setting this higher than 500.
|
||||||
This value should be set to a value that is not less than `ef` in the
|
This value should be set to a value that is not less than `ef` in the
|
||||||
search phase.
|
search phase.
|
||||||
|
|
||||||
|
target_partition_size, default is 1,048,576
|
||||||
|
|
||||||
|
The target size of each partition.
|
||||||
|
|
||||||
|
This value controls the tradeoff between search performance and accuracy.
|
||||||
|
faster search but less accurate results as higher value.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
||||||
@@ -261,6 +268,7 @@ class HnswPq:
|
|||||||
sample_rate: int = 256
|
sample_rate: int = 256
|
||||||
m: int = 20
|
m: int = 20
|
||||||
ef_construction: int = 300
|
ef_construction: int = 300
|
||||||
|
target_partition_size: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -351,6 +359,12 @@ class HnswSq:
|
|||||||
This value should be set to a value that is not less than `ef` in the search
|
This value should be set to a value that is not less than `ef` in the search
|
||||||
phase.
|
phase.
|
||||||
|
|
||||||
|
target_partition_size, default is 1,048,576
|
||||||
|
|
||||||
|
The target size of each partition.
|
||||||
|
|
||||||
|
This value controls the tradeoff between search performance and accuracy.
|
||||||
|
faster search but less accurate results as higher value.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
||||||
@@ -359,6 +373,7 @@ class HnswSq:
|
|||||||
sample_rate: int = 256
|
sample_rate: int = 256
|
||||||
m: int = 20
|
m: int = 20
|
||||||
ef_construction: int = 300
|
ef_construction: int = 300
|
||||||
|
target_partition_size: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -444,12 +459,20 @@ class IvfFlat:
|
|||||||
cases the default should be sufficient.
|
cases the default should be sufficient.
|
||||||
|
|
||||||
The default value is 256.
|
The default value is 256.
|
||||||
|
|
||||||
|
target_partition_size, default is 8192
|
||||||
|
|
||||||
|
The target size of each partition.
|
||||||
|
|
||||||
|
This value controls the tradeoff between search performance and accuracy.
|
||||||
|
faster search but less accurate results as higher value.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
distance_type: Literal["l2", "cosine", "dot", "hamming"] = "l2"
|
distance_type: Literal["l2", "cosine", "dot", "hamming"] = "l2"
|
||||||
num_partitions: Optional[int] = None
|
num_partitions: Optional[int] = None
|
||||||
max_iterations: int = 50
|
max_iterations: int = 50
|
||||||
sample_rate: int = 256
|
sample_rate: int = 256
|
||||||
|
target_partition_size: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -564,6 +587,13 @@ class IvfPq:
|
|||||||
cases the default should be sufficient.
|
cases the default should be sufficient.
|
||||||
|
|
||||||
The default value is 256.
|
The default value is 256.
|
||||||
|
|
||||||
|
target_partition_size, default is 8192
|
||||||
|
|
||||||
|
The target size of each partition.
|
||||||
|
|
||||||
|
This value controls the tradeoff between search performance and accuracy.
|
||||||
|
faster search but less accurate results as higher value.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
||||||
@@ -572,11 +602,56 @@ class IvfPq:
|
|||||||
num_bits: int = 8
|
num_bits: int = 8
|
||||||
max_iterations: int = 50
|
max_iterations: int = 50
|
||||||
sample_rate: int = 256
|
sample_rate: int = 256
|
||||||
|
target_partition_size: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IvfRq:
|
||||||
|
"""Describes an IVF RQ Index
|
||||||
|
|
||||||
|
IVF-RQ (Residual Quantization) stores a compressed copy of each vector using
|
||||||
|
residual quantization and organizes them into IVF partitions. Parameters
|
||||||
|
largely mirror IVF-PQ for consistency.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
distance_type: str, default "l2"
|
||||||
|
Distance metric used to train the index and for quantization.
|
||||||
|
|
||||||
|
The following distance types are available:
|
||||||
|
|
||||||
|
"l2" - Euclidean distance.
|
||||||
|
"cosine" - Cosine distance.
|
||||||
|
"dot" - Dot product.
|
||||||
|
|
||||||
|
num_partitions: int, default sqrt(num_rows)
|
||||||
|
Number of IVF partitions to create.
|
||||||
|
|
||||||
|
num_bits: int, default 1
|
||||||
|
Number of bits to encode each dimension.
|
||||||
|
|
||||||
|
max_iterations: int, default 50
|
||||||
|
Max iterations to train kmeans when computing IVF partitions.
|
||||||
|
|
||||||
|
sample_rate: int, default 256
|
||||||
|
Controls the number of training vectors: sample_rate * num_partitions.
|
||||||
|
|
||||||
|
target_partition_size, default is 8192
|
||||||
|
Target size of each partition.
|
||||||
|
"""
|
||||||
|
|
||||||
|
distance_type: Literal["l2", "cosine", "dot"] = "l2"
|
||||||
|
num_partitions: Optional[int] = None
|
||||||
|
num_bits: int = 1
|
||||||
|
max_iterations: int = 50
|
||||||
|
sample_rate: int = 256
|
||||||
|
target_partition_size: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"BTree",
|
"BTree",
|
||||||
"IvfPq",
|
"IvfPq",
|
||||||
|
"IvfRq",
|
||||||
"IvfFlat",
|
"IvfFlat",
|
||||||
"HnswPq",
|
"HnswPq",
|
||||||
"HnswSq",
|
"HnswSq",
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ class LanceMergeInsertBuilder(object):
|
|||||||
self._when_not_matched_by_source_delete = False
|
self._when_not_matched_by_source_delete = False
|
||||||
self._when_not_matched_by_source_condition = None
|
self._when_not_matched_by_source_condition = None
|
||||||
self._timeout = None
|
self._timeout = None
|
||||||
|
self._use_index = True
|
||||||
|
|
||||||
def when_matched_update_all(
|
def when_matched_update_all(
|
||||||
self, *, where: Optional[str] = None
|
self, *, where: Optional[str] = None
|
||||||
@@ -78,6 +79,23 @@ class LanceMergeInsertBuilder(object):
|
|||||||
self._when_not_matched_by_source_condition = condition
|
self._when_not_matched_by_source_condition = condition
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def use_index(self, use_index: bool) -> LanceMergeInsertBuilder:
|
||||||
|
"""
|
||||||
|
Controls whether to use indexes for the merge operation.
|
||||||
|
|
||||||
|
When set to `True` (the default), the operation will use an index if available
|
||||||
|
on the join key for improved performance. When set to `False`, it forces a full
|
||||||
|
table scan even if an index exists. This can be useful for benchmarking or when
|
||||||
|
the query optimizer chooses a suboptimal path.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
use_index: bool
|
||||||
|
Whether to use indices for the merge operation. Defaults to `True`.
|
||||||
|
"""
|
||||||
|
self._use_index = use_index
|
||||||
|
return self
|
||||||
|
|
||||||
def execute(
|
def execute(
|
||||||
self,
|
self,
|
||||||
new_data: DATA,
|
new_data: DATA,
|
||||||
|
|||||||
@@ -12,13 +12,18 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from typing import Dict, Iterable, List, Optional, Union
|
from typing import Dict, Iterable, List, Optional, Union
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if sys.version_info >= (3, 12):
|
||||||
|
from typing import override
|
||||||
|
else:
|
||||||
|
from overrides import override
|
||||||
|
|
||||||
from lancedb.db import DBConnection
|
from lancedb.db import DBConnection
|
||||||
from lancedb.table import LanceTable, Table
|
from lancedb.table import LanceTable, Table
|
||||||
from lancedb.util import validate_table_name
|
from lancedb.util import validate_table_name
|
||||||
from lancedb.common import validate_schema
|
from lancedb.common import validate_schema
|
||||||
from lancedb.table import sanitize_create_table
|
from lancedb.table import sanitize_create_table
|
||||||
from overrides import override
|
|
||||||
|
|
||||||
from lance_namespace import LanceNamespace, connect as namespace_connect
|
from lance_namespace import LanceNamespace, connect as namespace_connect
|
||||||
from lance_namespace_urllib3_client.models import (
|
from lance_namespace_urllib3_client.models import (
|
||||||
|
|||||||
72
python/python/lancedb/permutation.py
Normal file
72
python/python/lancedb/permutation.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
from ._lancedb import async_permutation_builder
|
||||||
|
from .table import LanceTable
|
||||||
|
from .background_loop import LOOP
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
class PermutationBuilder:
|
||||||
|
def __init__(self, table: LanceTable, dest_table_name: str):
|
||||||
|
self._async = async_permutation_builder(table, dest_table_name)
|
||||||
|
|
||||||
|
def select(self, projections: dict[str, str]) -> "PermutationBuilder":
|
||||||
|
self._async.select(projections)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def split_random(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
ratios: Optional[list[float]] = None,
|
||||||
|
counts: Optional[list[int]] = None,
|
||||||
|
fixed: Optional[int] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
) -> "PermutationBuilder":
|
||||||
|
self._async.split_random(ratios=ratios, counts=counts, fixed=fixed, seed=seed)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def split_hash(
|
||||||
|
self,
|
||||||
|
columns: list[str],
|
||||||
|
split_weights: list[int],
|
||||||
|
*,
|
||||||
|
discard_weight: Optional[int] = None,
|
||||||
|
) -> "PermutationBuilder":
|
||||||
|
self._async.split_hash(columns, split_weights, discard_weight=discard_weight)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def split_sequential(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
ratios: Optional[list[float]] = None,
|
||||||
|
counts: Optional[list[int]] = None,
|
||||||
|
fixed: Optional[int] = None,
|
||||||
|
) -> "PermutationBuilder":
|
||||||
|
self._async.split_sequential(ratios=ratios, counts=counts, fixed=fixed)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def split_calculated(self, calculation: str) -> "PermutationBuilder":
|
||||||
|
self._async.split_calculated(calculation)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def shuffle(
|
||||||
|
self, *, seed: Optional[int] = None, clump_size: Optional[int] = None
|
||||||
|
) -> "PermutationBuilder":
|
||||||
|
self._async.shuffle(seed=seed, clump_size=clump_size)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def filter(self, filter: str) -> "PermutationBuilder":
|
||||||
|
self._async.filter(filter)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def execute(self) -> LanceTable:
|
||||||
|
async def do_execute():
|
||||||
|
inner_tbl = await self._async.execute()
|
||||||
|
return LanceTable.from_inner(inner_tbl)
|
||||||
|
|
||||||
|
return LOOP.run(do_execute())
|
||||||
|
|
||||||
|
|
||||||
|
def permutation_builder(table: LanceTable, dest_table_name: str) -> PermutationBuilder:
|
||||||
|
return PermutationBuilder(table, dest_table_name)
|
||||||
@@ -5,15 +5,20 @@
|
|||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
import logging
|
import logging
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
import sys
|
||||||
from typing import Any, Dict, Iterable, List, Optional, Union
|
from typing import Any, Dict, Iterable, List, Optional, Union
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
if sys.version_info >= (3, 12):
|
||||||
|
from typing import override
|
||||||
|
else:
|
||||||
|
from overrides import override
|
||||||
|
|
||||||
# Remove this import to fix circular dependency
|
# Remove this import to fix circular dependency
|
||||||
# from lancedb import connect_async
|
# from lancedb import connect_async
|
||||||
from lancedb.remote import ClientConfig
|
from lancedb.remote import ClientConfig
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from overrides import override
|
|
||||||
|
|
||||||
from ..common import DATA
|
from ..common import DATA
|
||||||
from ..db import DBConnection, LOOP
|
from ..db import DBConnection, LOOP
|
||||||
@@ -212,6 +217,53 @@ class RemoteDBConnection(DBConnection):
|
|||||||
table = LOOP.run(self._conn.open_table(name, namespace=namespace))
|
table = LOOP.run(self._conn.open_table(name, namespace=namespace))
|
||||||
return RemoteTable(table, self.db_name)
|
return RemoteTable(table, self.db_name)
|
||||||
|
|
||||||
|
def clone_table(
|
||||||
|
self,
|
||||||
|
target_table_name: str,
|
||||||
|
source_uri: str,
|
||||||
|
*,
|
||||||
|
target_namespace: List[str] = [],
|
||||||
|
source_version: Optional[int] = None,
|
||||||
|
source_tag: Optional[str] = None,
|
||||||
|
is_shallow: bool = True,
|
||||||
|
) -> Table:
|
||||||
|
"""Clone a table from a source table.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
target_table_name: str
|
||||||
|
The name of the target table to create.
|
||||||
|
source_uri: str
|
||||||
|
The URI of the source table to clone from.
|
||||||
|
target_namespace: List[str], optional
|
||||||
|
The namespace for the target table.
|
||||||
|
None or empty list represents root namespace.
|
||||||
|
source_version: int, optional
|
||||||
|
The version of the source table to clone.
|
||||||
|
source_tag: str, optional
|
||||||
|
The tag of the source table to clone.
|
||||||
|
is_shallow: bool, default True
|
||||||
|
Whether to perform a shallow clone (True) or deep clone (False).
|
||||||
|
Currently only shallow clone is supported.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
A RemoteTable object representing the cloned table.
|
||||||
|
"""
|
||||||
|
from .table import RemoteTable
|
||||||
|
|
||||||
|
table = LOOP.run(
|
||||||
|
self._conn.clone_table(
|
||||||
|
target_table_name,
|
||||||
|
source_uri,
|
||||||
|
target_namespace=target_namespace,
|
||||||
|
source_version=source_version,
|
||||||
|
source_tag=source_tag,
|
||||||
|
is_shallow=is_shallow,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return RemoteTable(table, self.db_name)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def create_table(
|
def create_table(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class RemoteTable(Table):
|
|||||||
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
|
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
|
||||||
*,
|
*,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
wait_timeout: timedelta = None,
|
wait_timeout: Optional[timedelta] = None,
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Creates a scalar index
|
"""Creates a scalar index
|
||||||
@@ -153,7 +153,7 @@ class RemoteTable(Table):
|
|||||||
column: str,
|
column: str,
|
||||||
*,
|
*,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
wait_timeout: timedelta = None,
|
wait_timeout: Optional[timedelta] = None,
|
||||||
with_position: bool = False,
|
with_position: bool = False,
|
||||||
# tokenizer configs:
|
# tokenizer configs:
|
||||||
base_tokenizer: str = "simple",
|
base_tokenizer: str = "simple",
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from .linear_combination import LinearCombinationReranker
|
|||||||
from .openai import OpenaiReranker
|
from .openai import OpenaiReranker
|
||||||
from .jinaai import JinaReranker
|
from .jinaai import JinaReranker
|
||||||
from .rrf import RRFReranker
|
from .rrf import RRFReranker
|
||||||
|
from .mrr import MRRReranker
|
||||||
from .answerdotai import AnswerdotaiRerankers
|
from .answerdotai import AnswerdotaiRerankers
|
||||||
from .voyageai import VoyageAIReranker
|
from .voyageai import VoyageAIReranker
|
||||||
|
|
||||||
@@ -23,4 +24,5 @@ __all__ = [
|
|||||||
"RRFReranker",
|
"RRFReranker",
|
||||||
"AnswerdotaiRerankers",
|
"AnswerdotaiRerankers",
|
||||||
"VoyageAIReranker",
|
"VoyageAIReranker",
|
||||||
|
"MRRReranker",
|
||||||
]
|
]
|
||||||
|
|||||||
169
python/python/lancedb/rerankers/mrr.py
Normal file
169
python/python/lancedb/rerankers/mrr.py
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
|
||||||
|
from typing import Union, List, TYPE_CHECKING
|
||||||
|
import pyarrow as pa
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from .base import Reranker
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from ..table import LanceVectorQueryBuilder
|
||||||
|
|
||||||
|
|
||||||
|
class MRRReranker(Reranker):
|
||||||
|
"""
|
||||||
|
Reranks the results using Mean Reciprocal Rank (MRR) algorithm based
|
||||||
|
on the scores of vector and FTS search.
|
||||||
|
Algorithm reference - https://en.wikipedia.org/wiki/Mean_reciprocal_rank
|
||||||
|
|
||||||
|
MRR calculates the average of reciprocal ranks across different search results.
|
||||||
|
For each document, it computes the reciprocal of its rank in each system,
|
||||||
|
then takes the mean of these reciprocal ranks as the final score.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
weight_vector : float, default 0.5
|
||||||
|
Weight for vector search results (0.0 to 1.0)
|
||||||
|
weight_fts : float, default 0.5
|
||||||
|
Weight for FTS search results (0.0 to 1.0)
|
||||||
|
Note: weight_vector + weight_fts should equal 1.0
|
||||||
|
return_score : str, default "relevance"
|
||||||
|
Options are "relevance" or "all"
|
||||||
|
The type of score to return. If "relevance", will return only the relevance
|
||||||
|
score. If "all", will return all scores from the vector and FTS search along
|
||||||
|
with the relevance score.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
weight_vector: float = 0.5,
|
||||||
|
weight_fts: float = 0.5,
|
||||||
|
return_score="relevance",
|
||||||
|
):
|
||||||
|
if not (0.0 <= weight_vector <= 1.0):
|
||||||
|
raise ValueError("weight_vector must be between 0.0 and 1.0")
|
||||||
|
if not (0.0 <= weight_fts <= 1.0):
|
||||||
|
raise ValueError("weight_fts must be between 0.0 and 1.0")
|
||||||
|
if abs(weight_vector + weight_fts - 1.0) > 1e-6:
|
||||||
|
raise ValueError("weight_vector + weight_fts must equal 1.0")
|
||||||
|
|
||||||
|
super().__init__(return_score)
|
||||||
|
self.weight_vector = weight_vector
|
||||||
|
self.weight_fts = weight_fts
|
||||||
|
|
||||||
|
def rerank_hybrid(
|
||||||
|
self,
|
||||||
|
query: str, # noqa: F821
|
||||||
|
vector_results: pa.Table,
|
||||||
|
fts_results: pa.Table,
|
||||||
|
):
|
||||||
|
vector_ids = vector_results["_rowid"].to_pylist() if vector_results else []
|
||||||
|
fts_ids = fts_results["_rowid"].to_pylist() if fts_results else []
|
||||||
|
|
||||||
|
# Maps result_id to list of (type, reciprocal_rank)
|
||||||
|
mrr_score_map = defaultdict(list)
|
||||||
|
|
||||||
|
if vector_ids:
|
||||||
|
for rank, result_id in enumerate(vector_ids, 1):
|
||||||
|
reciprocal_rank = 1.0 / rank
|
||||||
|
mrr_score_map[result_id].append(("vector", reciprocal_rank))
|
||||||
|
|
||||||
|
if fts_ids:
|
||||||
|
for rank, result_id in enumerate(fts_ids, 1):
|
||||||
|
reciprocal_rank = 1.0 / rank
|
||||||
|
mrr_score_map[result_id].append(("fts", reciprocal_rank))
|
||||||
|
|
||||||
|
final_mrr_scores = {}
|
||||||
|
for result_id, scores in mrr_score_map.items():
|
||||||
|
vector_rr = 0.0
|
||||||
|
fts_rr = 0.0
|
||||||
|
|
||||||
|
for score_type, reciprocal_rank in scores:
|
||||||
|
if score_type == "vector":
|
||||||
|
vector_rr = reciprocal_rank
|
||||||
|
elif score_type == "fts":
|
||||||
|
fts_rr = reciprocal_rank
|
||||||
|
|
||||||
|
# If a document doesn't appear, its reciprocal rank is 0
|
||||||
|
weighted_mrr = self.weight_vector * vector_rr + self.weight_fts * fts_rr
|
||||||
|
final_mrr_scores[result_id] = weighted_mrr
|
||||||
|
|
||||||
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
|
combined_row_ids = combined_results["_rowid"].to_pylist()
|
||||||
|
relevance_scores = [final_mrr_scores[row_id] for row_id in combined_row_ids]
|
||||||
|
combined_results = combined_results.append_column(
|
||||||
|
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
|
||||||
|
)
|
||||||
|
combined_results = combined_results.sort_by(
|
||||||
|
[("_relevance_score", "descending")]
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.score == "relevance":
|
||||||
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
|
|
||||||
|
return combined_results
|
||||||
|
|
||||||
|
def rerank_multivector(
|
||||||
|
self,
|
||||||
|
vector_results: Union[List[pa.Table], List["LanceVectorQueryBuilder"]],
|
||||||
|
query: str = None,
|
||||||
|
deduplicate: bool = True, # noqa: F821
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Reranks the results from multiple vector searches using MRR algorithm.
|
||||||
|
Each vector search result is treated as a separate ranking system,
|
||||||
|
and MRR calculates the mean of reciprocal ranks across all systems.
|
||||||
|
This cannot reuse rerank_hybrid because MRR semantics require treating
|
||||||
|
each vector result as a separate ranking system.
|
||||||
|
"""
|
||||||
|
if not all(isinstance(v, type(vector_results[0])) for v in vector_results):
|
||||||
|
raise ValueError(
|
||||||
|
"All elements in vector_results should be of the same type"
|
||||||
|
)
|
||||||
|
|
||||||
|
# avoid circular import
|
||||||
|
if type(vector_results[0]).__name__ == "LanceVectorQueryBuilder":
|
||||||
|
vector_results = [result.to_arrow() for result in vector_results]
|
||||||
|
elif not isinstance(vector_results[0], pa.Table):
|
||||||
|
raise ValueError(
|
||||||
|
"vector_results should be a list of pa.Table or LanceVectorQueryBuilder"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not all("_rowid" in result.column_names for result in vector_results):
|
||||||
|
raise ValueError(
|
||||||
|
"'_rowid' is required for deduplication. \
|
||||||
|
add _rowid to search results like this: \
|
||||||
|
`search().with_row_id(True)`"
|
||||||
|
)
|
||||||
|
|
||||||
|
mrr_score_map = defaultdict(list)
|
||||||
|
|
||||||
|
for result_table in vector_results:
|
||||||
|
result_ids = result_table["_rowid"].to_pylist()
|
||||||
|
for rank, result_id in enumerate(result_ids, 1):
|
||||||
|
reciprocal_rank = 1.0 / rank
|
||||||
|
mrr_score_map[result_id].append(reciprocal_rank)
|
||||||
|
|
||||||
|
final_mrr_scores = {}
|
||||||
|
for result_id, reciprocal_ranks in mrr_score_map.items():
|
||||||
|
mean_rr = np.mean(reciprocal_ranks)
|
||||||
|
final_mrr_scores[result_id] = mean_rr
|
||||||
|
|
||||||
|
combined = pa.concat_tables(vector_results, **self._concat_tables_args)
|
||||||
|
combined = self._deduplicate(combined)
|
||||||
|
|
||||||
|
combined_row_ids = combined["_rowid"].to_pylist()
|
||||||
|
|
||||||
|
relevance_scores = [final_mrr_scores[row_id] for row_id in combined_row_ids]
|
||||||
|
combined = combined.append_column(
|
||||||
|
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
|
||||||
|
)
|
||||||
|
combined = combined.sort_by([("_relevance_score", "descending")])
|
||||||
|
|
||||||
|
if self.score == "relevance":
|
||||||
|
combined = self._keep_relevance_score(combined)
|
||||||
|
|
||||||
|
return combined
|
||||||
@@ -44,7 +44,7 @@ import numpy as np
|
|||||||
|
|
||||||
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||||
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
||||||
from .index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
|
from .index import BTree, IvfFlat, IvfPq, Bitmap, IvfRq, LabelList, HnswPq, HnswSq, FTS
|
||||||
from .merge import LanceMergeInsertBuilder
|
from .merge import LanceMergeInsertBuilder
|
||||||
from .pydantic import LanceModel, model_to_dict
|
from .pydantic import LanceModel, model_to_dict
|
||||||
from .query import (
|
from .query import (
|
||||||
@@ -74,6 +74,7 @@ from .index import lang_mapping
|
|||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
from .db import LanceDBConnection
|
||||||
from ._lancedb import (
|
from ._lancedb import (
|
||||||
Table as LanceDBTable,
|
Table as LanceDBTable,
|
||||||
OptimizeStats,
|
OptimizeStats,
|
||||||
@@ -88,7 +89,6 @@ if TYPE_CHECKING:
|
|||||||
MergeResult,
|
MergeResult,
|
||||||
UpdateResult,
|
UpdateResult,
|
||||||
)
|
)
|
||||||
from .db import LanceDBConnection
|
|
||||||
from .index import IndexConfig
|
from .index import IndexConfig
|
||||||
import pandas
|
import pandas
|
||||||
import PIL
|
import PIL
|
||||||
@@ -691,6 +691,7 @@ class Table(ABC):
|
|||||||
ef_construction: int = 300,
|
ef_construction: int = 300,
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
train: bool = True,
|
train: bool = True,
|
||||||
|
target_partition_size: Optional[int] = None,
|
||||||
):
|
):
|
||||||
"""Create an index on the table.
|
"""Create an index on the table.
|
||||||
|
|
||||||
@@ -1469,10 +1470,7 @@ class Table(ABC):
|
|||||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||||
then these files will be deleted regardless of their age.
|
then these files will be deleted regardless of their age.
|
||||||
retrain: bool, default False
|
retrain: bool, default False
|
||||||
If True, retrain the vector indices, this would refine the IVF clustering
|
This parameter is no longer used and is deprecated.
|
||||||
and quantization, which may improve the search accuracy. It's faster than
|
|
||||||
re-creating the index from scratch, so it's recommended to try this first,
|
|
||||||
when the data distribution has changed significantly.
|
|
||||||
|
|
||||||
Experimental API
|
Experimental API
|
||||||
----------------
|
----------------
|
||||||
@@ -1709,22 +1707,38 @@ class LanceTable(Table):
|
|||||||
namespace: List[str] = [],
|
namespace: List[str] = [],
|
||||||
storage_options: Optional[Dict[str, str]] = None,
|
storage_options: Optional[Dict[str, str]] = None,
|
||||||
index_cache_size: Optional[int] = None,
|
index_cache_size: Optional[int] = None,
|
||||||
|
_async: AsyncTable = None,
|
||||||
):
|
):
|
||||||
self._conn = connection
|
self._conn = connection
|
||||||
self._namespace = namespace
|
self._namespace = namespace
|
||||||
self._table = LOOP.run(
|
if _async is not None:
|
||||||
connection._conn.open_table(
|
self._table = _async
|
||||||
name,
|
else:
|
||||||
namespace=namespace,
|
self._table = LOOP.run(
|
||||||
storage_options=storage_options,
|
connection._conn.open_table(
|
||||||
index_cache_size=index_cache_size,
|
name,
|
||||||
|
namespace=namespace,
|
||||||
|
storage_options=storage_options,
|
||||||
|
index_cache_size=index_cache_size,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def name(self) -> str:
|
def name(self) -> str:
|
||||||
return self._table.name
|
return self._table.name
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_inner(cls, tbl: LanceDBTable):
|
||||||
|
from .db import LanceDBConnection
|
||||||
|
|
||||||
|
async_tbl = AsyncTable(tbl)
|
||||||
|
conn = LanceDBConnection.from_inner(tbl.database())
|
||||||
|
return cls(
|
||||||
|
conn,
|
||||||
|
async_tbl.name,
|
||||||
|
_async=async_tbl,
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def open(cls, db, name, *, namespace: List[str] = [], **kwargs):
|
def open(cls, db, name, *, namespace: List[str] = [], **kwargs):
|
||||||
tbl = cls(db, name, namespace=namespace, **kwargs)
|
tbl = cls(db, name, namespace=namespace, **kwargs)
|
||||||
@@ -1993,7 +2007,7 @@ class LanceTable(Table):
|
|||||||
index_cache_size: Optional[int] = None,
|
index_cache_size: Optional[int] = None,
|
||||||
num_bits: int = 8,
|
num_bits: int = 8,
|
||||||
index_type: Literal[
|
index_type: Literal[
|
||||||
"IVF_FLAT", "IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
|
"IVF_FLAT", "IVF_PQ", "IVF_RQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"
|
||||||
] = "IVF_PQ",
|
] = "IVF_PQ",
|
||||||
max_iterations: int = 50,
|
max_iterations: int = 50,
|
||||||
sample_rate: int = 256,
|
sample_rate: int = 256,
|
||||||
@@ -2002,6 +2016,7 @@ class LanceTable(Table):
|
|||||||
*,
|
*,
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
train: bool = True,
|
train: bool = True,
|
||||||
|
target_partition_size: Optional[int] = None,
|
||||||
):
|
):
|
||||||
"""Create an index on the table."""
|
"""Create an index on the table."""
|
||||||
if accelerator is not None:
|
if accelerator is not None:
|
||||||
@@ -2018,6 +2033,7 @@ class LanceTable(Table):
|
|||||||
num_bits=num_bits,
|
num_bits=num_bits,
|
||||||
m=m,
|
m=m,
|
||||||
ef_construction=ef_construction,
|
ef_construction=ef_construction,
|
||||||
|
target_partition_size=target_partition_size,
|
||||||
)
|
)
|
||||||
self.checkout_latest()
|
self.checkout_latest()
|
||||||
return
|
return
|
||||||
@@ -2027,6 +2043,7 @@ class LanceTable(Table):
|
|||||||
num_partitions=num_partitions,
|
num_partitions=num_partitions,
|
||||||
max_iterations=max_iterations,
|
max_iterations=max_iterations,
|
||||||
sample_rate=sample_rate,
|
sample_rate=sample_rate,
|
||||||
|
target_partition_size=target_partition_size,
|
||||||
)
|
)
|
||||||
elif index_type == "IVF_PQ":
|
elif index_type == "IVF_PQ":
|
||||||
config = IvfPq(
|
config = IvfPq(
|
||||||
@@ -2036,6 +2053,16 @@ class LanceTable(Table):
|
|||||||
num_bits=num_bits,
|
num_bits=num_bits,
|
||||||
max_iterations=max_iterations,
|
max_iterations=max_iterations,
|
||||||
sample_rate=sample_rate,
|
sample_rate=sample_rate,
|
||||||
|
target_partition_size=target_partition_size,
|
||||||
|
)
|
||||||
|
elif index_type == "IVF_RQ":
|
||||||
|
config = IvfRq(
|
||||||
|
distance_type=metric,
|
||||||
|
num_partitions=num_partitions,
|
||||||
|
num_bits=num_bits,
|
||||||
|
max_iterations=max_iterations,
|
||||||
|
sample_rate=sample_rate,
|
||||||
|
target_partition_size=target_partition_size,
|
||||||
)
|
)
|
||||||
elif index_type == "IVF_HNSW_PQ":
|
elif index_type == "IVF_HNSW_PQ":
|
||||||
config = HnswPq(
|
config = HnswPq(
|
||||||
@@ -2047,6 +2074,7 @@ class LanceTable(Table):
|
|||||||
sample_rate=sample_rate,
|
sample_rate=sample_rate,
|
||||||
m=m,
|
m=m,
|
||||||
ef_construction=ef_construction,
|
ef_construction=ef_construction,
|
||||||
|
target_partition_size=target_partition_size,
|
||||||
)
|
)
|
||||||
elif index_type == "IVF_HNSW_SQ":
|
elif index_type == "IVF_HNSW_SQ":
|
||||||
config = HnswSq(
|
config = HnswSq(
|
||||||
@@ -2056,6 +2084,7 @@ class LanceTable(Table):
|
|||||||
sample_rate=sample_rate,
|
sample_rate=sample_rate,
|
||||||
m=m,
|
m=m,
|
||||||
ef_construction=ef_construction,
|
ef_construction=ef_construction,
|
||||||
|
target_partition_size=target_partition_size,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown index type {index_type}")
|
raise ValueError(f"Unknown index type {index_type}")
|
||||||
@@ -2743,6 +2772,10 @@ class LanceTable(Table):
|
|||||||
self._table._do_merge(merge, new_data, on_bad_vectors, fill_value)
|
self._table._do_merge(merge, new_data, on_bad_vectors, fill_value)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _inner(self) -> LanceDBTable:
|
||||||
|
return self._table._inner
|
||||||
|
|
||||||
@deprecation.deprecated(
|
@deprecation.deprecated(
|
||||||
deprecated_in="0.21.0",
|
deprecated_in="0.21.0",
|
||||||
current_version=__version__,
|
current_version=__version__,
|
||||||
@@ -2828,10 +2861,7 @@ class LanceTable(Table):
|
|||||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||||
then these files will be deleted regardless of their age.
|
then these files will be deleted regardless of their age.
|
||||||
retrain: bool, default False
|
retrain: bool, default False
|
||||||
If True, retrain the vector indices, this would refine the IVF clustering
|
This parameter is no longer used and is deprecated.
|
||||||
and quantization, which may improve the search accuracy. It's faster than
|
|
||||||
re-creating the index from scratch, so it's recommended to try this first,
|
|
||||||
when the data distribution has changed significantly.
|
|
||||||
|
|
||||||
Experimental API
|
Experimental API
|
||||||
----------------
|
----------------
|
||||||
@@ -3329,7 +3359,7 @@ class AsyncTable:
|
|||||||
*,
|
*,
|
||||||
replace: Optional[bool] = None,
|
replace: Optional[bool] = None,
|
||||||
config: Optional[
|
config: Optional[
|
||||||
Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
Union[IvfFlat, IvfPq, IvfRq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
||||||
] = None,
|
] = None,
|
||||||
wait_timeout: Optional[timedelta] = None,
|
wait_timeout: Optional[timedelta] = None,
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
@@ -3368,11 +3398,12 @@ class AsyncTable:
|
|||||||
"""
|
"""
|
||||||
if config is not None:
|
if config is not None:
|
||||||
if not isinstance(
|
if not isinstance(
|
||||||
config, (IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS)
|
config,
|
||||||
|
(IvfFlat, IvfPq, IvfRq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS),
|
||||||
):
|
):
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
"config must be an instance of IvfPq, HnswPq, HnswSq, BTree,"
|
"config must be an instance of IvfPq, IvfRq, HnswPq, HnswSq, BTree,"
|
||||||
" Bitmap, LabelList, or FTS"
|
" Bitmap, LabelList, or FTS, but got " + str(type(config))
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
await self._inner.create_index(
|
await self._inner.create_index(
|
||||||
@@ -3919,6 +3950,7 @@ class AsyncTable:
|
|||||||
when_not_matched_by_source_delete=merge._when_not_matched_by_source_delete,
|
when_not_matched_by_source_delete=merge._when_not_matched_by_source_delete,
|
||||||
when_not_matched_by_source_condition=merge._when_not_matched_by_source_condition,
|
when_not_matched_by_source_condition=merge._when_not_matched_by_source_condition,
|
||||||
timeout=merge._timeout,
|
timeout=merge._timeout,
|
||||||
|
use_index=merge._use_index,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -4291,10 +4323,7 @@ class AsyncTable:
|
|||||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||||
then these files will be deleted regardless of their age.
|
then these files will be deleted regardless of their age.
|
||||||
retrain: bool, default False
|
retrain: bool, default False
|
||||||
If True, retrain the vector indices, this would refine the IVF clustering
|
This parameter is no longer used and is deprecated.
|
||||||
and quantization, which may improve the search accuracy. It's faster than
|
|
||||||
re-creating the index from scratch, so it's recommended to try this first,
|
|
||||||
when the data distribution has changed significantly.
|
|
||||||
|
|
||||||
Experimental API
|
Experimental API
|
||||||
----------------
|
----------------
|
||||||
@@ -4317,10 +4346,19 @@ class AsyncTable:
|
|||||||
cleanup_since_ms: Optional[int] = None
|
cleanup_since_ms: Optional[int] = None
|
||||||
if cleanup_older_than is not None:
|
if cleanup_older_than is not None:
|
||||||
cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
|
cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
|
||||||
|
|
||||||
|
if retrain:
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
warnings.warn(
|
||||||
|
"The 'retrain' parameter is deprecated and will be removed in a "
|
||||||
|
"future version.",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
|
||||||
return await self._inner.optimize(
|
return await self._inner.optimize(
|
||||||
cleanup_since_ms=cleanup_since_ms,
|
cleanup_since_ms=cleanup_since_ms,
|
||||||
delete_unverified=delete_unverified,
|
delete_unverified=delete_unverified,
|
||||||
retrain=retrain,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
async def list_indices(self) -> Iterable[IndexConfig]:
|
async def list_indices(self) -> Iterable[IndexConfig]:
|
||||||
|
|||||||
@@ -18,10 +18,17 @@ AddMode = Literal["append", "overwrite"]
|
|||||||
CreateMode = Literal["create", "overwrite"]
|
CreateMode = Literal["create", "overwrite"]
|
||||||
|
|
||||||
# Index type literals
|
# Index type literals
|
||||||
VectorIndexType = Literal["IVF_FLAT", "IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"]
|
VectorIndexType = Literal["IVF_FLAT", "IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ", "IVF_RQ"]
|
||||||
ScalarIndexType = Literal["BTREE", "BITMAP", "LABEL_LIST"]
|
ScalarIndexType = Literal["BTREE", "BITMAP", "LABEL_LIST"]
|
||||||
IndexType = Literal[
|
IndexType = Literal[
|
||||||
"IVF_PQ", "IVF_HNSW_PQ", "IVF_HNSW_SQ", "FTS", "BTREE", "BITMAP", "LABEL_LIST"
|
"IVF_PQ",
|
||||||
|
"IVF_HNSW_PQ",
|
||||||
|
"IVF_HNSW_SQ",
|
||||||
|
"FTS",
|
||||||
|
"BTREE",
|
||||||
|
"BITMAP",
|
||||||
|
"LABEL_LIST",
|
||||||
|
"IVF_RQ",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Tokenizer literals
|
# Tokenizer literals
|
||||||
|
|||||||
@@ -747,15 +747,16 @@ def test_local_namespace_operations(tmp_path):
|
|||||||
# Create a local database connection
|
# Create a local database connection
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
|
|
||||||
# Test list_namespaces returns empty list
|
# Test list_namespaces returns empty list for root namespace
|
||||||
namespaces = list(db.list_namespaces())
|
namespaces = list(db.list_namespaces())
|
||||||
assert namespaces == []
|
assert namespaces == []
|
||||||
|
|
||||||
# Test list_namespaces with parameters still returns empty list
|
# Test list_namespaces with non-empty namespace raises NotImplementedError
|
||||||
namespaces_with_params = list(
|
with pytest.raises(
|
||||||
db.list_namespaces(namespace=["test"], page_token="token", limit=5)
|
NotImplementedError,
|
||||||
)
|
match="Namespace operations are not supported for listing database",
|
||||||
assert namespaces_with_params == []
|
):
|
||||||
|
list(db.list_namespaces(namespace=["test"]))
|
||||||
|
|
||||||
|
|
||||||
def test_local_create_namespace_not_supported(tmp_path):
|
def test_local_create_namespace_not_supported(tmp_path):
|
||||||
@@ -830,3 +831,119 @@ def test_local_table_operations_with_namespace_raise_error(tmp_path):
|
|||||||
# Test table_names without namespace - should work normally
|
# Test table_names without namespace - should work normally
|
||||||
tables_root = list(db.table_names())
|
tables_root = list(db.table_names())
|
||||||
assert "test_table" in tables_root
|
assert "test_table" in tables_root
|
||||||
|
|
||||||
|
|
||||||
|
def test_clone_table_latest_version(tmp_path):
|
||||||
|
"""Test cloning a table with the latest version (default behavior)"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
|
||||||
|
# Create source table with some data
|
||||||
|
data = [
|
||||||
|
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||||
|
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||||
|
]
|
||||||
|
source_table = db.create_table("source", data=data)
|
||||||
|
|
||||||
|
# Add more data to create a new version
|
||||||
|
more_data = [{"id": 3, "text": "test", "vector": [5.0, 6.0]}]
|
||||||
|
source_table.add(more_data)
|
||||||
|
|
||||||
|
# Clone the table (should get latest version with 3 rows)
|
||||||
|
source_uri = os.path.join(tmp_path, "source.lance")
|
||||||
|
cloned_table = db.clone_table("cloned", source_uri)
|
||||||
|
|
||||||
|
# Verify cloned table has all 3 rows
|
||||||
|
assert cloned_table.count_rows() == 3
|
||||||
|
assert "cloned" in db.table_names()
|
||||||
|
|
||||||
|
# Verify data matches
|
||||||
|
cloned_data = cloned_table.to_pandas()
|
||||||
|
assert len(cloned_data) == 3
|
||||||
|
assert set(cloned_data["id"].tolist()) == {1, 2, 3}
|
||||||
|
|
||||||
|
|
||||||
|
def test_clone_table_specific_version(tmp_path):
|
||||||
|
"""Test cloning a table from a specific version"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
|
||||||
|
# Create source table with initial data
|
||||||
|
data = [
|
||||||
|
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||||
|
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||||
|
]
|
||||||
|
source_table = db.create_table("source", data=data)
|
||||||
|
|
||||||
|
# Get the initial version
|
||||||
|
initial_version = source_table.version
|
||||||
|
|
||||||
|
# Add more data to create a new version
|
||||||
|
more_data = [{"id": 3, "text": "test", "vector": [5.0, 6.0]}]
|
||||||
|
source_table.add(more_data)
|
||||||
|
|
||||||
|
# Verify source now has 3 rows
|
||||||
|
assert source_table.count_rows() == 3
|
||||||
|
|
||||||
|
# Clone from the initial version (should have only 2 rows)
|
||||||
|
source_uri = os.path.join(tmp_path, "source.lance")
|
||||||
|
cloned_table = db.clone_table("cloned", source_uri, source_version=initial_version)
|
||||||
|
|
||||||
|
# Verify cloned table has only the initial 2 rows
|
||||||
|
assert cloned_table.count_rows() == 2
|
||||||
|
cloned_data = cloned_table.to_pandas()
|
||||||
|
assert set(cloned_data["id"].tolist()) == {1, 2}
|
||||||
|
|
||||||
|
|
||||||
|
def test_clone_table_with_tag(tmp_path):
|
||||||
|
"""Test cloning a table from a tagged version"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
|
||||||
|
# Create source table with initial data
|
||||||
|
data = [
|
||||||
|
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||||
|
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||||
|
]
|
||||||
|
source_table = db.create_table("source", data=data)
|
||||||
|
|
||||||
|
# Create a tag for the current version
|
||||||
|
source_table.tags.create("v1.0", source_table.version)
|
||||||
|
|
||||||
|
# Add more data after the tag
|
||||||
|
more_data = [{"id": 3, "text": "test", "vector": [5.0, 6.0]}]
|
||||||
|
source_table.add(more_data)
|
||||||
|
|
||||||
|
# Verify source now has 3 rows
|
||||||
|
assert source_table.count_rows() == 3
|
||||||
|
|
||||||
|
# Clone from the tagged version (should have only 2 rows)
|
||||||
|
source_uri = os.path.join(tmp_path, "source.lance")
|
||||||
|
cloned_table = db.clone_table("cloned", source_uri, source_tag="v1.0")
|
||||||
|
|
||||||
|
# Verify cloned table has only the tagged version's 2 rows
|
||||||
|
assert cloned_table.count_rows() == 2
|
||||||
|
cloned_data = cloned_table.to_pandas()
|
||||||
|
assert set(cloned_data["id"].tolist()) == {1, 2}
|
||||||
|
|
||||||
|
|
||||||
|
def test_clone_table_deep_clone_fails(tmp_path):
|
||||||
|
"""Test that deep clone raises an unsupported error"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
|
||||||
|
# Create source table with some data
|
||||||
|
data = [
|
||||||
|
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||||
|
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||||
|
]
|
||||||
|
db.create_table("source", data=data)
|
||||||
|
|
||||||
|
# Try to create a deep clone (should fail)
|
||||||
|
source_uri = os.path.join(tmp_path, "source.lance")
|
||||||
|
with pytest.raises(Exception, match="Deep clone is not yet implemented"):
|
||||||
|
db.clone_table("cloned", source_uri, is_shallow=False)
|
||||||
|
|||||||
@@ -114,6 +114,63 @@ def test_embedding_function_variables():
|
|||||||
assert func.safe_model_dump()["secret_key"] == "$var:secret"
|
assert func.safe_model_dump()["secret_key"] == "$var:secret"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_functions_with_variables():
|
||||||
|
@register("variable-parsing-test")
|
||||||
|
class VariableParsingFunction(TextEmbeddingFunction):
|
||||||
|
api_key: str
|
||||||
|
base_url: Optional[str] = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def sensitive_keys():
|
||||||
|
return ["api_key"]
|
||||||
|
|
||||||
|
def ndims(self):
|
||||||
|
return 10
|
||||||
|
|
||||||
|
def generate_embeddings(self, texts):
|
||||||
|
# Mock implementation that just returns random embeddings
|
||||||
|
# In real usage, this would use the api_key to call an API
|
||||||
|
return [np.random.rand(self.ndims()).tolist() for _ in texts]
|
||||||
|
|
||||||
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
|
|
||||||
|
registry.set_var("test_api_key", "sk-test-key-12345")
|
||||||
|
registry.set_var("test_base_url", "https://api.example.com")
|
||||||
|
|
||||||
|
conf = EmbeddingFunctionConfig(
|
||||||
|
source_column="text",
|
||||||
|
vector_column="vector",
|
||||||
|
function=registry.get("variable-parsing-test").create(
|
||||||
|
api_key="$var:test_api_key", base_url="$var:test_base_url"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
metadata = registry.get_table_metadata([conf])
|
||||||
|
|
||||||
|
# Create a mock arrow table with the metadata
|
||||||
|
schema = pa.schema(
|
||||||
|
[pa.field("text", pa.string()), pa.field("vector", pa.list_(pa.float32(), 10))]
|
||||||
|
)
|
||||||
|
table = pa.table({"text": [], "vector": []}, schema=schema)
|
||||||
|
table = table.replace_schema_metadata(metadata)
|
||||||
|
|
||||||
|
ds = lance.write_dataset(table, "memory://")
|
||||||
|
|
||||||
|
configs = registry.parse_functions(ds.schema.metadata)
|
||||||
|
|
||||||
|
assert "vector" in configs
|
||||||
|
parsed_func = configs["vector"].function
|
||||||
|
|
||||||
|
assert parsed_func.api_key == "sk-test-key-12345"
|
||||||
|
assert parsed_func.base_url == "https://api.example.com"
|
||||||
|
|
||||||
|
embeddings = parsed_func.generate_embeddings(["test text"])
|
||||||
|
assert len(embeddings) == 1
|
||||||
|
assert len(embeddings[0]) == 10
|
||||||
|
|
||||||
|
assert parsed_func.safe_model_dump()["api_key"] == "$var:test_api_key"
|
||||||
|
|
||||||
|
|
||||||
def test_embedding_with_bad_results(tmp_path):
|
def test_embedding_with_bad_results(tmp_path):
|
||||||
@register("null-embedding")
|
@register("null-embedding")
|
||||||
class NullEmbeddingFunction(TextEmbeddingFunction):
|
class NullEmbeddingFunction(TextEmbeddingFunction):
|
||||||
|
|||||||
@@ -8,7 +8,17 @@ import pyarrow as pa
|
|||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
from lancedb import AsyncConnection, AsyncTable, connect_async
|
from lancedb import AsyncConnection, AsyncTable, connect_async
|
||||||
from lancedb.index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
|
from lancedb.index import (
|
||||||
|
BTree,
|
||||||
|
IvfFlat,
|
||||||
|
IvfPq,
|
||||||
|
IvfRq,
|
||||||
|
Bitmap,
|
||||||
|
LabelList,
|
||||||
|
HnswPq,
|
||||||
|
HnswSq,
|
||||||
|
FTS,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
@pytest_asyncio.fixture
|
||||||
@@ -35,6 +45,8 @@ async def some_table(db_async):
|
|||||||
"tags": [
|
"tags": [
|
||||||
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
|
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
|
||||||
],
|
],
|
||||||
|
"is_active": [random.choice([True, False]) for _ in range(NROWS)],
|
||||||
|
"data": [random.randbytes(random.randint(0, 128)) for _ in range(NROWS)],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return await db_async.create_table(
|
return await db_async.create_table(
|
||||||
@@ -99,10 +111,17 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable):
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_create_bitmap_index(some_table: AsyncTable):
|
async def test_create_bitmap_index(some_table: AsyncTable):
|
||||||
await some_table.create_index("id", config=Bitmap())
|
await some_table.create_index("id", config=Bitmap())
|
||||||
|
await some_table.create_index("is_active", config=Bitmap())
|
||||||
|
await some_table.create_index("data", config=Bitmap())
|
||||||
indices = await some_table.list_indices()
|
indices = await some_table.list_indices()
|
||||||
assert str(indices) == '[Index(Bitmap, columns=["id"], name="id_idx")]'
|
assert len(indices) == 3
|
||||||
indices = await some_table.list_indices()
|
assert indices[0].index_type == "Bitmap"
|
||||||
assert len(indices) == 1
|
assert indices[0].columns == ["id"]
|
||||||
|
assert indices[1].index_type == "Bitmap"
|
||||||
|
assert indices[1].columns == ["is_active"]
|
||||||
|
assert indices[2].index_type == "Bitmap"
|
||||||
|
assert indices[2].columns == ["data"]
|
||||||
|
|
||||||
index_name = indices[0].name
|
index_name = indices[0].name
|
||||||
stats = await some_table.index_stats(index_name)
|
stats = await some_table.index_stats(index_name)
|
||||||
assert stats.index_type == "BITMAP"
|
assert stats.index_type == "BITMAP"
|
||||||
@@ -111,6 +130,11 @@ async def test_create_bitmap_index(some_table: AsyncTable):
|
|||||||
assert stats.num_unindexed_rows == 0
|
assert stats.num_unindexed_rows == 0
|
||||||
assert stats.num_indices == 1
|
assert stats.num_indices == 1
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"ScalarIndexQuery"
|
||||||
|
in await some_table.query().where("is_active = TRUE").explain_plan()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_create_label_list_index(some_table: AsyncTable):
|
async def test_create_label_list_index(some_table: AsyncTable):
|
||||||
@@ -181,6 +205,16 @@ async def test_create_4bit_ivfpq_index(some_table: AsyncTable):
|
|||||||
assert stats.loss >= 0.0
|
assert stats.loss >= 0.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_create_ivfrq_index(some_table: AsyncTable):
|
||||||
|
await some_table.create_index("vector", config=IvfRq(num_bits=1))
|
||||||
|
indices = await some_table.list_indices()
|
||||||
|
assert len(indices) == 1
|
||||||
|
assert indices[0].index_type == "IvfRq"
|
||||||
|
assert indices[0].columns == ["vector"]
|
||||||
|
assert indices[0].name == "vector_idx"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_create_hnswpq_index(some_table: AsyncTable):
|
async def test_create_hnswpq_index(some_table: AsyncTable):
|
||||||
await some_table.create_index("vector", config=HnswPq(num_partitions=10))
|
await some_table.create_index("vector", config=HnswPq(num_partitions=10))
|
||||||
|
|||||||
496
python/python/tests/test_permutation.py
Normal file
496
python/python/tests/test_permutation.py
Normal file
@@ -0,0 +1,496 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from lancedb.permutation import permutation_builder
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_random_ratios(mem_db):
|
||||||
|
"""Test random splitting with ratios."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"x": range(100), "y": range(100)})
|
||||||
|
)
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.split_random(ratios=[0.3, 0.7])
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check that the table was created and has data
|
||||||
|
assert permutation_tbl.count_rows() == 100
|
||||||
|
|
||||||
|
# Check that split_id column exists and has correct values
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
split_ids = data["split_id"]
|
||||||
|
assert set(split_ids) == {0, 1}
|
||||||
|
|
||||||
|
# Check approximate split sizes (allowing for rounding)
|
||||||
|
split_0_count = split_ids.count(0)
|
||||||
|
split_1_count = split_ids.count(1)
|
||||||
|
assert 25 <= split_0_count <= 35 # ~30% ± tolerance
|
||||||
|
assert 65 <= split_1_count <= 75 # ~70% ± tolerance
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_random_counts(mem_db):
|
||||||
|
"""Test random splitting with absolute counts."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"x": range(100), "y": range(100)})
|
||||||
|
)
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.split_random(counts=[20, 30])
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check that we have exactly the requested counts
|
||||||
|
assert permutation_tbl.count_rows() == 50
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
split_ids = data["split_id"]
|
||||||
|
assert split_ids.count(0) == 20
|
||||||
|
assert split_ids.count(1) == 30
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_random_fixed(mem_db):
|
||||||
|
"""Test random splitting with fixed number of splits."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"x": range(100), "y": range(100)})
|
||||||
|
)
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation").split_random(fixed=4).execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check that we have 4 splits with 25 rows each
|
||||||
|
assert permutation_tbl.count_rows() == 100
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
split_ids = data["split_id"]
|
||||||
|
assert set(split_ids) == {0, 1, 2, 3}
|
||||||
|
|
||||||
|
for split_id in range(4):
|
||||||
|
assert split_ids.count(split_id) == 25
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_random_with_seed(mem_db):
|
||||||
|
"""Test that seeded random splits are reproducible."""
|
||||||
|
tbl = mem_db.create_table("test_table", pa.table({"x": range(50), "y": range(50)}))
|
||||||
|
|
||||||
|
# Create two identical permutations with same seed
|
||||||
|
perm1 = (
|
||||||
|
permutation_builder(tbl, "perm1")
|
||||||
|
.split_random(ratios=[0.6, 0.4], seed=42)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
perm2 = (
|
||||||
|
permutation_builder(tbl, "perm2")
|
||||||
|
.split_random(ratios=[0.6, 0.4], seed=42)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Results should be identical
|
||||||
|
data1 = perm1.search(None).to_arrow().to_pydict()
|
||||||
|
data2 = perm2.search(None).to_arrow().to_pydict()
|
||||||
|
|
||||||
|
assert data1["row_id"] == data2["row_id"]
|
||||||
|
assert data1["split_id"] == data2["split_id"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_hash(mem_db):
|
||||||
|
"""Test hash-based splitting."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table",
|
||||||
|
pa.table(
|
||||||
|
{
|
||||||
|
"id": range(100),
|
||||||
|
"category": (["A", "B", "C"] * 34)[:100], # Repeating pattern
|
||||||
|
"value": range(100),
|
||||||
|
}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.split_hash(["category"], [1, 1], discard_weight=0)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should have all 100 rows (no discard)
|
||||||
|
assert permutation_tbl.count_rows() == 100
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
split_ids = data["split_id"]
|
||||||
|
assert set(split_ids) == {0, 1}
|
||||||
|
|
||||||
|
# Verify that each split has roughly 50 rows (allowing for hash variance)
|
||||||
|
split_0_count = split_ids.count(0)
|
||||||
|
split_1_count = split_ids.count(1)
|
||||||
|
assert 30 <= split_0_count <= 70 # ~50 ± 20 tolerance for hash distribution
|
||||||
|
assert 30 <= split_1_count <= 70 # ~50 ± 20 tolerance for hash distribution
|
||||||
|
|
||||||
|
# Hash splits should be deterministic - same category should go to same split
|
||||||
|
# Let's verify by creating another permutation and checking consistency
|
||||||
|
perm2 = (
|
||||||
|
permutation_builder(tbl, "test_permutation2")
|
||||||
|
.split_hash(["category"], [1, 1], discard_weight=0)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
data2 = perm2.search(None).to_arrow().to_pydict()
|
||||||
|
assert data["split_id"] == data2["split_id"] # Should be identical
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_hash_with_discard(mem_db):
|
||||||
|
"""Test hash-based splitting with discard weight."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table",
|
||||||
|
pa.table({"id": range(100), "category": ["A", "B"] * 50, "value": range(100)}),
|
||||||
|
)
|
||||||
|
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.split_hash(["category"], [1, 1], discard_weight=2) # Should discard ~50%
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should have fewer than 100 rows due to discard
|
||||||
|
row_count = permutation_tbl.count_rows()
|
||||||
|
assert row_count < 100
|
||||||
|
assert row_count > 0 # But not empty
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_sequential(mem_db):
|
||||||
|
"""Test sequential splitting."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"x": range(100), "y": range(100)})
|
||||||
|
)
|
||||||
|
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.split_sequential(counts=[30, 40])
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert permutation_tbl.count_rows() == 70
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
row_ids = data["row_id"]
|
||||||
|
split_ids = data["split_id"]
|
||||||
|
|
||||||
|
# Sequential should maintain order
|
||||||
|
assert row_ids == sorted(row_ids)
|
||||||
|
|
||||||
|
# First 30 should be split 0, next 40 should be split 1
|
||||||
|
assert split_ids[:30] == [0] * 30
|
||||||
|
assert split_ids[30:] == [1] * 40
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_calculated(mem_db):
|
||||||
|
"""Test calculated splitting."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"id": range(100), "value": range(100)})
|
||||||
|
)
|
||||||
|
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.split_calculated("id % 3") # Split based on id modulo 3
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert permutation_tbl.count_rows() == 100
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
row_ids = data["row_id"]
|
||||||
|
split_ids = data["split_id"]
|
||||||
|
|
||||||
|
# Verify the calculation: each row's split_id should equal row_id % 3
|
||||||
|
for i, (row_id, split_id) in enumerate(zip(row_ids, split_ids)):
|
||||||
|
assert split_id == row_id % 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_error_cases(mem_db):
|
||||||
|
"""Test error handling for invalid split parameters."""
|
||||||
|
tbl = mem_db.create_table("test_table", pa.table({"x": range(10), "y": range(10)}))
|
||||||
|
|
||||||
|
# Test split_random with no parameters
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
permutation_builder(tbl, "error1").split_random().execute()
|
||||||
|
|
||||||
|
# Test split_random with multiple parameters
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
permutation_builder(tbl, "error2").split_random(
|
||||||
|
ratios=[0.5, 0.5], counts=[5, 5]
|
||||||
|
).execute()
|
||||||
|
|
||||||
|
# Test split_sequential with no parameters
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
permutation_builder(tbl, "error3").split_sequential().execute()
|
||||||
|
|
||||||
|
# Test split_sequential with multiple parameters
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
permutation_builder(tbl, "error4").split_sequential(
|
||||||
|
ratios=[0.5, 0.5], fixed=2
|
||||||
|
).execute()
|
||||||
|
|
||||||
|
|
||||||
|
def test_shuffle_no_seed(mem_db):
|
||||||
|
"""Test shuffling without a seed."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"id": range(100), "value": range(100)})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a permutation with shuffling (no seed)
|
||||||
|
permutation_tbl = permutation_builder(tbl, "test_permutation").shuffle().execute()
|
||||||
|
|
||||||
|
assert permutation_tbl.count_rows() == 100
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
row_ids = data["row_id"]
|
||||||
|
|
||||||
|
# Row IDs should not be in sequential order due to shuffling
|
||||||
|
# This is probabilistic but with 100 rows, it's extremely unlikely they'd stay
|
||||||
|
# in order
|
||||||
|
assert row_ids != list(range(100))
|
||||||
|
|
||||||
|
|
||||||
|
def test_shuffle_with_seed(mem_db):
|
||||||
|
"""Test that shuffling with a seed is reproducible."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"id": range(50), "value": range(50)})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create two identical permutations with same shuffle seed
|
||||||
|
perm1 = permutation_builder(tbl, "perm1").shuffle(seed=42).execute()
|
||||||
|
|
||||||
|
perm2 = permutation_builder(tbl, "perm2").shuffle(seed=42).execute()
|
||||||
|
|
||||||
|
# Results should be identical due to same seed
|
||||||
|
data1 = perm1.search(None).to_arrow().to_pydict()
|
||||||
|
data2 = perm2.search(None).to_arrow().to_pydict()
|
||||||
|
|
||||||
|
assert data1["row_id"] == data2["row_id"]
|
||||||
|
assert data1["split_id"] == data2["split_id"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_shuffle_with_clump_size(mem_db):
|
||||||
|
"""Test shuffling with clump size."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"id": range(100), "value": range(100)})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a permutation with shuffling using clumps
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.shuffle(clump_size=10) # 10-row clumps
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert permutation_tbl.count_rows() == 100
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
row_ids = data["row_id"]
|
||||||
|
|
||||||
|
for i in range(10):
|
||||||
|
start = row_ids[i * 10]
|
||||||
|
assert row_ids[i * 10 : (i + 1) * 10] == list(range(start, start + 10))
|
||||||
|
|
||||||
|
|
||||||
|
def test_shuffle_different_seeds(mem_db):
|
||||||
|
"""Test that different seeds produce different shuffle orders."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"id": range(50), "value": range(50)})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create two permutations with different shuffle seeds
|
||||||
|
perm1 = (
|
||||||
|
permutation_builder(tbl, "perm1")
|
||||||
|
.split_random(fixed=2)
|
||||||
|
.shuffle(seed=42)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
perm2 = (
|
||||||
|
permutation_builder(tbl, "perm2")
|
||||||
|
.split_random(fixed=2)
|
||||||
|
.shuffle(seed=123)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Results should be different due to different seeds
|
||||||
|
data1 = perm1.search(None).to_arrow().to_pydict()
|
||||||
|
data2 = perm2.search(None).to_arrow().to_pydict()
|
||||||
|
|
||||||
|
# Row order should be different
|
||||||
|
assert data1["row_id"] != data2["row_id"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_shuffle_combined_with_splits(mem_db):
|
||||||
|
"""Test shuffling combined with different split strategies."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table",
|
||||||
|
pa.table(
|
||||||
|
{
|
||||||
|
"id": range(100),
|
||||||
|
"category": (["A", "B", "C"] * 34)[:100],
|
||||||
|
"value": range(100),
|
||||||
|
}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test shuffle with random splits
|
||||||
|
perm_random = (
|
||||||
|
permutation_builder(tbl, "perm_random")
|
||||||
|
.split_random(ratios=[0.6, 0.4], seed=42)
|
||||||
|
.shuffle(seed=123, clump_size=None)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test shuffle with hash splits
|
||||||
|
perm_hash = (
|
||||||
|
permutation_builder(tbl, "perm_hash")
|
||||||
|
.split_hash(["category"], [1, 1], discard_weight=0)
|
||||||
|
.shuffle(seed=456, clump_size=5)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test shuffle with sequential splits
|
||||||
|
perm_sequential = (
|
||||||
|
permutation_builder(tbl, "perm_sequential")
|
||||||
|
.split_sequential(counts=[40, 35])
|
||||||
|
.shuffle(seed=789, clump_size=None)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify all permutations work and have expected properties
|
||||||
|
assert perm_random.count_rows() == 100
|
||||||
|
assert perm_hash.count_rows() == 100
|
||||||
|
assert perm_sequential.count_rows() == 75
|
||||||
|
|
||||||
|
# Verify shuffle affected the order
|
||||||
|
data_random = perm_random.search(None).to_arrow().to_pydict()
|
||||||
|
data_sequential = perm_sequential.search(None).to_arrow().to_pydict()
|
||||||
|
|
||||||
|
assert data_random["row_id"] != list(range(100))
|
||||||
|
assert data_sequential["row_id"] != list(range(75))
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_shuffle_maintains_order(mem_db):
|
||||||
|
"""Test that not calling shuffle maintains the original order."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"id": range(50), "value": range(50)})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create permutation without shuffle (should maintain some order)
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.split_sequential(counts=[25, 25]) # Sequential maintains order
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert permutation_tbl.count_rows() == 50
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
row_ids = data["row_id"]
|
||||||
|
|
||||||
|
# With sequential splits and no shuffle, should maintain order
|
||||||
|
assert row_ids == list(range(50))
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_basic(mem_db):
|
||||||
|
"""Test basic filtering functionality."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"id": range(100), "value": range(100, 200)})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter to only include rows where id < 50
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation").filter("id < 50").execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert permutation_tbl.count_rows() == 50
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
row_ids = data["row_id"]
|
||||||
|
|
||||||
|
# All row_ids should be less than 50
|
||||||
|
assert all(row_id < 50 for row_id in row_ids)
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_with_splits(mem_db):
|
||||||
|
"""Test filtering combined with split strategies."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table",
|
||||||
|
pa.table(
|
||||||
|
{
|
||||||
|
"id": range(100),
|
||||||
|
"category": (["A", "B", "C"] * 34)[:100],
|
||||||
|
"value": range(100),
|
||||||
|
}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter to only category A and B, then split
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.filter("category IN ('A', 'B')")
|
||||||
|
.split_random(ratios=[0.5, 0.5])
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should have fewer than 100 rows due to filtering
|
||||||
|
row_count = permutation_tbl.count_rows()
|
||||||
|
assert row_count == 67
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
categories = data["category"]
|
||||||
|
|
||||||
|
# All categories should be A or B
|
||||||
|
assert all(cat in ["A", "B"] for cat in categories)
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_with_shuffle(mem_db):
|
||||||
|
"""Test filtering combined with shuffling."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table",
|
||||||
|
pa.table(
|
||||||
|
{
|
||||||
|
"id": range(100),
|
||||||
|
"category": (["A", "B", "C", "D"] * 25)[:100],
|
||||||
|
"value": range(100),
|
||||||
|
}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter and shuffle
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.filter("category IN ('A', 'C')")
|
||||||
|
.shuffle(seed=42)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
row_count = permutation_tbl.count_rows()
|
||||||
|
assert row_count == 50 # Should have 50 rows (A and C categories)
|
||||||
|
|
||||||
|
data = permutation_tbl.search(None).to_arrow().to_pydict()
|
||||||
|
row_ids = data["row_id"]
|
||||||
|
|
||||||
|
assert row_ids != sorted(row_ids)
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_empty_result(mem_db):
|
||||||
|
"""Test filtering that results in empty set."""
|
||||||
|
tbl = mem_db.create_table(
|
||||||
|
"test_table", pa.table({"id": range(10), "value": range(10)})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter that matches nothing
|
||||||
|
permutation_tbl = (
|
||||||
|
permutation_builder(tbl, "test_permutation")
|
||||||
|
.filter("value > 100") # No values > 100 in our data
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert permutation_tbl.count_rows() == 0
|
||||||
@@ -22,6 +22,7 @@ from lancedb.rerankers import (
|
|||||||
JinaReranker,
|
JinaReranker,
|
||||||
AnswerdotaiRerankers,
|
AnswerdotaiRerankers,
|
||||||
VoyageAIReranker,
|
VoyageAIReranker,
|
||||||
|
MRRReranker,
|
||||||
)
|
)
|
||||||
from lancedb.table import LanceTable
|
from lancedb.table import LanceTable
|
||||||
|
|
||||||
@@ -46,6 +47,7 @@ def get_test_table(tmp_path, use_tantivy):
|
|||||||
db,
|
db,
|
||||||
"my_table",
|
"my_table",
|
||||||
schema=MyTable,
|
schema=MyTable,
|
||||||
|
mode="overwrite",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Need to test with a bunch of phrases to make sure sorting is consistent
|
# Need to test with a bunch of phrases to make sure sorting is consistent
|
||||||
@@ -96,7 +98,7 @@ def get_test_table(tmp_path, use_tantivy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create a fts index
|
# Create a fts index
|
||||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
table.create_fts_index("text", use_tantivy=use_tantivy, replace=True)
|
||||||
|
|
||||||
return table, MyTable
|
return table, MyTable
|
||||||
|
|
||||||
@@ -320,6 +322,34 @@ def test_rrf_reranker(tmp_path, use_tantivy):
|
|||||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||||
|
def test_mrr_reranker(tmp_path, use_tantivy):
|
||||||
|
reranker = MRRReranker()
|
||||||
|
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||||
|
|
||||||
|
# Test multi-vector part
|
||||||
|
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||||
|
query = "single player experience"
|
||||||
|
rs1 = table.search(query, vector_column_name="vector").limit(10).with_row_id(True)
|
||||||
|
rs2 = (
|
||||||
|
table.search(query, vector_column_name="meta_vector")
|
||||||
|
.limit(10)
|
||||||
|
.with_row_id(True)
|
||||||
|
)
|
||||||
|
result = reranker.rerank_multivector([rs1, rs2])
|
||||||
|
assert "_relevance_score" in result.column_names
|
||||||
|
assert len(result) <= 20
|
||||||
|
|
||||||
|
if len(result) > 1:
|
||||||
|
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), (
|
||||||
|
"The _relevance_score should be descending."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test with duplicate results
|
||||||
|
result_deduped = reranker.rerank_multivector([rs1, rs2, rs1])
|
||||||
|
assert len(result_deduped) == len(result)
|
||||||
|
|
||||||
|
|
||||||
def test_rrf_reranker_distance():
|
def test_rrf_reranker_distance():
|
||||||
data = pa.table(
|
data = pa.table(
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -674,6 +674,45 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
|||||||
"vector", replace=True, config=expected_config, name=None, train=True
|
"vector", replace=True, config=expected_config, name=None, train=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Test with target_partition_size
|
||||||
|
table.create_index(
|
||||||
|
metric="l2",
|
||||||
|
num_sub_vectors=96,
|
||||||
|
vector_column_name="vector",
|
||||||
|
replace=True,
|
||||||
|
index_cache_size=256,
|
||||||
|
num_bits=4,
|
||||||
|
target_partition_size=8192,
|
||||||
|
)
|
||||||
|
expected_config = IvfPq(
|
||||||
|
distance_type="l2",
|
||||||
|
num_sub_vectors=96,
|
||||||
|
num_bits=4,
|
||||||
|
target_partition_size=8192,
|
||||||
|
)
|
||||||
|
mock_create_index.assert_called_with(
|
||||||
|
"vector", replace=True, config=expected_config, name=None, train=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# target_partition_size has a default value,
|
||||||
|
# so `num_partitions` and `target_partition_size` are not required
|
||||||
|
table.create_index(
|
||||||
|
metric="l2",
|
||||||
|
num_sub_vectors=96,
|
||||||
|
vector_column_name="vector",
|
||||||
|
replace=True,
|
||||||
|
index_cache_size=256,
|
||||||
|
num_bits=4,
|
||||||
|
)
|
||||||
|
expected_config = IvfPq(
|
||||||
|
distance_type="l2",
|
||||||
|
num_sub_vectors=96,
|
||||||
|
num_bits=4,
|
||||||
|
)
|
||||||
|
mock_create_index.assert_called_with(
|
||||||
|
"vector", replace=True, config=expected_config, name=None, train=True
|
||||||
|
)
|
||||||
|
|
||||||
table.create_index(
|
table.create_index(
|
||||||
vector_column_name="my_vector",
|
vector_column_name="my_vector",
|
||||||
metric="dot",
|
metric="dot",
|
||||||
|
|||||||
@@ -4,7 +4,10 @@
|
|||||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||||
|
|
||||||
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
|
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
|
||||||
use lancedb::{connection::Connection as LanceConnection, database::CreateTableMode};
|
use lancedb::{
|
||||||
|
connection::Connection as LanceConnection,
|
||||||
|
database::{CreateTableMode, ReadConsistency},
|
||||||
|
};
|
||||||
use pyo3::{
|
use pyo3::{
|
||||||
exceptions::{PyRuntimeError, PyValueError},
|
exceptions::{PyRuntimeError, PyValueError},
|
||||||
pyclass, pyfunction, pymethods, Bound, FromPyObject, Py, PyAny, PyRef, PyResult, Python,
|
pyclass, pyfunction, pymethods, Bound, FromPyObject, Py, PyAny, PyRef, PyResult, Python,
|
||||||
@@ -23,7 +26,7 @@ impl Connection {
|
|||||||
Self { inner: Some(inner) }
|
Self { inner: Some(inner) }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_inner(&self) -> PyResult<&LanceConnection> {
|
pub(crate) fn get_inner(&self) -> PyResult<&LanceConnection> {
|
||||||
self.inner
|
self.inner
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.ok_or_else(|| PyRuntimeError::new_err("Connection is closed"))
|
.ok_or_else(|| PyRuntimeError::new_err("Connection is closed"))
|
||||||
@@ -63,6 +66,18 @@ impl Connection {
|
|||||||
self.get_inner().map(|inner| inner.uri().to_string())
|
self.get_inner().map(|inner| inner.uri().to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyo3(signature = ())]
|
||||||
|
pub fn get_read_consistency_interval(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||||
|
let inner = self_.get_inner()?.clone();
|
||||||
|
future_into_py(self_.py(), async move {
|
||||||
|
Ok(match inner.read_consistency().await.infer_error()? {
|
||||||
|
ReadConsistency::Manual => None,
|
||||||
|
ReadConsistency::Eventual(duration) => Some(duration.as_secs_f64()),
|
||||||
|
ReadConsistency::Strong => Some(0.0_f64),
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (namespace=vec![], start_after=None, limit=None))]
|
#[pyo3(signature = (namespace=vec![], start_after=None, limit=None))]
|
||||||
pub fn table_names(
|
pub fn table_names(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
@@ -163,6 +178,34 @@ impl Connection {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyo3(signature = (target_table_name, source_uri, target_namespace=vec![], source_version=None, source_tag=None, is_shallow=true))]
|
||||||
|
pub fn clone_table(
|
||||||
|
self_: PyRef<'_, Self>,
|
||||||
|
target_table_name: String,
|
||||||
|
source_uri: String,
|
||||||
|
target_namespace: Vec<String>,
|
||||||
|
source_version: Option<u64>,
|
||||||
|
source_tag: Option<String>,
|
||||||
|
is_shallow: bool,
|
||||||
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
|
let inner = self_.get_inner()?.clone();
|
||||||
|
|
||||||
|
let mut builder = inner.clone_table(target_table_name, source_uri);
|
||||||
|
builder = builder.target_namespace(target_namespace);
|
||||||
|
if let Some(version) = source_version {
|
||||||
|
builder = builder.source_version(version);
|
||||||
|
}
|
||||||
|
if let Some(tag) = source_tag {
|
||||||
|
builder = builder.source_tag(tag);
|
||||||
|
}
|
||||||
|
builder = builder.is_shallow(is_shallow);
|
||||||
|
|
||||||
|
future_into_py(self_.py(), async move {
|
||||||
|
let table = builder.execute().await.infer_error()?;
|
||||||
|
Ok(Table::new(table))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
#[pyo3(signature = (cur_name, new_name, cur_namespace=vec![], new_namespace=vec![]))]
|
#[pyo3(signature = (cur_name, new_name, cur_namespace=vec![], new_namespace=vec![]))]
|
||||||
pub fn rename_table(
|
pub fn rename_table(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
@@ -255,7 +298,7 @@ impl Connection {
|
|||||||
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None))]
|
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None))]
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn connect(
|
pub fn connect(
|
||||||
py: Python,
|
py: Python<'_>,
|
||||||
uri: String,
|
uri: String,
|
||||||
api_key: Option<String>,
|
api_key: Option<String>,
|
||||||
region: Option<String>,
|
region: Option<String>,
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
use lancedb::index::vector::IvfFlatIndexBuilder;
|
use lancedb::index::vector::{IvfFlatIndexBuilder, IvfRqIndexBuilder};
|
||||||
use lancedb::index::{
|
use lancedb::index::{
|
||||||
scalar::{BTreeIndexBuilder, FtsIndexBuilder},
|
scalar::{BTreeIndexBuilder, FtsIndexBuilder},
|
||||||
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
|
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
|
||||||
@@ -63,6 +63,9 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
|||||||
if let Some(num_partitions) = params.num_partitions {
|
if let Some(num_partitions) = params.num_partitions {
|
||||||
ivf_flat_builder = ivf_flat_builder.num_partitions(num_partitions);
|
ivf_flat_builder = ivf_flat_builder.num_partitions(num_partitions);
|
||||||
}
|
}
|
||||||
|
if let Some(target_partition_size) = params.target_partition_size {
|
||||||
|
ivf_flat_builder = ivf_flat_builder.target_partition_size(target_partition_size);
|
||||||
|
}
|
||||||
Ok(LanceDbIndex::IvfFlat(ivf_flat_builder))
|
Ok(LanceDbIndex::IvfFlat(ivf_flat_builder))
|
||||||
},
|
},
|
||||||
"IvfPq" => {
|
"IvfPq" => {
|
||||||
@@ -76,11 +79,30 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
|||||||
if let Some(num_partitions) = params.num_partitions {
|
if let Some(num_partitions) = params.num_partitions {
|
||||||
ivf_pq_builder = ivf_pq_builder.num_partitions(num_partitions);
|
ivf_pq_builder = ivf_pq_builder.num_partitions(num_partitions);
|
||||||
}
|
}
|
||||||
|
if let Some(target_partition_size) = params.target_partition_size {
|
||||||
|
ivf_pq_builder = ivf_pq_builder.target_partition_size(target_partition_size);
|
||||||
|
}
|
||||||
if let Some(num_sub_vectors) = params.num_sub_vectors {
|
if let Some(num_sub_vectors) = params.num_sub_vectors {
|
||||||
ivf_pq_builder = ivf_pq_builder.num_sub_vectors(num_sub_vectors);
|
ivf_pq_builder = ivf_pq_builder.num_sub_vectors(num_sub_vectors);
|
||||||
}
|
}
|
||||||
Ok(LanceDbIndex::IvfPq(ivf_pq_builder))
|
Ok(LanceDbIndex::IvfPq(ivf_pq_builder))
|
||||||
},
|
},
|
||||||
|
"IvfRq" => {
|
||||||
|
let params = source.extract::<IvfRqParams>()?;
|
||||||
|
let distance_type = parse_distance_type(params.distance_type)?;
|
||||||
|
let mut ivf_rq_builder = IvfRqIndexBuilder::default()
|
||||||
|
.distance_type(distance_type)
|
||||||
|
.max_iterations(params.max_iterations)
|
||||||
|
.sample_rate(params.sample_rate)
|
||||||
|
.num_bits(params.num_bits);
|
||||||
|
if let Some(num_partitions) = params.num_partitions {
|
||||||
|
ivf_rq_builder = ivf_rq_builder.num_partitions(num_partitions);
|
||||||
|
}
|
||||||
|
if let Some(target_partition_size) = params.target_partition_size {
|
||||||
|
ivf_rq_builder = ivf_rq_builder.target_partition_size(target_partition_size);
|
||||||
|
}
|
||||||
|
Ok(LanceDbIndex::IvfRq(ivf_rq_builder))
|
||||||
|
},
|
||||||
"HnswPq" => {
|
"HnswPq" => {
|
||||||
let params = source.extract::<IvfHnswPqParams>()?;
|
let params = source.extract::<IvfHnswPqParams>()?;
|
||||||
let distance_type = parse_distance_type(params.distance_type)?;
|
let distance_type = parse_distance_type(params.distance_type)?;
|
||||||
@@ -94,6 +116,9 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
|||||||
if let Some(num_partitions) = params.num_partitions {
|
if let Some(num_partitions) = params.num_partitions {
|
||||||
hnsw_pq_builder = hnsw_pq_builder.num_partitions(num_partitions);
|
hnsw_pq_builder = hnsw_pq_builder.num_partitions(num_partitions);
|
||||||
}
|
}
|
||||||
|
if let Some(target_partition_size) = params.target_partition_size {
|
||||||
|
hnsw_pq_builder = hnsw_pq_builder.target_partition_size(target_partition_size);
|
||||||
|
}
|
||||||
if let Some(num_sub_vectors) = params.num_sub_vectors {
|
if let Some(num_sub_vectors) = params.num_sub_vectors {
|
||||||
hnsw_pq_builder = hnsw_pq_builder.num_sub_vectors(num_sub_vectors);
|
hnsw_pq_builder = hnsw_pq_builder.num_sub_vectors(num_sub_vectors);
|
||||||
}
|
}
|
||||||
@@ -111,6 +136,9 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
|||||||
if let Some(num_partitions) = params.num_partitions {
|
if let Some(num_partitions) = params.num_partitions {
|
||||||
hnsw_sq_builder = hnsw_sq_builder.num_partitions(num_partitions);
|
hnsw_sq_builder = hnsw_sq_builder.num_partitions(num_partitions);
|
||||||
}
|
}
|
||||||
|
if let Some(target_partition_size) = params.target_partition_size {
|
||||||
|
hnsw_sq_builder = hnsw_sq_builder.target_partition_size(target_partition_size);
|
||||||
|
}
|
||||||
Ok(LanceDbIndex::IvfHnswSq(hnsw_sq_builder))
|
Ok(LanceDbIndex::IvfHnswSq(hnsw_sq_builder))
|
||||||
},
|
},
|
||||||
not_supported => Err(PyValueError::new_err(format!(
|
not_supported => Err(PyValueError::new_err(format!(
|
||||||
@@ -144,6 +172,7 @@ struct IvfFlatParams {
|
|||||||
num_partitions: Option<u32>,
|
num_partitions: Option<u32>,
|
||||||
max_iterations: u32,
|
max_iterations: u32,
|
||||||
sample_rate: u32,
|
sample_rate: u32,
|
||||||
|
target_partition_size: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(FromPyObject)]
|
#[derive(FromPyObject)]
|
||||||
@@ -154,6 +183,17 @@ struct IvfPqParams {
|
|||||||
num_bits: u32,
|
num_bits: u32,
|
||||||
max_iterations: u32,
|
max_iterations: u32,
|
||||||
sample_rate: u32,
|
sample_rate: u32,
|
||||||
|
target_partition_size: Option<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(FromPyObject)]
|
||||||
|
struct IvfRqParams {
|
||||||
|
distance_type: String,
|
||||||
|
num_partitions: Option<u32>,
|
||||||
|
num_bits: u32,
|
||||||
|
max_iterations: u32,
|
||||||
|
sample_rate: u32,
|
||||||
|
target_partition_size: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(FromPyObject)]
|
#[derive(FromPyObject)]
|
||||||
@@ -166,6 +206,7 @@ struct IvfHnswPqParams {
|
|||||||
sample_rate: u32,
|
sample_rate: u32,
|
||||||
m: u32,
|
m: u32,
|
||||||
ef_construction: u32,
|
ef_construction: u32,
|
||||||
|
target_partition_size: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(FromPyObject)]
|
#[derive(FromPyObject)]
|
||||||
@@ -176,6 +217,7 @@ struct IvfHnswSqParams {
|
|||||||
sample_rate: u32,
|
sample_rate: u32,
|
||||||
m: u32,
|
m: u32,
|
||||||
ef_construction: u32,
|
ef_construction: u32,
|
||||||
|
target_partition_size: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyclass(get_all)]
|
#[pyclass(get_all)]
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use arrow::RecordBatchStream;
|
|||||||
use connection::{connect, Connection};
|
use connection::{connect, Connection};
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
use index::IndexConfig;
|
use index::IndexConfig;
|
||||||
|
use permutation::PyAsyncPermutationBuilder;
|
||||||
use pyo3::{
|
use pyo3::{
|
||||||
pymodule,
|
pymodule,
|
||||||
types::{PyModule, PyModuleMethods},
|
types::{PyModule, PyModuleMethods},
|
||||||
@@ -22,6 +23,7 @@ pub mod connection;
|
|||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod header;
|
pub mod header;
|
||||||
pub mod index;
|
pub mod index;
|
||||||
|
pub mod permutation;
|
||||||
pub mod query;
|
pub mod query;
|
||||||
pub mod session;
|
pub mod session;
|
||||||
pub mod table;
|
pub mod table;
|
||||||
@@ -49,7 +51,9 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|||||||
m.add_class::<DeleteResult>()?;
|
m.add_class::<DeleteResult>()?;
|
||||||
m.add_class::<DropColumnsResult>()?;
|
m.add_class::<DropColumnsResult>()?;
|
||||||
m.add_class::<UpdateResult>()?;
|
m.add_class::<UpdateResult>()?;
|
||||||
|
m.add_class::<PyAsyncPermutationBuilder>()?;
|
||||||
m.add_function(wrap_pyfunction!(connect, m)?)?;
|
m.add_function(wrap_pyfunction!(connect, m)?)?;
|
||||||
|
m.add_function(wrap_pyfunction!(permutation::async_permutation_builder, m)?)?;
|
||||||
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
|
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
|
||||||
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
|
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
177
python/src/permutation.rs
Normal file
177
python/src/permutation.rs
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
|
use crate::{error::PythonErrorExt, table::Table};
|
||||||
|
use lancedb::dataloader::{
|
||||||
|
permutation::{PermutationBuilder as LancePermutationBuilder, ShuffleStrategy},
|
||||||
|
split::{SplitSizes, SplitStrategy},
|
||||||
|
};
|
||||||
|
use pyo3::{
|
||||||
|
exceptions::PyRuntimeError, pyclass, pymethods, types::PyAnyMethods, Bound, PyAny, PyRefMut,
|
||||||
|
PyResult,
|
||||||
|
};
|
||||||
|
use pyo3_async_runtimes::tokio::future_into_py;
|
||||||
|
|
||||||
|
/// Create a permutation builder for the given table
|
||||||
|
#[pyo3::pyfunction]
|
||||||
|
pub fn async_permutation_builder(
|
||||||
|
table: Bound<'_, PyAny>,
|
||||||
|
dest_table_name: String,
|
||||||
|
) -> PyResult<PyAsyncPermutationBuilder> {
|
||||||
|
let table = table.getattr("_inner")?.downcast_into::<Table>()?;
|
||||||
|
let inner_table = table.borrow().inner_ref()?.clone();
|
||||||
|
let inner_builder = LancePermutationBuilder::new(inner_table);
|
||||||
|
|
||||||
|
Ok(PyAsyncPermutationBuilder {
|
||||||
|
state: Arc::new(Mutex::new(PyAsyncPermutationBuilderState {
|
||||||
|
builder: Some(inner_builder),
|
||||||
|
dest_table_name,
|
||||||
|
})),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PyAsyncPermutationBuilderState {
|
||||||
|
builder: Option<LancePermutationBuilder>,
|
||||||
|
dest_table_name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyclass(name = "AsyncPermutationBuilder")]
|
||||||
|
pub struct PyAsyncPermutationBuilder {
|
||||||
|
state: Arc<Mutex<PyAsyncPermutationBuilderState>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PyAsyncPermutationBuilder {
|
||||||
|
fn modify(
|
||||||
|
&self,
|
||||||
|
func: impl FnOnce(LancePermutationBuilder) -> LancePermutationBuilder,
|
||||||
|
) -> PyResult<Self> {
|
||||||
|
let mut state = self.state.lock().unwrap();
|
||||||
|
let builder = state
|
||||||
|
.builder
|
||||||
|
.take()
|
||||||
|
.ok_or_else(|| PyRuntimeError::new_err("Builder already consumed"))?;
|
||||||
|
state.builder = Some(func(builder));
|
||||||
|
Ok(Self {
|
||||||
|
state: self.state.clone(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl PyAsyncPermutationBuilder {
|
||||||
|
#[pyo3(signature = (*, ratios=None, counts=None, fixed=None, seed=None))]
|
||||||
|
pub fn split_random(
|
||||||
|
slf: PyRefMut<'_, Self>,
|
||||||
|
ratios: Option<Vec<f64>>,
|
||||||
|
counts: Option<Vec<u64>>,
|
||||||
|
fixed: Option<u64>,
|
||||||
|
seed: Option<u64>,
|
||||||
|
) -> PyResult<Self> {
|
||||||
|
// Check that exactly one split type is provided
|
||||||
|
let split_args_count = [ratios.is_some(), counts.is_some(), fixed.is_some()]
|
||||||
|
.iter()
|
||||||
|
.filter(|&&x| x)
|
||||||
|
.count();
|
||||||
|
|
||||||
|
if split_args_count != 1 {
|
||||||
|
return Err(pyo3::exceptions::PyValueError::new_err(
|
||||||
|
"Exactly one of 'ratios', 'counts', or 'fixed' must be provided",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let sizes = if let Some(ratios) = ratios {
|
||||||
|
SplitSizes::Percentages(ratios)
|
||||||
|
} else if let Some(counts) = counts {
|
||||||
|
SplitSizes::Counts(counts)
|
||||||
|
} else if let Some(fixed) = fixed {
|
||||||
|
SplitSizes::Fixed(fixed)
|
||||||
|
} else {
|
||||||
|
unreachable!("One of the split arguments must be provided");
|
||||||
|
};
|
||||||
|
|
||||||
|
slf.modify(|builder| builder.with_split_strategy(SplitStrategy::Random { seed, sizes }))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyo3(signature = (columns, split_weights, *, discard_weight=0))]
|
||||||
|
pub fn split_hash(
|
||||||
|
slf: PyRefMut<'_, Self>,
|
||||||
|
columns: Vec<String>,
|
||||||
|
split_weights: Vec<u64>,
|
||||||
|
discard_weight: u64,
|
||||||
|
) -> PyResult<Self> {
|
||||||
|
slf.modify(|builder| {
|
||||||
|
builder.with_split_strategy(SplitStrategy::Hash {
|
||||||
|
columns,
|
||||||
|
split_weights,
|
||||||
|
discard_weight,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyo3(signature = (*, ratios=None, counts=None, fixed=None))]
|
||||||
|
pub fn split_sequential(
|
||||||
|
slf: PyRefMut<'_, Self>,
|
||||||
|
ratios: Option<Vec<f64>>,
|
||||||
|
counts: Option<Vec<u64>>,
|
||||||
|
fixed: Option<u64>,
|
||||||
|
) -> PyResult<Self> {
|
||||||
|
// Check that exactly one split type is provided
|
||||||
|
let split_args_count = [ratios.is_some(), counts.is_some(), fixed.is_some()]
|
||||||
|
.iter()
|
||||||
|
.filter(|&&x| x)
|
||||||
|
.count();
|
||||||
|
|
||||||
|
if split_args_count != 1 {
|
||||||
|
return Err(pyo3::exceptions::PyValueError::new_err(
|
||||||
|
"Exactly one of 'ratios', 'counts', or 'fixed' must be provided",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let sizes = if let Some(ratios) = ratios {
|
||||||
|
SplitSizes::Percentages(ratios)
|
||||||
|
} else if let Some(counts) = counts {
|
||||||
|
SplitSizes::Counts(counts)
|
||||||
|
} else if let Some(fixed) = fixed {
|
||||||
|
SplitSizes::Fixed(fixed)
|
||||||
|
} else {
|
||||||
|
unreachable!("One of the split arguments must be provided");
|
||||||
|
};
|
||||||
|
|
||||||
|
slf.modify(|builder| builder.with_split_strategy(SplitStrategy::Sequential { sizes }))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn split_calculated(slf: PyRefMut<'_, Self>, calculation: String) -> PyResult<Self> {
|
||||||
|
slf.modify(|builder| builder.with_split_strategy(SplitStrategy::Calculated { calculation }))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn shuffle(
|
||||||
|
slf: PyRefMut<'_, Self>,
|
||||||
|
seed: Option<u64>,
|
||||||
|
clump_size: Option<u64>,
|
||||||
|
) -> PyResult<Self> {
|
||||||
|
slf.modify(|builder| {
|
||||||
|
builder.with_shuffle_strategy(ShuffleStrategy::Random { seed, clump_size })
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn filter(slf: PyRefMut<'_, Self>, filter: String) -> PyResult<Self> {
|
||||||
|
slf.modify(|builder| builder.with_filter(filter))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn execute(slf: PyRefMut<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||||
|
let mut state = slf.state.lock().unwrap();
|
||||||
|
let builder = state
|
||||||
|
.builder
|
||||||
|
.take()
|
||||||
|
.ok_or_else(|| PyRuntimeError::new_err("Builder already consumed"))?;
|
||||||
|
|
||||||
|
let dest_table_name = std::mem::take(&mut state.dest_table_name);
|
||||||
|
|
||||||
|
future_into_py(slf.py(), async move {
|
||||||
|
let table = builder.build(&dest_table_name).await.infer_error()?;
|
||||||
|
Ok(Table::new(table))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,6 +3,7 @@
|
|||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
|
connection::Connection,
|
||||||
error::PythonErrorExt,
|
error::PythonErrorExt,
|
||||||
index::{extract_index_params, IndexConfig},
|
index::{extract_index_params, IndexConfig},
|
||||||
query::{Query, TakeQuery},
|
query::{Query, TakeQuery},
|
||||||
@@ -249,7 +250,7 @@ impl Table {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Table {
|
impl Table {
|
||||||
fn inner_ref(&self) -> PyResult<&LanceDbTable> {
|
pub(crate) fn inner_ref(&self) -> PyResult<&LanceDbTable> {
|
||||||
self.inner
|
self.inner
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.ok_or_else(|| PyRuntimeError::new_err(format!("Table {} is closed", self.name)))
|
.ok_or_else(|| PyRuntimeError::new_err(format!("Table {} is closed", self.name)))
|
||||||
@@ -272,6 +273,13 @@ impl Table {
|
|||||||
self.inner.take();
|
self.inner.take();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn database(&self) -> PyResult<Connection> {
|
||||||
|
let inner = self.inner_ref()?.clone();
|
||||||
|
let inner_connection =
|
||||||
|
lancedb::Connection::new(inner.database().clone(), inner.embedding_registry().clone());
|
||||||
|
Ok(Connection::new(inner_connection))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn schema(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
pub fn schema(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.inner_ref()?.clone();
|
let inner = self_.inner_ref()?.clone();
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
@@ -591,12 +599,11 @@ impl Table {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
|
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
|
||||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
|
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))]
|
||||||
pub fn optimize(
|
pub fn optimize(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
cleanup_since_ms: Option<u64>,
|
cleanup_since_ms: Option<u64>,
|
||||||
delete_unverified: Option<bool>,
|
delete_unverified: Option<bool>,
|
||||||
retrain: Option<bool>,
|
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.inner_ref()?.clone();
|
let inner = self_.inner_ref()?.clone();
|
||||||
let older_than = if let Some(ms) = cleanup_since_ms {
|
let older_than = if let Some(ms) = cleanup_since_ms {
|
||||||
@@ -632,10 +639,9 @@ impl Table {
|
|||||||
.prune
|
.prune
|
||||||
.unwrap();
|
.unwrap();
|
||||||
inner
|
inner
|
||||||
.optimize(lancedb::table::OptimizeAction::Index(match retrain {
|
.optimize(lancedb::table::OptimizeAction::Index(
|
||||||
Some(true) => OptimizeOptions::retrain(),
|
OptimizeOptions::default(),
|
||||||
_ => OptimizeOptions::default(),
|
))
|
||||||
}))
|
|
||||||
.await
|
.await
|
||||||
.infer_error()?;
|
.infer_error()?;
|
||||||
Ok(OptimizeStats {
|
Ok(OptimizeStats {
|
||||||
@@ -674,6 +680,9 @@ impl Table {
|
|||||||
if let Some(timeout) = parameters.timeout {
|
if let Some(timeout) = parameters.timeout {
|
||||||
builder.timeout(timeout);
|
builder.timeout(timeout);
|
||||||
}
|
}
|
||||||
|
if let Some(use_index) = parameters.use_index {
|
||||||
|
builder.use_index(use_index);
|
||||||
|
}
|
||||||
|
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
let res = builder.execute(Box::new(batches)).await.infer_error()?;
|
let res = builder.execute(Box::new(batches)).await.infer_error()?;
|
||||||
@@ -833,6 +842,7 @@ pub struct MergeInsertParams {
|
|||||||
when_not_matched_by_source_delete: bool,
|
when_not_matched_by_source_delete: bool,
|
||||||
when_not_matched_by_source_condition: Option<String>,
|
when_not_matched_by_source_condition: Option<String>,
|
||||||
timeout: Option<std::time::Duration>,
|
timeout: Option<std::time::Duration>,
|
||||||
|
use_index: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
[toolchain]
|
[toolchain]
|
||||||
channel = "1.86.0"
|
channel = "1.90.0"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.22.1-beta.0"
|
version = "0.22.2"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -11,6 +11,7 @@ rust-version.workspace = true
|
|||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
ahash = { workspace = true }
|
||||||
arrow = { workspace = true }
|
arrow = { workspace = true }
|
||||||
arrow-array = { workspace = true }
|
arrow-array = { workspace = true }
|
||||||
arrow-data = { workspace = true }
|
arrow-data = { workspace = true }
|
||||||
@@ -24,18 +25,23 @@ datafusion-common.workspace = true
|
|||||||
datafusion-execution.workspace = true
|
datafusion-execution.workspace = true
|
||||||
datafusion-expr.workspace = true
|
datafusion-expr.workspace = true
|
||||||
datafusion-physical-plan.workspace = true
|
datafusion-physical-plan.workspace = true
|
||||||
|
datafusion.workspace = true
|
||||||
object_store = { workspace = true }
|
object_store = { workspace = true }
|
||||||
snafu = { workspace = true }
|
snafu = { workspace = true }
|
||||||
half = { workspace = true }
|
half = { workspace = true }
|
||||||
lazy_static.workspace = true
|
lazy_static.workspace = true
|
||||||
lance = { workspace = true }
|
lance = { workspace = true }
|
||||||
|
lance-core = { workspace = true }
|
||||||
lance-datafusion.workspace = true
|
lance-datafusion.workspace = true
|
||||||
|
lance-datagen = { workspace = true }
|
||||||
|
lance-file = { workspace = true }
|
||||||
lance-io = { workspace = true }
|
lance-io = { workspace = true }
|
||||||
lance-index = { workspace = true }
|
lance-index = { workspace = true }
|
||||||
lance-table = { workspace = true }
|
lance-table = { workspace = true }
|
||||||
lance-linalg = { workspace = true }
|
lance-linalg = { workspace = true }
|
||||||
lance-testing = { workspace = true }
|
lance-testing = { workspace = true }
|
||||||
lance-encoding = { workspace = true }
|
lance-encoding = { workspace = true }
|
||||||
|
lance-namespace = { workspace = true }
|
||||||
moka = { workspace = true }
|
moka = { workspace = true }
|
||||||
pin-project = { workspace = true }
|
pin-project = { workspace = true }
|
||||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||||
@@ -45,11 +51,13 @@ bytes = "1"
|
|||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
num-traits.workspace = true
|
num-traits.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
|
rand.workspace = true
|
||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
serde = { version = "^1" }
|
serde = { version = "^1" }
|
||||||
serde_json = { version = "1" }
|
serde_json = { version = "1" }
|
||||||
async-openai = { version = "0.20.0", optional = true }
|
async-openai = { version = "0.20.0", optional = true }
|
||||||
serde_with = { version = "3.8.1" }
|
serde_with = { version = "3.8.1" }
|
||||||
|
tempfile = "3.5.0"
|
||||||
aws-sdk-bedrockruntime = { version = "1.27.0", optional = true }
|
aws-sdk-bedrockruntime = { version = "1.27.0", optional = true }
|
||||||
# For remote feature
|
# For remote feature
|
||||||
reqwest = { version = "0.12.0", default-features = false, features = [
|
reqwest = { version = "0.12.0", default-features = false, features = [
|
||||||
@@ -60,9 +68,8 @@ reqwest = { version = "0.12.0", default-features = false, features = [
|
|||||||
"macos-system-configuration",
|
"macos-system-configuration",
|
||||||
"stream",
|
"stream",
|
||||||
], optional = true }
|
], optional = true }
|
||||||
rand = { version = "0.9", features = ["small_rng"], optional = true }
|
|
||||||
http = { version = "1", optional = true } # Matching what is in reqwest
|
http = { version = "1", optional = true } # Matching what is in reqwest
|
||||||
uuid = { version = "1.7.0", features = ["v4"], optional = true }
|
uuid = { version = "1.7.0", features = ["v4"] }
|
||||||
polars-arrow = { version = ">=0.37,<0.40.0", optional = true }
|
polars-arrow = { version = ">=0.37,<0.40.0", optional = true }
|
||||||
polars = { version = ">=0.37,<0.40.0", optional = true }
|
polars = { version = ">=0.37,<0.40.0", optional = true }
|
||||||
hf-hub = { version = "0.4.1", optional = true, default-features = false, features = [
|
hf-hub = { version = "0.4.1", optional = true, default-features = false, features = [
|
||||||
@@ -81,19 +88,20 @@ crunchy.workspace = true
|
|||||||
bytemuck_derive.workspace = true
|
bytemuck_derive.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
anyhow = "1"
|
||||||
tempfile = "3.5.0"
|
tempfile = "3.5.0"
|
||||||
rand = { version = "0.9", features = ["small_rng"] }
|
|
||||||
random_word = { version = "0.4.3", features = ["en"] }
|
random_word = { version = "0.4.3", features = ["en"] }
|
||||||
uuid = { version = "1.7.0", features = ["v4"] }
|
uuid = { version = "1.7.0", features = ["v4"] }
|
||||||
walkdir = "2"
|
walkdir = "2"
|
||||||
aws-sdk-dynamodb = { version = "1.38.0" }
|
aws-sdk-dynamodb = { version = "1.55.0" }
|
||||||
aws-sdk-s3 = { version = "1.38.0" }
|
aws-sdk-s3 = { version = "1.55.0" }
|
||||||
aws-sdk-kms = { version = "1.37" }
|
aws-sdk-kms = { version = "1.48.0" }
|
||||||
aws-config = { version = "1.0" }
|
aws-config = { version = "1.5.10" }
|
||||||
aws-smithy-runtime = { version = "1.3" }
|
aws-smithy-runtime = { version = "1.9.1" }
|
||||||
datafusion.workspace = true
|
datafusion.workspace = true
|
||||||
http-body = "1" # Matching reqwest
|
http-body = "1" # Matching reqwest
|
||||||
rstest = "0.23.0"
|
rstest = "0.23.0"
|
||||||
|
test-log = "0.2"
|
||||||
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
@@ -103,7 +111,7 @@ oss = ["lance/oss", "lance-io/oss"]
|
|||||||
gcs = ["lance/gcp", "lance-io/gcp"]
|
gcs = ["lance/gcp", "lance-io/gcp"]
|
||||||
azure = ["lance/azure", "lance-io/azure"]
|
azure = ["lance/azure", "lance-io/azure"]
|
||||||
dynamodb = ["lance/dynamodb", "aws"]
|
dynamodb = ["lance/dynamodb", "aws"]
|
||||||
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
remote = ["dep:reqwest", "dep:http"]
|
||||||
fp16kernels = ["lance-linalg/fp16kernels"]
|
fp16kernels = ["lance-linalg/fp16kernels"]
|
||||||
s3-test = []
|
s3-test = []
|
||||||
bedrock = ["dep:aws-sdk-bedrockruntime"]
|
bedrock = ["dep:aws-sdk-bedrockruntime"]
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user