mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
50 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b67f13f642 | ||
|
|
2f12d67469 | ||
|
|
8d7cc29abb | ||
|
|
a4404e9e18 | ||
|
|
077e5bb586 | ||
|
|
2ad71bdeca | ||
|
|
7c13615096 | ||
|
|
f882f5b69a | ||
|
|
a68311a893 | ||
|
|
846a5cea33 | ||
|
|
e3dec647b5 | ||
|
|
c58104cecc | ||
|
|
b3b5362632 | ||
|
|
abe06fee3d | ||
|
|
93a82fd371 | ||
|
|
0d379e6ffa | ||
|
|
e1388bdfdd | ||
|
|
315a24c2bc | ||
|
|
6dd4cf6038 | ||
|
|
f97e751b3c | ||
|
|
e803a626a1 | ||
|
|
9403254442 | ||
|
|
b2a38ac366 | ||
|
|
bdb6c09c3b | ||
|
|
2bfdef2624 | ||
|
|
7982d5c082 | ||
|
|
7ff6ec7fe3 | ||
|
|
ba1ded933a | ||
|
|
b595d8a579 | ||
|
|
2a1d6d8abf | ||
|
|
440a466a13 | ||
|
|
b9afd9c860 | ||
|
|
a6b6f6a806 | ||
|
|
ae1548b507 | ||
|
|
4e03ee82bc | ||
|
|
46a6846d07 | ||
|
|
a207213358 | ||
|
|
6c321c694a | ||
|
|
5c00b2904c | ||
|
|
14677d7c18 | ||
|
|
dd22a379b2 | ||
|
|
7747c9bcbf | ||
|
|
c9d6fc43a6 | ||
|
|
581bcfbb88 | ||
|
|
3750639b5f | ||
|
|
e744d54460 | ||
|
|
9d1ce4b5a5 | ||
|
|
729ce5e542 | ||
|
|
de6739e7ec | ||
|
|
495216efdb |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.18.0"
|
||||
current_version = "0.18.2-beta.1"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
@@ -87,26 +87,11 @@ glob = "node/package.json"
|
||||
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
|
||||
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
glob = "node/package.json"
|
||||
replace = "\"@lancedb/vectordb-linux-arm64-musl\": \"{new_version}\""
|
||||
search = "\"@lancedb/vectordb-linux-arm64-musl\": \"{current_version}\""
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
glob = "node/package.json"
|
||||
replace = "\"@lancedb/vectordb-linux-x64-musl\": \"{new_version}\""
|
||||
search = "\"@lancedb/vectordb-linux-x64-musl\": \"{current_version}\""
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
glob = "node/package.json"
|
||||
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
|
||||
search = "\"@lancedb/vectordb-win32-x64-msvc\": \"{current_version}\""
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
glob = "node/package.json"
|
||||
replace = "\"@lancedb/vectordb-win32-arm64-msvc\": \"{new_version}\""
|
||||
search = "\"@lancedb/vectordb-win32-arm64-msvc\": \"{current_version}\""
|
||||
|
||||
# Cargo files
|
||||
# ------------
|
||||
[[tool.bumpversion.files]]
|
||||
|
||||
@@ -34,6 +34,10 @@ rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=+avx2,+fma,+f16c"
|
||||
[target.x86_64-unknown-linux-musl]
|
||||
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=-crt-static,+avx2,+fma,+f16c"]
|
||||
|
||||
[target.aarch64-unknown-linux-musl]
|
||||
linker = "aarch64-linux-musl-gcc"
|
||||
rustflags = ["-C", "target-feature=-crt-static"]
|
||||
|
||||
[target.aarch64-apple-darwin]
|
||||
rustflags = ["-C", "target-cpu=apple-m1", "-C", "target-feature=+neon,+fp16,+fhm,+dotprod"]
|
||||
|
||||
@@ -44,4 +48,4 @@ rustflags = ["-Ctarget-feature=+crt-static"]
|
||||
|
||||
# Experimental target for Arm64 Windows
|
||||
[target.aarch64-pc-windows-msvc]
|
||||
rustflags = ["-Ctarget-feature=+crt-static"]
|
||||
rustflags = ["-Ctarget-feature=+crt-static"]
|
||||
|
||||
@@ -36,8 +36,7 @@ runs:
|
||||
args: ${{ inputs.args }}
|
||||
before-script-linux: |
|
||||
set -e
|
||||
yum install -y openssl-devel \
|
||||
&& curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-$(uname -m).zip > /tmp/protoc.zip \
|
||||
curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-$(uname -m).zip > /tmp/protoc.zip \
|
||||
&& unzip /tmp/protoc.zip -d /usr/local \
|
||||
&& rm /tmp/protoc.zip
|
||||
- name: Build Arm Manylinux Wheel
|
||||
@@ -52,7 +51,7 @@ runs:
|
||||
args: ${{ inputs.args }}
|
||||
before-script-linux: |
|
||||
set -e
|
||||
yum install -y openssl-devel clang \
|
||||
yum install -y clang \
|
||||
&& curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-aarch_64.zip > /tmp/protoc.zip \
|
||||
&& unzip /tmp/protoc.zip -d /usr/local \
|
||||
&& rm /tmp/protoc.zip
|
||||
|
||||
1080
.github/workflows/npm-publish.yml
vendored
1080
.github/workflows/npm-publish.yml
vendored
File diff suppressed because it is too large
Load Diff
8
.github/workflows/pypi-publish.yml
vendored
8
.github/workflows/pypi-publish.yml
vendored
@@ -4,6 +4,10 @@ on:
|
||||
push:
|
||||
tags:
|
||||
- 'python-v*'
|
||||
pull_request:
|
||||
# This should trigger a dry run (we skip the final publish step)
|
||||
paths:
|
||||
- .github/workflows/pypi-publish.yml
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
@@ -46,6 +50,7 @@ jobs:
|
||||
arm-build: ${{ matrix.config.platform == 'aarch64' }}
|
||||
manylinux: ${{ matrix.config.manylinux }}
|
||||
- uses: ./.github/workflows/upload_wheel
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
with:
|
||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||
@@ -75,6 +80,7 @@ jobs:
|
||||
python-minor-version: 8
|
||||
args: "--release --strip --target ${{ matrix.config.target }} --features fp16kernels"
|
||||
- uses: ./.github/workflows/upload_wheel
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
with:
|
||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||
@@ -96,10 +102,12 @@ jobs:
|
||||
args: "--release --strip"
|
||||
vcpkg_token: ${{ secrets.VCPKG_GITHUB_PACKAGES }}
|
||||
- uses: ./.github/workflows/upload_wheel
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
with:
|
||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||
gh-release:
|
||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
9
.github/workflows/python.yml
vendored
9
.github/workflows/python.yml
vendored
@@ -13,6 +13,11 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
# Color output for pytest is off by default.
|
||||
PYTEST_ADDOPTS: "--color=yes"
|
||||
FORCE_COLOR: "1"
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: "Lint"
|
||||
@@ -131,6 +136,10 @@ jobs:
|
||||
- uses: ./.github/workflows/run_tests
|
||||
with:
|
||||
integration: true
|
||||
- name: Test without pylance
|
||||
run: |
|
||||
pip uninstall -y pylance
|
||||
pytest -vv python/tests/test_table.py
|
||||
# Make sure wheels are not included in the Rust cache
|
||||
- name: Delete wheels
|
||||
run: rm -rf target/wheels
|
||||
|
||||
150
.github/workflows/rust.yml
vendored
150
.github/workflows/rust.yml
vendored
@@ -157,153 +157,33 @@ jobs:
|
||||
|
||||
windows:
|
||||
runs-on: windows-2022
|
||||
strategy:
|
||||
matrix:
|
||||
target:
|
||||
- x86_64-pc-windows-msvc
|
||||
- aarch64-pc-windows-msvc
|
||||
defaults:
|
||||
run:
|
||||
working-directory: rust/lancedb
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
with:
|
||||
workspaces: rust
|
||||
- name: Install Protoc v21.12
|
||||
working-directory: C:\
|
||||
run: choco install --no-progress protoc
|
||||
- name: Build
|
||||
run: |
|
||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
||||
Set-Location C:\protoc
|
||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||
7z x protoc.zip
|
||||
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||
shell: powershell
|
||||
rustup target add ${{ matrix.target }}
|
||||
$env:VCPKG_ROOT = $env:VCPKG_INSTALLATION_ROOT
|
||||
cargo build --features remote --tests --locked --target ${{ matrix.target }}
|
||||
- name: Run tests
|
||||
# Can only run tests when target matches host
|
||||
if: ${{ matrix.target == 'x86_64-pc-windows-msvc' }}
|
||||
run: |
|
||||
$env:VCPKG_ROOT = $env:VCPKG_INSTALLATION_ROOT
|
||||
cargo test --features remote --locked
|
||||
|
||||
windows-arm64-cross:
|
||||
# We cross compile in Node releases, so we want to make sure
|
||||
# this can run successfully.
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine:edge
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install dependencies (part 1)
|
||||
run: |
|
||||
set -e
|
||||
apk add protobuf-dev curl clang lld llvm19 grep npm bash msitools sed
|
||||
- name: Install rust
|
||||
uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
with:
|
||||
target: aarch64-pc-windows-msvc
|
||||
- name: Install dependencies (part 2)
|
||||
run: |
|
||||
set -e
|
||||
mkdir -p sysroot
|
||||
cd sysroot
|
||||
sh ../ci/sysroot-aarch64-pc-windows-msvc.sh
|
||||
- name: Check
|
||||
env:
|
||||
CC: clang
|
||||
AR: llvm-ar
|
||||
C_INCLUDE_PATH: /usr/aarch64-pc-windows-msvc/usr/include
|
||||
CARGO_BUILD_TARGET: aarch64-pc-windows-msvc
|
||||
RUSTFLAGS: -Ctarget-feature=+crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=lld -Clink-arg=/LIBPATH:/usr/aarch64-pc-windows-msvc/usr/lib -Clink-arg=arm64rt.lib
|
||||
run: |
|
||||
source $HOME/.cargo/env
|
||||
cargo check --features remote --locked
|
||||
|
||||
windows-arm64:
|
||||
runs-on: windows-4x-arm
|
||||
steps:
|
||||
- name: Install Git
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||
shell: powershell
|
||||
- name: Add Git to PATH
|
||||
run: |
|
||||
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||
shell: powershell
|
||||
- name: Configure Git symlinks
|
||||
run: git config --global core.symlinks true
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.13"
|
||||
- name: Install Visual Studio Build Tools
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||
"--installPath", "C:\BuildTools", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||
shell: powershell
|
||||
- name: Add Visual Studio Build Tools to PATH
|
||||
run: |
|
||||
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||
|
||||
# Add MSVC runtime libraries to LIB
|
||||
$env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
|
||||
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
|
||||
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||
Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
|
||||
|
||||
# Add INCLUDE paths
|
||||
$env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
|
||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
|
||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
|
||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
|
||||
Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
|
||||
shell: powershell
|
||||
- name: Install Rust
|
||||
run: |
|
||||
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc --default-toolchain 1.83.0
|
||||
shell: powershell
|
||||
- name: Add Rust to PATH
|
||||
run: |
|
||||
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||
shell: powershell
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
with:
|
||||
workspaces: rust
|
||||
- name: Install 7-Zip ARM
|
||||
run: |
|
||||
New-Item -Path 'C:\7zip' -ItemType Directory
|
||||
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||
shell: powershell
|
||||
- name: Add 7-Zip to PATH
|
||||
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||
shell: powershell
|
||||
- name: Install Protoc v21.12
|
||||
working-directory: C:\
|
||||
run: |
|
||||
if (Test-Path 'C:\protoc') {
|
||||
Write-Host "Protoc directory exists, skipping installation"
|
||||
return
|
||||
}
|
||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
||||
Set-Location C:\protoc
|
||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||
shell: powershell
|
||||
- name: Add Protoc to PATH
|
||||
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||
shell: powershell
|
||||
- name: Run tests
|
||||
run: |
|
||||
$env:VCPKG_ROOT = $env:VCPKG_INSTALLATION_ROOT
|
||||
cargo test --target aarch64-pc-windows-msvc --features remote --locked
|
||||
|
||||
msrv:
|
||||
# Check the minimum supported Rust version
|
||||
name: MSRV Check - Rust v${{ matrix.msrv }}
|
||||
|
||||
1077
Cargo.lock
generated
1077
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
21
Cargo.toml
21
Cargo.toml
@@ -21,14 +21,16 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.24.1", "features" = ["dynamodb"] }
|
||||
lance-io = { version = "=0.24.1" }
|
||||
lance-index = { version = "=0.24.1" }
|
||||
lance-linalg = { version = "=0.24.1" }
|
||||
lance-table = { version = "=0.24.1" }
|
||||
lance-testing = { version = "=0.24.1" }
|
||||
lance-datafusion = { version = "=0.24.1" }
|
||||
lance-encoding = { version = "=0.24.1" }
|
||||
lance = { "version" = "=0.25.0", "features" = [
|
||||
"dynamodb",
|
||||
] }
|
||||
lance-io = { version = "=0.25.0" }
|
||||
lance-index = { version = "=0.25.0" }
|
||||
lance-linalg = { version = "=0.25.0" }
|
||||
lance-table = { version = "=0.25.0" }
|
||||
lance-testing = { version = "=0.25.0" }
|
||||
lance-datafusion = { version = "=0.25.0" }
|
||||
lance-encoding = { version = "=0.25.0" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "54.1", optional = false }
|
||||
arrow-array = "54.1"
|
||||
@@ -70,3 +72,6 @@ base64ct = "=1.6.0"
|
||||
|
||||
# Workaround for: https://github.com/eira-fransham/crunchy/issues/13
|
||||
crunchy = "=0.2.2"
|
||||
|
||||
# Workaround for: https://github.com/Lokathor/bytemuck/issues/306
|
||||
bytemuck_derive = ">=1.8.1, <1.9.0"
|
||||
|
||||
12
README.md
12
README.md
@@ -1,9 +1,17 @@
|
||||
<a href="https://cloud.lancedb.com" target="_blank">
|
||||
<img src="https://github.com/user-attachments/assets/92dad0a2-2a37-4ce1-b783-0d1b4f30a00c" alt="LanceDB Cloud Public Beta" width="100%" style="max-width: 100%;">
|
||||
</a>
|
||||
|
||||
<div align="center">
|
||||
<p align="center">
|
||||
|
||||
<img width="275" alt="LanceDB Logo" src="https://github.com/lancedb/lancedb/assets/5846846/37d7c7ad-c2fd-4f56-9f16-fffb0d17c73a">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/ac270358-333e-4bea-a132-acefaa94040e">
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/b864d814-0d29-4784-8fd9-807297c758c0">
|
||||
<img alt="LanceDB Logo" src="https://github.com/user-attachments/assets/b864d814-0d29-4784-8fd9-807297c758c0" width=300>
|
||||
</picture>
|
||||
|
||||
**Developer-friendly, database for multimodal AI**
|
||||
**Search More, Manage Less**
|
||||
|
||||
<a href='https://github.com/lancedb/vectordb-recipes/tree/main' target="_blank"><img alt='LanceDB' src='https://img.shields.io/badge/VectorDB_Recipes-100000?style=for-the-badge&logo=LanceDB&logoColor=white&labelColor=645cfb&color=645cfb'/></a>
|
||||
<a href='https://lancedb.github.io/lancedb/' target="_blank"><img alt='lancdb' src='https://img.shields.io/badge/DOCS-100000?style=for-the-badge&logo=lancdb&logoColor=white&labelColor=645cfb&color=645cfb'/></a>
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
ARCH=${1:-x86_64}
|
||||
|
||||
# We pass down the current user so that when we later mount the local files
|
||||
# into the container, the files are accessible by the current user.
|
||||
pushd ci/manylinux_node
|
||||
docker build \
|
||||
-t lancedb-node-manylinux-$ARCH \
|
||||
--build-arg="ARCH=$ARCH" \
|
||||
--build-arg="DOCKER_USER=$(id -u)" \
|
||||
--progress=plain \
|
||||
.
|
||||
popd
|
||||
|
||||
# We turn on memory swap to avoid OOM killer
|
||||
docker run \
|
||||
-v $(pwd):/io -w /io \
|
||||
--memory-swap=-1 \
|
||||
lancedb-node-manylinux-$ARCH \
|
||||
bash ci/manylinux_node/build_lancedb.sh $ARCH
|
||||
@@ -1,34 +0,0 @@
|
||||
# Builds the macOS artifacts (nodejs binaries).
|
||||
# Usage: ./ci/build_macos_artifacts_nodejs.sh [target]
|
||||
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
|
||||
set -e
|
||||
|
||||
prebuild_rust() {
|
||||
# Building here for the sake of easier debugging.
|
||||
pushd rust/lancedb
|
||||
echo "Building rust library for $1"
|
||||
export RUST_BACKTRACE=1
|
||||
cargo build --release --target $1
|
||||
popd
|
||||
}
|
||||
|
||||
build_node_binaries() {
|
||||
pushd nodejs
|
||||
echo "Building nodejs library for $1"
|
||||
export RUST_TARGET=$1
|
||||
npm run build-release
|
||||
popd
|
||||
}
|
||||
|
||||
if [ -n "$1" ]; then
|
||||
targets=$1
|
||||
else
|
||||
targets="x86_64-apple-darwin aarch64-apple-darwin"
|
||||
fi
|
||||
|
||||
echo "Building artifacts for targets: $targets"
|
||||
for target in $targets
|
||||
do
|
||||
prebuild_rust $target
|
||||
build_node_binaries $target
|
||||
done
|
||||
@@ -1,5 +1,5 @@
|
||||
# Many linux dockerfile with Rust, Node, and Lance dependencies installed.
|
||||
# This container allows building the node modules native libraries in an
|
||||
# This container allows building the node modules native libraries in an
|
||||
# environment with a very old glibc, so that we are compatible with a wide
|
||||
# range of linux distributions.
|
||||
ARG ARCH=x86_64
|
||||
@@ -9,10 +9,6 @@ FROM quay.io/pypa/manylinux_2_28_${ARCH}
|
||||
ARG ARCH=x86_64
|
||||
ARG DOCKER_USER=default_user
|
||||
|
||||
# Install static openssl
|
||||
COPY install_openssl.sh install_openssl.sh
|
||||
RUN ./install_openssl.sh ${ARCH} > /dev/null
|
||||
|
||||
# Protobuf is also installed as root.
|
||||
COPY install_protobuf.sh install_protobuf.sh
|
||||
RUN ./install_protobuf.sh ${ARCH}
|
||||
@@ -21,7 +17,7 @@ ENV DOCKER_USER=${DOCKER_USER}
|
||||
# Create a group and user, but only if it doesn't exist
|
||||
RUN echo ${ARCH} && id -u ${DOCKER_USER} >/dev/null 2>&1 || adduser --user-group --create-home --uid ${DOCKER_USER} build_user
|
||||
|
||||
# We switch to the user to install Rust and Node, since those like to be
|
||||
# We switch to the user to install Rust and Node, since those like to be
|
||||
# installed at the user level.
|
||||
USER ${DOCKER_USER}
|
||||
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Builds the nodejs module for manylinux. Invoked by ci/build_linux_artifacts_nodejs.sh.
|
||||
set -e
|
||||
ARCH=${1:-x86_64}
|
||||
|
||||
if [ "$ARCH" = "x86_64" ]; then
|
||||
export OPENSSL_LIB_DIR=/usr/local/lib64/
|
||||
else
|
||||
export OPENSSL_LIB_DIR=/usr/local/lib/
|
||||
fi
|
||||
export OPENSSL_STATIC=1
|
||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||
|
||||
#Alpine doesn't have .bashrc
|
||||
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
||||
|
||||
cd nodejs
|
||||
npm ci
|
||||
npm run build-release
|
||||
@@ -4,14 +4,6 @@ set -e
|
||||
ARCH=${1:-x86_64}
|
||||
TARGET_TRIPLE=${2:-x86_64-unknown-linux-gnu}
|
||||
|
||||
if [ "$ARCH" = "x86_64" ]; then
|
||||
export OPENSSL_LIB_DIR=/usr/local/lib64/
|
||||
else
|
||||
export OPENSSL_LIB_DIR=/usr/local/lib/
|
||||
fi
|
||||
export OPENSSL_STATIC=1
|
||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||
|
||||
#Alpine doesn't have .bashrc
|
||||
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
||||
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Builds openssl from source so we can statically link to it
|
||||
|
||||
# this is to avoid the error we get with the system installation:
|
||||
# /usr/bin/ld: <library>: version node not found for symbol SSLeay@@OPENSSL_1.0.1
|
||||
# /usr/bin/ld: failed to set dynamic section sizes: Bad value
|
||||
set -e
|
||||
|
||||
git clone -b OpenSSL_1_1_1v \
|
||||
--single-branch \
|
||||
https://github.com/openssl/openssl.git
|
||||
|
||||
pushd openssl
|
||||
|
||||
if [[ $1 == x86_64* ]]; then
|
||||
ARCH=linux-x86_64
|
||||
else
|
||||
# gnu target
|
||||
ARCH=linux-aarch64
|
||||
fi
|
||||
|
||||
./Configure no-shared $ARCH
|
||||
|
||||
make
|
||||
|
||||
make install
|
||||
@@ -124,6 +124,9 @@ nav:
|
||||
- Overview: hybrid_search/hybrid_search.md
|
||||
- Comparing Rerankers: hybrid_search/eval.md
|
||||
- Airbnb financial data example: notebooks/hybrid_search.ipynb
|
||||
- Late interaction with MultiVector search:
|
||||
- Overview: guides/multi-vector.md
|
||||
- Example: notebooks/Multivector_on_LanceDB.ipynb
|
||||
- RAG:
|
||||
- Vanilla RAG: rag/vanilla_rag.md
|
||||
- Multi-head RAG: rag/multi_head_rag.md
|
||||
@@ -233,13 +236,6 @@ nav:
|
||||
- 👾 JavaScript (vectordb): javascript/modules.md
|
||||
- 👾 JavaScript (lancedb): js/globals.md
|
||||
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
|
||||
- ☁️ LanceDB Cloud:
|
||||
- Overview: cloud/index.md
|
||||
- API reference:
|
||||
- 🐍 Python: python/saas-python.md
|
||||
- 👾 JavaScript: javascript/modules.md
|
||||
- REST API: cloud/rest.md
|
||||
- FAQs: cloud/cloud_faq.md
|
||||
|
||||
- Quick start: basic.md
|
||||
- Concepts:
|
||||
@@ -260,6 +256,9 @@ nav:
|
||||
- Overview: hybrid_search/hybrid_search.md
|
||||
- Comparing Rerankers: hybrid_search/eval.md
|
||||
- Airbnb financial data example: notebooks/hybrid_search.ipynb
|
||||
- Late interaction with MultiVector search:
|
||||
- Overview: guides/multi-vector.md
|
||||
- Document search Example: notebooks/Multivector_on_LanceDB.ipynb
|
||||
- RAG:
|
||||
- Vanilla RAG: rag/vanilla_rag.md
|
||||
- Multi-head RAG: rag/multi_head_rag.md
|
||||
@@ -363,13 +362,6 @@ nav:
|
||||
- Javascript (vectordb): javascript/modules.md
|
||||
- Javascript (lancedb): js/globals.md
|
||||
- Rust: https://docs.rs/lancedb/latest/lancedb/index.html
|
||||
- LanceDB Cloud:
|
||||
- Overview: cloud/index.md
|
||||
- API reference:
|
||||
- 🐍 Python: python/saas-python.md
|
||||
- 👾 JavaScript: javascript/modules.md
|
||||
- REST API: cloud/rest.md
|
||||
- FAQs: cloud/cloud_faq.md
|
||||
|
||||
extra_css:
|
||||
- styles/global.css
|
||||
|
||||
@@ -171,7 +171,7 @@ paths:
|
||||
distance_type:
|
||||
type: string
|
||||
description: |
|
||||
The distance metric to use for search. L2, Cosine, Dot and Hamming are supported. Default is L2.
|
||||
The distance metric to use for search. l2, Cosine, Dot and Hamming are supported. Default is l2.
|
||||
bypass_vector_index:
|
||||
type: boolean
|
||||
description: |
|
||||
@@ -450,7 +450,7 @@ paths:
|
||||
type: string
|
||||
nullable: false
|
||||
description: |
|
||||
The metric type to use for the index. L2, Cosine, Dot are supported.
|
||||
The metric type to use for the index. l2, Cosine, Dot are supported.
|
||||
index_type:
|
||||
type: string
|
||||
responses:
|
||||
|
||||
@@ -69,7 +69,7 @@ Lance supports `IVF_PQ` index type by default.
|
||||
|
||||
The following IVF_PQ paramters can be specified:
|
||||
|
||||
- **distance_type**: The distance metric to use. By default it uses euclidean distance "`L2`".
|
||||
- **distance_type**: The distance metric to use. By default it uses euclidean distance "`l2`".
|
||||
We also support "cosine" and "dot" distance as well.
|
||||
- **num_partitions**: The number of partitions in the index. The default is the square root
|
||||
of the number of rows.
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
LanceDB Cloud is a SaaS (software-as-a-service) solution that runs serverless in the cloud, clearly separating storage from compute. It's designed to be highly scalable without breaking the bank. LanceDB Cloud is currently in private beta with general availability coming soon, but you can apply for early access with the private beta release by signing up below.
|
||||
|
||||
[Try out LanceDB Cloud](https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms){ .md-button .md-button--primary }
|
||||
[Try out LanceDB Cloud (Public Beta)](https://cloud.lancedb.com){ .md-button .md-button--primary }
|
||||
|
||||
## Architecture
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ Then the greedy search routine operates as follows:
|
||||
|
||||
There are three key parameters to set when constructing an HNSW index:
|
||||
|
||||
* `metric`: Use an `L2` euclidean distance metric. We also support `dot` and `cosine` distance.
|
||||
* `metric`: Use an `l2` euclidean distance metric. We also support `dot` and `cosine` distance.
|
||||
* `m`: The number of neighbors to select for each vector in the HNSW graph.
|
||||
* `ef_construction`: The number of candidates to evaluate during the construction of the HNSW graph.
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ We can combine the above concepts to understand how to build and query an IVF-PQ
|
||||
|
||||
There are three key parameters to set when constructing an IVF-PQ index:
|
||||
|
||||
* `metric`: Use an `L2` euclidean distance metric. We also support `dot` and `cosine` distance.
|
||||
* `metric`: Use an `l2` euclidean distance metric. We also support `dot` and `cosine` distance.
|
||||
* `num_partitions`: The number of partitions in the IVF portion of the index.
|
||||
* `num_sub_vectors`: The number of sub-vectors that will be created during Product Quantization (PQ).
|
||||
|
||||
@@ -56,7 +56,7 @@ In Python, the index can be created as follows:
|
||||
```python
|
||||
# Create and train the index for a 1536-dimensional vector
|
||||
# Make sure you have enough data in the table for an effective training step
|
||||
tbl.create_index(metric="L2", num_partitions=256, num_sub_vectors=96)
|
||||
tbl.create_index(metric="l2", num_partitions=256, num_sub_vectors=96)
|
||||
```
|
||||
!!! note
|
||||
`num_partitions`=256 and `num_sub_vectors`=96 does not work for every dataset. Those values needs to be adjusted for your particular dataset.
|
||||
|
||||
@@ -54,7 +54,7 @@ As mentioned, after creating embedding, each data point is represented as a vect
|
||||
|
||||
Points that are close to each other in vector space are considered similar (or appear in similar contexts), and points that are far away are considered dissimilar. To quantify this closeness, we use distance as a metric which can be measured in the following way -
|
||||
|
||||
1. **Euclidean Distance (L2)**: It calculates the straight-line distance between two points (vectors) in a multidimensional space.
|
||||
1. **Euclidean Distance (l2)**: It calculates the straight-line distance between two points (vectors) in a multidimensional space.
|
||||
2. **Cosine Similarity**: It measures the cosine of the angle between two vectors, providing a normalized measure of similarity based on their direction.
|
||||
3. **Dot product**: It is calculated as the sum of the products of their corresponding components. To measure relatedness it considers both the magnitude and direction of the vectors.
|
||||
|
||||
|
||||
@@ -8,15 +8,5 @@ LanceDB provides language APIs, allowing you to embed a database in your languag
|
||||
* 👾 [JavaScript](examples_js.md) examples
|
||||
* 🦀 Rust examples (coming soon)
|
||||
|
||||
## Python Applications powered by LanceDB
|
||||
|
||||
| Project Name | Description |
|
||||
| --- | --- |
|
||||
| **Ultralytics Explorer 🚀**<br>[](https://docs.ultralytics.com/datasets/explorer/)<br>[](https://colab.research.google.com/github/ultralytics/ultralytics/blob/main/docs/en/datasets/explorer/explorer.ipynb) | - 🔍 **Explore CV Datasets**: Semantic search, SQL queries, vector similarity, natural language.<br>- 🖥️ **GUI & Python API**: Seamless dataset interaction.<br>- ⚡ **Efficient & Scalable**: Leverages LanceDB for large datasets.<br>- 📊 **Detailed Analysis**: Easily analyze data patterns.<br>- 🌐 **Browser GUI Demo**: Create embeddings, search images, run queries. |
|
||||
| **Website Chatbot🤖**<br>[](https://github.com/lancedb/lancedb-vercel-chatbot)<br>[](https://vercel.com/new/clone?repository-url=https%3A%2F%2Fgithub.com%2Flancedb%2Flancedb-vercel-chatbot&env=OPENAI_API_KEY&envDescription=OpenAI%20API%20Key%20for%20chat%20completion.&project-name=lancedb-vercel-chatbot&repository-name=lancedb-vercel-chatbot&demo-title=LanceDB%20Chatbot%20Demo&demo-description=Demo%20website%20chatbot%20with%20LanceDB.&demo-url=https%3A%2F%2Flancedb.vercel.app&demo-image=https%3A%2F%2Fi.imgur.com%2FazVJtvr.png) | - 🌐 **Chatbot from Sitemap/Docs**: Create a chatbot using site or document context.<br>- 🚀 **Embed LanceDB in Next.js**: Lightweight, on-prem storage.<br>- 🧠 **AI-Powered Context Retrieval**: Efficiently access relevant data.<br>- 🔧 **Serverless & Native JS**: Seamless integration with Next.js.<br>- ⚡ **One-Click Deploy on Vercel**: Quick and easy setup.. |
|
||||
|
||||
## Nodejs Applications powered by LanceDB
|
||||
|
||||
| Project Name | Description |
|
||||
| --- | --- |
|
||||
| **Langchain Writing Assistant✍️ **<br>[](https://github.com/lancedb/vectordb-recipes/tree/main/applications/node/lanchain_writing_assistant) | - **📂 Data Source Integration**: Use your own data by specifying data source file, and the app instantly processes it to provide insights. <br>- **🧠 Intelligent Suggestions**: Powered by LangChain.js and LanceDB, it improves writing productivity and accuracy. <br>- **💡 Enhanced Writing Experience**: It delivers real-time contextual insights and factual suggestions while the user writes. |
|
||||
!!! tip "Hosted LanceDB"
|
||||
If you want S3 cost-efficiency and local performance via a simple serverless API, checkout **LanceDB Cloud**. For private deployments, high performance at extreme scale, or if you have strict security requirements, talk to us about **LanceDB Enterprise**. [Learn more](https://docs.lancedb.com/)
|
||||
85
docs/src/guides/multi-vector.md
Normal file
85
docs/src/guides/multi-vector.md
Normal file
@@ -0,0 +1,85 @@
|
||||
# Late interaction & MultiVector embedding type
|
||||
Late interaction is a technique used in retrieval that calculates the relevance of a query to a document by comparing their multi-vector representations. The key difference between late interaction and other popular methods:
|
||||
|
||||

|
||||
|
||||
|
||||
[ Illustration from https://jina.ai/news/what-is-colbert-and-late-interaction-and-why-they-matter-in-search/]
|
||||
|
||||
<b>No interaction:</b> Refers to independently embedding the query and document, that are compared to calcualte similarity without any interaction between them. This is typically used in vector search operations.
|
||||
|
||||
<b>Partial interaction</b> Refers to a specific approach where the similarity computation happens primarily between query vectors and document vectors, without extensive interaction between individual components of each. An example of this is dual-encoder models like BERT.
|
||||
|
||||
<b>Early full interaction</b> Refers to techniques like cross-encoders that process query and docs in pairs with full interaction across various stages of encoding. This is a powerful, but relatively slower technique. Because it requires processing query and docs in pairs, doc embeddings can't be pre-computed for fast retrieval. This is why cross encoders are typically used as reranking models combined with vector search. Learn more about [LanceDB Reranking support](https://lancedb.github.io/lancedb/reranking/).
|
||||
|
||||
<b>Late interaction</b> Late interaction is a technique that calculates the doc and query similarity independently and then the interaction or evaluation happens during the retrieval process. This is typically used in retrieval models like ColBERT. Unlike early interaction, It allows speeding up the retrieval process without compromising the depth of semantic analysis.
|
||||
|
||||
## Internals of ColBERT
|
||||
Let's take a look at the steps involved in performing late interaction based retrieval using ColBERT:
|
||||
|
||||
• ColBERT employs BERT-based encoders for both queries `(fQ)` and documents `(fD)`
|
||||
• A single BERT model is shared between query and document encoders and special tokens distinguish input types: `[Q]` for queries and `[D]` for documents
|
||||
|
||||
**Query Encoder (fQ):**
|
||||
• Query q is tokenized into WordPiece tokens: `q1, q2, ..., ql`. `[Q]` token is prepended right after BERT's `[CLS]` token
|
||||
• If query length < Nq, it's padded with [MASK] tokens up to Nq.
|
||||
• The padded sequence goes through BERT's transformer architecture
|
||||
• Final embeddings are L2-normalized.
|
||||
|
||||
**Document Encoder (fD):**
|
||||
• Document d is tokenized into tokens `d1, d2, ..., dm`. `[D]` token is prepended after `[CLS]` token
|
||||
• Unlike queries, documents are NOT padded with `[MASK]` tokens
|
||||
• Document tokens are processed through BERT and the same linear layer
|
||||
|
||||
**Late Interaction:**
|
||||
• Late interaction estimates relevance score `S(q,d)` using embedding `Eq` and `Ed`. Late interaction happens after independent encoding
|
||||
• For each query embedding, maximum similarity is computed against all document embeddings
|
||||
• The similarity measure can be cosine similarity or squared L2 distance
|
||||
|
||||
**MaxSim Calculation:**
|
||||
```
|
||||
S(q,d) := Σ max(Eqi⋅EdjT)
|
||||
i∈|Eq| j∈|Ed|
|
||||
```
|
||||
• This finds the best matching document embedding for each query embedding
|
||||
• Captures relevance based on strongest local matches between contextual embeddings
|
||||
|
||||
## LanceDB MultiVector type
|
||||
LanceDB supports multivector type, this is useful when you have multiple vectors for a single item (e.g. with ColBert and ColPali).
|
||||
|
||||
You can index on a column with multivector type and search on it, the query can be single vector or multiple vectors. For now, only cosine metric is supported for multivector search. The vector value type can be float16, float32 or float64. LanceDB integrateds [ConteXtualized Token Retriever(XTR)](https://arxiv.org/abs/2304.01982), which introduces a simple, yet novel, objective function that encourages the model to retrieve the most important document tokens first.
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
|
||||
db = lancedb.connect("data/multivector_demo")
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
# float16, float32, and float64 are supported
|
||||
pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
|
||||
]
|
||||
)
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"vector": np.random.random(size=(2, 256)).tolist(),
|
||||
}
|
||||
for i in range(1024)
|
||||
]
|
||||
tbl = db.create_table("my_table", data=data, schema=schema)
|
||||
|
||||
# only cosine similarity is supported for multi-vectors
|
||||
tbl.create_index(metric="cosine")
|
||||
|
||||
# query with single vector
|
||||
query = np.random.random(256).astype(np.float16)
|
||||
tbl.search(query).to_arrow()
|
||||
|
||||
# query with multiple vectors
|
||||
query = np.random.random(size=(2, 256))
|
||||
tbl.search(query).to_arrow()
|
||||
```
|
||||
Find more about vector search in LanceDB [here](https://lancedb.github.io/lancedb/search/#multivector-type).
|
||||
@@ -4,6 +4,9 @@ LanceDB is an open-source vector database for AI that's designed to store, manag
|
||||
|
||||
Both the database and the underlying data format are designed from the ground up to be **easy-to-use**, **scalable** and **cost-effective**.
|
||||
|
||||
!!! tip "Hosted LanceDB"
|
||||
If you want S3 cost-efficiency and local performance via a simple serverless API, checkout **LanceDB Cloud**. For private deployments, high performance at extreme scale, or if you have strict security requirements, talk to us about **LanceDB Enterprise**. [Learn more](https://docs.lancedb.com/)
|
||||
|
||||

|
||||
|
||||
## Truly multi-modal
|
||||
@@ -20,7 +23,7 @@ LanceDB **OSS** is an **open-source**, batteries-included embedded vector databa
|
||||
|
||||
LanceDB **Cloud** is a SaaS (software-as-a-service) solution that runs serverless in the cloud, making the storage clearly separated from compute. It's designed to be cost-effective and highly scalable without breaking the bank. LanceDB Cloud is currently in private beta with general availability coming soon, but you can apply for early access with the private beta release by signing up below.
|
||||
|
||||
[Try out LanceDB Cloud](https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms){ .md-button .md-button--primary }
|
||||
[Try out LanceDB Cloud (Public Beta) Now](https://cloud.lancedb.com){ .md-button .md-button--primary }
|
||||
|
||||
## Why use LanceDB?
|
||||
|
||||
|
||||
@@ -108,7 +108,7 @@ This method creates a scalar(for non-vector cols) or a vector index on a table.
|
||||
|:---|:---|:---|:---|
|
||||
|`vector_col`|`Optional[str]`| Provide if you want to create index on a vector column. |`None`|
|
||||
|`col_name`|`Optional[str]`| Provide if you want to create index on a non-vector column. |`None`|
|
||||
|`metric`|`Optional[str]` |Provide the metric to use for vector index. choice of metrics: 'L2', 'dot', 'cosine'. |`L2`|
|
||||
|`metric`|`Optional[str]` |Provide the metric to use for vector index. choice of metrics: 'l2', 'dot', 'cosine'. |`l2`|
|
||||
|`num_partitions`|`Optional[int]`|Number of partitions to use for the index.|`256`|
|
||||
|`num_sub_vectors`|`Optional[int]` |Number of sub-vectors to use for the index.|`96`|
|
||||
|`index_cache_size`|`Optional[int]` |Size of the index cache.|`None`|
|
||||
|
||||
@@ -125,7 +125,7 @@ The exhaustive list of parameters for `LanceDBVectorStore` vector store are :
|
||||
```
|
||||
- **_table_exists(self, tbl_name: `Optional[str]` = `None`) -> `bool`** : Returns `True` if `tbl_name` exists in database.
|
||||
- __create_index(
|
||||
self, scalar: `Optional[bool]` = False, col_name: `Optional[str]` = None, num_partitions: `Optional[int]` = 256, num_sub_vectors: `Optional[int]` = 96, index_cache_size: `Optional[int]` = None, metric: `Optional[str]` = "L2",
|
||||
self, scalar: `Optional[bool]` = False, col_name: `Optional[str]` = None, num_partitions: `Optional[int]` = 256, num_sub_vectors: `Optional[int]` = 96, index_cache_size: `Optional[int]` = None, metric: `Optional[str]` = "l2",
|
||||
) -> `None`__ : Creates a scalar(for non-vector cols) or a vector index on a table.
|
||||
Make sure your vector column has enough data before creating an index on it.
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ Distance metrics type.
|
||||
|
||||
- [Cosine](MetricType.md#cosine)
|
||||
- [Dot](MetricType.md#dot)
|
||||
- [L2](MetricType.md#l2)
|
||||
- [l2](MetricType.md#l2)
|
||||
|
||||
## Enumeration Members
|
||||
|
||||
|
||||
@@ -85,7 +85,7 @@ ___
|
||||
|
||||
• `Optional` **metric\_type**: [`MetricType`](../enums/MetricType.md)
|
||||
|
||||
Metric type, L2 or Cosine
|
||||
Metric type, l2 or Cosine
|
||||
|
||||
#### Defined in
|
||||
|
||||
|
||||
@@ -15,11 +15,9 @@ npm install @lancedb/lancedb
|
||||
This will download the appropriate native library for your platform. We currently
|
||||
support:
|
||||
|
||||
- Linux (x86_64 and aarch64)
|
||||
- Linux (x86_64 and aarch64 on glibc and musl)
|
||||
- MacOS (Intel and ARM/M1/M2)
|
||||
- Windows (x86_64 only)
|
||||
|
||||
We do not yet support musl-based Linux (such as Alpine Linux) or aarch64 Windows.
|
||||
- Windows (x86_64 and aarch64)
|
||||
|
||||
## Usage
|
||||
|
||||
|
||||
@@ -126,6 +126,37 @@ the vectors.
|
||||
|
||||
***
|
||||
|
||||
### ivfFlat()
|
||||
|
||||
```ts
|
||||
static ivfFlat(options?): Index
|
||||
```
|
||||
|
||||
Create an IvfFlat index
|
||||
|
||||
This index groups vectors into partitions of similar vectors. Each partition keeps track of
|
||||
a centroid which is the average value of all vectors in the group.
|
||||
|
||||
During a query the centroids are compared with the query vector to find the closest
|
||||
partitions. The vectors in these partitions are then searched to find
|
||||
the closest vectors.
|
||||
|
||||
The partitioning process is called IVF and the `num_partitions` parameter controls how
|
||||
many groups to create.
|
||||
|
||||
Note that training an IVF FLAT index on a large dataset is a slow operation and
|
||||
currently is also a memory intensive operation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`IvfFlatOptions`](../interfaces/IvfFlatOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
[`Index`](Index.md)
|
||||
|
||||
***
|
||||
|
||||
### ivfPq()
|
||||
|
||||
```ts
|
||||
|
||||
19
docs/src/js/functions/packBits.md
Normal file
19
docs/src/js/functions/packBits.md
Normal file
@@ -0,0 +1,19 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / packBits
|
||||
|
||||
# Function: packBits()
|
||||
|
||||
```ts
|
||||
function packBits(data): number[]
|
||||
```
|
||||
|
||||
## Parameters
|
||||
|
||||
* **data**: `number`[]
|
||||
|
||||
## Returns
|
||||
|
||||
`number`[]
|
||||
@@ -39,6 +39,7 @@
|
||||
- [IndexConfig](interfaces/IndexConfig.md)
|
||||
- [IndexOptions](interfaces/IndexOptions.md)
|
||||
- [IndexStatistics](interfaces/IndexStatistics.md)
|
||||
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
|
||||
- [IvfPqOptions](interfaces/IvfPqOptions.md)
|
||||
- [OpenTableOptions](interfaces/OpenTableOptions.md)
|
||||
- [OptimizeOptions](interfaces/OptimizeOptions.md)
|
||||
@@ -66,3 +67,4 @@
|
||||
|
||||
- [connect](functions/connect.md)
|
||||
- [makeArrowTable](functions/makeArrowTable.md)
|
||||
- [packBits](functions/packBits.md)
|
||||
|
||||
@@ -16,7 +16,7 @@ must be provided.
|
||||
### dataType?
|
||||
|
||||
```ts
|
||||
optional dataType: string;
|
||||
optional dataType: string | DataType<Type, any>;
|
||||
```
|
||||
|
||||
A new data type for the column. If not provided then the data type will not be changed.
|
||||
|
||||
@@ -24,18 +24,18 @@ The following distance types are available:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. L2 distance has a range of [0, ∞).
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike L2, the cosine distance is not affected by the
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
***
|
||||
|
||||
|
||||
@@ -24,18 +24,18 @@ The following distance types are available:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. L2 distance has a range of [0, ∞).
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike L2, the cosine distance is not affected by the
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
***
|
||||
|
||||
|
||||
@@ -30,6 +30,17 @@ The type of the index
|
||||
|
||||
***
|
||||
|
||||
### loss?
|
||||
|
||||
```ts
|
||||
optional loss: number;
|
||||
```
|
||||
|
||||
The KMeans loss value of the index,
|
||||
it is only present for vector indices.
|
||||
|
||||
***
|
||||
|
||||
### numIndexedRows
|
||||
|
||||
```ts
|
||||
|
||||
112
docs/src/js/interfaces/IvfFlatOptions.md
Normal file
112
docs/src/js/interfaces/IvfFlatOptions.md
Normal file
@@ -0,0 +1,112 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / IvfFlatOptions
|
||||
|
||||
# Interface: IvfFlatOptions
|
||||
|
||||
Options to create an `IVF_FLAT` index
|
||||
|
||||
## Properties
|
||||
|
||||
### distanceType?
|
||||
|
||||
```ts
|
||||
optional distanceType: "l2" | "cosine" | "dot" | "hamming";
|
||||
```
|
||||
|
||||
Distance type to use to build the index.
|
||||
|
||||
Default value is "l2".
|
||||
|
||||
This is used when training the index to calculate the IVF partitions
|
||||
(vectors are grouped in partitions with similar vectors according to this
|
||||
distance type).
|
||||
|
||||
The distance type used to train an index MUST match the distance type used
|
||||
to search the index. Failure to do so will yield inaccurate results.
|
||||
|
||||
The following distance types are available:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
Note: the cosine distance is undefined when one (or both) of the vectors
|
||||
are all zeros (there is no direction). These vectors are invalid and may
|
||||
never be returned from a vector search.
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
"hamming" - Hamming distance. Hamming distance is a distance metric
|
||||
calculated from the number of bits that are different between two vectors.
|
||||
Hamming distance has a range of [0, dimension]. Note that the hamming distance
|
||||
is only valid for binary vectors.
|
||||
|
||||
***
|
||||
|
||||
### maxIterations?
|
||||
|
||||
```ts
|
||||
optional maxIterations: number;
|
||||
```
|
||||
|
||||
Max iteration to train IVF kmeans.
|
||||
|
||||
When training an IVF FLAT index we use kmeans to calculate the partitions. This parameter
|
||||
controls how many iterations of kmeans to run.
|
||||
|
||||
Increasing this might improve the quality of the index but in most cases these extra
|
||||
iterations have diminishing returns.
|
||||
|
||||
The default value is 50.
|
||||
|
||||
***
|
||||
|
||||
### numPartitions?
|
||||
|
||||
```ts
|
||||
optional numPartitions: number;
|
||||
```
|
||||
|
||||
The number of IVF partitions to create.
|
||||
|
||||
This value should generally scale with the number of rows in the dataset.
|
||||
By default the number of partitions is the square root of the number of
|
||||
rows.
|
||||
|
||||
If this value is too large then the first part of the search (picking the
|
||||
right partition) will be slow. If this value is too small then the second
|
||||
part of the search (searching within a partition) will be slow.
|
||||
|
||||
***
|
||||
|
||||
### sampleRate?
|
||||
|
||||
```ts
|
||||
optional sampleRate: number;
|
||||
```
|
||||
|
||||
The number of vectors, per partition, to sample when training IVF kmeans.
|
||||
|
||||
When an IVF FLAT index is trained, we need to calculate partitions. These are groups
|
||||
of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
||||
|
||||
Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
||||
random sample of the data. This parameter controls the size of the sample. The total
|
||||
number of vectors used to train the index is `sample_rate * num_partitions`.
|
||||
|
||||
Increasing this value might improve the quality of the index but in most cases the
|
||||
default should be sufficient.
|
||||
|
||||
The default value is 256.
|
||||
@@ -31,13 +31,13 @@ The following distance types are available:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. L2 distance has a range of [0, ∞).
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike L2, the cosine distance is not affected by the
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
Note: the cosine distance is undefined when one (or both) of the vectors
|
||||
@@ -46,7 +46,7 @@ never be returned from a vector search.
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
***
|
||||
|
||||
|
||||
667
docs/src/notebooks/Multivector_on_LanceDB.ipynb
Normal file
667
docs/src/notebooks/Multivector_on_LanceDB.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -59,8 +59,6 @@ is also an [asynchronous API client](#connections-asynchronous).
|
||||
|
||||
::: lancedb.embeddings.open_clip.OpenClipEmbeddings
|
||||
|
||||
::: lancedb.embeddings.utils.with_embeddings
|
||||
|
||||
## Context
|
||||
|
||||
::: lancedb.context.contextualize
|
||||
|
||||
@@ -15,7 +15,7 @@ Currently, LanceDB supports the following metrics:
|
||||
|
||||
| Metric | Description |
|
||||
| --------- | --------------------------------------------------------------------------- |
|
||||
| `l2` | [Euclidean / L2 distance](https://en.wikipedia.org/wiki/Euclidean_distance) |
|
||||
| `l2` | [Euclidean / l2 distance](https://en.wikipedia.org/wiki/Euclidean_distance) |
|
||||
| `cosine` | [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) |
|
||||
| `dot` | [Dot Production](https://en.wikipedia.org/wiki/Dot_product) |
|
||||
| `hamming` | [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) |
|
||||
@@ -138,6 +138,19 @@ LanceDB supports binary vectors as a data type, and has the ability to search bi
|
||||
--8<-- "python/python/tests/docs/test_binary_vector.py:async_binary_vector"
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```ts
|
||||
--8<-- "nodejs/examples/search.test.ts:import"
|
||||
|
||||
--8<-- "nodejs/examples/search.test.ts:import_bin_util"
|
||||
|
||||
--8<-- "nodejs/examples/search.test.ts:ingest_binary_data"
|
||||
|
||||
--8<-- "nodejs/examples/search.test.ts:search_binary_data"
|
||||
```
|
||||
|
||||
|
||||
## Multivector type
|
||||
|
||||
LanceDB supports multivector type, this is useful when you have multiple vectors for a single item (e.g. with ColBert and ColPali).
|
||||
|
||||
@@ -7,7 +7,7 @@ performed on the top-k results returned by the vector search. However, pre-filte
|
||||
option that performs the filter prior to vector search. This can be useful to narrow down
|
||||
the search space of a very large dataset to reduce query latency.
|
||||
|
||||
Note that both pre-filtering and post-filtering can yield false positives. For pre-filtering, if the filter is too selective, it might eliminate relevant items that the vector search would have otherwise identified as a good match. In this case, increasing `nprobes` parameter will help reduce such false positives. It is recommended to set `use_index=false` if you know that the filter is highly selective.
|
||||
Note that both pre-filtering and post-filtering can yield false positives. For pre-filtering, if the filter is too selective, it might eliminate relevant items that the vector search would have otherwise identified as a good match. In this case, increasing `nprobes` parameter will help reduce such false positives. It is recommended to call `bypass_vector_index()` if you know that the filter is highly selective.
|
||||
|
||||
Similarly, a highly selective post-filter can lead to false positives. Increasing both `nprobes` and `refine_factor` can mitigate this issue. When deciding between pre-filtering and post-filtering, pre-filtering is generally the safer choice if you're uncertain.
|
||||
|
||||
|
||||
@@ -8,6 +8,10 @@ For trouble shooting, the best place to ask is in our Discord, under the relevan
|
||||
language channel. By asking in the language-specific channel, it makes it more
|
||||
likely that someone who knows the answer will see your question.
|
||||
|
||||
## Common issues
|
||||
|
||||
* Multiprocessing with `fork` is not supported. You should use `spawn` instead.
|
||||
|
||||
## Enabling logging
|
||||
|
||||
To provide more information, especially for LanceDB Cloud related issues, enable
|
||||
|
||||
3
java/.gitignore
vendored
Normal file
3
java/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
*.iml
|
||||
.java-version
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.18.0-final.0</version>
|
||||
<version>0.18.2-beta.1</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.18.0-final.0</version>
|
||||
<version>0.18.2-beta.1</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<name>LanceDB Parent</name>
|
||||
@@ -166,7 +166,6 @@
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<version>3.2.5</version>
|
||||
<configuration>
|
||||
<argLine>--add-opens=java.base/java.nio=ALL-UNNAMED</argLine>
|
||||
<forkNode
|
||||
implementation="org.apache.maven.plugin.surefire.extensions.SurefireForkNodeFactory" />
|
||||
<useSystemClassLoader>false</useSystemClassLoader>
|
||||
|
||||
86
node/package-lock.json
generated
86
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,14 +52,11 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.18.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.18.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.18.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.18.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.18.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.18.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.18.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.18.0"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.18.2-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.18.2-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.18.2-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.18.2-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.18.2-beta.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
@@ -330,9 +327,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.18.0.tgz",
|
||||
"integrity": "sha512-ormNCmny1j64aSZRrZeUQ1Zs8cOFKrW14NgTmW3AehDuru+Ep+8AriHA5Pmyi6raBOZfNzDSiZs/LTzzyVaa7g==",
|
||||
"version": "0.18.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.18.2-beta.0.tgz",
|
||||
"integrity": "sha512-FzIcElkS6R5I5kU1S5m7yLVTB1Duv1XcmZQtVmYl/JjNlfxS1WTtMzdzMqSBFohDcgU2Tkc5+1FpK1B94dUUbg==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -343,9 +340,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.18.0.tgz",
|
||||
"integrity": "sha512-S4skQ1RXXQJciq40s84Kyy7v3YC+nao8pX4xUyxDcKRx+90Qg9eH+tehs6XLN1IjrQT/9CWKaE5wxZmv6Oys4g==",
|
||||
"version": "0.18.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.18.2-beta.0.tgz",
|
||||
"integrity": "sha512-jv+XludfLNBDm1DjdqyghwDMtd4E+ygwycQpkpK72wyZSh6Qytrgq+4dNi/zCZ3UChFLbKbIxrVxv9yENQn2Pg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -356,22 +353,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.18.0.tgz",
|
||||
"integrity": "sha512-1txr4tasVdxy321/4Fw8GJPjzrf84F02L9ffN8JebHmmR0S8uk2MKf2WsyLaSVRPd4YHIvvf3qmG0RGaUsb2sw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-musl": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.18.0.tgz",
|
||||
"integrity": "sha512-8xS1xaoJeFDx6WmDBcfueWvIbdNX/ptQXfoC7hYICwNHizjlyt4O3Nxz8uG9URMF1y9saUYUditIHLzLVZc76g==",
|
||||
"version": "0.18.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.18.2-beta.0.tgz",
|
||||
"integrity": "sha512-8/fBpbNYhhpetf/pZv0DyPnQkeAbsiICMyCoRiNu5auvQK4AsGF1XvLWrDi68u9F0GysBKvuatYuGqa/yh+Anw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -382,9 +366,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.18.0.tgz",
|
||||
"integrity": "sha512-8XUc2UnEV3awv0DGJS5gRA7yTkicX6oPN7GudXXxycCKL33FJ2ah7hkeDia9Bhk9MmvTonvsEDvUSqnglcpqfA==",
|
||||
"version": "0.18.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.18.2-beta.0.tgz",
|
||||
"integrity": "sha512-7a1Kc/2V2ff4HlLzXyXVdK0Z0VIFUt50v2SBRdlcycJ0NLW9ZqV+9UjB/NAOwMXVgYd7d3rKjACGkQzkpvcyeg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -394,36 +378,10 @@
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-musl": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.18.0.tgz",
|
||||
"integrity": "sha512-LV7TuWgLcL82Wdq+EH2Xs3+apqeLohwYLlVIauVAwKEHvdwyNxTOW9TaNLvHXcbylIh7agl2xXvASCNhYZAyzA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-arm64-msvc": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-arm64-msvc/-/vectordb-win32-arm64-msvc-0.18.0.tgz",
|
||||
"integrity": "sha512-kxdCnKfvnuDKoKZRUBbreMBpimHb+k9/pFR48GN6JSrIuzUDx5G1VjHKBmaFhbveZCOBjjtYlg/8ohnWQHZfeA==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.18.0.tgz",
|
||||
"integrity": "sha512-uAE80q50cAp4gHoGvclxJqZGqj3/9oN9kz8iXgNIxiPngqnN01kVyaj4ulm4Qk/nauWUhHJ3tjTh/+CpkhSc2Q==",
|
||||
"version": "0.18.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.18.2-beta.0.tgz",
|
||||
"integrity": "sha512-EeCiSf2RtJMESnkIca28GI6rAStYj2q9sVIyNCXpmIZSkJVpfQ3iswHGAbHrEfaPl0J1Re9cnRHLLuqkumwiIQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.1",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"private": false,
|
||||
"main": "dist/index.js",
|
||||
@@ -85,20 +85,14 @@
|
||||
"aarch64-apple-darwin": "@lancedb/vectordb-darwin-arm64",
|
||||
"x86_64-unknown-linux-gnu": "@lancedb/vectordb-linux-x64-gnu",
|
||||
"aarch64-unknown-linux-gnu": "@lancedb/vectordb-linux-arm64-gnu",
|
||||
"x86_64-unknown-linux-musl": "@lancedb/vectordb-linux-x64-musl",
|
||||
"aarch64-unknown-linux-musl": "@lancedb/vectordb-linux-arm64-musl",
|
||||
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc",
|
||||
"aarch64-pc-windows-msvc": "@lancedb/vectordb-win32-arm64-msvc"
|
||||
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc"
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-x64": "0.18.0",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.18.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.18.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.18.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.18.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.18.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.18.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.18.0"
|
||||
"@lancedb/vectordb-darwin-x64": "0.18.2-beta.1",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.18.2-beta.1",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.18.2-beta.1",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.18.2-beta.1",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.18.2-beta.1"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1299,7 +1299,7 @@ export interface IvfPQIndexConfig {
|
||||
index_name?: string
|
||||
|
||||
/**
|
||||
* Metric type, L2 or Cosine
|
||||
* Metric type, l2 or Cosine
|
||||
*/
|
||||
metric_type?: MetricType
|
||||
|
||||
|
||||
@@ -22,3 +22,4 @@ build.rs
|
||||
jest.config.js
|
||||
tsconfig.json
|
||||
typedoc.json
|
||||
typedoc_post_process.js
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.18.0"
|
||||
version = "0.18.2-beta.1"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
@@ -18,7 +18,7 @@ arrow-array.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
env_logger.workspace = true
|
||||
futures.workspace = true
|
||||
lancedb = { path = "../rust/lancedb", features = ["remote"] }
|
||||
lancedb = { path = "../rust/lancedb" }
|
||||
napi = { version = "2.16.8", default-features = false, features = [
|
||||
"napi9",
|
||||
"async"
|
||||
@@ -30,3 +30,8 @@ log.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
napi-build = "2.1"
|
||||
|
||||
[features]
|
||||
default = ["remote"]
|
||||
fp16kernels = ["lancedb/fp16kernels"]
|
||||
remote = ["lancedb/remote"]
|
||||
|
||||
@@ -11,11 +11,9 @@ npm install @lancedb/lancedb
|
||||
This will download the appropriate native library for your platform. We currently
|
||||
support:
|
||||
|
||||
- Linux (x86_64 and aarch64)
|
||||
- Linux (x86_64 and aarch64 on glibc and musl)
|
||||
- MacOS (Intel and ARM/M1/M2)
|
||||
- Windows (x86_64 only)
|
||||
|
||||
We do not yet support musl-based Linux (such as Alpine Linux) or aarch64 Windows.
|
||||
- Windows (x86_64 and aarch64)
|
||||
|
||||
## Usage
|
||||
|
||||
|
||||
@@ -21,9 +21,11 @@ import {
|
||||
Int64,
|
||||
List,
|
||||
Schema,
|
||||
Uint8,
|
||||
Utf8,
|
||||
makeArrowTable,
|
||||
} from "../lancedb/arrow";
|
||||
import * as arrow from "../lancedb/arrow";
|
||||
import {
|
||||
EmbeddingFunction,
|
||||
LanceSchema,
|
||||
@@ -278,6 +280,15 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(res.getChild("y")?.toJSON()).toEqual([2, null, null, null]);
|
||||
expect(res.getChild("z")?.toJSON()).toEqual([null, null, 3n, 5n]);
|
||||
});
|
||||
|
||||
it("should handle null vectors at end of data", async () => {
|
||||
// https://github.com/lancedb/lancedb/issues/2240
|
||||
const data = [{ vector: [1, 2, 3] }, { vector: null }];
|
||||
const db = await connect("memory://");
|
||||
|
||||
const table = await db.createTable("my_table", data);
|
||||
expect(await table.countRows()).toEqual(2);
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
@@ -460,6 +471,8 @@ describe("When creating an index", () => {
|
||||
indexType: "IvfPq",
|
||||
columns: ["vec"],
|
||||
});
|
||||
const stats = await tbl.indexStats("vec_idx");
|
||||
expect(stats?.loss).toBeDefined();
|
||||
|
||||
// Search without specifying the column
|
||||
let rst = await tbl
|
||||
@@ -720,6 +733,7 @@ describe("When creating an index", () => {
|
||||
expect(stats?.distanceType).toBeUndefined();
|
||||
expect(stats?.indexType).toEqual("BTREE");
|
||||
expect(stats?.numIndices).toEqual(1);
|
||||
expect(stats?.loss).toBeUndefined();
|
||||
});
|
||||
|
||||
test("when getting stats on non-existent index", async () => {
|
||||
@@ -727,6 +741,38 @@ describe("When creating an index", () => {
|
||||
expect(stats).toBeUndefined();
|
||||
});
|
||||
|
||||
test("create ivf_flat with binary vectors", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const binarySchema = new Schema([
|
||||
new Field("id", new Int32(), true),
|
||||
new Field("vec", new FixedSizeList(32, new Field("item", new Uint8()))),
|
||||
]);
|
||||
const tbl = await db.createTable(
|
||||
"binary",
|
||||
makeArrowTable(
|
||||
Array(300)
|
||||
.fill(1)
|
||||
.map((_, i) => ({
|
||||
id: i,
|
||||
vec: Array(32)
|
||||
.fill(1)
|
||||
.map(() => Math.floor(Math.random() * 255)),
|
||||
})),
|
||||
{ schema: binarySchema },
|
||||
),
|
||||
);
|
||||
await tbl.createIndex("vec", {
|
||||
config: Index.ivfFlat({ numPartitions: 10, distanceType: "hamming" }),
|
||||
});
|
||||
|
||||
// query with binary vectors
|
||||
const queryVec = Array(32)
|
||||
.fill(1)
|
||||
.map(() => Math.floor(Math.random() * 255));
|
||||
const rst = await tbl.query().limit(5).nearestTo(queryVec).toArrow();
|
||||
expect(rst.numRows).toBe(5);
|
||||
});
|
||||
|
||||
// TODO: Move this test to the query API test (making sure we can reject queries
|
||||
// when the dimension is incorrect)
|
||||
test("two columns with different dimensions", async () => {
|
||||
@@ -920,6 +966,93 @@ describe("schema evolution", function () {
|
||||
new Field("price", new Float64(), true),
|
||||
]);
|
||||
expect(await table.schema()).toEqual(expectedSchema2);
|
||||
|
||||
await table.alterColumns([
|
||||
{
|
||||
path: "vector",
|
||||
dataType: new FixedSizeList(2, new Field("item", new Float64(), true)),
|
||||
},
|
||||
]);
|
||||
const expectedSchema3 = new Schema([
|
||||
new Field("new_id", new Int32(), true),
|
||||
new Field(
|
||||
"vector",
|
||||
new FixedSizeList(2, new Field("item", new Float64(), true)),
|
||||
true,
|
||||
),
|
||||
new Field("price", new Float64(), true),
|
||||
]);
|
||||
expect(await table.schema()).toEqual(expectedSchema3);
|
||||
});
|
||||
|
||||
it("can cast to various types", async function () {
|
||||
const con = await connect(tmpDir.name);
|
||||
|
||||
// integers
|
||||
const intTypes = [
|
||||
new arrow.Int8(),
|
||||
new arrow.Int16(),
|
||||
new arrow.Int32(),
|
||||
new arrow.Int64(),
|
||||
new arrow.Uint8(),
|
||||
new arrow.Uint16(),
|
||||
new arrow.Uint32(),
|
||||
new arrow.Uint64(),
|
||||
];
|
||||
const tableInts = await con.createTable("ints", [{ id: 1n }], {
|
||||
schema: new Schema([new Field("id", new Int64(), true)]),
|
||||
});
|
||||
for (const intType of intTypes) {
|
||||
await tableInts.alterColumns([{ path: "id", dataType: intType }]);
|
||||
const schema = new Schema([new Field("id", intType, true)]);
|
||||
expect(await tableInts.schema()).toEqual(schema);
|
||||
}
|
||||
|
||||
// floats
|
||||
const floatTypes = [
|
||||
new arrow.Float16(),
|
||||
new arrow.Float32(),
|
||||
new arrow.Float64(),
|
||||
];
|
||||
const tableFloats = await con.createTable("floats", [{ val: 2.1 }], {
|
||||
schema: new Schema([new Field("val", new Float32(), true)]),
|
||||
});
|
||||
for (const floatType of floatTypes) {
|
||||
await tableFloats.alterColumns([{ path: "val", dataType: floatType }]);
|
||||
const schema = new Schema([new Field("val", floatType, true)]);
|
||||
expect(await tableFloats.schema()).toEqual(schema);
|
||||
}
|
||||
|
||||
// Lists of floats
|
||||
const listTypes = [
|
||||
new arrow.List(new arrow.Field("item", new arrow.Float32(), true)),
|
||||
new arrow.FixedSizeList(
|
||||
2,
|
||||
new arrow.Field("item", new arrow.Float64(), true),
|
||||
),
|
||||
new arrow.FixedSizeList(
|
||||
2,
|
||||
new arrow.Field("item", new arrow.Float16(), true),
|
||||
),
|
||||
new arrow.FixedSizeList(
|
||||
2,
|
||||
new arrow.Field("item", new arrow.Float32(), true),
|
||||
),
|
||||
];
|
||||
const tableLists = await con.createTable("lists", [{ val: [2.1, 3.2] }], {
|
||||
schema: new Schema([
|
||||
new Field(
|
||||
"val",
|
||||
new FixedSizeList(2, new arrow.Field("item", new Float32())),
|
||||
true,
|
||||
),
|
||||
]),
|
||||
});
|
||||
for (const listType of listTypes) {
|
||||
await tableLists.alterColumns([{ path: "val", dataType: listType }]);
|
||||
const schema = new Schema([new Field("val", listType, true)]);
|
||||
expect(await tableLists.schema()).toEqual(schema);
|
||||
}
|
||||
});
|
||||
|
||||
it("can drop a column from the schema", async function () {
|
||||
|
||||
@@ -132,6 +132,17 @@ test("basic table examples", async () => {
|
||||
},
|
||||
]);
|
||||
// --8<-- [end:alter_columns]
|
||||
// --8<-- [start:alter_columns_vector]
|
||||
await tbl.alterColumns([
|
||||
{
|
||||
path: "vector",
|
||||
dataType: new arrow.FixedSizeList(
|
||||
2,
|
||||
new arrow.Field("item", new arrow.Float16(), false),
|
||||
),
|
||||
},
|
||||
]);
|
||||
// --8<-- [end:alter_columns_vector]
|
||||
// --8<-- [start:drop_columns]
|
||||
await tbl.dropColumns(["dbl_price"]);
|
||||
// --8<-- [end:drop_columns]
|
||||
|
||||
@@ -4,9 +4,12 @@ import { expect, test } from "@jest/globals";
|
||||
// --8<-- [start:import]
|
||||
import * as lancedb from "@lancedb/lancedb";
|
||||
// --8<-- [end:import]
|
||||
// --8<-- [start:import_bin_util]
|
||||
import { Field, FixedSizeList, Int32, Schema, Uint8 } from "apache-arrow";
|
||||
// --8<-- [end:import_bin_util]
|
||||
import { withTempDirectory } from "./util.ts";
|
||||
|
||||
test("full text search", async () => {
|
||||
test("vector search", async () => {
|
||||
await withTempDirectory(async (databaseDir) => {
|
||||
{
|
||||
const db = await lancedb.connect(databaseDir);
|
||||
@@ -14,8 +17,6 @@ test("full text search", async () => {
|
||||
const data = Array.from({ length: 10_000 }, (_, i) => ({
|
||||
vector: Array(128).fill(i),
|
||||
id: `${i}`,
|
||||
content: "",
|
||||
longId: `${i}`,
|
||||
}));
|
||||
|
||||
await db.createTable("my_vectors", data);
|
||||
@@ -52,5 +53,41 @@ test("full text search", async () => {
|
||||
expect(r.distance).toBeGreaterThanOrEqual(0.1);
|
||||
expect(r.distance).toBeLessThan(0.2);
|
||||
}
|
||||
|
||||
{
|
||||
// --8<-- [start:ingest_binary_data]
|
||||
const schema = new Schema([
|
||||
new Field("id", new Int32(), true),
|
||||
new Field("vec", new FixedSizeList(32, new Field("item", new Uint8()))),
|
||||
]);
|
||||
const data = lancedb.makeArrowTable(
|
||||
Array(1_000)
|
||||
.fill(0)
|
||||
.map((_, i) => ({
|
||||
// the 256 bits would be store in 32 bytes,
|
||||
// if your data is already in this format, you can skip the packBits step
|
||||
id: i,
|
||||
vec: lancedb.packBits(Array(256).fill(i % 2)),
|
||||
})),
|
||||
{ schema: schema },
|
||||
);
|
||||
|
||||
const tbl = await db.createTable("binary_table", data);
|
||||
await tbl.createIndex("vec", {
|
||||
config: lancedb.Index.ivfFlat({
|
||||
numPartitions: 10,
|
||||
distanceType: "hamming",
|
||||
}),
|
||||
});
|
||||
// --8<-- [end:ingest_binary_data]
|
||||
|
||||
// --8<-- [start:search_binary_data]
|
||||
const query = Array(32)
|
||||
.fill(1)
|
||||
.map(() => Math.floor(Math.random() * 255));
|
||||
const results = await tbl.query().nearestTo(query).limit(10).toArrow();
|
||||
// --8<-- [end:search_binary_data
|
||||
expect(results.numRows).toBe(10);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -8,7 +8,11 @@ import {
|
||||
Bool,
|
||||
BufferType,
|
||||
DataType,
|
||||
DateUnit,
|
||||
Date_,
|
||||
Decimal,
|
||||
Dictionary,
|
||||
Duration,
|
||||
Field,
|
||||
FixedSizeBinary,
|
||||
FixedSizeList,
|
||||
@@ -21,19 +25,22 @@ import {
|
||||
LargeBinary,
|
||||
List,
|
||||
Null,
|
||||
Precision,
|
||||
RecordBatch,
|
||||
RecordBatchFileReader,
|
||||
RecordBatchFileWriter,
|
||||
RecordBatchStreamWriter,
|
||||
Schema,
|
||||
Struct,
|
||||
Timestamp,
|
||||
Type,
|
||||
Utf8,
|
||||
Vector,
|
||||
makeVector as arrowMakeVector,
|
||||
vectorFromArray as badVectorFromArray,
|
||||
makeBuilder,
|
||||
makeData,
|
||||
makeTable,
|
||||
vectorFromArray,
|
||||
} from "apache-arrow";
|
||||
import { Buffers } from "apache-arrow/data";
|
||||
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
||||
@@ -179,6 +186,21 @@ export class VectorColumnOptions {
|
||||
}
|
||||
}
|
||||
|
||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||
function vectorFromArray(data: any, type?: DataType) {
|
||||
// Workaround for: https://github.com/apache/arrow/issues/45862
|
||||
// If FSL type with float
|
||||
if (DataType.isFixedSizeList(type) && DataType.isFloat(type.valueType)) {
|
||||
const extendedData = [...data, new Array(type.listSize).fill(0.0)];
|
||||
const array = badVectorFromArray(extendedData, type);
|
||||
return array.slice(0, data.length);
|
||||
} else if (type === undefined) {
|
||||
return badVectorFromArray(data);
|
||||
} else {
|
||||
return badVectorFromArray(data, type);
|
||||
}
|
||||
}
|
||||
|
||||
/** Options to control the makeArrowTable call. */
|
||||
export class MakeArrowTableOptions {
|
||||
/*
|
||||
@@ -1170,3 +1192,137 @@ function validateSchemaEmbeddings(
|
||||
|
||||
return new Schema(fields, schema.metadata);
|
||||
}
|
||||
|
||||
interface JsonDataType {
|
||||
type: string;
|
||||
fields?: JsonField[];
|
||||
length?: number;
|
||||
}
|
||||
|
||||
interface JsonField {
|
||||
name: string;
|
||||
type: JsonDataType;
|
||||
nullable: boolean;
|
||||
metadata: Map<string, string>;
|
||||
}
|
||||
|
||||
// Matches format of https://github.com/lancedb/lance/blob/main/rust/lance/src/arrow/json.rs
|
||||
export function dataTypeToJson(dataType: DataType): JsonDataType {
|
||||
switch (dataType.typeId) {
|
||||
// For primitives, matches https://github.com/lancedb/lance/blob/e12bb9eff2a52f753668d4b62c52e4d72b10d294/rust/lance-core/src/datatypes.rs#L185
|
||||
case Type.Null:
|
||||
return { type: "null" };
|
||||
case Type.Bool:
|
||||
return { type: "bool" };
|
||||
case Type.Int8:
|
||||
return { type: "int8" };
|
||||
case Type.Int16:
|
||||
return { type: "int16" };
|
||||
case Type.Int32:
|
||||
return { type: "int32" };
|
||||
case Type.Int64:
|
||||
return { type: "int64" };
|
||||
case Type.Uint8:
|
||||
return { type: "uint8" };
|
||||
case Type.Uint16:
|
||||
return { type: "uint16" };
|
||||
case Type.Uint32:
|
||||
return { type: "uint32" };
|
||||
case Type.Uint64:
|
||||
return { type: "uint64" };
|
||||
case Type.Int: {
|
||||
const bitWidth = (dataType as Int).bitWidth;
|
||||
const signed = (dataType as Int).isSigned;
|
||||
const prefix = signed ? "" : "u";
|
||||
return { type: `${prefix}int${bitWidth}` };
|
||||
}
|
||||
case Type.Float: {
|
||||
switch ((dataType as Float).precision) {
|
||||
case Precision.HALF:
|
||||
return { type: "halffloat" };
|
||||
case Precision.SINGLE:
|
||||
return { type: "float" };
|
||||
case Precision.DOUBLE:
|
||||
return { type: "double" };
|
||||
}
|
||||
throw Error("Unsupported float precision");
|
||||
}
|
||||
case Type.Float16:
|
||||
return { type: "halffloat" };
|
||||
case Type.Float32:
|
||||
return { type: "float" };
|
||||
case Type.Float64:
|
||||
return { type: "double" };
|
||||
case Type.Utf8:
|
||||
return { type: "string" };
|
||||
case Type.Binary:
|
||||
return { type: "binary" };
|
||||
case Type.LargeUtf8:
|
||||
return { type: "large_string" };
|
||||
case Type.LargeBinary:
|
||||
return { type: "large_binary" };
|
||||
case Type.List:
|
||||
return {
|
||||
type: "list",
|
||||
fields: [fieldToJson((dataType as List).children[0])],
|
||||
};
|
||||
case Type.FixedSizeList: {
|
||||
const fixedSizeList = dataType as FixedSizeList;
|
||||
return {
|
||||
type: "fixed_size_list",
|
||||
fields: [fieldToJson(fixedSizeList.children[0])],
|
||||
length: fixedSizeList.listSize,
|
||||
};
|
||||
}
|
||||
case Type.Struct:
|
||||
return {
|
||||
type: "struct",
|
||||
fields: (dataType as Struct).children.map(fieldToJson),
|
||||
};
|
||||
case Type.Date: {
|
||||
const unit = (dataType as Date_).unit;
|
||||
return {
|
||||
type: unit === DateUnit.DAY ? "date32:day" : "date64:ms",
|
||||
};
|
||||
}
|
||||
case Type.Timestamp: {
|
||||
const timestamp = dataType as Timestamp;
|
||||
const timezone = timestamp.timezone || "-";
|
||||
return {
|
||||
type: `timestamp:${timestamp.unit}:${timezone}`,
|
||||
};
|
||||
}
|
||||
case Type.Decimal: {
|
||||
const decimal = dataType as Decimal;
|
||||
return {
|
||||
type: `decimal:${decimal.bitWidth}:${decimal.precision}:${decimal.scale}`,
|
||||
};
|
||||
}
|
||||
case Type.Duration: {
|
||||
const duration = dataType as Duration;
|
||||
return { type: `duration:${duration.unit}` };
|
||||
}
|
||||
case Type.FixedSizeBinary: {
|
||||
const byteWidth = (dataType as FixedSizeBinary).byteWidth;
|
||||
return { type: `fixed_size_binary:${byteWidth}` };
|
||||
}
|
||||
case Type.Dictionary: {
|
||||
const dict = dataType as Dictionary;
|
||||
const indexType = dataTypeToJson(dict.indices);
|
||||
const valueType = dataTypeToJson(dict.valueType);
|
||||
return {
|
||||
type: `dict:${valueType.type}:${indexType.type}:false`,
|
||||
};
|
||||
}
|
||||
}
|
||||
throw new Error("Unsupported data type");
|
||||
}
|
||||
|
||||
function fieldToJson(field: Field): JsonField {
|
||||
return {
|
||||
name: field.name,
|
||||
type: dataTypeToJson(field.type),
|
||||
nullable: field.nullable,
|
||||
metadata: field.metadata,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -14,7 +14,6 @@ import {
|
||||
|
||||
export {
|
||||
AddColumnsSql,
|
||||
ColumnAlteration,
|
||||
ConnectionOptions,
|
||||
IndexStatistics,
|
||||
IndexConfig,
|
||||
@@ -54,6 +53,7 @@ export {
|
||||
Index,
|
||||
IndexOptions,
|
||||
IvfPqOptions,
|
||||
IvfFlatOptions,
|
||||
HnswPqOptions,
|
||||
HnswSqOptions,
|
||||
FtsOptions,
|
||||
@@ -65,6 +65,7 @@ export {
|
||||
UpdateOptions,
|
||||
OptimizeOptions,
|
||||
Version,
|
||||
ColumnAlteration,
|
||||
} from "./table";
|
||||
|
||||
export { MergeInsertBuilder } from "./merge";
|
||||
@@ -79,7 +80,7 @@ export {
|
||||
DataLike,
|
||||
IntoVector,
|
||||
} from "./arrow";
|
||||
export { IntoSql } from "./util";
|
||||
export { IntoSql, packBits } from "./util";
|
||||
|
||||
/**
|
||||
* Connect to a LanceDB instance at the given URI.
|
||||
|
||||
@@ -62,13 +62,13 @@ export interface IvfPqOptions {
|
||||
*
|
||||
* "l2" - Euclidean distance. This is a very common distance metric that
|
||||
* accounts for both magnitude and direction when determining the distance
|
||||
* between vectors. L2 distance has a range of [0, ∞).
|
||||
* between vectors. l2 distance has a range of [0, ∞).
|
||||
*
|
||||
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
* calculated from the cosine similarity between two vectors. Cosine
|
||||
* similarity is a measure of similarity between two non-zero vectors of an
|
||||
* inner product space. It is defined to equal the cosine of the angle
|
||||
* between them. Unlike L2, the cosine distance is not affected by the
|
||||
* between them. Unlike l2, the cosine distance is not affected by the
|
||||
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
*
|
||||
* Note: the cosine distance is undefined when one (or both) of the vectors
|
||||
@@ -77,7 +77,7 @@ export interface IvfPqOptions {
|
||||
*
|
||||
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
* L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
* l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
*/
|
||||
distanceType?: "l2" | "cosine" | "dot";
|
||||
|
||||
@@ -125,18 +125,18 @@ export interface HnswPqOptions {
|
||||
*
|
||||
* "l2" - Euclidean distance. This is a very common distance metric that
|
||||
* accounts for both magnitude and direction when determining the distance
|
||||
* between vectors. L2 distance has a range of [0, ∞).
|
||||
* between vectors. l2 distance has a range of [0, ∞).
|
||||
*
|
||||
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
* calculated from the cosine similarity between two vectors. Cosine
|
||||
* similarity is a measure of similarity between two non-zero vectors of an
|
||||
* inner product space. It is defined to equal the cosine of the angle
|
||||
* between them. Unlike L2, the cosine distance is not affected by the
|
||||
* between them. Unlike l2, the cosine distance is not affected by the
|
||||
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
*
|
||||
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
* L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
* l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
*/
|
||||
distanceType?: "l2" | "cosine" | "dot";
|
||||
|
||||
@@ -241,18 +241,18 @@ export interface HnswSqOptions {
|
||||
*
|
||||
* "l2" - Euclidean distance. This is a very common distance metric that
|
||||
* accounts for both magnitude and direction when determining the distance
|
||||
* between vectors. L2 distance has a range of [0, ∞).
|
||||
* between vectors. l2 distance has a range of [0, ∞).
|
||||
*
|
||||
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
* calculated from the cosine similarity between two vectors. Cosine
|
||||
* similarity is a measure of similarity between two non-zero vectors of an
|
||||
* inner product space. It is defined to equal the cosine of the angle
|
||||
* between them. Unlike L2, the cosine distance is not affected by the
|
||||
* between them. Unlike l2, the cosine distance is not affected by the
|
||||
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
*
|
||||
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
* L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
* l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
*/
|
||||
distanceType?: "l2" | "cosine" | "dot";
|
||||
|
||||
@@ -327,6 +327,94 @@ export interface HnswSqOptions {
|
||||
efConstruction?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options to create an `IVF_FLAT` index
|
||||
*/
|
||||
export interface IvfFlatOptions {
|
||||
/**
|
||||
* The number of IVF partitions to create.
|
||||
*
|
||||
* This value should generally scale with the number of rows in the dataset.
|
||||
* By default the number of partitions is the square root of the number of
|
||||
* rows.
|
||||
*
|
||||
* If this value is too large then the first part of the search (picking the
|
||||
* right partition) will be slow. If this value is too small then the second
|
||||
* part of the search (searching within a partition) will be slow.
|
||||
*/
|
||||
numPartitions?: number;
|
||||
|
||||
/**
|
||||
* Distance type to use to build the index.
|
||||
*
|
||||
* Default value is "l2".
|
||||
*
|
||||
* This is used when training the index to calculate the IVF partitions
|
||||
* (vectors are grouped in partitions with similar vectors according to this
|
||||
* distance type).
|
||||
*
|
||||
* The distance type used to train an index MUST match the distance type used
|
||||
* to search the index. Failure to do so will yield inaccurate results.
|
||||
*
|
||||
* The following distance types are available:
|
||||
*
|
||||
* "l2" - Euclidean distance. This is a very common distance metric that
|
||||
* accounts for both magnitude and direction when determining the distance
|
||||
* between vectors. l2 distance has a range of [0, ∞).
|
||||
*
|
||||
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
* calculated from the cosine similarity between two vectors. Cosine
|
||||
* similarity is a measure of similarity between two non-zero vectors of an
|
||||
* inner product space. It is defined to equal the cosine of the angle
|
||||
* between them. Unlike l2, the cosine distance is not affected by the
|
||||
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
*
|
||||
* Note: the cosine distance is undefined when one (or both) of the vectors
|
||||
* are all zeros (there is no direction). These vectors are invalid and may
|
||||
* never be returned from a vector search.
|
||||
*
|
||||
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
* l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
*
|
||||
* "hamming" - Hamming distance. Hamming distance is a distance metric
|
||||
* calculated from the number of bits that are different between two vectors.
|
||||
* Hamming distance has a range of [0, dimension]. Note that the hamming distance
|
||||
* is only valid for binary vectors.
|
||||
*/
|
||||
distanceType?: "l2" | "cosine" | "dot" | "hamming";
|
||||
|
||||
/**
|
||||
* Max iteration to train IVF kmeans.
|
||||
*
|
||||
* When training an IVF FLAT index we use kmeans to calculate the partitions. This parameter
|
||||
* controls how many iterations of kmeans to run.
|
||||
*
|
||||
* Increasing this might improve the quality of the index but in most cases these extra
|
||||
* iterations have diminishing returns.
|
||||
*
|
||||
* The default value is 50.
|
||||
*/
|
||||
maxIterations?: number;
|
||||
|
||||
/**
|
||||
* The number of vectors, per partition, to sample when training IVF kmeans.
|
||||
*
|
||||
* When an IVF FLAT index is trained, we need to calculate partitions. These are groups
|
||||
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
||||
*
|
||||
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
||||
* random sample of the data. This parameter controls the size of the sample. The total
|
||||
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
||||
*
|
||||
* Increasing this value might improve the quality of the index but in most cases the
|
||||
* default should be sufficient.
|
||||
*
|
||||
* The default value is 256.
|
||||
*/
|
||||
sampleRate?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options to create a full text search index
|
||||
*/
|
||||
@@ -426,6 +514,33 @@ export class Index {
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an IvfFlat index
|
||||
*
|
||||
* This index groups vectors into partitions of similar vectors. Each partition keeps track of
|
||||
* a centroid which is the average value of all vectors in the group.
|
||||
*
|
||||
* During a query the centroids are compared with the query vector to find the closest
|
||||
* partitions. The vectors in these partitions are then searched to find
|
||||
* the closest vectors.
|
||||
*
|
||||
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
||||
* many groups to create.
|
||||
*
|
||||
* Note that training an IVF FLAT index on a large dataset is a slow operation and
|
||||
* currently is also a memory intensive operation.
|
||||
*/
|
||||
static ivfFlat(options?: Partial<IvfFlatOptions>) {
|
||||
return new Index(
|
||||
LanceDbIndex.ivfFlat(
|
||||
options?.distanceType,
|
||||
options?.numPartitions,
|
||||
options?.maxIterations,
|
||||
options?.sampleRate,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a btree index
|
||||
*
|
||||
|
||||
@@ -4,8 +4,10 @@
|
||||
import {
|
||||
Table as ArrowTable,
|
||||
Data,
|
||||
DataType,
|
||||
IntoVector,
|
||||
Schema,
|
||||
dataTypeToJson,
|
||||
fromDataToBuffer,
|
||||
tableFromIPC,
|
||||
} from "./arrow";
|
||||
@@ -15,13 +17,13 @@ import { IndexOptions } from "./indices";
|
||||
import { MergeInsertBuilder } from "./merge";
|
||||
import {
|
||||
AddColumnsSql,
|
||||
ColumnAlteration,
|
||||
IndexConfig,
|
||||
IndexStatistics,
|
||||
OptimizeStats,
|
||||
Table as _NativeTable,
|
||||
} from "./native";
|
||||
import { Query, VectorQuery } from "./query";
|
||||
import { sanitizeType } from "./sanitize";
|
||||
import { IntoSql, toSQL } from "./util";
|
||||
export { IndexConfig } from "./native";
|
||||
|
||||
@@ -618,7 +620,27 @@ export class LocalTable extends Table {
|
||||
}
|
||||
|
||||
async alterColumns(columnAlterations: ColumnAlteration[]): Promise<void> {
|
||||
await this.inner.alterColumns(columnAlterations);
|
||||
const processedAlterations = columnAlterations.map((alteration) => {
|
||||
if (typeof alteration.dataType === "string") {
|
||||
return {
|
||||
...alteration,
|
||||
dataType: JSON.stringify({ type: alteration.dataType }),
|
||||
};
|
||||
} else if (alteration.dataType === undefined) {
|
||||
return {
|
||||
...alteration,
|
||||
dataType: undefined,
|
||||
};
|
||||
} else {
|
||||
const dataType = sanitizeType(alteration.dataType);
|
||||
return {
|
||||
...alteration,
|
||||
dataType: JSON.stringify(dataTypeToJson(dataType)),
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
await this.inner.alterColumns(processedAlterations);
|
||||
}
|
||||
|
||||
async dropColumns(columnNames: string[]): Promise<void> {
|
||||
@@ -711,3 +733,38 @@ export class LocalTable extends Table {
|
||||
await this.inner.migrateManifestPathsV2();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A definition of a column alteration. The alteration changes the column at
|
||||
* `path` to have the new name `name`, to be nullable if `nullable` is true,
|
||||
* and to have the data type `data_type`. At least one of `rename` or `nullable`
|
||||
* must be provided.
|
||||
*/
|
||||
export interface ColumnAlteration {
|
||||
/**
|
||||
* The path to the column to alter. This is a dot-separated path to the column.
|
||||
* If it is a top-level column then it is just the name of the column. If it is
|
||||
* a nested column then it is the path to the column, e.g. "a.b.c" for a column
|
||||
* `c` nested inside a column `b` nested inside a column `a`.
|
||||
*/
|
||||
path: string;
|
||||
/**
|
||||
* The new name of the column. If not provided then the name will not be changed.
|
||||
* This must be distinct from the names of all other columns in the table.
|
||||
*/
|
||||
rename?: string;
|
||||
/**
|
||||
* A new data type for the column. If not provided then the data type will not be changed.
|
||||
* Changing data types is limited to casting to the same general type. For example, these
|
||||
* changes are valid:
|
||||
* * `int32` -> `int64` (integers)
|
||||
* * `double` -> `float` (floats)
|
||||
* * `string` -> `large_string` (strings)
|
||||
* But these changes are not:
|
||||
* * `int32` -> `double` (mix integers and floats)
|
||||
* * `string` -> `int32` (mix strings and integers)
|
||||
*/
|
||||
dataType?: string | DataType;
|
||||
/** Set the new nullability. Note that a nullable column cannot be made non-nullable. */
|
||||
nullable?: boolean;
|
||||
}
|
||||
|
||||
@@ -35,6 +35,16 @@ export function toSQL(value: IntoSql): string {
|
||||
}
|
||||
}
|
||||
|
||||
export function packBits(data: Array<number>): Array<number> {
|
||||
const packed = Array(data.length >> 3).fill(0);
|
||||
for (let i = 0; i < data.length; i++) {
|
||||
const byte = i >> 3;
|
||||
const bit = i & 7;
|
||||
packed[byte] |= data[i] << bit;
|
||||
}
|
||||
return packed;
|
||||
}
|
||||
|
||||
export class TTLCache {
|
||||
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||
private readonly cache: Map<string, { value: any; expires: number }>;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.1",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.1",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.1",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.1",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.18.0",
|
||||
"version": "0.18.2-beta.1",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
@@ -29,7 +29,6 @@
|
||||
"aarch64-apple-darwin",
|
||||
"x86_64-unknown-linux-gnu",
|
||||
"aarch64-unknown-linux-gnu",
|
||||
"x86_64-unknown-linux-musl",
|
||||
"aarch64-unknown-linux-musl",
|
||||
"x86_64-pc-windows-msvc",
|
||||
"aarch64-pc-windows-msvc"
|
||||
@@ -74,8 +73,10 @@
|
||||
"artifacts": "napi artifacts",
|
||||
"build:debug": "napi build --platform --no-const-enum --dts ../lancedb/native.d.ts --js ../lancedb/native.js lancedb",
|
||||
"build:release": "napi build --platform --no-const-enum --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js dist/",
|
||||
"build": "npm run build:debug && tsc -b && shx cp lancedb/native.d.ts dist/native.d.ts && shx cp lancedb/*.node dist/",
|
||||
"build-release": "npm run build:release && tsc -b && shx cp lancedb/native.d.ts dist/native.d.ts",
|
||||
"build": "npm run build:debug && npm run tsc && shx cp lancedb/*.node dist/",
|
||||
"build-release": "npm run build:release && npm run tsc",
|
||||
"tsc": "tsc -b",
|
||||
"posttsc": "shx cp lancedb/native.d.ts dist/native.d.ts",
|
||||
"lint-ci": "biome ci .",
|
||||
"docs": "typedoc --plugin typedoc-plugin-markdown --treatWarningsAsErrors --out ../docs/src/js lancedb/index.ts",
|
||||
"postdocs": "node typedoc_post_process.js",
|
||||
|
||||
@@ -4,7 +4,9 @@
|
||||
use std::sync::Mutex;
|
||||
|
||||
use lancedb::index::scalar::{BTreeIndexBuilder, FtsIndexBuilder};
|
||||
use lancedb::index::vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder};
|
||||
use lancedb::index::vector::{
|
||||
IvfFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder,
|
||||
};
|
||||
use lancedb::index::Index as LanceDbIndex;
|
||||
use napi_derive::napi;
|
||||
|
||||
@@ -63,6 +65,32 @@ impl Index {
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn ivf_flat(
|
||||
distance_type: Option<String>,
|
||||
num_partitions: Option<u32>,
|
||||
max_iterations: Option<u32>,
|
||||
sample_rate: Option<u32>,
|
||||
) -> napi::Result<Self> {
|
||||
let mut ivf_flat_builder = IvfFlatIndexBuilder::default();
|
||||
if let Some(distance_type) = distance_type {
|
||||
let distance_type = parse_distance_type(distance_type)?;
|
||||
ivf_flat_builder = ivf_flat_builder.distance_type(distance_type);
|
||||
}
|
||||
if let Some(num_partitions) = num_partitions {
|
||||
ivf_flat_builder = ivf_flat_builder.num_partitions(num_partitions);
|
||||
}
|
||||
if let Some(max_iterations) = max_iterations {
|
||||
ivf_flat_builder = ivf_flat_builder.max_iterations(max_iterations);
|
||||
}
|
||||
if let Some(sample_rate) = sample_rate {
|
||||
ivf_flat_builder = ivf_flat_builder.sample_rate(sample_rate);
|
||||
}
|
||||
Ok(Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::IvfFlat(ivf_flat_builder))),
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn btree() -> Self {
|
||||
Self {
|
||||
|
||||
@@ -498,6 +498,9 @@ pub struct IndexStatistics {
|
||||
pub distance_type: Option<String>,
|
||||
/// The number of parts this index is split into.
|
||||
pub num_indices: Option<u32>,
|
||||
/// The KMeans loss value of the index,
|
||||
/// it is only present for vector indices.
|
||||
pub loss: Option<f64>,
|
||||
}
|
||||
impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
||||
fn from(value: lancedb::index::IndexStatistics) -> Self {
|
||||
@@ -507,6 +510,7 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
||||
index_type: value.index_type.to_string(),
|
||||
distance_type: value.distance_type.map(|d| d.to_string()),
|
||||
num_indices: value.num_indices,
|
||||
loss: value.loss,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.21.1"
|
||||
current_version = "0.21.2"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.21.1"
|
||||
version = "0.21.2"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
@@ -33,10 +33,6 @@ pyo3-build-config = { version = "0.23", features = [
|
||||
] }
|
||||
|
||||
[features]
|
||||
default = ["default-tls", "remote"]
|
||||
default = ["remote"]
|
||||
fp16kernels = ["lancedb/fp16kernels"]
|
||||
remote = ["lancedb/remote"]
|
||||
# TLS
|
||||
default-tls = ["lancedb/default-tls"]
|
||||
native-tls = ["lancedb/native-tls"]
|
||||
rustls-tls = ["lancedb/rustls-tls"]
|
||||
|
||||
@@ -9,7 +9,6 @@ dependencies = [
|
||||
"pydantic>=1.10",
|
||||
"packaging",
|
||||
"overrides>=0.7",
|
||||
"pylance>=0.23.2",
|
||||
]
|
||||
description = "lancedb"
|
||||
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
||||
@@ -55,6 +54,7 @@ tests = [
|
||||
"polars>=0.19, <=1.3.0",
|
||||
"tantivy",
|
||||
"pyarrow-stubs",
|
||||
"pylance>=0.23.2",
|
||||
]
|
||||
dev = [
|
||||
"ruff",
|
||||
@@ -63,7 +63,7 @@ dev = [
|
||||
'typing-extensions>=4.0.0; python_version < "3.11"',
|
||||
]
|
||||
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
|
||||
clip = ["torch", "pillow", "open-clip"]
|
||||
clip = ["torch", "pillow", "open-clip-torch"]
|
||||
embeddings = [
|
||||
"requests>=2.31.0",
|
||||
"openai>=1.6.1",
|
||||
|
||||
@@ -7,6 +7,7 @@ import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import timedelta
|
||||
from typing import Dict, Optional, Union, Any
|
||||
import warnings
|
||||
|
||||
__version__ = importlib.metadata.version("lancedb")
|
||||
|
||||
@@ -213,3 +214,13 @@ __all__ = [
|
||||
"RemoteDBConnection",
|
||||
"__version__",
|
||||
]
|
||||
|
||||
|
||||
def __warn_on_fork():
|
||||
warnings.warn(
|
||||
"lance is not fork-safe. If you are using multiprocessing, use spawn instead.",
|
||||
)
|
||||
|
||||
|
||||
if hasattr(os, "register_at_fork"):
|
||||
os.register_at_fork(before=__warn_on_fork)
|
||||
|
||||
@@ -94,6 +94,7 @@ class Query:
|
||||
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
|
||||
def nearest_to_text(self, query: dict) -> FTSQuery: ...
|
||||
async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class FTSQuery:
|
||||
def where(self, filter: str): ...
|
||||
@@ -108,6 +109,7 @@ class FTSQuery:
|
||||
def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
|
||||
async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
|
||||
async def explain_plan(self) -> str: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class VectorQuery:
|
||||
async def execute(self) -> RecordBatchStream: ...
|
||||
@@ -123,6 +125,7 @@ class VectorQuery:
|
||||
def nprobes(self, nprobes: int): ...
|
||||
def bypass_vector_index(self): ...
|
||||
def nearest_to_text(self, query: dict) -> HybridQuery: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class HybridQuery:
|
||||
def where(self, filter: str): ...
|
||||
@@ -140,6 +143,33 @@ class HybridQuery:
|
||||
def to_fts_query(self) -> FTSQuery: ...
|
||||
def get_limit(self) -> int: ...
|
||||
def get_with_row_id(self) -> bool: ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class PyFullTextSearchQuery:
|
||||
columns: Optional[List[str]]
|
||||
query: str
|
||||
limit: Optional[int]
|
||||
wand_factor: Optional[float]
|
||||
|
||||
class PyQueryRequest:
|
||||
limit: Optional[int]
|
||||
offset: Optional[int]
|
||||
filter: Optional[Union[str, bytes]]
|
||||
full_text_search: Optional[PyFullTextSearchQuery]
|
||||
select: Optional[Union[str, List[str]]]
|
||||
fast_search: Optional[bool]
|
||||
with_row_id: Optional[bool]
|
||||
column: Optional[str]
|
||||
query_vector: Optional[List[pa.Array]]
|
||||
nprobes: Optional[int]
|
||||
lower_bound: Optional[float]
|
||||
upper_bound: Optional[float]
|
||||
ef: Optional[int]
|
||||
refine_factor: Optional[int]
|
||||
distance_type: Optional[str]
|
||||
bypass_vector_index: Optional[bool]
|
||||
postfilter: Optional[bool]
|
||||
norm: Optional[str]
|
||||
|
||||
class CompactionStats:
|
||||
fragments_removed: int
|
||||
|
||||
@@ -7,10 +7,9 @@ from typing import Iterable, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pyarrow.dataset
|
||||
|
||||
from .util import safe_import_pandas
|
||||
|
||||
pd = safe_import_pandas()
|
||||
from .dependencies import pandas as pd
|
||||
|
||||
DATA = Union[List[dict], "pd.DataFrame", pa.Table, Iterable[pa.RecordBatch]]
|
||||
VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray]
|
||||
|
||||
@@ -8,9 +8,7 @@ import deprecation
|
||||
|
||||
from . import __version__
|
||||
from .exceptions import MissingColumnError, MissingValueError
|
||||
from .util import safe_import_pandas
|
||||
|
||||
pd = safe_import_pandas()
|
||||
from .dependencies import pandas as pd
|
||||
|
||||
|
||||
def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
|
||||
|
||||
@@ -30,6 +30,7 @@ _TORCH_AVAILABLE = True
|
||||
_HUGGING_FACE_AVAILABLE = True
|
||||
_TENSORFLOW_AVAILABLE = True
|
||||
_RAY_AVAILABLE = True
|
||||
_LANCE_AVAILABLE = True
|
||||
|
||||
|
||||
class _LazyModule(ModuleType):
|
||||
@@ -53,6 +54,7 @@ class _LazyModule(ModuleType):
|
||||
"torch": "torch.",
|
||||
"tensorflow": "tf.",
|
||||
"ray": "ray.",
|
||||
"lance": "lance.",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
@@ -169,6 +171,7 @@ if TYPE_CHECKING:
|
||||
import ray
|
||||
import tensorflow
|
||||
import torch
|
||||
import lance
|
||||
else:
|
||||
# heavy/optional third party libs
|
||||
numpy, _NUMPY_AVAILABLE = _lazy_import("numpy")
|
||||
@@ -178,6 +181,7 @@ else:
|
||||
datasets, _HUGGING_FACE_AVAILABLE = _lazy_import("datasets")
|
||||
tensorflow, _TENSORFLOW_AVAILABLE = _lazy_import("tensorflow")
|
||||
ray, _RAY_AVAILABLE = _lazy_import("ray")
|
||||
lance, _LANCE_AVAILABLE = _lazy_import("lance")
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
@@ -232,6 +236,12 @@ def _check_for_ray(obj: Any, *, check_type: bool = True) -> bool:
|
||||
)
|
||||
|
||||
|
||||
def _check_for_lance(obj: Any, *, check_type: bool = True) -> bool:
|
||||
return _LANCE_AVAILABLE and _might_be(
|
||||
cast(Hashable, type(obj) if check_type else obj), "lance"
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
# lazy-load third party libs
|
||||
"datasets",
|
||||
@@ -241,6 +251,7 @@ __all__ = [
|
||||
"ray",
|
||||
"tensorflow",
|
||||
"torch",
|
||||
"lance",
|
||||
# lazy utilities
|
||||
"_check_for_hugging_face",
|
||||
"_check_for_numpy",
|
||||
@@ -249,6 +260,7 @@ __all__ = [
|
||||
"_check_for_tensorflow",
|
||||
"_check_for_torch",
|
||||
"_check_for_ray",
|
||||
"_check_for_lance",
|
||||
"_LazyModule",
|
||||
# exported flags/guards
|
||||
"_NUMPY_AVAILABLE",
|
||||
@@ -258,4 +270,5 @@ __all__ = [
|
||||
"_HUGGING_FACE_AVAILABLE",
|
||||
"_TENSORFLOW_AVAILABLE",
|
||||
"_RAY_AVAILABLE",
|
||||
"_LANCE_AVAILABLE",
|
||||
]
|
||||
|
||||
@@ -16,7 +16,6 @@ from .sentence_transformers import SentenceTransformerEmbeddings
|
||||
from .gte import GteEmbeddings
|
||||
from .transformers import TransformersEmbeddingFunction, ColbertEmbeddings
|
||||
from .imagebind import ImageBindEmbeddings
|
||||
from .utils import with_embeddings
|
||||
from .jinaai import JinaEmbeddings
|
||||
from .watsonx import WatsonxEmbeddings
|
||||
from .voyageai import VoyageAIEmbeddingFunction
|
||||
|
||||
@@ -17,12 +17,13 @@ class EmbeddingFunction(BaseModel, ABC):
|
||||
"""
|
||||
An ABC for embedding functions.
|
||||
|
||||
All concrete embedding functions must implement the following:
|
||||
All concrete embedding functions must implement the following methods:
|
||||
1. compute_query_embeddings() which takes a query and returns a list of embeddings
|
||||
2. get_source_embeddings() which returns a list of embeddings for the source column
|
||||
2. compute_source_embeddings() which returns a list of embeddings for
|
||||
the source column
|
||||
For text data, the two will be the same. For multi-modal data, the source column
|
||||
might be images and the vector column might be text.
|
||||
3. ndims method which returns the number of dimensions of the vector column
|
||||
3. ndims() which returns the number of dimensions of the vector column
|
||||
"""
|
||||
|
||||
__slots__ = ("__weakref__",) # pydantic 1.x compatibility
|
||||
|
||||
@@ -16,9 +16,8 @@ from functools import wraps
|
||||
from typing import Callable, List, Union
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
from lance.vector import vec_to_table
|
||||
|
||||
from ..util import deprecated, safe_import_pandas
|
||||
from ..dependencies import pandas as pd
|
||||
|
||||
|
||||
# ruff: noqa: PERF203
|
||||
@@ -41,8 +40,6 @@ def retry(tries=10, delay=1, max_delay=30, backoff=3, jitter=1):
|
||||
return wrapper
|
||||
|
||||
|
||||
pd = safe_import_pandas()
|
||||
|
||||
DATA = Union[pa.Table, "pd.DataFrame"]
|
||||
TEXT = Union[str, List[str], pa.Array, pa.ChunkedArray, np.ndarray]
|
||||
IMAGES = Union[
|
||||
@@ -87,52 +84,6 @@ class RateLimiter:
|
||||
return wrapper
|
||||
|
||||
|
||||
@deprecated
|
||||
def with_embeddings(
|
||||
func: Callable,
|
||||
data: DATA,
|
||||
column: str = "text",
|
||||
wrap_api: bool = True,
|
||||
show_progress: bool = False,
|
||||
batch_size: int = 1000,
|
||||
) -> pa.Table:
|
||||
"""Add a vector column to a table using the given embedding function.
|
||||
|
||||
The new columns will be called "vector".
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : Callable
|
||||
A function that takes a list of strings and returns a list of vectors.
|
||||
data : pa.Table or pd.DataFrame
|
||||
The data to add an embedding column to.
|
||||
column : str, default "text"
|
||||
The name of the column to use as input to the embedding function.
|
||||
wrap_api : bool, default True
|
||||
Whether to wrap the embedding function in a retry and rate limiter.
|
||||
show_progress : bool, default False
|
||||
Whether to show a progress bar.
|
||||
batch_size : int, default 1000
|
||||
The number of row values to pass to each call of the embedding function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.Table
|
||||
The input table with a new column called "vector" containing the embeddings.
|
||||
"""
|
||||
func = FunctionWrapper(func)
|
||||
if wrap_api:
|
||||
func = func.retry().rate_limit()
|
||||
func = func.batch_size(batch_size)
|
||||
if show_progress:
|
||||
func = func.show_progress()
|
||||
if pd is not None and isinstance(data, pd.DataFrame):
|
||||
data = pa.Table.from_pandas(data, preserve_index=False)
|
||||
embeddings = func(data[column].to_numpy())
|
||||
table = vec_to_table(np.array(embeddings))
|
||||
return data.append_column("vector", table["vector"])
|
||||
|
||||
|
||||
class FunctionWrapper:
|
||||
"""
|
||||
A wrapper for embedding functions that adds rate limiting, retries, and batching.
|
||||
|
||||
@@ -150,7 +150,7 @@ class HnswPq:
|
||||
Parameters
|
||||
----------
|
||||
|
||||
distance_type: str, default "L2"
|
||||
distance_type: str, default "l2"
|
||||
|
||||
The distance metric used to train the index.
|
||||
|
||||
@@ -158,18 +158,18 @@ class HnswPq:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. L2 distance has a range of [0, ∞).
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike L2, the cosine distance is not affected by the
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
num_partitions, default sqrt(num_rows)
|
||||
|
||||
@@ -271,7 +271,7 @@ class HnswSq:
|
||||
Parameters
|
||||
----------
|
||||
|
||||
distance_type: str, default "L2"
|
||||
distance_type: str, default "l2"
|
||||
|
||||
The distance metric used to train the index.
|
||||
|
||||
@@ -279,18 +279,18 @@ class HnswSq:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. L2 distance has a range of [0, ∞).
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike L2, the cosine distance is not affected by the
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
num_partitions, default sqrt(num_rows)
|
||||
|
||||
@@ -369,7 +369,7 @@ class IvfFlat:
|
||||
|
||||
Attributes
|
||||
----------
|
||||
distance_type: str, default "L2"
|
||||
distance_type: str, default "l2"
|
||||
The distance metric used to train the index
|
||||
|
||||
This is used when training the index to calculate the IVF partitions
|
||||
@@ -383,13 +383,13 @@ class IvfFlat:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. L2 distance has a range of [0, ∞).
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike L2, the cosine distance is not affected by the
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
Note: the cosine distance is undefined when one (or both) of the vectors
|
||||
@@ -398,7 +398,7 @@ class IvfFlat:
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
"hamming" - Hamming distance. Hamming distance is a distance metric
|
||||
calculated as the number of positions at which the corresponding bits are
|
||||
@@ -475,7 +475,7 @@ class IvfPq:
|
||||
|
||||
Attributes
|
||||
----------
|
||||
distance_type: str, default "L2"
|
||||
distance_type: str, default "l2"
|
||||
The distance metric used to train the index
|
||||
|
||||
This is used when training the index to calculate the IVF partitions
|
||||
@@ -489,13 +489,13 @@ class IvfPq:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. L2 distance has a range of [0, ∞).
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike L2, the cosine distance is not affected by the
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
Note: the cosine distance is undefined when one (or both) of the vectors
|
||||
@@ -504,7 +504,7 @@ class IvfPq:
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
num_partitions: int, default sqrt(num_rows)
|
||||
The number of IVF partitions to create.
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import logging
|
||||
from typing import Any, List, Optional, Tuple, Union, Literal
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.dataset
|
||||
|
||||
from ..table import Table
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ from typing import (
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
Any,
|
||||
)
|
||||
|
||||
import asyncio
|
||||
@@ -26,10 +27,13 @@ import pydantic
|
||||
|
||||
from . import __version__
|
||||
from .arrow import AsyncRecordBatchReader
|
||||
from .dependencies import pandas as pd
|
||||
from .rerankers.base import Reranker
|
||||
from .rerankers.rrf import RRFReranker
|
||||
from .rerankers.util import check_reranker_result
|
||||
from .util import safe_import_pandas, flatten_columns
|
||||
from .util import flatten_columns
|
||||
|
||||
from typing_extensions import Annotated
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import sys
|
||||
@@ -40,6 +44,7 @@ if TYPE_CHECKING:
|
||||
from ._lancedb import FTSQuery as LanceFTSQuery
|
||||
from ._lancedb import HybridQuery as LanceHybridQuery
|
||||
from ._lancedb import VectorQuery as LanceVectorQuery
|
||||
from ._lancedb import PyQueryRequest
|
||||
from .common import VEC
|
||||
from .pydantic import LanceModel
|
||||
from .table import Table
|
||||
@@ -49,36 +54,118 @@ if TYPE_CHECKING:
|
||||
else:
|
||||
from typing_extensions import Self
|
||||
|
||||
pd = safe_import_pandas()
|
||||
|
||||
# Pydantic validation function for vector queries
|
||||
def ensure_vector_query(
|
||||
val: Any,
|
||||
) -> Union[List[float], List[List[float]], pa.Array, List[pa.Array]]:
|
||||
if isinstance(val, list):
|
||||
if len(val) == 0:
|
||||
return ValueError("Vector query must be a non-empty list")
|
||||
sample = val[0]
|
||||
else:
|
||||
if isinstance(val, float):
|
||||
raise ValueError(
|
||||
"Vector query must be a list of floats or a list of lists of floats"
|
||||
)
|
||||
sample = val
|
||||
if isinstance(sample, pa.Array):
|
||||
# val is array or list of array
|
||||
return val
|
||||
if isinstance(sample, list):
|
||||
if len(sample) == 0:
|
||||
return ValueError("Vector query must be a non-empty list")
|
||||
if isinstance(sample[0], float):
|
||||
# val is list of list of floats
|
||||
return val
|
||||
if isinstance(sample, float):
|
||||
# val is a list of floats
|
||||
return val
|
||||
|
||||
|
||||
class Query(pydantic.BaseModel):
|
||||
"""The LanceDB Query
|
||||
class FullTextSearchQuery(pydantic.BaseModel):
|
||||
"""A LanceDB Full Text Search Query
|
||||
|
||||
Attributes
|
||||
----------
|
||||
vector : List[float]
|
||||
the vector to search for
|
||||
filter : Optional[str]
|
||||
sql filter to refine the query with, optional
|
||||
prefilter : bool
|
||||
if True then apply the filter before vector search
|
||||
k : int
|
||||
top k results to return
|
||||
metric : str
|
||||
the distance metric between a pair of vectors,
|
||||
columns: List[str]
|
||||
The columns to search
|
||||
|
||||
can support L2 (default), Cosine and Dot.
|
||||
[metric definitions][search]
|
||||
columns : Optional[List[str]]
|
||||
If None, then the table should select the column automatically.
|
||||
query: str
|
||||
The query to search for
|
||||
limit: Optional[int] = None
|
||||
The limit on the number of results to return
|
||||
wand_factor: Optional[float] = None
|
||||
The wand factor to use for the search
|
||||
"""
|
||||
|
||||
columns: Optional[List[str]] = None
|
||||
query: str
|
||||
limit: Optional[int] = None
|
||||
wand_factor: Optional[float] = None
|
||||
|
||||
|
||||
class Query(pydantic.BaseModel):
|
||||
"""A LanceDB Query
|
||||
|
||||
Queries are constructed by the `Table.search` method. This class is a
|
||||
python representation of the query. Normally you will not need to interact
|
||||
with this class directly. You can build up a query and execute it using
|
||||
collection methods such as `to_batches()`, `to_arrow()`, `to_pandas()`,
|
||||
etc.
|
||||
|
||||
However, you can use the `to_query()` method to get the underlying query object.
|
||||
This can be useful for serializing a query or using it in a different context.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
filter : Optional[str]
|
||||
sql filter to refine the query with
|
||||
limit : Optional[int]
|
||||
The limit on the number of results to return. If this is a vector or FTS query,
|
||||
then this is required. If this is a plain SQL query, then this is optional.
|
||||
offset: Optional[int]
|
||||
The offset to start fetching results from
|
||||
|
||||
This is ignored for vector / FTS search (will be None).
|
||||
columns : Optional[Union[List[str], Dict[str, str]]]
|
||||
which columns to return in the results
|
||||
nprobes : int
|
||||
The number of probes used - optional
|
||||
|
||||
This can be a list of column names or a dictionary. If it is a dictionary,
|
||||
then the keys are the column names and the values are sql expressions to
|
||||
use to calculate the result.
|
||||
|
||||
If this is None then all columns are returned. This can be expensive.
|
||||
with_row_id : Optional[bool]
|
||||
if True then include the row id in the results
|
||||
vector : Optional[Union[List[float], List[List[float]], pa.Array, List[pa.Array]]]
|
||||
the vector to search for, if this a vector search or hybrid search. It will
|
||||
be None for full text search and plain SQL filtering.
|
||||
vector_column : Optional[str]
|
||||
the name of the vector column to use for vector search
|
||||
|
||||
If this is None then a default vector column will be used.
|
||||
distance_type : Optional[str]
|
||||
the distance type to use for vector search
|
||||
|
||||
This can be l2 (default), cosine and dot. See [metric definitions][search] for
|
||||
more details.
|
||||
|
||||
If this is not a vector search this will be None.
|
||||
postfilter : bool
|
||||
if True then apply the filter after vector / FTS search. This is ignored for
|
||||
plain SQL filtering.
|
||||
nprobes : Optional[int]
|
||||
The number of IVF partitions to search. If this is None then a default
|
||||
number of partitions will be used.
|
||||
|
||||
- A higher number makes search more accurate but also slower.
|
||||
|
||||
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||||
tuning advice.
|
||||
|
||||
Will be None if this is not a vector search.
|
||||
refine_factor : Optional[int]
|
||||
Refine the results by reading extra elements and re-ranking them in memory.
|
||||
|
||||
@@ -86,58 +173,130 @@ class Query(pydantic.BaseModel):
|
||||
|
||||
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||||
tuning advice.
|
||||
offset: int
|
||||
The offset to start fetching results from
|
||||
fast_search: bool
|
||||
|
||||
Will be None if this is not a vector search.
|
||||
lower_bound : Optional[float]
|
||||
The lower bound for distance search
|
||||
|
||||
Only results with a distance greater than or equal to this value
|
||||
will be returned.
|
||||
|
||||
This will only be set on vector search.
|
||||
upper_bound : Optional[float]
|
||||
The upper bound for distance search
|
||||
|
||||
Only results with a distance less than or equal to this value
|
||||
will be returned.
|
||||
|
||||
This will only be set on vector search.
|
||||
ef : Optional[int]
|
||||
The size of the nearest neighbor list maintained during HNSW search
|
||||
|
||||
This will only be set on vector search.
|
||||
full_text_query : Optional[Union[str, dict]]
|
||||
The full text search query
|
||||
|
||||
This can be a string or a dictionary. A dictionary will be used to search
|
||||
multiple columns. The keys are the column names and the values are the
|
||||
search queries.
|
||||
|
||||
This will only be set on FTS or hybrid queries.
|
||||
fast_search: Optional[bool]
|
||||
Skip a flat search of unindexed data. This will improve
|
||||
search performance but search results will not include unindexed data.
|
||||
|
||||
- *default False*.
|
||||
The default is False
|
||||
"""
|
||||
|
||||
# The name of the vector column to use for vector search.
|
||||
vector_column: Optional[str] = None
|
||||
|
||||
# vector to search for
|
||||
vector: Union[List[float], List[List[float]]]
|
||||
#
|
||||
# Note: today this will be floats on the sync path and pa.Array on the async
|
||||
# path though in the future we should unify this to pa.Array everywhere
|
||||
vector: Annotated[
|
||||
Optional[Union[List[float], List[List[float]], pa.Array, List[pa.Array]]],
|
||||
ensure_vector_query,
|
||||
] = None
|
||||
|
||||
# sql filter to refine the query with
|
||||
filter: Optional[str] = None
|
||||
|
||||
# if True then apply the filter before vector search
|
||||
prefilter: bool = False
|
||||
# if True then apply the filter after vector search
|
||||
postfilter: Optional[bool] = None
|
||||
|
||||
# full text search query
|
||||
full_text_query: Optional[Union[str, dict]] = None
|
||||
full_text_query: Optional[FullTextSearchQuery] = None
|
||||
|
||||
# top k results to return
|
||||
k: Optional[int] = None
|
||||
limit: Optional[int] = None
|
||||
|
||||
# # metrics
|
||||
metric: str = "L2"
|
||||
# distance type to use for vector search
|
||||
distance_type: Optional[str] = None
|
||||
|
||||
# which columns to return in the results
|
||||
columns: Optional[Union[List[str], Dict[str, str]]] = None
|
||||
|
||||
# optional query parameters for tuning the results,
|
||||
# e.g. `{"nprobes": "10", "refine_factor": "10"}`
|
||||
nprobes: int = 10
|
||||
# number of IVF partitions to search
|
||||
nprobes: Optional[int] = None
|
||||
|
||||
# lower bound for distance search
|
||||
lower_bound: Optional[float] = None
|
||||
|
||||
# upper bound for distance search
|
||||
upper_bound: Optional[float] = None
|
||||
|
||||
# Refine factor.
|
||||
# multiplier for the number of results to inspect for reranking
|
||||
refine_factor: Optional[int] = None
|
||||
|
||||
with_row_id: bool = False
|
||||
# if true, include the row id in the results
|
||||
with_row_id: Optional[bool] = None
|
||||
|
||||
offset: int = 0
|
||||
# offset to start fetching results from
|
||||
offset: Optional[int] = None
|
||||
|
||||
fast_search: bool = False
|
||||
# if true, will only search the indexed data
|
||||
fast_search: Optional[bool] = None
|
||||
|
||||
# size of the nearest neighbor list maintained during HNSW search
|
||||
ef: Optional[int] = None
|
||||
|
||||
# Default is true. Set to false to enforce a brute force search.
|
||||
use_index: bool = True
|
||||
# Bypass the vector index and use a brute force search
|
||||
bypass_vector_index: Optional[bool] = None
|
||||
|
||||
@classmethod
|
||||
def from_inner(cls, req: PyQueryRequest) -> Self:
|
||||
query = cls()
|
||||
query.limit = req.limit
|
||||
query.offset = req.offset
|
||||
query.filter = req.filter
|
||||
query.full_text_query = req.full_text_search
|
||||
query.columns = req.select
|
||||
query.with_row_id = req.with_row_id
|
||||
query.vector_column = req.column
|
||||
query.vector = req.query_vector
|
||||
query.distance_type = req.distance_type
|
||||
query.nprobes = req.nprobes
|
||||
query.lower_bound = req.lower_bound
|
||||
query.upper_bound = req.upper_bound
|
||||
query.ef = req.ef
|
||||
query.refine_factor = req.refine_factor
|
||||
query.bypass_vector_index = req.bypass_vector_index
|
||||
query.postfilter = req.postfilter
|
||||
if req.full_text_search is not None:
|
||||
query.full_text_query = FullTextSearchQuery(
|
||||
columns=req.full_text_search.columns,
|
||||
query=req.full_text_search.query,
|
||||
limit=req.full_text_search.limit,
|
||||
wand_factor=req.full_text_search.wand_factor,
|
||||
)
|
||||
return query
|
||||
|
||||
class Config:
|
||||
# This tells pydantic to allow custom types (needed for the `vector` query since
|
||||
# pa.Array wouln't be allowed otherwise)
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class LanceQueryBuilder(ABC):
|
||||
@@ -153,9 +312,9 @@ class LanceQueryBuilder(ABC):
|
||||
query_type: str,
|
||||
vector_column_name: str,
|
||||
ordering_field_name: Optional[str] = None,
|
||||
fts_columns: Union[str, List[str]] = [],
|
||||
fast_search: bool = False,
|
||||
) -> LanceQueryBuilder:
|
||||
fts_columns: Optional[Union[str, List[str]]] = None,
|
||||
fast_search: bool = None,
|
||||
) -> Self:
|
||||
"""
|
||||
Create a query builder based on the given query and query type.
|
||||
|
||||
@@ -258,15 +417,15 @@ class LanceQueryBuilder(ABC):
|
||||
def __init__(self, table: "Table"):
|
||||
self._table = table
|
||||
self._limit = None
|
||||
self._offset = 0
|
||||
self._offset = None
|
||||
self._columns = None
|
||||
self._where = None
|
||||
self._prefilter = True
|
||||
self._with_row_id = False
|
||||
self._postfilter = None
|
||||
self._with_row_id = None
|
||||
self._vector = None
|
||||
self._text = None
|
||||
self._ef = None
|
||||
self._use_index = True
|
||||
self._bypass_vector_index = None
|
||||
|
||||
@deprecation.deprecated(
|
||||
deprecated_in="0.3.1",
|
||||
@@ -316,7 +475,7 @@ class LanceQueryBuilder(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.Table:
|
||||
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
|
||||
"""
|
||||
Execute the query and return the results as a pyarrow
|
||||
[RecordBatchReader](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html)
|
||||
@@ -361,7 +520,7 @@ class LanceQueryBuilder(ABC):
|
||||
|
||||
return pl.from_arrow(self.to_arrow())
|
||||
|
||||
def limit(self, limit: Union[int, None]) -> LanceQueryBuilder:
|
||||
def limit(self, limit: Union[int, None]) -> Self:
|
||||
"""Set the maximum number of results to return.
|
||||
|
||||
Parameters
|
||||
@@ -391,7 +550,7 @@ class LanceQueryBuilder(ABC):
|
||||
self._limit = limit
|
||||
return self
|
||||
|
||||
def offset(self, offset: int) -> LanceQueryBuilder:
|
||||
def offset(self, offset: int) -> Self:
|
||||
"""Set the offset for the results.
|
||||
|
||||
Parameters
|
||||
@@ -410,7 +569,7 @@ class LanceQueryBuilder(ABC):
|
||||
self._offset = offset
|
||||
return self
|
||||
|
||||
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
|
||||
def select(self, columns: Union[list[str], dict[str, str]]) -> Self:
|
||||
"""Set the columns to return.
|
||||
|
||||
Parameters
|
||||
@@ -431,7 +590,7 @@ class LanceQueryBuilder(ABC):
|
||||
raise ValueError("columns must be a list or a dictionary")
|
||||
return self
|
||||
|
||||
def where(self, where: str, prefilter: bool = True) -> LanceQueryBuilder:
|
||||
def where(self, where: str, prefilter: bool = True) -> Self:
|
||||
"""Set the where clause.
|
||||
|
||||
Parameters
|
||||
@@ -452,10 +611,10 @@ class LanceQueryBuilder(ABC):
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._where = where
|
||||
self._prefilter = prefilter
|
||||
self._postfilter = not prefilter
|
||||
return self
|
||||
|
||||
def with_row_id(self, with_row_id: bool) -> LanceQueryBuilder:
|
||||
def with_row_id(self, with_row_id: bool) -> Self:
|
||||
"""Set whether to return row ids.
|
||||
|
||||
Parameters
|
||||
@@ -498,25 +657,9 @@ class LanceQueryBuilder(ABC):
|
||||
-------
|
||||
plan : str
|
||||
""" # noqa: E501
|
||||
ds = self._table.to_lance()
|
||||
return ds.scanner(
|
||||
nearest={
|
||||
"column": self._vector_column,
|
||||
"q": self._query,
|
||||
"k": self._limit,
|
||||
"metric": self._distance_type,
|
||||
"nprobes": self._nprobes,
|
||||
"refine_factor": self._refine_factor,
|
||||
"use_index": self._use_index,
|
||||
},
|
||||
prefilter=self._prefilter,
|
||||
filter=self._str_query,
|
||||
limit=self._limit,
|
||||
with_row_id=self._with_row_id,
|
||||
offset=self._offset,
|
||||
).explain_plan(verbose)
|
||||
return self._table._explain_plan(self.to_query_object())
|
||||
|
||||
def vector(self, vector: Union[np.ndarray, list]) -> LanceQueryBuilder:
|
||||
def vector(self, vector: Union[np.ndarray, list]) -> Self:
|
||||
"""Set the vector to search for.
|
||||
|
||||
Parameters
|
||||
@@ -531,7 +674,7 @@ class LanceQueryBuilder(ABC):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def text(self, text: str) -> LanceQueryBuilder:
|
||||
def text(self, text: str) -> Self:
|
||||
"""Set the text to search for.
|
||||
|
||||
Parameters
|
||||
@@ -547,7 +690,7 @@ class LanceQueryBuilder(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def rerank(self, reranker: Reranker) -> LanceQueryBuilder:
|
||||
def rerank(self, reranker: Reranker) -> Self:
|
||||
"""Rerank the results using the specified reranker.
|
||||
|
||||
Parameters
|
||||
@@ -562,6 +705,17 @@ class LanceQueryBuilder(ABC):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def to_query_object(self) -> Query:
|
||||
"""Return a serializable representation of the query
|
||||
|
||||
Returns
|
||||
-------
|
||||
Query
|
||||
The serializable representation of the query
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
"""
|
||||
@@ -591,24 +745,22 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
query: Union[np.ndarray, list, "PIL.Image.Image"],
|
||||
vector_column: str,
|
||||
str_query: Optional[str] = None,
|
||||
fast_search: bool = False,
|
||||
fast_search: bool = None,
|
||||
):
|
||||
super().__init__(table)
|
||||
if self._limit is None:
|
||||
self._limit = 10
|
||||
self._query = query
|
||||
self._distance_type = "L2"
|
||||
self._nprobes = 20
|
||||
self._distance_type = None
|
||||
self._nprobes = None
|
||||
self._lower_bound = None
|
||||
self._upper_bound = None
|
||||
self._refine_factor = None
|
||||
self._vector_column = vector_column
|
||||
self._prefilter = False
|
||||
self._postfilter = None
|
||||
self._reranker = None
|
||||
self._str_query = str_query
|
||||
self._fast_search = fast_search
|
||||
|
||||
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
|
||||
def metric(self, metric: Literal["l2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
|
||||
"""Set the distance metric to use.
|
||||
|
||||
This is an alias for distance_type() and may be deprecated in the future.
|
||||
@@ -616,8 +768,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metric: "L2" or "cosine" or "dot"
|
||||
The distance metric to use. By default "L2" is used.
|
||||
metric: "l2" or "cosine" or "dot"
|
||||
The distance metric to use. By default "l2" is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -627,7 +779,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
return self.distance_type(metric)
|
||||
|
||||
def distance_type(
|
||||
self, distance_type: Literal["L2", "cosine", "dot"]
|
||||
self, distance_type: Literal["l2", "cosine", "dot"]
|
||||
) -> "LanceVectorQueryBuilder":
|
||||
"""Set the distance metric to use.
|
||||
|
||||
@@ -641,8 +793,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_type: "L2" or "cosine" or "dot"
|
||||
The distance metric to use. By default "L2" is used.
|
||||
distance_type: "l2" or "cosine" or "dot"
|
||||
The distance metric to use. By default "l2" is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -753,6 +905,34 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
"""
|
||||
return self.to_batches().read_all()
|
||||
|
||||
def to_query_object(self) -> Query:
|
||||
"""
|
||||
Build a Query object
|
||||
|
||||
This can be used to serialize a query
|
||||
"""
|
||||
vector = self._query if isinstance(self._query, list) else self._query.tolist()
|
||||
if isinstance(vector[0], np.ndarray):
|
||||
vector = [v.tolist() for v in vector]
|
||||
return Query(
|
||||
vector=vector,
|
||||
filter=self._where,
|
||||
postfilter=self._postfilter,
|
||||
limit=self._limit,
|
||||
distance_type=self._distance_type,
|
||||
columns=self._columns,
|
||||
nprobes=self._nprobes,
|
||||
lower_bound=self._lower_bound,
|
||||
upper_bound=self._upper_bound,
|
||||
refine_factor=self._refine_factor,
|
||||
vector_column=self._vector_column,
|
||||
with_row_id=self._with_row_id,
|
||||
offset=self._offset,
|
||||
fast_search=self._fast_search,
|
||||
ef=self._ef,
|
||||
bypass_vector_index=self._bypass_vector_index,
|
||||
)
|
||||
|
||||
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
|
||||
"""
|
||||
Execute the query and return the result as a RecordBatchReader object.
|
||||
@@ -769,24 +949,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
vector = self._query if isinstance(self._query, list) else self._query.tolist()
|
||||
if isinstance(vector[0], np.ndarray):
|
||||
vector = [v.tolist() for v in vector]
|
||||
query = Query(
|
||||
vector=vector,
|
||||
filter=self._where,
|
||||
prefilter=self._prefilter,
|
||||
k=self._limit,
|
||||
metric=self._distance_type,
|
||||
columns=self._columns,
|
||||
nprobes=self._nprobes,
|
||||
lower_bound=self._lower_bound,
|
||||
upper_bound=self._upper_bound,
|
||||
refine_factor=self._refine_factor,
|
||||
vector_column=self._vector_column,
|
||||
with_row_id=self._with_row_id,
|
||||
offset=self._offset,
|
||||
fast_search=self._fast_search,
|
||||
ef=self._ef,
|
||||
use_index=self._use_index,
|
||||
)
|
||||
query = self.to_query_object()
|
||||
result_set = self._table._execute_query(query, batch_size)
|
||||
if self._reranker is not None:
|
||||
rs_table = result_set.read_all()
|
||||
@@ -799,7 +962,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
return result_set
|
||||
|
||||
def where(self, where: str, prefilter: bool = True) -> LanceVectorQueryBuilder:
|
||||
def where(self, where: str, prefilter: bool = None) -> LanceVectorQueryBuilder:
|
||||
"""Set the where clause.
|
||||
|
||||
Parameters
|
||||
@@ -811,8 +974,6 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
prefilter: bool, default True
|
||||
If True, apply the filter before vector search, otherwise the
|
||||
filter is applied on the result of vector search.
|
||||
This feature is **EXPERIMENTAL** and may be removed and modified
|
||||
without warning in the future.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -820,7 +981,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._where = where
|
||||
self._prefilter = prefilter
|
||||
if prefilter is not None:
|
||||
self._postfilter = not prefilter
|
||||
return self
|
||||
|
||||
def rerank(
|
||||
@@ -874,7 +1036,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
LanceVectorQueryBuilder
|
||||
The LanceVectorQueryBuilder object.
|
||||
"""
|
||||
self._use_index = False
|
||||
self._bypass_vector_index = True
|
||||
return self
|
||||
|
||||
|
||||
@@ -886,11 +1048,9 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
table: "Table",
|
||||
query: str,
|
||||
ordering_field_name: Optional[str] = None,
|
||||
fts_columns: Union[str, List[str]] = [],
|
||||
fts_columns: Optional[Union[str, List[str]]] = None,
|
||||
):
|
||||
super().__init__(table)
|
||||
if self._limit is None:
|
||||
self._limit = 10
|
||||
self._query = query
|
||||
self._phrase_query = False
|
||||
self.ordering_field_name = ordering_field_name
|
||||
@@ -916,6 +1076,19 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
self._phrase_query = phrase_query
|
||||
return self
|
||||
|
||||
def to_query_object(self) -> Query:
|
||||
return Query(
|
||||
columns=self._columns,
|
||||
filter=self._where,
|
||||
limit=self._limit,
|
||||
postfilter=self._postfilter,
|
||||
with_row_id=self._with_row_id,
|
||||
full_text_query=FullTextSearchQuery(
|
||||
query=self._query, columns=self._fts_columns
|
||||
),
|
||||
offset=self._offset,
|
||||
)
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
path, fs, exist = self._table._get_fts_index_path()
|
||||
if exist:
|
||||
@@ -927,19 +1100,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
"Phrase query is not yet supported in Lance FTS. "
|
||||
"Use tantivy-based index instead for now."
|
||||
)
|
||||
query = Query(
|
||||
columns=self._columns,
|
||||
filter=self._where,
|
||||
k=self._limit,
|
||||
prefilter=self._prefilter,
|
||||
with_row_id=self._with_row_id,
|
||||
full_text_query={
|
||||
"query": query,
|
||||
"columns": self._fts_columns,
|
||||
},
|
||||
vector=[],
|
||||
offset=self._offset,
|
||||
)
|
||||
query = self.to_query_object()
|
||||
results = self._table._execute_query(query)
|
||||
results = results.read_all()
|
||||
if self._reranker is not None:
|
||||
@@ -984,8 +1145,9 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
if self._phrase_query:
|
||||
query = query.replace('"', "'")
|
||||
query = f'"{query}"'
|
||||
limit = self._limit if self._limit is not None else 10
|
||||
row_ids, scores = search_index(
|
||||
index, query, self._limit, ordering_field=self.ordering_field_name
|
||||
index, query, limit, ordering_field=self.ordering_field_name
|
||||
)
|
||||
if len(row_ids) == 0:
|
||||
empty_schema = pa.schema([pa.field("_score", pa.float32())])
|
||||
@@ -1054,17 +1216,18 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
||||
def to_arrow(self) -> pa.Table:
|
||||
return self.to_batches().read_all()
|
||||
|
||||
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
|
||||
query = Query(
|
||||
def to_query_object(self) -> Query:
|
||||
return Query(
|
||||
columns=self._columns,
|
||||
filter=self._where,
|
||||
k=self._limit,
|
||||
limit=self._limit,
|
||||
with_row_id=self._with_row_id,
|
||||
vector=[],
|
||||
# not actually respected in remote query
|
||||
offset=self._offset or 0,
|
||||
offset=self._offset,
|
||||
)
|
||||
return self._table._execute_query(query)
|
||||
|
||||
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
|
||||
query = self.to_query_object()
|
||||
return self._table._execute_query(query, batch_size)
|
||||
|
||||
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
|
||||
"""Rerank the results using the specified reranker.
|
||||
@@ -1099,18 +1262,18 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
table: "Table",
|
||||
query: Optional[str] = None,
|
||||
vector_column: Optional[str] = None,
|
||||
fts_columns: Union[str, List[str]] = [],
|
||||
fts_columns: Optional[Union[str, List[str]]] = None,
|
||||
):
|
||||
super().__init__(table)
|
||||
self._query = query
|
||||
self._vector_column = vector_column
|
||||
self._fts_columns = fts_columns
|
||||
self._norm = "score"
|
||||
self._reranker = RRFReranker()
|
||||
self._norm = None
|
||||
self._reranker = None
|
||||
self._nprobes = None
|
||||
self._refine_factor = None
|
||||
self._distance_type = None
|
||||
self._phrase_query = False
|
||||
self._phrase_query = None
|
||||
|
||||
def _validate_query(self, query, vector=None, text=None):
|
||||
if query is not None and (vector is not None or text is not None):
|
||||
@@ -1132,7 +1295,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
return vector_query, text_query
|
||||
|
||||
def phrase_query(self, phrase_query: bool = True) -> LanceHybridQueryBuilder:
|
||||
def phrase_query(self, phrase_query: bool = None) -> LanceHybridQueryBuilder:
|
||||
"""Set whether to use phrase query.
|
||||
|
||||
Parameters
|
||||
@@ -1149,6 +1312,9 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._phrase_query = phrase_query
|
||||
return self
|
||||
|
||||
def to_query_object(self) -> Query:
|
||||
raise NotImplementedError("to_query_object not yet supported on a hybrid query")
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
vector_query, fts_query = self._validate_query(
|
||||
self._query, self._vector, self._text
|
||||
@@ -1170,8 +1336,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._vector_query.select(self._columns)
|
||||
self._fts_query.select(self._columns)
|
||||
if self._where:
|
||||
self._vector_query.where(self._where, self._prefilter)
|
||||
self._fts_query.where(self._where, self._prefilter)
|
||||
self._vector_query.where(self._where, self._postfilter)
|
||||
self._fts_query.where(self._where, self._postfilter)
|
||||
if self._with_row_id:
|
||||
self._vector_query.with_row_id(True)
|
||||
self._fts_query.with_row_id(True)
|
||||
@@ -1185,9 +1351,12 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._vector_query.refine_factor(self._refine_factor)
|
||||
if self._ef:
|
||||
self._vector_query.ef(self._ef)
|
||||
if not self._use_index:
|
||||
if self._bypass_vector_index:
|
||||
self._vector_query.bypass_vector_index()
|
||||
|
||||
if self._reranker is None:
|
||||
self._reranker = RRFReranker()
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
fts_future = executor.submit(self._fts_query.with_row_id(True).to_arrow)
|
||||
vector_future = executor.submit(
|
||||
@@ -1220,6 +1389,10 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
vector_results = LanceHybridQueryBuilder._rank(vector_results, "_distance")
|
||||
fts_results = LanceHybridQueryBuilder._rank(fts_results, "_score")
|
||||
|
||||
original_distances = None
|
||||
original_scores = None
|
||||
original_distance_row_ids = None
|
||||
original_score_row_ids = None
|
||||
# normalize the scores to be between 0 and 1, 0 being most relevant
|
||||
# We check whether the results (vector and FTS) are empty, because when
|
||||
# they are, they often are missing the _rowid column, which causes an error
|
||||
@@ -1249,7 +1422,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
check_reranker_result(results)
|
||||
|
||||
if "_distance" in results.column_names:
|
||||
if "_distance" in results.column_names and original_distances is not None:
|
||||
# restore the original distances
|
||||
indices = pc.index_in(
|
||||
results["_rowid"], original_distance_row_ids, skip_nulls=True
|
||||
@@ -1258,7 +1431,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
distance_i = results.column_names.index("_distance")
|
||||
results = results.set_column(distance_i, "_distance", original_distances)
|
||||
|
||||
if "_score" in results.column_names:
|
||||
if "_score" in results.column_names and original_scores is not None:
|
||||
# restore the original scores
|
||||
indices = pc.index_in(
|
||||
results["_rowid"], original_score_row_ids, skip_nulls=True
|
||||
@@ -1414,7 +1587,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._ef = ef
|
||||
return self
|
||||
|
||||
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
|
||||
def metric(self, metric: Literal["l2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
|
||||
"""Set the distance metric to use.
|
||||
|
||||
This is an alias for distance_type() and may be deprecated in the future.
|
||||
@@ -1422,8 +1595,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metric: "L2" or "cosine" or "dot"
|
||||
The distance metric to use. By default "L2" is used.
|
||||
metric: "l2" or "cosine" or "dot"
|
||||
The distance metric to use. By default "l2" is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -1433,7 +1606,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
return self.distance_type(metric)
|
||||
|
||||
def distance_type(
|
||||
self, distance_type: Literal["L2", "cosine", "dot"]
|
||||
self, distance_type: Literal["l2", "cosine", "dot"]
|
||||
) -> "LanceHybridQueryBuilder":
|
||||
"""Set the distance metric to use.
|
||||
|
||||
@@ -1447,8 +1620,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_type: "L2" or "cosine" or "dot"
|
||||
The distance metric to use. By default "L2" is used.
|
||||
distance_type: "l2" or "cosine" or "dot"
|
||||
The distance metric to use. By default "l2" is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -1499,12 +1672,12 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
LanceHybridQueryBuilder
|
||||
The LanceHybridQueryBuilder object.
|
||||
"""
|
||||
self._use_index = False
|
||||
self._bypass_vector_index = True
|
||||
return self
|
||||
|
||||
|
||||
class AsyncQueryBase(object):
|
||||
def __init__(self, inner: Union[LanceQuery | LanceVectorQuery]):
|
||||
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery]):
|
||||
"""
|
||||
Construct an AsyncQueryBase
|
||||
|
||||
@@ -1513,6 +1686,9 @@ class AsyncQueryBase(object):
|
||||
"""
|
||||
self._inner = inner
|
||||
|
||||
def to_query_object(self) -> Query:
|
||||
return Query.from_inner(self._inner.to_query_request())
|
||||
|
||||
def where(self, predicate: str) -> Self:
|
||||
"""
|
||||
Only return rows matching the given predicate
|
||||
@@ -1865,7 +2041,7 @@ class AsyncQuery(AsyncQueryBase):
|
||||
)
|
||||
|
||||
def nearest_to_text(
|
||||
self, query: str, columns: Union[str, List[str]] = []
|
||||
self, query: str, columns: Union[str, List[str], None] = None
|
||||
) -> AsyncFTSQuery:
|
||||
"""
|
||||
Find the documents that are most relevant to the given text query.
|
||||
@@ -1889,6 +2065,8 @@ class AsyncQuery(AsyncQueryBase):
|
||||
"""
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
if columns is None:
|
||||
columns = []
|
||||
return AsyncFTSQuery(
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
@@ -2174,7 +2352,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
return self
|
||||
|
||||
def nearest_to_text(
|
||||
self, query: str, columns: Union[str, List[str]] = []
|
||||
self, query: str, columns: Union[str, List[str], None] = None
|
||||
) -> AsyncHybridQuery:
|
||||
"""
|
||||
Find the documents that are most relevant to the given text query,
|
||||
@@ -2202,6 +2380,8 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
||||
"""
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
if columns is None:
|
||||
columns = []
|
||||
return AsyncHybridQuery(
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
|
||||
@@ -154,7 +154,7 @@ class RemoteTable(Table):
|
||||
|
||||
def create_index(
|
||||
self,
|
||||
metric="L2",
|
||||
metric="l2",
|
||||
vector_column_name: str = VECTOR_COLUMN_NAME,
|
||||
index_cache_size: Optional[int] = None,
|
||||
num_partitions: Optional[int] = None,
|
||||
@@ -170,7 +170,7 @@ class RemoteTable(Table):
|
||||
Parameters
|
||||
----------
|
||||
metric : str
|
||||
The metric to use for the index. Default is "L2".
|
||||
The metric to use for the index. Default is "l2".
|
||||
vector_column_name : str
|
||||
The name of the vector column. Default is "vector".
|
||||
|
||||
@@ -193,7 +193,7 @@ class RemoteTable(Table):
|
||||
... table_name, # doctest: +SKIP
|
||||
... schema=schema, # doctest: +SKIP
|
||||
... )
|
||||
>>> table.create_index("L2", "vector") # doctest: +SKIP
|
||||
>>> table.create_index("l2", "vector") # doctest: +SKIP
|
||||
"""
|
||||
|
||||
if num_partitions is not None:
|
||||
@@ -282,7 +282,8 @@ class RemoteTable(Table):
|
||||
"""Create a search query to find the nearest neighbors
|
||||
of the given query vector. We currently support [vector search][search]
|
||||
|
||||
All query options are defined in [Query][lancedb.query.Query].
|
||||
All query options are defined in
|
||||
[LanceVectorQueryBuilder][lancedb.query.LanceVectorQueryBuilder].
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -353,7 +354,16 @@ class RemoteTable(Table):
|
||||
def _execute_query(
|
||||
self, query: Query, batch_size: Optional[int] = None
|
||||
) -> pa.RecordBatchReader:
|
||||
return LOOP.run(self._table._execute_query(query, batch_size=batch_size))
|
||||
async_iter = LOOP.run(self._table._execute_query(query, batch_size=batch_size))
|
||||
|
||||
def iter_sync():
|
||||
try:
|
||||
while True:
|
||||
yield LOOP.run(async_iter.__anext__())
|
||||
except StopAsyncIteration:
|
||||
return
|
||||
|
||||
return pa.RecordBatchReader.from_batches(async_iter.schema, iter_sync())
|
||||
|
||||
def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:
|
||||
"""Returns a [`LanceMergeInsertBuilder`][lancedb.merge.LanceMergeInsertBuilder]
|
||||
|
||||
@@ -42,7 +42,9 @@ class AnswerdotaiRerankers(Reranker):
|
||||
rerankers = attempt_import_or_raise(
|
||||
"rerankers"
|
||||
) # import here for faster ops later
|
||||
self.reranker = rerankers.Reranker(model_name, model_type, **kwargs)
|
||||
self.reranker = rerankers.Reranker(
|
||||
model_name=model_name, model_type=model_type, **kwargs
|
||||
)
|
||||
|
||||
def _rerank(self, result_set: pa.Table, query: str):
|
||||
docs = result_set[self.column].to_pylist()
|
||||
|
||||
@@ -28,12 +28,19 @@ from urllib.parse import urlparse
|
||||
from . import __version__
|
||||
from lancedb.arrow import peek_reader
|
||||
from lancedb.background_loop import LOOP
|
||||
from .dependencies import _check_for_hugging_face, _check_for_pandas
|
||||
from .dependencies import (
|
||||
_check_for_hugging_face,
|
||||
_check_for_lance,
|
||||
_check_for_pandas,
|
||||
lance,
|
||||
pandas as pd,
|
||||
polars as pl,
|
||||
)
|
||||
import pyarrow as pa
|
||||
import pyarrow.dataset
|
||||
import pyarrow.compute as pc
|
||||
import pyarrow.fs as pa_fs
|
||||
import numpy as np
|
||||
from lance import LanceDataset
|
||||
|
||||
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
||||
@@ -58,8 +65,6 @@ from .util import (
|
||||
get_uri_scheme,
|
||||
infer_vector_column_name,
|
||||
join_uri,
|
||||
safe_import_pandas,
|
||||
safe_import_polars,
|
||||
value_to_sql,
|
||||
)
|
||||
from .index import lang_mapping
|
||||
@@ -88,10 +93,6 @@ if TYPE_CHECKING:
|
||||
)
|
||||
|
||||
|
||||
pd = safe_import_pandas()
|
||||
pl = safe_import_polars()
|
||||
|
||||
|
||||
def _into_pyarrow_reader(data) -> pa.RecordBatchReader:
|
||||
from lancedb.dependencies import datasets
|
||||
|
||||
@@ -100,7 +101,9 @@ def _into_pyarrow_reader(data) -> pa.RecordBatchReader:
|
||||
schema = data.features.arrow_schema
|
||||
return pa.RecordBatchReader.from_batches(schema, data.data.to_batches())
|
||||
elif isinstance(data, datasets.dataset_dict.DatasetDict):
|
||||
schema = _schema_from_hf(data, schema)
|
||||
schema = _schema_from_hf(data, None)
|
||||
if "split" not in schema.names:
|
||||
schema = schema.append(pa.field("split", pa.string()))
|
||||
return pa.RecordBatchReader.from_batches(
|
||||
schema, _to_batches_with_split(data)
|
||||
)
|
||||
@@ -130,7 +133,7 @@ def _into_pyarrow_reader(data) -> pa.RecordBatchReader:
|
||||
return data.to_reader()
|
||||
elif isinstance(data, pa.RecordBatch):
|
||||
return pa.RecordBatchReader.from_batches(data.schema, [data])
|
||||
elif isinstance(data, LanceDataset):
|
||||
elif _check_for_lance(data) and isinstance(data, lance.LanceDataset):
|
||||
return data.scanner().to_reader()
|
||||
elif isinstance(data, pa.dataset.Dataset):
|
||||
return data.scanner().to_reader()
|
||||
@@ -245,7 +248,6 @@ def _sanitize_data(
|
||||
target_schema = target_schema.with_metadata(new_metadata)
|
||||
|
||||
_validate_schema(target_schema)
|
||||
|
||||
reader = _cast_to_target_schema(reader, target_schema, allow_subschema)
|
||||
|
||||
return reader
|
||||
@@ -263,12 +265,7 @@ def _cast_to_target_schema(
|
||||
# Fast path when the schemas are already the same
|
||||
return reader
|
||||
|
||||
fields = []
|
||||
for field in reader.schema:
|
||||
target_field = target_schema.field(field.name)
|
||||
if target_field is None:
|
||||
raise ValueError(f"Field {field.name} not found in target schema")
|
||||
fields.append(target_field)
|
||||
fields = _align_field_types(list(iter(reader.schema)), list(iter(target_schema)))
|
||||
reordered_schema = pa.schema(fields, metadata=target_schema.metadata)
|
||||
if not allow_subschema and len(reordered_schema) != len(target_schema):
|
||||
raise ValueError(
|
||||
@@ -289,6 +286,53 @@ def _cast_to_target_schema(
|
||||
return pa.RecordBatchReader.from_batches(reordered_schema, gen())
|
||||
|
||||
|
||||
def _align_field_types(
|
||||
fields: List[pa.Field],
|
||||
target_fields: List[pa.Field],
|
||||
) -> List[pa.Field]:
|
||||
"""
|
||||
Apply the data types from the target_fields to the fields.
|
||||
"""
|
||||
new_fields = []
|
||||
for field in fields:
|
||||
target_field = next((f for f in target_fields if f.name == field.name), None)
|
||||
if target_field is None:
|
||||
raise ValueError(f"Field '{field.name}' not found in target schema")
|
||||
if pa.types.is_struct(target_field.type):
|
||||
new_type = pa.struct(
|
||||
_align_field_types(
|
||||
field.type.fields,
|
||||
target_field.type.fields,
|
||||
)
|
||||
)
|
||||
elif pa.types.is_list(target_field.type):
|
||||
new_type = pa.list_(
|
||||
_align_field_types(
|
||||
[field.type.value_field],
|
||||
[target_field.type.value_field],
|
||||
)[0]
|
||||
)
|
||||
elif pa.types.is_large_list(target_field.type):
|
||||
new_type = pa.large_list(
|
||||
_align_field_types(
|
||||
[field.type.value_field],
|
||||
[target_field.type.value_field],
|
||||
)[0]
|
||||
)
|
||||
elif pa.types.is_fixed_size_list(target_field.type):
|
||||
new_type = pa.list_(
|
||||
_align_field_types(
|
||||
[field.type.value_field],
|
||||
[target_field.type.value_field],
|
||||
)[0],
|
||||
target_field.type.list_size,
|
||||
)
|
||||
else:
|
||||
new_type = target_field.type
|
||||
new_fields.append(pa.field(field.name, new_type, field.nullable))
|
||||
return new_fields
|
||||
|
||||
|
||||
def _infer_subschema(
|
||||
schema: List[pa.Field],
|
||||
reference_fields: List[pa.Field],
|
||||
@@ -373,7 +417,7 @@ def sanitize_create_table(
|
||||
return data, schema
|
||||
|
||||
|
||||
def _schema_from_hf(data, schema):
|
||||
def _schema_from_hf(data, schema) -> pa.Schema:
|
||||
"""
|
||||
Extract pyarrow schema from HuggingFace DatasetDict
|
||||
and validate that they're all the same schema between
|
||||
@@ -577,7 +621,7 @@ class Table(ABC):
|
||||
|
||||
def create_index(
|
||||
self,
|
||||
metric="L2",
|
||||
metric="l2",
|
||||
num_partitions=256,
|
||||
num_sub_vectors=96,
|
||||
vector_column_name: str = VECTOR_COLUMN_NAME,
|
||||
@@ -596,10 +640,10 @@ class Table(ABC):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metric: str, default "L2"
|
||||
metric: str, default "l2"
|
||||
The distance metric to use when creating the index.
|
||||
Valid values are "L2", "cosine", "dot", or "hamming".
|
||||
L2 is euclidean distance.
|
||||
Valid values are "l2", "cosine", "dot", or "hamming".
|
||||
l2 is euclidean distance.
|
||||
Hamming is available only for binary vectors.
|
||||
num_partitions: int, default 256
|
||||
The number of IVF partitions to use when creating the index.
|
||||
@@ -885,7 +929,8 @@ class Table(ABC):
|
||||
of the given query vector. We currently support [vector search][search]
|
||||
and [full-text search][experimental-full-text-search].
|
||||
|
||||
All query options are defined in [Query][lancedb.query.Query].
|
||||
All query options are defined in
|
||||
[LanceQueryBuilder][lancedb.query.LanceQueryBuilder].
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -1144,6 +1189,7 @@ class Table(ABC):
|
||||
*,
|
||||
cleanup_older_than: Optional[timedelta] = None,
|
||||
delete_unverified: bool = False,
|
||||
retrain: bool = False,
|
||||
):
|
||||
"""
|
||||
Optimize the on-disk data and indices for better performance.
|
||||
@@ -1167,6 +1213,11 @@ class Table(ABC):
|
||||
in-progress operation (e.g. appending new data) and these files will not
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -1393,7 +1444,7 @@ class LanceTable(Table):
|
||||
# Cacheable since it's deterministic
|
||||
return _table_path(self._conn.uri, self.name)
|
||||
|
||||
def to_lance(self, **kwargs) -> LanceDataset:
|
||||
def to_lance(self, **kwargs) -> lance.LanceDataset:
|
||||
"""Return the LanceDataset backing this table."""
|
||||
try:
|
||||
import lance
|
||||
@@ -2230,7 +2281,19 @@ class LanceTable(Table):
|
||||
def _execute_query(
|
||||
self, query: Query, batch_size: Optional[int] = None
|
||||
) -> pa.RecordBatchReader:
|
||||
return LOOP.run(self._table._execute_query(query, batch_size))
|
||||
async_iter = LOOP.run(self._table._execute_query(query, batch_size))
|
||||
|
||||
def iter_sync():
|
||||
try:
|
||||
while True:
|
||||
yield LOOP.run(async_iter.__anext__())
|
||||
except StopAsyncIteration:
|
||||
return
|
||||
|
||||
return pa.RecordBatchReader.from_batches(async_iter.schema, iter_sync())
|
||||
|
||||
def _explain_plan(self, query: Query) -> str:
|
||||
return LOOP.run(self._table._explain_plan(query))
|
||||
|
||||
def _do_merge(
|
||||
self,
|
||||
@@ -2301,6 +2364,7 @@ class LanceTable(Table):
|
||||
*,
|
||||
cleanup_older_than: Optional[timedelta] = None,
|
||||
delete_unverified: bool = False,
|
||||
retrain: bool = False,
|
||||
):
|
||||
"""
|
||||
Optimize the on-disk data and indices for better performance.
|
||||
@@ -2324,6 +2388,11 @@ class LanceTable(Table):
|
||||
in-progress operation (e.g. appending new data) and these files will not
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -2347,6 +2416,7 @@ class LanceTable(Table):
|
||||
self._table.optimize(
|
||||
cleanup_older_than=cleanup_older_than,
|
||||
delete_unverified=delete_unverified,
|
||||
retrain=retrain,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -2998,7 +3068,7 @@ class AsyncTable:
|
||||
query_type: Literal["auto"] = ...,
|
||||
ordering_field_name: Optional[str] = None,
|
||||
fts_columns: Optional[Union[str, List[str]]] = None,
|
||||
) -> Union[AsyncHybridQuery | AsyncFTSQuery | AsyncVectorQuery]: ...
|
||||
) -> Union[AsyncHybridQuery, AsyncFTSQuery, AsyncVectorQuery]: ...
|
||||
|
||||
@overload
|
||||
async def search(
|
||||
@@ -3047,7 +3117,7 @@ class AsyncTable:
|
||||
query_type: QueryType = "auto",
|
||||
ordering_field_name: Optional[str] = None,
|
||||
fts_columns: Optional[Union[str, List[str]]] = None,
|
||||
) -> Union[AsyncHybridQuery | AsyncFTSQuery | AsyncVectorQuery]:
|
||||
) -> Union[AsyncHybridQuery, AsyncFTSQuery, AsyncVectorQuery]:
|
||||
"""Create a search query to find the nearest neighbors
|
||||
of the given query vector. We currently support [vector search][search]
|
||||
and [full-text search][experimental-full-text-search].
|
||||
@@ -3209,12 +3279,12 @@ class AsyncTable:
|
||||
builder = builder.column(vector_column_name)
|
||||
return builder
|
||||
elif query_type == "fts":
|
||||
return self.query().nearest_to_text(query, columns=fts_columns or [])
|
||||
return self.query().nearest_to_text(query, columns=fts_columns)
|
||||
elif query_type == "hybrid":
|
||||
builder = self.query().nearest_to(vector_query)
|
||||
if vector_column_name:
|
||||
builder = builder.column(vector_column_name)
|
||||
return builder.nearest_to_text(query, columns=fts_columns or [])
|
||||
return builder.nearest_to_text(query, columns=fts_columns)
|
||||
else:
|
||||
raise ValueError(f"Unknown query type: '{query_type}'")
|
||||
|
||||
@@ -3231,16 +3301,13 @@ class AsyncTable:
|
||||
"""
|
||||
return self.query().nearest_to(query_vector)
|
||||
|
||||
async def _execute_query(
|
||||
self, query: Query, batch_size: Optional[int] = None
|
||||
) -> pa.RecordBatchReader:
|
||||
# The sync remote table calls into this method, so we need to map the
|
||||
# query to the async version of the query and run that here. This is only
|
||||
# used for that code path right now.
|
||||
def _sync_query_to_async(
|
||||
self, query: Query
|
||||
) -> AsyncHybridQuery | AsyncFTSQuery | AsyncVectorQuery | AsyncQuery:
|
||||
async_query = self.query()
|
||||
if query.k is not None:
|
||||
async_query = async_query.limit(query.k)
|
||||
if query.offset > 0:
|
||||
if query.limit is not None:
|
||||
async_query = async_query.limit(query.limit)
|
||||
if query.offset is not None:
|
||||
async_query = async_query.offset(query.offset)
|
||||
if query.columns:
|
||||
async_query = async_query.select(query.columns)
|
||||
@@ -3252,35 +3319,49 @@ class AsyncTable:
|
||||
async_query = async_query.with_row_id()
|
||||
|
||||
if query.vector:
|
||||
# we need the schema to get the vector column type
|
||||
# to determine whether the vectors is batch queries or not
|
||||
async_query = (
|
||||
async_query.nearest_to(query.vector)
|
||||
.distance_type(query.metric)
|
||||
.nprobes(query.nprobes)
|
||||
.distance_range(query.lower_bound, query.upper_bound)
|
||||
async_query = async_query.nearest_to(query.vector).distance_range(
|
||||
query.lower_bound, query.upper_bound
|
||||
)
|
||||
if query.refine_factor:
|
||||
if query.distance_type is not None:
|
||||
async_query = async_query.distance_type(query.distance_type)
|
||||
if query.nprobes is not None:
|
||||
async_query = async_query.nprobes(query.nprobes)
|
||||
if query.refine_factor is not None:
|
||||
async_query = async_query.refine_factor(query.refine_factor)
|
||||
if query.vector_column:
|
||||
async_query = async_query.column(query.vector_column)
|
||||
if query.ef:
|
||||
async_query = async_query.ef(query.ef)
|
||||
if not query.use_index:
|
||||
if query.bypass_vector_index:
|
||||
async_query = async_query.bypass_vector_index()
|
||||
|
||||
if not query.prefilter:
|
||||
if query.postfilter:
|
||||
async_query = async_query.postfilter()
|
||||
|
||||
if isinstance(query.full_text_query, str):
|
||||
async_query = async_query.nearest_to_text(query.full_text_query)
|
||||
elif isinstance(query.full_text_query, dict):
|
||||
fts_query = query.full_text_query["query"]
|
||||
fts_columns = query.full_text_query.get("columns", []) or []
|
||||
async_query = async_query.nearest_to_text(fts_query, columns=fts_columns)
|
||||
if query.full_text_query:
|
||||
async_query = async_query.nearest_to_text(
|
||||
query.full_text_query.query, query.full_text_query.columns
|
||||
)
|
||||
if query.full_text_query.limit is not None:
|
||||
async_query = async_query.limit(query.full_text_query.limit)
|
||||
|
||||
table = await async_query.to_arrow()
|
||||
return table.to_reader()
|
||||
return async_query
|
||||
|
||||
async def _execute_query(
|
||||
self, query: Query, batch_size: Optional[int] = None
|
||||
) -> pa.RecordBatchReader:
|
||||
# The sync table calls into this method, so we need to map the
|
||||
# query to the async version of the query and run that here. This is only
|
||||
# used for that code path right now.
|
||||
|
||||
async_query = self._sync_query_to_async(query)
|
||||
|
||||
return await async_query.to_batches(max_batch_length=batch_size)
|
||||
|
||||
async def _explain_plan(self, query: Query) -> str:
|
||||
# This method is used by the sync table
|
||||
async_query = self._sync_query_to_async(query)
|
||||
return await async_query.explain_plan()
|
||||
|
||||
async def _do_merge(
|
||||
self,
|
||||
@@ -3549,6 +3630,7 @@ class AsyncTable:
|
||||
*,
|
||||
cleanup_older_than: Optional[timedelta] = None,
|
||||
delete_unverified: bool = False,
|
||||
retrain=False,
|
||||
) -> OptimizeStats:
|
||||
"""
|
||||
Optimize the on-disk data and indices for better performance.
|
||||
@@ -3572,6 +3654,11 @@ class AsyncTable:
|
||||
in-progress operation (e.g. appending new data) and these files will not
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -3595,7 +3682,9 @@ class AsyncTable:
|
||||
if cleanup_older_than is not None:
|
||||
cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
|
||||
return await self._inner.optimize(
|
||||
cleanup_since_ms=cleanup_since_ms, delete_unverified=delete_unverified
|
||||
cleanup_since_ms=cleanup_since_ms,
|
||||
delete_unverified=delete_unverified,
|
||||
retrain=retrain,
|
||||
)
|
||||
|
||||
async def list_indices(self) -> Iterable[IndexConfig]:
|
||||
@@ -3688,6 +3777,8 @@ class IndexStatistics:
|
||||
The distance type used by the index.
|
||||
num_indices: Optional[int]
|
||||
The number of parts the index is split into.
|
||||
loss: Optional[float]
|
||||
The KMeans loss for the index, for only vector indices.
|
||||
"""
|
||||
|
||||
num_indexed_rows: int
|
||||
@@ -3697,6 +3788,7 @@ class IndexStatistics:
|
||||
]
|
||||
distance_type: Optional[Literal["l2", "cosine", "dot"]] = None
|
||||
num_indices: Optional[int] = None
|
||||
loss: Optional[float] = None
|
||||
|
||||
# This exists for backwards compatibility with an older API, which returned
|
||||
# a dictionary instead of a class.
|
||||
|
||||
@@ -157,24 +157,6 @@ def attempt_import_or_raise(module: str, mitigation=None):
|
||||
raise ImportError(f"Please install {mitigation or module}")
|
||||
|
||||
|
||||
def safe_import_pandas():
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
return pd
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
|
||||
def safe_import_polars():
|
||||
try:
|
||||
import polars as pl
|
||||
|
||||
return pl
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
|
||||
def flatten_columns(tbl: pa.Table, flatten: Optional[Union[int, bool]] = None):
|
||||
"""
|
||||
Flatten all struct columns in a table.
|
||||
|
||||
@@ -83,6 +83,21 @@ def test_quickstart(tmp_path):
|
||||
}
|
||||
)
|
||||
# --8<-- [end:alter_columns]
|
||||
# --8<-- [start:alter_columns_vector]
|
||||
tbl.alter_columns(
|
||||
{
|
||||
"path": "vector",
|
||||
"data_type": pa.list_(pa.float16(), list_size=2),
|
||||
}
|
||||
)
|
||||
# --8<-- [end:alter_columns_vector]
|
||||
# Change it back since we can get a panic with fp16
|
||||
tbl.alter_columns(
|
||||
{
|
||||
"path": "vector",
|
||||
"data_type": pa.list_(pa.float32(), list_size=2),
|
||||
}
|
||||
)
|
||||
# --8<-- [start:drop_columns]
|
||||
tbl.drop_columns(["dbl_price"])
|
||||
# --8<-- [end:drop_columns]
|
||||
@@ -162,6 +177,21 @@ async def test_quickstart_async(tmp_path):
|
||||
}
|
||||
)
|
||||
# --8<-- [end:alter_columns_async]
|
||||
# --8<-- [start:alter_columns_async_vector]
|
||||
await tbl.alter_columns(
|
||||
{
|
||||
"path": "vector",
|
||||
"data_type": pa.list_(pa.float16(), list_size=2),
|
||||
}
|
||||
)
|
||||
# --8<-- [end:alter_columns_async_vector]
|
||||
# Change it back since we can get a panic with fp16
|
||||
await tbl.alter_columns(
|
||||
{
|
||||
"path": "vector",
|
||||
"data_type": pa.list_(pa.float32(), list_size=2),
|
||||
}
|
||||
)
|
||||
# --8<-- [start:drop_columns_async]
|
||||
await tbl.drop_columns(["dbl_price"])
|
||||
# --8<-- [end:drop_columns_async]
|
||||
|
||||
@@ -15,7 +15,6 @@ from lancedb.conftest import MockTextEmbeddingFunction
|
||||
from lancedb.embeddings import (
|
||||
EmbeddingFunctionConfig,
|
||||
EmbeddingFunctionRegistry,
|
||||
with_embeddings,
|
||||
)
|
||||
from lancedb.embeddings.base import TextEmbeddingFunction
|
||||
from lancedb.embeddings.registry import get_registry, register
|
||||
@@ -27,23 +26,6 @@ def mock_embed_func(input_data):
|
||||
return [np.random.randn(128).tolist() for _ in range(len(input_data))]
|
||||
|
||||
|
||||
def test_with_embeddings():
|
||||
for wrap_api in [True, False]:
|
||||
data = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
names=["text", "price"],
|
||||
)
|
||||
data = with_embeddings(mock_embed_func, data, wrap_api=wrap_api)
|
||||
assert data.num_columns == 3
|
||||
assert data.num_rows == 2
|
||||
assert data.column_names == ["text", "price", "vector"]
|
||||
assert data.column("text").to_pylist() == ["foo", "bar"]
|
||||
assert data.column("price").to_pylist() == [10.0, 20.0]
|
||||
|
||||
|
||||
def test_embedding_function(tmp_path):
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
|
||||
|
||||
@@ -131,6 +131,7 @@ async def test_create_vector_index(some_table: AsyncTable):
|
||||
assert stats.num_indexed_rows == await some_table.count_rows()
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
assert stats.loss >= 0.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -154,6 +155,7 @@ async def test_create_4bit_ivfpq_index(some_table: AsyncTable):
|
||||
assert stats.num_indexed_rows == await some_table.count_rows()
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
assert stats.loss >= 0.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -26,10 +26,12 @@ from lancedb.query import (
|
||||
AsyncVectorQuery,
|
||||
LanceVectorQueryBuilder,
|
||||
Query,
|
||||
FullTextSearchQuery,
|
||||
)
|
||||
from lancedb.rerankers.cross_encoder import CrossEncoderReranker
|
||||
from lancedb.table import AsyncTable, LanceTable
|
||||
from utils import exception_output
|
||||
from importlib.util import find_spec
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@@ -392,12 +394,28 @@ def test_query_builder_batches(table):
|
||||
for item in rs:
|
||||
rs_list.append(item)
|
||||
assert isinstance(item, pa.RecordBatch)
|
||||
assert len(rs_list) == 1
|
||||
assert len(rs_list[0]["id"]) == 2
|
||||
assert len(rs_list) == 2
|
||||
assert len(rs_list[0]["id"]) == 1
|
||||
assert all(rs_list[0].to_pandas()["vector"][0] == [1.0, 2.0])
|
||||
assert rs_list[0].to_pandas()["id"][0] == 1
|
||||
assert all(rs_list[0].to_pandas()["vector"][1] == [3.0, 4.0])
|
||||
assert rs_list[0].to_pandas()["id"][1] == 2
|
||||
assert all(rs_list[1].to_pandas()["vector"][0] == [3.0, 4.0])
|
||||
assert rs_list[1].to_pandas()["id"][0] == 2
|
||||
|
||||
rs = (
|
||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||
.limit(2)
|
||||
.select(["id", "vector"])
|
||||
.to_batches(2)
|
||||
)
|
||||
rs_list = []
|
||||
for item in rs:
|
||||
rs_list.append(item)
|
||||
assert isinstance(item, pa.RecordBatch)
|
||||
assert len(rs_list) == 1
|
||||
assert len(rs_list[0]["id"]) == 2
|
||||
rs_list = rs_list[0].to_pandas()
|
||||
assert rs_list["id"][0] == 1
|
||||
assert rs_list["id"][1] == 2
|
||||
|
||||
|
||||
def test_dynamic_projection(table):
|
||||
@@ -452,7 +470,7 @@ def test_query_builder_with_metric(table):
|
||||
df_default = LanceVectorQueryBuilder(table, query, vector_column_name).to_pandas()
|
||||
df_l2 = (
|
||||
LanceVectorQueryBuilder(table, query, vector_column_name)
|
||||
.distance_type("L2")
|
||||
.distance_type("l2")
|
||||
.to_pandas()
|
||||
)
|
||||
tm.assert_frame_equal(df_default, df_l2)
|
||||
@@ -488,12 +506,9 @@ def test_query_builder_with_different_vector_column():
|
||||
Query(
|
||||
vector=query,
|
||||
filter="b < 10",
|
||||
prefilter=True,
|
||||
k=2,
|
||||
metric="cosine",
|
||||
limit=2,
|
||||
distance_type="cosine",
|
||||
columns=["b"],
|
||||
nprobes=20,
|
||||
refine_factor=None,
|
||||
vector_column="foo_vector",
|
||||
),
|
||||
None,
|
||||
@@ -595,6 +610,10 @@ async def test_query_async(table_async: AsyncTable):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.slow
|
||||
async def test_query_reranked_async(table_async: AsyncTable):
|
||||
# CrossEncoderReranker requires torch
|
||||
if find_spec("torch") is None:
|
||||
pytest.skip("torch not installed")
|
||||
|
||||
# FTS with rerank
|
||||
await table_async.create_index("text", config=FTS(with_position=False))
|
||||
await check_query(
|
||||
@@ -823,3 +842,223 @@ async def test_query_search_specified(mem_db_async: AsyncConnection):
|
||||
assert "No embedding functions are registered for any columns" in exception_output(
|
||||
e
|
||||
)
|
||||
|
||||
|
||||
# Helper method used in the following tests. Looks at the simple python object `q` and
|
||||
# checks that the properties match the expected values in kwargs.
|
||||
def check_set_props(q, **kwargs):
|
||||
for k in dict(q):
|
||||
if not k.startswith("_"):
|
||||
if k in kwargs:
|
||||
assert kwargs[k] == getattr(q, k), (
|
||||
f"{k} should be {kwargs[k]} but is {getattr(q, k)}"
|
||||
)
|
||||
else:
|
||||
assert getattr(q, k) is None, f"{k} should be None"
|
||||
|
||||
|
||||
def test_query_serialization_sync(table: lancedb.table.Table):
|
||||
# Simple queries
|
||||
q = table.search().where("id = 1").limit(500).offset(10).to_query_object()
|
||||
check_set_props(q, limit=500, offset=10, filter="id = 1")
|
||||
|
||||
q = table.search().select(["id", "vector"]).to_query_object()
|
||||
check_set_props(q, columns=["id", "vector"])
|
||||
|
||||
q = table.search().with_row_id(True).to_query_object()
|
||||
check_set_props(q, with_row_id=True)
|
||||
|
||||
# Vector queries
|
||||
q = table.search([5.0, 6.0]).limit(10).to_query_object()
|
||||
check_set_props(q, limit=10, vector_column="vector", vector=[5.0, 6.0])
|
||||
|
||||
q = table.search([5.0, 6.0]).to_query_object()
|
||||
check_set_props(q, vector_column="vector", vector=[5.0, 6.0])
|
||||
|
||||
q = (
|
||||
table.search([5.0, 6.0])
|
||||
.limit(10)
|
||||
.where("id = 1", prefilter=False)
|
||||
.to_query_object()
|
||||
)
|
||||
check_set_props(
|
||||
q,
|
||||
limit=10,
|
||||
vector_column="vector",
|
||||
filter="id = 1",
|
||||
postfilter=True,
|
||||
vector=[5.0, 6.0],
|
||||
)
|
||||
|
||||
q = table.search([5.0, 6.0]).nprobes(10).refine_factor(5).to_query_object()
|
||||
check_set_props(
|
||||
q, vector_column="vector", vector=[5.0, 6.0], nprobes=10, refine_factor=5
|
||||
)
|
||||
|
||||
q = table.search([5.0, 6.0]).distance_range(0.0, 1.0).to_query_object()
|
||||
check_set_props(
|
||||
q, vector_column="vector", vector=[5.0, 6.0], lower_bound=0.0, upper_bound=1.0
|
||||
)
|
||||
|
||||
q = table.search([5.0, 6.0]).distance_type("cosine").to_query_object()
|
||||
check_set_props(
|
||||
q, distance_type="cosine", vector_column="vector", vector=[5.0, 6.0]
|
||||
)
|
||||
|
||||
q = table.search([5.0, 6.0]).ef(7).to_query_object()
|
||||
check_set_props(q, ef=7, vector_column="vector", vector=[5.0, 6.0])
|
||||
|
||||
q = table.search([5.0, 6.0]).bypass_vector_index().to_query_object()
|
||||
check_set_props(
|
||||
q, bypass_vector_index=True, vector_column="vector", vector=[5.0, 6.0]
|
||||
)
|
||||
|
||||
# FTS queries
|
||||
q = table.search("foo").limit(10).to_query_object()
|
||||
check_set_props(
|
||||
q, limit=10, full_text_query=FullTextSearchQuery(columns=[], query="foo")
|
||||
)
|
||||
|
||||
q = table.search("foo", query_type="fts").to_query_object()
|
||||
check_set_props(q, full_text_query=FullTextSearchQuery(columns=[], query="foo"))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_query_serialization_async(table_async: AsyncTable):
|
||||
# Simple queries
|
||||
q = table_async.query().where("id = 1").limit(500).offset(10).to_query_object()
|
||||
check_set_props(q, limit=500, offset=10, filter="id = 1", with_row_id=False)
|
||||
|
||||
q = table_async.query().select(["id", "vector"]).to_query_object()
|
||||
check_set_props(q, columns=["id", "vector"], with_row_id=False)
|
||||
|
||||
q = table_async.query().with_row_id().to_query_object()
|
||||
check_set_props(q, with_row_id=True)
|
||||
|
||||
sample_vector = [pa.array([5.0, 6.0], type=pa.float32())]
|
||||
|
||||
# Vector queries
|
||||
q = (await table_async.search([5.0, 6.0])).limit(10).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
limit=10,
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
)
|
||||
|
||||
q = (await table_async.search([5.0, 6.0])).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
)
|
||||
|
||||
q = (
|
||||
(await table_async.search([5.0, 6.0]))
|
||||
.limit(10)
|
||||
.where("id = 1")
|
||||
.postfilter()
|
||||
.to_query_object()
|
||||
)
|
||||
check_set_props(
|
||||
q,
|
||||
limit=10,
|
||||
filter="id = 1",
|
||||
postfilter=True,
|
||||
vector=sample_vector,
|
||||
nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
)
|
||||
|
||||
q = (
|
||||
(await table_async.search([5.0, 6.0]))
|
||||
.nprobes(10)
|
||||
.refine_factor(5)
|
||||
.to_query_object()
|
||||
)
|
||||
check_set_props(
|
||||
q,
|
||||
vector=sample_vector,
|
||||
nprobes=10,
|
||||
refine_factor=5,
|
||||
postfilter=False,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
)
|
||||
|
||||
q = (
|
||||
(await table_async.search([5.0, 6.0]))
|
||||
.distance_range(0.0, 1.0)
|
||||
.to_query_object()
|
||||
)
|
||||
check_set_props(
|
||||
q,
|
||||
vector=sample_vector,
|
||||
lower_bound=0.0,
|
||||
upper_bound=1.0,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
)
|
||||
|
||||
q = (await table_async.search([5.0, 6.0])).distance_type("cosine").to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
distance_type="cosine",
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
)
|
||||
|
||||
q = (await table_async.search([5.0, 6.0])).ef(7).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
ef=7,
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
with_row_id=False,
|
||||
bypass_vector_index=False,
|
||||
limit=10,
|
||||
)
|
||||
|
||||
q = (await table_async.search([5.0, 6.0])).bypass_vector_index().to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
bypass_vector_index=True,
|
||||
vector=sample_vector,
|
||||
postfilter=False,
|
||||
nprobes=20,
|
||||
with_row_id=False,
|
||||
limit=10,
|
||||
)
|
||||
|
||||
# FTS queries
|
||||
q = (await table_async.search("foo")).limit(10).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
limit=10,
|
||||
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
||||
with_row_id=False,
|
||||
)
|
||||
|
||||
q = (await table_async.search("foo", query_type="fts")).to_query_object()
|
||||
check_set_props(
|
||||
q,
|
||||
full_text_query=FullTextSearchQuery(columns=[], query="foo"),
|
||||
with_row_id=False,
|
||||
)
|
||||
|
||||
@@ -315,7 +315,7 @@ def test_query_sync_minimal():
|
||||
assert body == {
|
||||
"distance_type": "l2",
|
||||
"k": 10,
|
||||
"prefilter": False,
|
||||
"prefilter": True,
|
||||
"refine_factor": None,
|
||||
"lower_bound": None,
|
||||
"upper_bound": None,
|
||||
@@ -340,7 +340,7 @@ def test_query_sync_empty_query():
|
||||
"filter": "true",
|
||||
"vector": [],
|
||||
"columns": ["id"],
|
||||
"prefilter": False,
|
||||
"prefilter": True,
|
||||
"version": None,
|
||||
}
|
||||
|
||||
@@ -478,7 +478,7 @@ def test_query_sync_hybrid():
|
||||
assert body == {
|
||||
"distance_type": "l2",
|
||||
"k": 42,
|
||||
"prefilter": False,
|
||||
"prefilter": True,
|
||||
"refine_factor": None,
|
||||
"vector": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
||||
"nprobes": 20,
|
||||
|
||||
@@ -368,6 +368,26 @@ def test_rrf_reranker_distance():
|
||||
assert score == fts_scores[rowid], "Score mismatch"
|
||||
assert found_match, "No results matched between hybrid and fts search"
|
||||
|
||||
# Test for empty fts results
|
||||
fts_results = (
|
||||
table.search("abcxyz" * 100, query_type="fts").with_row_id(True).to_list()
|
||||
)
|
||||
hybrid_results = (
|
||||
table.search(query_type="hybrid")
|
||||
.vector([0.0] * 32)
|
||||
.text("abcxyz" * 100)
|
||||
.with_row_id(True)
|
||||
.rerank(reranker)
|
||||
.to_list()
|
||||
)
|
||||
assert len(fts_results) == 0
|
||||
# confirm if _rowid, _score, _distance & _relevance_score are present in hybrid
|
||||
assert len(hybrid_results) > 0
|
||||
assert "_rowid" in hybrid_results[0]
|
||||
assert "_score" in hybrid_results[0]
|
||||
assert "_distance" in hybrid_results[0]
|
||||
assert "_relevance_score" in hybrid_results[0]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
|
||||
|
||||
@@ -8,13 +8,13 @@ from time import sleep
|
||||
from typing import List
|
||||
from unittest.mock import patch
|
||||
|
||||
import lance
|
||||
import lancedb
|
||||
from lancedb.index import HnswPq, HnswSq, IvfPq
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import pyarrow as pa
|
||||
import pyarrow.dataset
|
||||
import pytest
|
||||
from lancedb.conftest import MockTextEmbeddingFunction
|
||||
from lancedb.db import AsyncConnection, DBConnection
|
||||
@@ -231,6 +231,59 @@ def test_add(mem_db: DBConnection):
|
||||
_add(table, schema)
|
||||
|
||||
|
||||
def test_add_struct(mem_db: DBConnection):
|
||||
# https://github.com/lancedb/lancedb/issues/2114
|
||||
schema = pa.schema(
|
||||
[
|
||||
(
|
||||
"stuff",
|
||||
pa.struct(
|
||||
[
|
||||
("b", pa.int64()),
|
||||
("a", pa.int64()),
|
||||
# TODO: also test subset of nested.
|
||||
]
|
||||
),
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Create test data with fields in same order
|
||||
data = [{"stuff": {"b": 1, "a": 2}}]
|
||||
# pa.Table.from_pylist() will reorder the fields. We need to make sure
|
||||
# we fix the field order later, before casting.
|
||||
table = mem_db.create_table("test", schema=schema)
|
||||
table.add(data)
|
||||
|
||||
data = [{"stuff": {"b": 4}}]
|
||||
table.add(data)
|
||||
|
||||
expected = pa.table(
|
||||
{
|
||||
"stuff": [{"b": 1, "a": 2}, {"b": 4, "a": None}],
|
||||
},
|
||||
schema=schema,
|
||||
)
|
||||
assert table.to_arrow() == expected
|
||||
|
||||
# Also check struct in list
|
||||
schema = pa.schema(
|
||||
{
|
||||
"s_list": pa.list_(
|
||||
pa.struct(
|
||||
[
|
||||
("b", pa.int64()),
|
||||
("a", pa.int64()),
|
||||
]
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
||||
data = [{"s_list": [{"b": 1, "a": 2}, {"b": 4}]}]
|
||||
table = mem_db.create_table("test", schema=schema)
|
||||
table.add(data)
|
||||
|
||||
|
||||
def test_add_subschema(mem_db: DBConnection):
|
||||
schema = pa.schema(
|
||||
[
|
||||
@@ -324,7 +377,10 @@ def test_add_nullability(mem_db: DBConnection):
|
||||
# We can't add nullable schema if it contains nulls
|
||||
with pytest.raises(
|
||||
Exception,
|
||||
match="Casting field 'vector' with null values to non-nullable",
|
||||
match=(
|
||||
"The field `vector` contained null values even though "
|
||||
"the field is marked non-null in the schema"
|
||||
),
|
||||
):
|
||||
table.add(data)
|
||||
|
||||
@@ -480,7 +536,7 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
)
|
||||
|
||||
table.create_index(
|
||||
metric="L2",
|
||||
metric="l2",
|
||||
num_partitions=256,
|
||||
num_sub_vectors=96,
|
||||
vector_column_name="vector",
|
||||
@@ -489,7 +545,7 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
num_bits=4,
|
||||
)
|
||||
expected_config = IvfPq(
|
||||
distance_type="L2",
|
||||
distance_type="l2",
|
||||
num_partitions=256,
|
||||
num_sub_vectors=96,
|
||||
num_bits=4,
|
||||
@@ -594,6 +650,9 @@ def test_restore(mem_db: DBConnection):
|
||||
|
||||
|
||||
def test_merge(tmp_db: DBConnection, tmp_path):
|
||||
pytest.importorskip("lance")
|
||||
import lance
|
||||
|
||||
table = tmp_db.create_table(
|
||||
"my_table",
|
||||
schema=pa.schema(
|
||||
@@ -1089,6 +1148,7 @@ def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection):
|
||||
|
||||
|
||||
def test_compact_cleanup(tmp_db: DBConnection):
|
||||
pytest.importorskip("lance")
|
||||
table = tmp_db.create_table(
|
||||
"my_table",
|
||||
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
|
||||
@@ -1166,6 +1226,7 @@ def setup_hybrid_search_table(db: DBConnection, embedding_func):
|
||||
def test_hybrid_search(tmp_db: DBConnection):
|
||||
# This test uses an FTS index
|
||||
pytest.importorskip("lancedb.fts")
|
||||
pytest.importorskip("lance")
|
||||
|
||||
table, MyTable, emb = setup_hybrid_search_table(tmp_db, "test")
|
||||
|
||||
@@ -1236,8 +1297,9 @@ def test_hybrid_search(tmp_db: DBConnection):
|
||||
def test_hybrid_search_metric_type(tmp_db: DBConnection):
|
||||
# This test uses an FTS index
|
||||
pytest.importorskip("lancedb.fts")
|
||||
pytest.importorskip("lance")
|
||||
|
||||
# Need to use nonnorm as the embedding function so L2 and dot results
|
||||
# Need to use nonnorm as the embedding function so l2 and dot results
|
||||
# are different
|
||||
table, _, _ = setup_hybrid_search_table(tmp_db, "nonnorm")
|
||||
|
||||
|
||||
@@ -1,19 +1,28 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::make_array;
|
||||
use arrow::array::Array;
|
||||
use arrow::array::ArrayData;
|
||||
use arrow::pyarrow::FromPyArrow;
|
||||
use arrow::pyarrow::IntoPyArrow;
|
||||
use lancedb::index::scalar::FullTextSearchQuery;
|
||||
use lancedb::query::QueryExecutionOptions;
|
||||
use lancedb::query::QueryFilter;
|
||||
use lancedb::query::{
|
||||
ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
|
||||
};
|
||||
use lancedb::table::AnyQuery;
|
||||
use pyo3::exceptions::PyNotImplementedError;
|
||||
use pyo3::exceptions::PyRuntimeError;
|
||||
use pyo3::prelude::{PyAnyMethods, PyDictMethods};
|
||||
use pyo3::pymethods;
|
||||
use pyo3::types::PyDict;
|
||||
use pyo3::types::PyList;
|
||||
use pyo3::Bound;
|
||||
use pyo3::IntoPyObject;
|
||||
use pyo3::PyAny;
|
||||
use pyo3::PyRef;
|
||||
use pyo3::PyResult;
|
||||
@@ -24,6 +33,156 @@ use crate::arrow::RecordBatchStream;
|
||||
use crate::error::PythonErrorExt;
|
||||
use crate::util::parse_distance_type;
|
||||
|
||||
// Python representation of full text search parameters
|
||||
#[derive(Clone)]
|
||||
#[pyclass(get_all)]
|
||||
pub struct PyFullTextSearchQuery {
|
||||
pub columns: Vec<String>,
|
||||
pub query: String,
|
||||
pub limit: Option<i64>,
|
||||
pub wand_factor: Option<f32>,
|
||||
}
|
||||
|
||||
impl From<FullTextSearchQuery> for PyFullTextSearchQuery {
|
||||
fn from(query: FullTextSearchQuery) -> Self {
|
||||
PyFullTextSearchQuery {
|
||||
columns: query.columns,
|
||||
query: query.query,
|
||||
limit: query.limit,
|
||||
wand_factor: query.wand_factor,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Python representation of query vector(s)
|
||||
#[derive(Clone)]
|
||||
pub struct PyQueryVectors(Vec<Arc<dyn Array>>);
|
||||
|
||||
impl<'py> IntoPyObject<'py> for PyQueryVectors {
|
||||
type Target = PyList;
|
||||
type Output = Bound<'py, Self::Target>;
|
||||
type Error = PyErr;
|
||||
|
||||
fn into_pyobject(self, py: pyo3::Python<'py>) -> PyResult<Self::Output> {
|
||||
let py_objs = self
|
||||
.0
|
||||
.into_iter()
|
||||
.map(|v| v.to_data().into_pyarrow(py))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
PyList::new(py, py_objs)
|
||||
}
|
||||
}
|
||||
|
||||
// Python representation of a query
|
||||
#[pyclass(get_all)]
|
||||
pub struct PyQueryRequest {
|
||||
pub limit: Option<usize>,
|
||||
pub offset: Option<usize>,
|
||||
pub filter: Option<PyQueryFilter>,
|
||||
pub full_text_search: Option<PyFullTextSearchQuery>,
|
||||
pub select: PySelect,
|
||||
pub fast_search: Option<bool>,
|
||||
pub with_row_id: Option<bool>,
|
||||
pub column: Option<String>,
|
||||
pub query_vector: Option<PyQueryVectors>,
|
||||
pub nprobes: Option<usize>,
|
||||
pub lower_bound: Option<f32>,
|
||||
pub upper_bound: Option<f32>,
|
||||
pub ef: Option<usize>,
|
||||
pub refine_factor: Option<u32>,
|
||||
pub distance_type: Option<String>,
|
||||
pub bypass_vector_index: Option<bool>,
|
||||
pub postfilter: Option<bool>,
|
||||
pub norm: Option<String>,
|
||||
}
|
||||
|
||||
impl From<AnyQuery> for PyQueryRequest {
|
||||
fn from(query: AnyQuery) -> Self {
|
||||
match query {
|
||||
AnyQuery::Query(query_request) => PyQueryRequest {
|
||||
limit: query_request.limit,
|
||||
offset: query_request.offset,
|
||||
filter: query_request.filter.map(PyQueryFilter),
|
||||
full_text_search: query_request
|
||||
.full_text_search
|
||||
.map(PyFullTextSearchQuery::from),
|
||||
select: PySelect(query_request.select),
|
||||
fast_search: Some(query_request.fast_search),
|
||||
with_row_id: Some(query_request.with_row_id),
|
||||
column: None,
|
||||
query_vector: None,
|
||||
nprobes: None,
|
||||
lower_bound: None,
|
||||
upper_bound: None,
|
||||
ef: None,
|
||||
refine_factor: None,
|
||||
distance_type: None,
|
||||
bypass_vector_index: None,
|
||||
postfilter: None,
|
||||
norm: None,
|
||||
},
|
||||
AnyQuery::VectorQuery(vector_query) => PyQueryRequest {
|
||||
limit: vector_query.base.limit,
|
||||
offset: vector_query.base.offset,
|
||||
filter: vector_query.base.filter.map(PyQueryFilter),
|
||||
full_text_search: None,
|
||||
select: PySelect(vector_query.base.select),
|
||||
fast_search: Some(vector_query.base.fast_search),
|
||||
with_row_id: Some(vector_query.base.with_row_id),
|
||||
column: vector_query.column,
|
||||
query_vector: Some(PyQueryVectors(vector_query.query_vector)),
|
||||
nprobes: Some(vector_query.nprobes),
|
||||
lower_bound: vector_query.lower_bound,
|
||||
upper_bound: vector_query.upper_bound,
|
||||
ef: vector_query.ef,
|
||||
refine_factor: vector_query.refine_factor,
|
||||
distance_type: vector_query.distance_type.map(|d| d.to_string()),
|
||||
bypass_vector_index: Some(!vector_query.use_index),
|
||||
postfilter: Some(!vector_query.base.prefilter),
|
||||
norm: vector_query.base.norm.map(|n| n.to_string()),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Python representation of query selection
|
||||
#[derive(Clone)]
|
||||
pub struct PySelect(Select);
|
||||
|
||||
impl<'py> IntoPyObject<'py> for PySelect {
|
||||
type Target = PyAny;
|
||||
type Output = Bound<'py, Self::Target>;
|
||||
type Error = PyErr;
|
||||
|
||||
fn into_pyobject(self, py: pyo3::Python<'py>) -> PyResult<Self::Output> {
|
||||
match self.0 {
|
||||
Select::All => Ok(py.None().into_bound(py).into_any()),
|
||||
Select::Columns(columns) => Ok(columns.into_pyobject(py)?.into_any()),
|
||||
Select::Dynamic(columns) => Ok(columns.into_pyobject(py)?.into_any()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Python representation of query filter
|
||||
#[derive(Clone)]
|
||||
pub struct PyQueryFilter(QueryFilter);
|
||||
|
||||
impl<'py> IntoPyObject<'py> for PyQueryFilter {
|
||||
type Target = PyAny;
|
||||
type Output = Bound<'py, Self::Target>;
|
||||
type Error = PyErr;
|
||||
|
||||
fn into_pyobject(self, py: pyo3::Python<'py>) -> PyResult<Self::Output> {
|
||||
match self.0 {
|
||||
QueryFilter::Datafusion(_) => Err(PyNotImplementedError::new_err(
|
||||
"Datafusion filter has no conversion to Python",
|
||||
)),
|
||||
QueryFilter::Sql(sql) => Ok(sql.into_pyobject(py)?.into_any()),
|
||||
QueryFilter::Substrait(substrait) => Ok(substrait.into_pyobject(py)?.into_any()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct Query {
|
||||
inner: LanceDbQuery,
|
||||
@@ -121,6 +280,10 @@ impl Query {
|
||||
.map_err(|e| PyRuntimeError::new_err(e.to_string()))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn to_query_request(&self) -> PyQueryRequest {
|
||||
PyQueryRequest::from(AnyQuery::Query(self.inner.clone().into_request()))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
@@ -205,6 +368,12 @@ impl FTSQuery {
|
||||
pub fn get_query(&self) -> String {
|
||||
self.fts_query.query.clone()
|
||||
}
|
||||
|
||||
pub fn to_query_request(&self) -> PyQueryRequest {
|
||||
let mut req = self.inner.clone().into_request();
|
||||
req.full_text_search = Some(self.fts_query.clone());
|
||||
PyQueryRequest::from(AnyQuery::Query(req))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
@@ -319,6 +488,10 @@ impl VectorQuery {
|
||||
inner_fts: fts_query,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn to_query_request(&self) -> PyQueryRequest {
|
||||
PyQueryRequest::from(AnyQuery::VectorQuery(self.inner.clone().into_request()))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
@@ -421,4 +594,17 @@ impl HybridQuery {
|
||||
pub fn get_with_row_id(&mut self) -> bool {
|
||||
self.inner_fts.inner.current_request().with_row_id
|
||||
}
|
||||
|
||||
pub fn to_query_request(&self) -> PyQueryRequest {
|
||||
let mut req = self.inner_fts.to_query_request();
|
||||
let vec_req = self.inner_vec.to_query_request();
|
||||
req.query_vector = vec_req.query_vector;
|
||||
req.column = vec_req.column;
|
||||
req.distance_type = vec_req.distance_type;
|
||||
req.ef = vec_req.ef;
|
||||
req.refine_factor = vec_req.refine_factor;
|
||||
req.lower_bound = vec_req.lower_bound;
|
||||
req.upper_bound = vec_req.upper_bound;
|
||||
req
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user