mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-24 13:59:58 +00:00
Compare commits
12 Commits
lancedb-cl
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
66a881b33a | ||
|
|
a7515d6ee2 | ||
|
|
587c0824af | ||
|
|
b38a4269d0 | ||
|
|
119d88b9db | ||
|
|
74f660d223 | ||
|
|
b2b0979b90 | ||
|
|
ee2a40b182 | ||
|
|
4ca0b15354 | ||
|
|
d8c217b47d | ||
|
|
b724b1a01f | ||
|
|
abd75e0ead |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.13.0-beta.1"
|
||||
current_version = "0.13.0-beta.2"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
1
.github/workflows/nodejs.yml
vendored
1
.github/workflows/nodejs.yml
vendored
@@ -104,7 +104,6 @@ jobs:
|
||||
OPENAI_BASE_URL: http://0.0.0.0:8000
|
||||
run: |
|
||||
python ci/mock_openai.py &
|
||||
ss -ltnp | grep :8000
|
||||
cd nodejs/examples
|
||||
npm test
|
||||
macos:
|
||||
|
||||
382
.github/workflows/npm-publish.yml
vendored
382
.github/workflows/npm-publish.yml
vendored
@@ -226,108 +226,109 @@ jobs:
|
||||
path: |
|
||||
node/dist/lancedb-vectordb-win32*.tgz
|
||||
|
||||
node-windows-arm64:
|
||||
name: vectordb win32-arm64-msvc
|
||||
runs-on: windows-4x-arm
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install Git
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||
shell: powershell
|
||||
- name: Add Git to PATH
|
||||
run: |
|
||||
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||
shell: powershell
|
||||
- name: Configure Git symlinks
|
||||
run: git config --global core.symlinks true
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.13"
|
||||
- name: Install Visual Studio Build Tools
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||
"--installPath", "C:\BuildTools", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||
shell: powershell
|
||||
- name: Add Visual Studio Build Tools to PATH
|
||||
run: |
|
||||
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||
# TODO: re-enable once working https://github.com/lancedb/lancedb/pull/1831
|
||||
# node-windows-arm64:
|
||||
# name: vectordb win32-arm64-msvc
|
||||
# runs-on: windows-4x-arm
|
||||
# if: startsWith(github.ref, 'refs/tags/v')
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
# - name: Install Git
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||
# Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||
# shell: powershell
|
||||
# - name: Add Git to PATH
|
||||
# run: |
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||
# $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||
# shell: powershell
|
||||
# - name: Configure Git symlinks
|
||||
# run: git config --global core.symlinks true
|
||||
# - uses: actions/checkout@v4
|
||||
# - uses: actions/setup-python@v5
|
||||
# with:
|
||||
# python-version: "3.13"
|
||||
# - name: Install Visual Studio Build Tools
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||
# Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||
# "--installPath", "C:\BuildTools", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||
# shell: powershell
|
||||
# - name: Add Visual Studio Build Tools to PATH
|
||||
# run: |
|
||||
# $vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||
# $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||
|
||||
# Add MSVC runtime libraries to LIB
|
||||
$env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
|
||||
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
|
||||
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||
Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
|
||||
# # Add MSVC runtime libraries to LIB
|
||||
# $env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
|
||||
# "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
|
||||
# "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||
# Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
|
||||
|
||||
# Add INCLUDE paths
|
||||
$env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
|
||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
|
||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
|
||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
|
||||
Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
|
||||
shell: powershell
|
||||
- name: Install Rust
|
||||
run: |
|
||||
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||
shell: powershell
|
||||
- name: Add Rust to PATH
|
||||
run: |
|
||||
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||
shell: powershell
|
||||
# # Add INCLUDE paths
|
||||
# $env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
|
||||
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
|
||||
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
|
||||
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
|
||||
# Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
|
||||
# shell: powershell
|
||||
# - name: Install Rust
|
||||
# run: |
|
||||
# Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||
# .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||
# shell: powershell
|
||||
# - name: Add Rust to PATH
|
||||
# run: |
|
||||
# Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||
# shell: powershell
|
||||
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
with:
|
||||
workspaces: rust
|
||||
- name: Install 7-Zip ARM
|
||||
run: |
|
||||
New-Item -Path 'C:\7zip' -ItemType Directory
|
||||
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||
shell: powershell
|
||||
- name: Add 7-Zip to PATH
|
||||
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||
shell: powershell
|
||||
- name: Install Protoc v21.12
|
||||
working-directory: C:\
|
||||
run: |
|
||||
if (Test-Path 'C:\protoc') {
|
||||
Write-Host "Protoc directory exists, skipping installation"
|
||||
return
|
||||
}
|
||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
||||
Set-Location C:\protoc
|
||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||
shell: powershell
|
||||
- name: Add Protoc to PATH
|
||||
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||
shell: powershell
|
||||
- name: Build Windows native node modules
|
||||
run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc
|
||||
- name: Upload Windows ARM64 Artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: node-native-windows-arm64
|
||||
path: |
|
||||
node/dist/*.node
|
||||
# - uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# workspaces: rust
|
||||
# - name: Install 7-Zip ARM
|
||||
# run: |
|
||||
# New-Item -Path 'C:\7zip' -ItemType Directory
|
||||
# Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||
# Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||
# shell: powershell
|
||||
# - name: Add 7-Zip to PATH
|
||||
# run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||
# shell: powershell
|
||||
# - name: Install Protoc v21.12
|
||||
# working-directory: C:\
|
||||
# run: |
|
||||
# if (Test-Path 'C:\protoc') {
|
||||
# Write-Host "Protoc directory exists, skipping installation"
|
||||
# return
|
||||
# }
|
||||
# New-Item -Path 'C:\protoc' -ItemType Directory
|
||||
# Set-Location C:\protoc
|
||||
# Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||
# & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||
# shell: powershell
|
||||
# - name: Add Protoc to PATH
|
||||
# run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||
# shell: powershell
|
||||
# - name: Build Windows native node modules
|
||||
# run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc
|
||||
# - name: Upload Windows ARM64 Artifacts
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: node-native-windows-arm64
|
||||
# path: |
|
||||
# node/dist/*.node
|
||||
|
||||
nodejs-windows:
|
||||
name: lancedb ${{ matrix.target }}
|
||||
@@ -363,98 +364,99 @@ jobs:
|
||||
path: |
|
||||
nodejs/dist/*.node
|
||||
|
||||
nodejs-windows-arm64:
|
||||
name: lancedb win32-arm64-msvc
|
||||
runs-on: windows-4x-arm
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install Git
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||
shell: powershell
|
||||
- name: Add Git to PATH
|
||||
run: |
|
||||
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||
shell: powershell
|
||||
- name: Configure Git symlinks
|
||||
run: git config --global core.symlinks true
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.13"
|
||||
- name: Install Visual Studio Build Tools
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||
"--installPath", "C:\BuildTools", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||
shell: powershell
|
||||
- name: Add Visual Studio Build Tools to PATH
|
||||
run: |
|
||||
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||
# TODO: re-enable once working https://github.com/lancedb/lancedb/pull/1831
|
||||
# nodejs-windows-arm64:
|
||||
# name: lancedb win32-arm64-msvc
|
||||
# runs-on: windows-4x-arm
|
||||
# if: startsWith(github.ref, 'refs/tags/v')
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
# - name: Install Git
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||
# Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||
# shell: powershell
|
||||
# - name: Add Git to PATH
|
||||
# run: |
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||
# $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||
# shell: powershell
|
||||
# - name: Configure Git symlinks
|
||||
# run: git config --global core.symlinks true
|
||||
# - uses: actions/checkout@v4
|
||||
# - uses: actions/setup-python@v5
|
||||
# with:
|
||||
# python-version: "3.13"
|
||||
# - name: Install Visual Studio Build Tools
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||
# Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||
# "--installPath", "C:\BuildTools", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||
# shell: powershell
|
||||
# - name: Add Visual Studio Build Tools to PATH
|
||||
# run: |
|
||||
# $vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||
# $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||
|
||||
$env:LIB = ""
|
||||
Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||
shell: powershell
|
||||
- name: Install Rust
|
||||
run: |
|
||||
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||
shell: powershell
|
||||
- name: Add Rust to PATH
|
||||
run: |
|
||||
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||
shell: powershell
|
||||
# $env:LIB = ""
|
||||
# Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||
# shell: powershell
|
||||
# - name: Install Rust
|
||||
# run: |
|
||||
# Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||
# .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||
# shell: powershell
|
||||
# - name: Add Rust to PATH
|
||||
# run: |
|
||||
# Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||
# shell: powershell
|
||||
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
with:
|
||||
workspaces: rust
|
||||
- name: Install 7-Zip ARM
|
||||
run: |
|
||||
New-Item -Path 'C:\7zip' -ItemType Directory
|
||||
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||
shell: powershell
|
||||
- name: Add 7-Zip to PATH
|
||||
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||
shell: powershell
|
||||
- name: Install Protoc v21.12
|
||||
working-directory: C:\
|
||||
run: |
|
||||
if (Test-Path 'C:\protoc') {
|
||||
Write-Host "Protoc directory exists, skipping installation"
|
||||
return
|
||||
}
|
||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
||||
Set-Location C:\protoc
|
||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||
shell: powershell
|
||||
- name: Add Protoc to PATH
|
||||
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||
shell: powershell
|
||||
- name: Build Windows native node modules
|
||||
run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc
|
||||
- name: Upload Windows ARM64 Artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: nodejs-native-windows-arm64
|
||||
path: |
|
||||
nodejs/dist/*.node
|
||||
# - uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# workspaces: rust
|
||||
# - name: Install 7-Zip ARM
|
||||
# run: |
|
||||
# New-Item -Path 'C:\7zip' -ItemType Directory
|
||||
# Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||
# Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||
# shell: powershell
|
||||
# - name: Add 7-Zip to PATH
|
||||
# run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||
# shell: powershell
|
||||
# - name: Install Protoc v21.12
|
||||
# working-directory: C:\
|
||||
# run: |
|
||||
# if (Test-Path 'C:\protoc') {
|
||||
# Write-Host "Protoc directory exists, skipping installation"
|
||||
# return
|
||||
# }
|
||||
# New-Item -Path 'C:\protoc' -ItemType Directory
|
||||
# Set-Location C:\protoc
|
||||
# Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||
# & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||
# shell: powershell
|
||||
# - name: Add Protoc to PATH
|
||||
# run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||
# shell: powershell
|
||||
# - name: Build Windows native node modules
|
||||
# run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc
|
||||
# - name: Upload Windows ARM64 Artifacts
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: nodejs-native-windows-arm64
|
||||
# path: |
|
||||
# nodejs/dist/*.node
|
||||
|
||||
release:
|
||||
name: vectordb NPM Publish
|
||||
@@ -476,7 +478,7 @@ jobs:
|
||||
env:
|
||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||
run: |
|
||||
# Tag beta as "preview" instead of default "latest". See lancedb
|
||||
# Tag beta as "preview" instead of default "latest". See lancedb
|
||||
# npm publish step for more info.
|
||||
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
|
||||
PUBLISH_ARGS="--tag preview"
|
||||
|
||||
14
Cargo.toml
14
Cargo.toml
@@ -23,13 +23,13 @@ rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.19.2", "features" = [
|
||||
"dynamodb",
|
||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
||||
lance-index = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
||||
lance-linalg = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
||||
lance-table = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
||||
lance-testing = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
||||
lance-datafusion = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
||||
lance-encoding = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
||||
]}
|
||||
lance-index = "=0.19.2"
|
||||
lance-linalg = "=0.19.2"
|
||||
lance-table = "=0.19.2"
|
||||
lance-testing = "=0.19.2"
|
||||
lance-datafusion = "=0.19.2"
|
||||
lance-encoding = "=0.19.2"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "52.2", optional = false }
|
||||
arrow-array = "52.2"
|
||||
|
||||
@@ -790,6 +790,27 @@ Use the `drop_table()` method on the database to remove a table.
|
||||
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||
If the table does not exist an exception is raised.
|
||||
|
||||
## Handling bad vectors
|
||||
|
||||
In LanceDB Python, you can use the `on_bad_vectors` parameter to choose how
|
||||
invalid vector values are handled. Invalid vectors are vectors that are not valid
|
||||
because:
|
||||
|
||||
1. They are the wrong dimension
|
||||
2. They contain NaN values
|
||||
3. They are null but are on a non-nullable field
|
||||
|
||||
By default, LanceDB will raise an error if it encounters a bad vector. You can
|
||||
also choose one of the following options:
|
||||
|
||||
* `drop`: Ignore rows with bad vectors
|
||||
* `fill`: Replace bad values (NaNs) or missing values (too few dimensions) with
|
||||
the fill value specified in the `fill_value` parameter. An input like
|
||||
`[1.0, NaN, 3.0]` will be replaced with `[1.0, 0.0, 3.0]` if `fill_value=0.0`.
|
||||
* `null`: Replace bad vectors with null (only works if the column is nullable).
|
||||
A bad vector `[1.0, NaN, 3.0]` will be replaced with `null` if the column is
|
||||
nullable. If the vector column is non-nullable, then bad vectors will cause an
|
||||
error
|
||||
|
||||
## Consistency
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.13.0-beta.1</version>
|
||||
<version>0.13.0-beta.2</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.13.0-beta.1</version>
|
||||
<version>0.13.0-beta.2</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<name>LanceDB Parent</name>
|
||||
|
||||
76
node/package-lock.json
generated
76
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.13.0-beta.1",
|
||||
"version": "0.13.0-beta.2",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.13.0-beta.1",
|
||||
"version": "0.13.0-beta.2",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,12 +52,12 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.1",
|
||||
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.1",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.1",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.1",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.1",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.1"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
|
||||
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
@@ -327,66 +327,6 @@
|
||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.13.0-beta.1",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.13.0-beta.1.tgz",
|
||||
"integrity": "sha512-beOrf6selCzzhLgDG8Nibma4nO/CSnA1wUKRmlJHEPtGcg7PW18z6MP/nfwQMpMR/FLRfTo8pPTbpzss47MiQQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.13.0-beta.1",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.13.0-beta.1.tgz",
|
||||
"integrity": "sha512-YdraGRF/RbJRkKh0v3xT03LUhq47T2GtCvJ5gZp8wKlh4pHa8LuhLU0DIdvmG/DT5vuQA+td8HDkBm/e3EOdNg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.13.0-beta.1",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.13.0-beta.1.tgz",
|
||||
"integrity": "sha512-Pp0O/uhEqof1oLaWrNbv+Ym+q8kBkiCqaA5+2eAZ6a3e9U+Ozkvb0FQrHuyi9adJ5wKQ4NabyQE9BMf2bYpOnQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.13.0-beta.1",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.13.0-beta.1.tgz",
|
||||
"integrity": "sha512-y8nxOye4egfWF5FGED9EfkmZ1O5HnRLU4a61B8m5JSpkivO9v2epTcbYN0yt/7ZFCgtqMfJ8VW4Mi7qQcz3KDA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.13.0-beta.1",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.13.0-beta.1.tgz",
|
||||
"integrity": "sha512-STMDP9dp0TBLkB3ro+16pKcGy6bmbhRuEZZZ1Tp5P75yTPeVh4zIgWkidMdU1qBbEYM7xacnsp9QAwgLnMU/Ow==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
]
|
||||
},
|
||||
"node_modules/@neon-rs/cli": {
|
||||
"version": "0.0.160",
|
||||
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.13.0-beta.1",
|
||||
"version": "0.13.0-beta.2",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
@@ -89,11 +89,11 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.1",
|
||||
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.1",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.1",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.1",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.1",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.1"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
|
||||
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.13.0-beta.1"
|
||||
version = "0.13.0-beta.2"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -187,6 +187,81 @@ describe.each([arrow13, arrow14, arrow15, arrow16, arrow17])(
|
||||
},
|
||||
);
|
||||
|
||||
// TODO: https://github.com/lancedb/lancedb/issues/1832
|
||||
it.skip("should be able to omit nullable fields", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const schema = new arrow.Schema([
|
||||
new arrow.Field(
|
||||
"vector",
|
||||
new arrow.FixedSizeList(
|
||||
2,
|
||||
new arrow.Field("item", new arrow.Float64()),
|
||||
),
|
||||
true,
|
||||
),
|
||||
new arrow.Field("item", new arrow.Utf8(), true),
|
||||
new arrow.Field("price", new arrow.Float64(), false),
|
||||
]);
|
||||
const table = await db.createEmptyTable("test", schema);
|
||||
|
||||
const data1 = { item: "foo", price: 10.0 };
|
||||
await table.add([data1]);
|
||||
const data2 = { vector: [3.1, 4.1], price: 2.0 };
|
||||
await table.add([data2]);
|
||||
const data3 = { vector: [5.9, 26.5], item: "bar", price: 3.0 };
|
||||
await table.add([data3]);
|
||||
|
||||
let res = await table.query().limit(10).toArray();
|
||||
const resVector = res.map((r) => r.get("vector").toArray());
|
||||
expect(resVector).toEqual([null, data2.vector, data3.vector]);
|
||||
const resItem = res.map((r) => r.get("item").toArray());
|
||||
expect(resItem).toEqual(["foo", null, "bar"]);
|
||||
const resPrice = res.map((r) => r.get("price").toArray());
|
||||
expect(resPrice).toEqual([10.0, 2.0, 3.0]);
|
||||
|
||||
const data4 = { item: "foo" };
|
||||
// We can't omit a column if it's not nullable
|
||||
await expect(table.add([data4])).rejects.toThrow("Invalid user input");
|
||||
|
||||
// But we can alter columns to make them nullable
|
||||
await table.alterColumns([{ path: "price", nullable: true }]);
|
||||
await table.add([data4]);
|
||||
|
||||
res = (await table.query().limit(10).toArray()).map((r) => r.toJSON());
|
||||
expect(res).toEqual([data1, data2, data3, data4]);
|
||||
});
|
||||
|
||||
it("should be able to insert nullable data for non-nullable fields", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const schema = new arrow.Schema([
|
||||
new arrow.Field("x", new arrow.Float64(), false),
|
||||
new arrow.Field("id", new arrow.Utf8(), false),
|
||||
]);
|
||||
const table = await db.createEmptyTable("test", schema);
|
||||
|
||||
const data1 = { x: 4.1, id: "foo" };
|
||||
await table.add([data1]);
|
||||
const res = (await table.query().toArray())[0];
|
||||
expect(res.x).toEqual(data1.x);
|
||||
expect(res.id).toEqual(data1.id);
|
||||
|
||||
const data2 = { x: null, id: "bar" };
|
||||
await expect(table.add([data2])).rejects.toThrow(
|
||||
"declared as non-nullable but contains null values",
|
||||
);
|
||||
|
||||
// But we can alter columns to make them nullable
|
||||
await table.alterColumns([{ path: "x", nullable: true }]);
|
||||
await table.add([data2]);
|
||||
|
||||
const res2 = await table.query().toArray();
|
||||
expect(res2.length).toBe(2);
|
||||
expect(res2[0].x).toEqual(data1.x);
|
||||
expect(res2[0].id).toEqual(data1.id);
|
||||
expect(res2[1].x).toBeNull();
|
||||
expect(res2[1].id).toEqual(data2.id);
|
||||
});
|
||||
|
||||
it("should return the table as an instance of an arrow table", async () => {
|
||||
const arrowTbl = await table.toArrow();
|
||||
expect(arrowTbl).toBeInstanceOf(ArrowTable);
|
||||
@@ -998,4 +1073,18 @@ describe("column name options", () => {
|
||||
const results = await table.query().where("`camelCase` = 1").toArray();
|
||||
expect(results[0].camelCase).toBe(1);
|
||||
});
|
||||
|
||||
test("can make multiple vector queries in one go", async () => {
|
||||
const results = await table
|
||||
.query()
|
||||
.nearestTo([0.1, 0.2])
|
||||
.addQueryVector([0.1, 0.2])
|
||||
.limit(1)
|
||||
.toArray();
|
||||
console.log(results);
|
||||
expect(results.length).toBe(2);
|
||||
results.sort((a, b) => a.query_index - b.query_index);
|
||||
expect(results[0].query_index).toBe(0);
|
||||
expect(results[1].query_index).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -6,12 +6,16 @@ import { withTempDirectory } from "./util.ts";
|
||||
import * as lancedb from "@lancedb/lancedb";
|
||||
import "@lancedb/lancedb/embedding/transformers";
|
||||
import { LanceSchema, getRegistry } from "@lancedb/lancedb/embedding";
|
||||
import { EmbeddingFunction } from "@lancedb/lancedb/embedding";
|
||||
import { Utf8 } from "apache-arrow";
|
||||
|
||||
test("full text search", async () => {
|
||||
await withTempDirectory(async (databaseDir) => {
|
||||
const db = await lancedb.connect(databaseDir);
|
||||
const func = await getRegistry().get("huggingface").create();
|
||||
console.log(getRegistry());
|
||||
const func = (await getRegistry()
|
||||
.get("huggingface")
|
||||
?.create()) as EmbeddingFunction;
|
||||
|
||||
const facts = [
|
||||
"Albert Einstein was a theoretical physicist.",
|
||||
@@ -56,4 +60,4 @@ test("full text search", async () => {
|
||||
|
||||
expect(actual[0]["text"]).toBe("The human body has 206 bones.");
|
||||
});
|
||||
});
|
||||
}, 100_000);
|
||||
|
||||
@@ -19,9 +19,6 @@ import { EmbeddingFunctionConfig, getRegistry } from "./registry";
|
||||
|
||||
export { EmbeddingFunction, TextEmbeddingFunction } from "./embedding_function";
|
||||
|
||||
// We need to explicitly export '*' so that the `register` decorator actually registers the class.
|
||||
export * from "./openai";
|
||||
export * from "./transformers";
|
||||
export * from "./registry";
|
||||
|
||||
/**
|
||||
|
||||
@@ -17,8 +17,6 @@ import {
|
||||
type EmbeddingFunctionConstructor,
|
||||
} from "./embedding_function";
|
||||
import "reflect-metadata";
|
||||
import { OpenAIEmbeddingFunction } from "./openai";
|
||||
import { TransformersEmbeddingFunction } from "./transformers";
|
||||
|
||||
type CreateReturnType<T> = T extends { init: () => Promise<void> }
|
||||
? Promise<T>
|
||||
@@ -73,10 +71,6 @@ export class EmbeddingFunctionRegistry {
|
||||
};
|
||||
}
|
||||
|
||||
get(name: "openai"): EmbeddingFunctionCreate<OpenAIEmbeddingFunction>;
|
||||
get(
|
||||
name: "huggingface",
|
||||
): EmbeddingFunctionCreate<TransformersEmbeddingFunction>;
|
||||
get<T extends EmbeddingFunction<unknown>>(
|
||||
name: string,
|
||||
): EmbeddingFunctionCreate<T> | undefined;
|
||||
|
||||
@@ -492,6 +492,42 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
||||
super.doCall((inner) => inner.bypassVectorIndex());
|
||||
return this;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a query vector to the search
|
||||
*
|
||||
* This method can be called multiple times to add multiple query vectors
|
||||
* to the search. If multiple query vectors are added, then they will be searched
|
||||
* in parallel, and the results will be concatenated. A column called `query_index`
|
||||
* will be added to indicate the index of the query vector that produced the result.
|
||||
*
|
||||
* Performance wise, this is equivalent to running multiple queries concurrently.
|
||||
*/
|
||||
addQueryVector(vector: IntoVector): VectorQuery {
|
||||
if (vector instanceof Promise) {
|
||||
const res = (async () => {
|
||||
try {
|
||||
const v = await vector;
|
||||
const arr = Float32Array.from(v);
|
||||
//
|
||||
// biome-ignore lint/suspicious/noExplicitAny: we need to get the `inner`, but js has no package scoping
|
||||
const value: any = this.addQueryVector(arr);
|
||||
const inner = value.inner as
|
||||
| NativeVectorQuery
|
||||
| Promise<NativeVectorQuery>;
|
||||
return inner;
|
||||
} catch (e) {
|
||||
return Promise.reject(e);
|
||||
}
|
||||
})();
|
||||
return new VectorQuery(res);
|
||||
} else {
|
||||
super.doCall((inner) => {
|
||||
inner.addQueryVector(Float32Array.from(vector));
|
||||
});
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** A builder for LanceDB queries. */
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.13.0-beta.1",
|
||||
"version": "0.13.0-beta.2",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.13.0-beta.1",
|
||||
"version": "0.13.0-beta.2",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.13.0-beta.1",
|
||||
"version": "0.13.0-beta.2",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.13.0-beta.1",
|
||||
"version": "0.13.0-beta.2",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.13.0-beta.1",
|
||||
"version": "0.13.0-beta.2",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.13.0-beta.1",
|
||||
"version": "0.13.0-beta.2",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
@@ -10,11 +10,13 @@
|
||||
"vector database",
|
||||
"ann"
|
||||
],
|
||||
"version": "0.13.0-beta.1",
|
||||
"version": "0.13.0-beta.2",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
"./embedding": "./dist/embedding/index.js"
|
||||
"./embedding": "./dist/embedding/index.js",
|
||||
"./embedding/openai": "./dist/embedding/openai.js",
|
||||
"./embedding/transformers": "./dist/embedding/transformers.js"
|
||||
},
|
||||
"types": "dist/index.d.ts",
|
||||
"napi": {
|
||||
|
||||
@@ -135,6 +135,16 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().column(&column);
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn add_query_vector(&mut self, vector: Float32Array) -> Result<()> {
|
||||
self.inner = self
|
||||
.inner
|
||||
.clone()
|
||||
.add_query_vector(vector.as_ref())
|
||||
.default_error()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn distance_type(&mut self, distance_type: String) -> napi::Result<()> {
|
||||
let distance_type = parse_distance_type(distance_type)?;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.16.0-beta.0"
|
||||
current_version = "0.16.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.16.0-beta.0"
|
||||
version = "0.16.0"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -4,7 +4,7 @@ name = "lancedb"
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"nest-asyncio~=1.0",
|
||||
"pylance==0.19.2-beta.3",
|
||||
"pylance==0.19.2",
|
||||
"tqdm>=4.27.0",
|
||||
"pydantic>=1.10",
|
||||
"packaging",
|
||||
|
||||
@@ -943,12 +943,16 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
||||
def to_arrow(self) -> pa.Table:
|
||||
ds = self._table.to_lance()
|
||||
return ds.to_table(
|
||||
query = Query(
|
||||
columns=self._columns,
|
||||
filter=self._where,
|
||||
limit=self._limit,
|
||||
k=self._limit or 10,
|
||||
with_row_id=self._with_row_id,
|
||||
vector=[],
|
||||
# not actually respected in remote query
|
||||
offset=self._offset or 0,
|
||||
)
|
||||
return self._table._execute_query(query).read_all()
|
||||
|
||||
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
|
||||
"""Rerank the results using the specified reranker.
|
||||
@@ -1491,7 +1495,7 @@ class AsyncQuery(AsyncQueryBase):
|
||||
return pa.array(vec)
|
||||
|
||||
def nearest_to(
|
||||
self, query_vector: Optional[Union[VEC, Tuple]] = None
|
||||
self, query_vector: Optional[Union[VEC, Tuple, List[VEC]]] = None
|
||||
) -> AsyncVectorQuery:
|
||||
"""
|
||||
Find the nearest vectors to the given query vector.
|
||||
@@ -1529,10 +1533,30 @@ class AsyncQuery(AsyncQueryBase):
|
||||
|
||||
Vector searches always have a [limit][]. If `limit` has not been called then
|
||||
a default `limit` of 10 will be used.
|
||||
|
||||
Typically, a single vector is passed in as the query. However, you can also
|
||||
pass in multiple vectors. This can be useful if you want to find the nearest
|
||||
vectors to multiple query vectors. This is not expected to be faster than
|
||||
making multiple queries concurrently; it is just a convenience method.
|
||||
If multiple vectors are passed in then an additional column `query_index`
|
||||
will be added to the results. This column will contain the index of the
|
||||
query vector that the result is nearest to.
|
||||
"""
|
||||
return AsyncVectorQuery(
|
||||
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
|
||||
)
|
||||
if (
|
||||
isinstance(query_vector, list)
|
||||
and len(query_vector) > 0
|
||||
and not isinstance(query_vector[0], (float, int))
|
||||
):
|
||||
# multiple have been passed
|
||||
query_vectors = [AsyncQuery._query_vec_to_array(v) for v in query_vector]
|
||||
new_self = self._inner.nearest_to(query_vectors[0])
|
||||
for v in query_vectors[1:]:
|
||||
new_self.add_query_vector(v)
|
||||
return AsyncVectorQuery(new_self)
|
||||
else:
|
||||
return AsyncVectorQuery(
|
||||
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
|
||||
)
|
||||
|
||||
def nearest_to_text(
|
||||
self, query: str, columns: Union[str, List[str]] = []
|
||||
|
||||
@@ -327,10 +327,6 @@ class RemoteTable(Table):
|
||||
- and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vector.
|
||||
"""
|
||||
# empty query builder is not supported in saas, raise error
|
||||
if query is None and query_type != "hybrid":
|
||||
raise ValueError("Empty query is not supported")
|
||||
|
||||
return LanceQueryBuilder.create(
|
||||
self,
|
||||
query,
|
||||
|
||||
@@ -1567,7 +1567,7 @@ class LanceTable(Table):
|
||||
"append" and "overwrite".
|
||||
on_bad_vectors: str, default "error"
|
||||
What to do if any of the vectors are not the same size or contains NaNs.
|
||||
One of "error", "drop", "fill".
|
||||
One of "error", "drop", "fill", "null".
|
||||
fill_value: float, default 0.
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
|
||||
@@ -1851,7 +1851,7 @@ class LanceTable(Table):
|
||||
data but will validate against any schema that's specified.
|
||||
on_bad_vectors: str, default "error"
|
||||
What to do if any of the vectors are not the same size or contains NaNs.
|
||||
One of "error", "drop", "fill".
|
||||
One of "error", "drop", "fill", "null".
|
||||
fill_value: float, default 0.
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
embedding_functions: list of EmbeddingFunctionModel, default None
|
||||
@@ -2151,13 +2151,11 @@ def _sanitize_schema(
|
||||
vector column to fixed_size_list(float32) if necessary.
|
||||
on_bad_vectors: str, default "error"
|
||||
What to do if any of the vectors are not the same size or contains NaNs.
|
||||
One of "error", "drop", "fill".
|
||||
One of "error", "drop", "fill", "null".
|
||||
fill_value: float, default 0.
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
"""
|
||||
if schema is not None:
|
||||
if data.schema == schema:
|
||||
return data
|
||||
# cast the columns to the expected types
|
||||
data = data.combine_chunks()
|
||||
for field in schema:
|
||||
@@ -2177,6 +2175,7 @@ def _sanitize_schema(
|
||||
vector_column_name=field.name,
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
table_schema=schema,
|
||||
)
|
||||
return pa.Table.from_arrays(
|
||||
[data[name] for name in schema.names], schema=schema
|
||||
@@ -2197,6 +2196,7 @@ def _sanitize_schema(
|
||||
def _sanitize_vector_column(
|
||||
data: pa.Table,
|
||||
vector_column_name: str,
|
||||
table_schema: Optional[pa.Schema] = None,
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
) -> pa.Table:
|
||||
@@ -2211,12 +2211,16 @@ def _sanitize_vector_column(
|
||||
The name of the vector column.
|
||||
on_bad_vectors: str, default "error"
|
||||
What to do if any of the vectors are not the same size or contains NaNs.
|
||||
One of "error", "drop", "fill".
|
||||
One of "error", "drop", "fill", "null".
|
||||
fill_value: float, default 0.0
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
"""
|
||||
# ChunkedArray is annoying to work with, so we combine chunks here
|
||||
vec_arr = data[vector_column_name].combine_chunks()
|
||||
if table_schema is not None:
|
||||
field = table_schema.field(vector_column_name)
|
||||
else:
|
||||
field = None
|
||||
typ = data[vector_column_name].type
|
||||
if pa.types.is_list(typ) or pa.types.is_large_list(typ):
|
||||
# if it's a variable size list array,
|
||||
@@ -2243,7 +2247,11 @@ def _sanitize_vector_column(
|
||||
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
||||
)
|
||||
else:
|
||||
if pc.any(pc.is_null(vec_arr.values, nan_is_null=True)).as_py():
|
||||
if (
|
||||
field is not None
|
||||
and not field.nullable
|
||||
and pc.any(pc.is_null(vec_arr.values)).as_py()
|
||||
) or (pc.any(pc.is_nan(vec_arr.values)).as_py()):
|
||||
data = _sanitize_nans(
|
||||
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
||||
)
|
||||
@@ -2287,6 +2295,12 @@ def _sanitize_jagged(data, fill_value, on_bad_vectors, vec_arr, vector_column_na
|
||||
)
|
||||
elif on_bad_vectors == "drop":
|
||||
data = data.filter(correct_ndims)
|
||||
elif on_bad_vectors == "null":
|
||||
data = data.set_column(
|
||||
data.column_names.index(vector_column_name),
|
||||
vector_column_name,
|
||||
pc.if_else(correct_ndims, vec_arr, pa.scalar(None)),
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
@@ -2303,7 +2317,8 @@ def _sanitize_nans(
|
||||
raise ValueError(
|
||||
f"Vector column {vector_column_name} has NaNs. "
|
||||
"Set on_bad_vectors='drop' to remove them, or "
|
||||
"set on_bad_vectors='fill' and fill_value=<value> to replace them."
|
||||
"set on_bad_vectors='fill' and fill_value=<value> to replace them. "
|
||||
"Or set on_bad_vectors='null' to replace them with null."
|
||||
)
|
||||
elif on_bad_vectors == "fill":
|
||||
if fill_value is None:
|
||||
@@ -2323,6 +2338,17 @@ def _sanitize_nans(
|
||||
np_arr = np_arr.reshape(-1, vec_arr.type.list_size)
|
||||
not_nulls = np.any(np_arr, axis=1)
|
||||
data = data.filter(~not_nulls)
|
||||
elif on_bad_vectors == "null":
|
||||
# null = pa.nulls(len(vec_arr)).cast(vec_arr.type)
|
||||
# values = pc.if_else(pc.is_nan(vec_arr.values), fill_value, vec_arr.values)
|
||||
np_arr = np.isnan(vec_arr.values.to_numpy(zero_copy_only=False))
|
||||
np_arr = np_arr.reshape(-1, vec_arr.type.list_size)
|
||||
no_nans = np.any(np_arr, axis=1)
|
||||
data = data.set_column(
|
||||
data.column_names.index(vector_column_name),
|
||||
vector_column_name,
|
||||
pc.if_else(no_nans, vec_arr, pa.scalar(None)),
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
@@ -2588,7 +2614,7 @@ class AsyncTable:
|
||||
"append" and "overwrite".
|
||||
on_bad_vectors: str, default "error"
|
||||
What to do if any of the vectors are not the same size or contains NaNs.
|
||||
One of "error", "drop", "fill".
|
||||
One of "error", "drop", "fill", "null".
|
||||
fill_value: float, default 0.
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
|
||||
|
||||
@@ -81,14 +81,15 @@ def test_embedding_function(tmp_path):
|
||||
|
||||
|
||||
def test_embedding_with_bad_results(tmp_path):
|
||||
@register("mock-embedding")
|
||||
class MockEmbeddingFunction(TextEmbeddingFunction):
|
||||
@register("null-embedding")
|
||||
class NullEmbeddingFunction(TextEmbeddingFunction):
|
||||
def ndims(self):
|
||||
return 128
|
||||
|
||||
def generate_embeddings(
|
||||
self, texts: Union[List[str], np.ndarray]
|
||||
) -> list[Union[np.array, None]]:
|
||||
# Return None, which is bad if field is non-nullable
|
||||
return [
|
||||
None if i % 2 == 0 else np.random.randn(self.ndims())
|
||||
for i in range(len(texts))
|
||||
@@ -96,13 +97,17 @@ def test_embedding_with_bad_results(tmp_path):
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
model = registry.get("mock-embedding").create()
|
||||
model = registry.get("null-embedding").create()
|
||||
|
||||
class Schema(LanceModel):
|
||||
text: str = model.SourceField()
|
||||
vector: Vector(model.ndims()) = model.VectorField()
|
||||
|
||||
table = db.create_table("test", schema=Schema, mode="overwrite")
|
||||
with pytest.raises(ValueError):
|
||||
# Default on_bad_vectors is "error"
|
||||
table.add([{"text": "hello world"}])
|
||||
|
||||
table.add(
|
||||
[{"text": "hello world"}, {"text": "bar"}],
|
||||
on_bad_vectors="drop",
|
||||
@@ -112,13 +117,33 @@ def test_embedding_with_bad_results(tmp_path):
|
||||
assert len(table) == 1
|
||||
assert df.iloc[0]["text"] == "bar"
|
||||
|
||||
# table = db.create_table("test2", schema=Schema, mode="overwrite")
|
||||
# table.add(
|
||||
# [{"text": "hello world"}, {"text": "bar"}],
|
||||
# )
|
||||
# assert len(table) == 2
|
||||
# tbl = table.to_arrow()
|
||||
# assert tbl["vector"].null_count == 1
|
||||
@register("nan-embedding")
|
||||
class NanEmbeddingFunction(TextEmbeddingFunction):
|
||||
def ndims(self):
|
||||
return 128
|
||||
|
||||
def generate_embeddings(
|
||||
self, texts: Union[List[str], np.ndarray]
|
||||
) -> list[Union[np.array, None]]:
|
||||
# Return NaN to produce bad vectors
|
||||
return [
|
||||
[np.NAN] * 128 if i % 2 == 0 else np.random.randn(self.ndims())
|
||||
for i in range(len(texts))
|
||||
]
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
model = registry.get("nan-embedding").create()
|
||||
|
||||
table = db.create_table("test2", schema=Schema, mode="overwrite")
|
||||
table.alter_columns(dict(path="vector", nullable=True))
|
||||
table.add(
|
||||
[{"text": "hello world"}, {"text": "bar"}],
|
||||
on_bad_vectors="null",
|
||||
)
|
||||
assert len(table) == 2
|
||||
tbl = table.to_arrow()
|
||||
assert tbl["vector"].null_count == 1
|
||||
|
||||
|
||||
def test_with_existing_vectors(tmp_path):
|
||||
|
||||
@@ -197,6 +197,23 @@ def test_query_sync_minimal():
|
||||
assert data == expected
|
||||
|
||||
|
||||
def test_query_sync_empty_query():
|
||||
def handler(body):
|
||||
assert body == {
|
||||
"k": 10,
|
||||
"filter": "true",
|
||||
"vector": [],
|
||||
"columns": ["id"],
|
||||
}
|
||||
|
||||
return pa.table({"id": [1, 2, 3]})
|
||||
|
||||
with query_test_table(handler) as table:
|
||||
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
|
||||
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||
assert data == expected
|
||||
|
||||
|
||||
def test_query_sync_maximal():
|
||||
def handler(body):
|
||||
assert body == {
|
||||
@@ -229,6 +246,17 @@ def test_query_sync_maximal():
|
||||
)
|
||||
|
||||
|
||||
def test_query_sync_multiple_vectors():
|
||||
def handler(_body):
|
||||
return pa.table({"id": [1]})
|
||||
|
||||
with query_test_table(handler) as table:
|
||||
results = table.search([[1, 2, 3], [4, 5, 6]]).limit(1).to_list()
|
||||
assert len(results) == 2
|
||||
results.sort(key=lambda x: x["query_index"])
|
||||
assert results == [{"id": 1, "query_index": 0}, {"id": 1, "query_index": 1}]
|
||||
|
||||
|
||||
def test_query_sync_fts():
|
||||
def handler(body):
|
||||
assert body == {
|
||||
|
||||
@@ -240,6 +240,121 @@ def test_add(db):
|
||||
_add(table, schema)
|
||||
|
||||
|
||||
def test_add_subschema(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||
pa.field("item", pa.string(), nullable=True),
|
||||
pa.field("price", pa.float64(), nullable=False),
|
||||
]
|
||||
)
|
||||
table = db.create_table("test", schema=schema)
|
||||
|
||||
data = {"price": 10.0, "item": "foo"}
|
||||
table.add([data])
|
||||
data = {"price": 2.0, "vector": [3.1, 4.1]}
|
||||
table.add([data])
|
||||
data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
|
||||
table.add([data])
|
||||
|
||||
expected = pa.table(
|
||||
{
|
||||
"vector": [None, [3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", None, "bar"],
|
||||
"price": [10.0, 2.0, 3.0],
|
||||
},
|
||||
schema=schema,
|
||||
)
|
||||
assert table.to_arrow() == expected
|
||||
|
||||
data = {"item": "foo"}
|
||||
# We can't omit a column if it's not nullable
|
||||
with pytest.raises(OSError, match="Invalid user input"):
|
||||
table.add([data])
|
||||
|
||||
# We can add it if we make the column nullable
|
||||
table.alter_columns(dict(path="price", nullable=True))
|
||||
table.add([data])
|
||||
|
||||
expected_schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||
pa.field("item", pa.string(), nullable=True),
|
||||
pa.field("price", pa.float64(), nullable=True),
|
||||
]
|
||||
)
|
||||
expected = pa.table(
|
||||
{
|
||||
"vector": [None, [3.1, 4.1], [5.9, 26.5], None],
|
||||
"item": ["foo", None, "bar", "foo"],
|
||||
"price": [10.0, 2.0, 3.0, None],
|
||||
},
|
||||
schema=expected_schema,
|
||||
)
|
||||
assert table.to_arrow() == expected
|
||||
|
||||
|
||||
def test_add_nullability(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=False),
|
||||
pa.field("id", pa.string(), nullable=False),
|
||||
]
|
||||
)
|
||||
table = db.create_table("test", schema=schema)
|
||||
|
||||
nullable_schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||
pa.field("id", pa.string(), nullable=True),
|
||||
]
|
||||
)
|
||||
data = pa.table(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"id": ["foo", "bar"],
|
||||
},
|
||||
schema=nullable_schema,
|
||||
)
|
||||
# We can add nullable schema if it doesn't actually contain nulls
|
||||
table.add(data)
|
||||
|
||||
expected = data.cast(schema)
|
||||
assert table.to_arrow() == expected
|
||||
|
||||
data = pa.table(
|
||||
{
|
||||
"vector": [None],
|
||||
"id": ["baz"],
|
||||
},
|
||||
schema=nullable_schema,
|
||||
)
|
||||
# We can't add nullable schema if it contains nulls
|
||||
with pytest.raises(Exception, match="Vector column vector has NaNs"):
|
||||
table.add(data)
|
||||
|
||||
# But we can make it nullable
|
||||
table.alter_columns(dict(path="vector", nullable=True))
|
||||
table.add(data)
|
||||
|
||||
expected_schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||
pa.field("id", pa.string(), nullable=False),
|
||||
]
|
||||
)
|
||||
expected = pa.table(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5], None],
|
||||
"id": ["foo", "bar", "baz"],
|
||||
},
|
||||
schema=expected_schema,
|
||||
)
|
||||
assert table.to_arrow() == expected
|
||||
|
||||
|
||||
def test_add_pydantic_model(db):
|
||||
# https://github.com/lancedb/lancedb/issues/562
|
||||
|
||||
@@ -892,10 +1007,15 @@ def test_empty_query(db):
|
||||
table = LanceTable.create(db, "my_table2", data=[{"id": i} for i in range(100)])
|
||||
df = table.search().select(["id"]).to_pandas()
|
||||
assert len(df) == 10
|
||||
# None is the same as default
|
||||
df = table.search().select(["id"]).limit(None).to_pandas()
|
||||
assert len(df) == 100
|
||||
assert len(df) == 10
|
||||
# invalid limist is the same as None, wihch is the same as default
|
||||
df = table.search().select(["id"]).limit(-1).to_pandas()
|
||||
assert len(df) == 100
|
||||
assert len(df) == 10
|
||||
# valid limit should work
|
||||
df = table.search().select(["id"]).limit(42).to_pandas()
|
||||
assert len(df) == 42
|
||||
|
||||
|
||||
def test_search_with_schema_inf_single_vector(db):
|
||||
|
||||
@@ -142,6 +142,13 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().only_if(predicate);
|
||||
}
|
||||
|
||||
pub fn add_query_vector(&mut self, vector: Bound<'_, PyAny>) -> PyResult<()> {
|
||||
let data: ArrayData = ArrayData::from_pyarrow_bound(&vector)?;
|
||||
let array = make_array(data);
|
||||
self.inner = self.inner.clone().add_query_vector(array).infer_error()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-node"
|
||||
version = "0.13.0-beta.1"
|
||||
version = "0.13.0-beta.2"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.13.0-beta.1"
|
||||
version = "0.13.0-beta.2"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
@@ -46,6 +46,7 @@ serde = { version = "^1" }
|
||||
serde_json = { version = "1" }
|
||||
async-openai = { version = "0.20.0", optional = true }
|
||||
serde_with = { version = "3.8.1" }
|
||||
aws-sdk-bedrockruntime = { version = "1.27.0", optional = true }
|
||||
# For remote feature
|
||||
reqwest = { version = "0.12.0", features = ["gzip", "json", "stream"], optional = true }
|
||||
rand = { version = "0.8.3", features = ["small_rng"], optional = true}
|
||||
@@ -72,11 +73,13 @@ aws-config = { version = "1.0" }
|
||||
aws-smithy-runtime = { version = "1.3" }
|
||||
http-body = "1" # Matching reqwest
|
||||
|
||||
|
||||
[features]
|
||||
default = []
|
||||
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
||||
fp16kernels = ["lance-linalg/fp16kernels"]
|
||||
s3-test = []
|
||||
bedrock = ["dep:aws-sdk-bedrockruntime"]
|
||||
openai = ["dep:async-openai", "dep:reqwest"]
|
||||
polars = ["dep:polars-arrow", "dep:polars"]
|
||||
sentence-transformers = [
|
||||
@@ -94,3 +97,7 @@ required-features = ["openai"]
|
||||
[[example]]
|
||||
name = "sentence_transformers"
|
||||
required-features = ["sentence-transformers"]
|
||||
|
||||
[[example]]
|
||||
name = "bedrock"
|
||||
required-features = ["bedrock"]
|
||||
|
||||
89
rust/lancedb/examples/bedrock.rs
Normal file
89
rust/lancedb/examples/bedrock.rs
Normal file
@@ -0,0 +1,89 @@
|
||||
use std::{iter::once, sync::Arc};
|
||||
|
||||
use arrow_array::{Float64Array, Int32Array, RecordBatch, RecordBatchIterator, StringArray};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use aws_config::Region;
|
||||
use aws_sdk_bedrockruntime::Client;
|
||||
use futures::StreamExt;
|
||||
use lancedb::{
|
||||
arrow::IntoArrow,
|
||||
connect,
|
||||
embeddings::{bedrock::BedrockEmbeddingFunction, EmbeddingDefinition, EmbeddingFunction},
|
||||
query::{ExecutableQuery, QueryBase},
|
||||
Result,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let tempdir = tempfile::tempdir().unwrap();
|
||||
let tempdir = tempdir.path().to_str().unwrap();
|
||||
|
||||
// create Bedrock embedding function
|
||||
let region: String = "us-east-1".to_string();
|
||||
let config = aws_config::defaults(aws_config::BehaviorVersion::latest())
|
||||
.region(Region::new(region))
|
||||
.load()
|
||||
.await;
|
||||
|
||||
let embedding = Arc::new(BedrockEmbeddingFunction::new(
|
||||
Client::new(&config), // AWS Region
|
||||
));
|
||||
|
||||
let db = connect(tempdir).execute().await?;
|
||||
db.embedding_registry()
|
||||
.register("bedrock", embedding.clone())?;
|
||||
|
||||
let table = db
|
||||
.create_table("vectors", make_data())
|
||||
.add_embedding(EmbeddingDefinition::new(
|
||||
"text",
|
||||
"bedrock",
|
||||
Some("embeddings"),
|
||||
))?
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
// execute vector search
|
||||
let query = Arc::new(StringArray::from_iter_values(once("something warm")));
|
||||
let query_vector = embedding.compute_query_embeddings(query)?;
|
||||
let mut results = table
|
||||
.vector_search(query_vector)?
|
||||
.limit(1)
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
let rb = results.next().await.unwrap()?;
|
||||
let out = rb
|
||||
.column_by_name("text")
|
||||
.unwrap()
|
||||
.as_any()
|
||||
.downcast_ref::<StringArray>()
|
||||
.unwrap();
|
||||
let text = out.iter().next().unwrap().unwrap();
|
||||
println!("Closest match: {}", text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_data() -> impl IntoArrow {
|
||||
let schema = Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, true),
|
||||
Field::new("text", DataType::Utf8, false),
|
||||
Field::new("price", DataType::Float64, false),
|
||||
]);
|
||||
|
||||
let id = Int32Array::from(vec![1, 2, 3, 4]);
|
||||
let text = StringArray::from_iter_values(vec![
|
||||
"Black T-Shirt",
|
||||
"Leather Jacket",
|
||||
"Winter Parka",
|
||||
"Hooded Sweatshirt",
|
||||
]);
|
||||
let price = Float64Array::from(vec![10.0, 50.0, 100.0, 30.0]);
|
||||
let schema = Arc::new(schema);
|
||||
let rb = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![Arc::new(id), Arc::new(text), Arc::new(price)],
|
||||
)
|
||||
.unwrap();
|
||||
Box::new(RecordBatchIterator::new(vec![Ok(rb)], schema))
|
||||
}
|
||||
@@ -17,6 +17,9 @@ pub mod openai;
|
||||
#[cfg(feature = "sentence-transformers")]
|
||||
pub mod sentence_transformers;
|
||||
|
||||
#[cfg(feature = "bedrock")]
|
||||
pub mod bedrock;
|
||||
|
||||
use lance::arrow::RecordBatchExt;
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
|
||||
210
rust/lancedb/src/embeddings/bedrock.rs
Normal file
210
rust/lancedb/src/embeddings/bedrock.rs
Normal file
@@ -0,0 +1,210 @@
|
||||
use aws_sdk_bedrockruntime::Client as BedrockClient;
|
||||
use std::{borrow::Cow, fmt::Formatter, str::FromStr, sync::Arc};
|
||||
|
||||
use arrow::array::{AsArray, Float32Builder};
|
||||
use arrow_array::{Array, ArrayRef, FixedSizeListArray, Float32Array};
|
||||
use arrow_data::ArrayData;
|
||||
use arrow_schema::DataType;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use super::EmbeddingFunction;
|
||||
use crate::{Error, Result};
|
||||
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::task::block_in_place;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum BedrockEmbeddingModel {
|
||||
TitanEmbedding,
|
||||
CohereLarge,
|
||||
}
|
||||
|
||||
impl BedrockEmbeddingModel {
|
||||
fn ndims(&self) -> usize {
|
||||
match self {
|
||||
Self::TitanEmbedding => 1536,
|
||||
Self::CohereLarge => 1024,
|
||||
}
|
||||
}
|
||||
|
||||
fn model_id(&self) -> &str {
|
||||
match self {
|
||||
Self::TitanEmbedding => "amazon.titan-embed-text-v1",
|
||||
Self::CohereLarge => "cohere.embed-english-v3",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for BedrockEmbeddingModel {
|
||||
type Err = Error;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
match s {
|
||||
"titan-embed-text-v1" => Ok(Self::TitanEmbedding),
|
||||
"cohere-embed-english-v3" => Ok(Self::CohereLarge),
|
||||
_ => Err(Error::InvalidInput {
|
||||
message: "Invalid model. Available models are: 'titan-embed-text-v1', 'cohere-embed-english-v3'".to_string()
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BedrockEmbeddingFunction {
|
||||
model: BedrockEmbeddingModel,
|
||||
client: BedrockClient,
|
||||
}
|
||||
|
||||
impl BedrockEmbeddingFunction {
|
||||
pub fn new(client: BedrockClient) -> Self {
|
||||
Self {
|
||||
model: BedrockEmbeddingModel::TitanEmbedding,
|
||||
client,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_model(client: BedrockClient, model: BedrockEmbeddingModel) -> Self {
|
||||
Self { model, client }
|
||||
}
|
||||
}
|
||||
|
||||
impl EmbeddingFunction for BedrockEmbeddingFunction {
|
||||
fn name(&self) -> &str {
|
||||
"bedrock"
|
||||
}
|
||||
|
||||
fn source_type(&self) -> Result<Cow<DataType>> {
|
||||
Ok(Cow::Owned(DataType::Utf8))
|
||||
}
|
||||
|
||||
fn dest_type(&self) -> Result<Cow<DataType>> {
|
||||
let n_dims = self.model.ndims();
|
||||
Ok(Cow::Owned(DataType::new_fixed_size_list(
|
||||
DataType::Float32,
|
||||
n_dims as i32,
|
||||
false,
|
||||
)))
|
||||
}
|
||||
|
||||
fn compute_source_embeddings(&self, source: ArrayRef) -> Result<ArrayRef> {
|
||||
let len = source.len();
|
||||
let n_dims = self.model.ndims();
|
||||
let inner = self.compute_inner(source)?;
|
||||
|
||||
let fsl = DataType::new_fixed_size_list(DataType::Float32, n_dims as i32, false);
|
||||
|
||||
let array_data = ArrayData::builder(fsl)
|
||||
.len(len)
|
||||
.add_child_data(inner.into_data())
|
||||
.build()?;
|
||||
|
||||
Ok(Arc::new(FixedSizeListArray::from(array_data)))
|
||||
}
|
||||
|
||||
fn compute_query_embeddings(&self, input: Arc<dyn Array>) -> Result<Arc<dyn Array>> {
|
||||
let arr = self.compute_inner(input)?;
|
||||
Ok(Arc::new(arr))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for BedrockEmbeddingFunction {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("BedrockEmbeddingFunction")
|
||||
.field("model", &self.model)
|
||||
// Skip client field as it doesn't implement Debug
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl BedrockEmbeddingFunction {
|
||||
fn compute_inner(&self, source: Arc<dyn Array>) -> Result<Float32Array> {
|
||||
if source.is_nullable() {
|
||||
return Err(Error::InvalidInput {
|
||||
message: "Expected non-nullable data type".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
if !matches!(source.data_type(), DataType::Utf8 | DataType::LargeUtf8) {
|
||||
return Err(Error::InvalidInput {
|
||||
message: "Expected Utf8 data type".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
let mut builder = Float32Builder::new();
|
||||
|
||||
let texts = match source.data_type() {
|
||||
DataType::Utf8 => source
|
||||
.as_string::<i32>()
|
||||
.into_iter()
|
||||
.map(|s| s.expect("array is non-nullable").to_string())
|
||||
.collect::<Vec<String>>(),
|
||||
DataType::LargeUtf8 => source
|
||||
.as_string::<i64>()
|
||||
.into_iter()
|
||||
.map(|s| s.expect("array is non-nullable").to_string())
|
||||
.collect::<Vec<String>>(),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
for text in texts {
|
||||
let request_body = match self.model {
|
||||
BedrockEmbeddingModel::TitanEmbedding => {
|
||||
json!({
|
||||
"inputText": text
|
||||
})
|
||||
}
|
||||
BedrockEmbeddingModel::CohereLarge => {
|
||||
json!({
|
||||
"texts": [text],
|
||||
"input_type": "search_document"
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
let client = self.client.clone();
|
||||
let model_id = self.model.model_id().to_string();
|
||||
let request_body = request_body.clone();
|
||||
|
||||
let response = block_in_place(move || {
|
||||
Handle::current().block_on(async move {
|
||||
client
|
||||
.invoke_model()
|
||||
.model_id(model_id)
|
||||
.body(aws_sdk_bedrockruntime::primitives::Blob::new(
|
||||
serde_json::to_vec(&request_body).unwrap(),
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
})
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let response_json: Value =
|
||||
serde_json::from_slice(response.body.as_ref()).map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to parse response: {}", e),
|
||||
})?;
|
||||
|
||||
let embedding = match self.model {
|
||||
BedrockEmbeddingModel::TitanEmbedding => response_json["embedding"]
|
||||
.as_array()
|
||||
.ok_or_else(|| Error::Runtime {
|
||||
message: "Missing embedding in response".to_string(),
|
||||
})?
|
||||
.iter()
|
||||
.map(|v| v.as_f64().unwrap() as f32)
|
||||
.collect::<Vec<f32>>(),
|
||||
BedrockEmbeddingModel::CohereLarge => response_json["embeddings"][0]
|
||||
.as_array()
|
||||
.ok_or_else(|| Error::Runtime {
|
||||
message: "Missing embeddings in response".to_string(),
|
||||
})?
|
||||
.iter()
|
||||
.map(|v| v.as_f64().unwrap() as f32)
|
||||
.collect::<Vec<f32>>(),
|
||||
};
|
||||
|
||||
builder.append_slice(&embedding);
|
||||
}
|
||||
|
||||
Ok(builder.finish())
|
||||
}
|
||||
}
|
||||
@@ -475,6 +475,7 @@ impl<T: HasQuery> QueryBase for T {
|
||||
|
||||
/// Options for controlling the execution of a query
|
||||
#[non_exhaustive]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct QueryExecutionOptions {
|
||||
/// The maximum number of rows that will be contained in a single
|
||||
/// `RecordBatch` delivered by the query.
|
||||
@@ -650,7 +651,7 @@ impl Query {
|
||||
pub fn nearest_to(self, vector: impl IntoQueryVector) -> Result<VectorQuery> {
|
||||
let mut vector_query = self.into_vector();
|
||||
let query_vector = vector.to_query_vector(&DataType::Float32, "default")?;
|
||||
vector_query.query_vector = Some(query_vector);
|
||||
vector_query.query_vector.push(query_vector);
|
||||
Ok(vector_query)
|
||||
}
|
||||
}
|
||||
@@ -701,7 +702,7 @@ pub struct VectorQuery {
|
||||
// the column based on the dataset's schema.
|
||||
pub(crate) column: Option<String>,
|
||||
// IVF PQ - ANN search.
|
||||
pub(crate) query_vector: Option<Arc<dyn Array>>,
|
||||
pub(crate) query_vector: Vec<Arc<dyn Array>>,
|
||||
pub(crate) nprobes: usize,
|
||||
pub(crate) refine_factor: Option<u32>,
|
||||
pub(crate) distance_type: Option<DistanceType>,
|
||||
@@ -714,7 +715,7 @@ impl VectorQuery {
|
||||
Self {
|
||||
base,
|
||||
column: None,
|
||||
query_vector: None,
|
||||
query_vector: Vec::new(),
|
||||
nprobes: 20,
|
||||
refine_factor: None,
|
||||
distance_type: None,
|
||||
@@ -734,6 +735,22 @@ impl VectorQuery {
|
||||
self
|
||||
}
|
||||
|
||||
/// Add another query vector to the search.
|
||||
///
|
||||
/// Multiple searches will be dispatched as part of the query.
|
||||
/// This is a convenience method for adding multiple query vectors
|
||||
/// to the search. It is not expected to be faster than issuing
|
||||
/// multiple queries concurrently.
|
||||
///
|
||||
/// The output data will contain an additional columns `query_index` which
|
||||
/// will contain the index of the query vector that was used to generate the
|
||||
/// result.
|
||||
pub fn add_query_vector(mut self, vector: impl IntoQueryVector) -> Result<Self> {
|
||||
let query_vector = vector.to_query_vector(&DataType::Float32, "default")?;
|
||||
self.query_vector.push(query_vector);
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Set the number of partitions to search (probe)
|
||||
///
|
||||
/// This argument is only used when the vector column has an IVF PQ index.
|
||||
@@ -854,6 +871,7 @@ mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use arrow::{compute::concat_batches, datatypes::Int32Type};
|
||||
use arrow_array::{
|
||||
cast::AsArray, Float32Array, Int32Array, RecordBatch, RecordBatchIterator,
|
||||
RecordBatchReader,
|
||||
@@ -883,7 +901,10 @@ mod tests {
|
||||
|
||||
let vector = Float32Array::from_iter_values([0.1, 0.2]);
|
||||
let query = table.query().nearest_to(&[0.1, 0.2]).unwrap();
|
||||
assert_eq!(*query.query_vector.unwrap().as_ref().as_primitive(), vector);
|
||||
assert_eq!(
|
||||
*query.query_vector.first().unwrap().as_ref().as_primitive(),
|
||||
vector
|
||||
);
|
||||
|
||||
let new_vector = Float32Array::from_iter_values([9.8, 8.7]);
|
||||
|
||||
@@ -899,7 +920,7 @@ mod tests {
|
||||
.refine_factor(999);
|
||||
|
||||
assert_eq!(
|
||||
*query.query_vector.unwrap().as_ref().as_primitive(),
|
||||
*query.query_vector.first().unwrap().as_ref().as_primitive(),
|
||||
new_vector
|
||||
);
|
||||
assert_eq!(query.base.limit.unwrap(), 100);
|
||||
@@ -1197,4 +1218,34 @@ mod tests {
|
||||
assert!(batch.column_by_name("_rowid").is_some());
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multiple_query_vectors() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let table = make_test_table(&tmp_dir).await;
|
||||
let query = table
|
||||
.query()
|
||||
.nearest_to(&[0.1, 0.2, 0.3, 0.4])
|
||||
.unwrap()
|
||||
.add_query_vector(&[0.5, 0.6, 0.7, 0.8])
|
||||
.unwrap()
|
||||
.limit(1);
|
||||
|
||||
let plan = query.explain_plan(true).await.unwrap();
|
||||
assert!(plan.contains("UnionExec"));
|
||||
|
||||
let results = query
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
let results = concat_batches(&results[0].schema(), &results).unwrap();
|
||||
assert_eq!(results.num_rows(), 2); // One result for each query vector.
|
||||
let query_index = results["query_index"].as_primitive::<Int32Type>();
|
||||
// We don't guarantee order.
|
||||
assert!(query_index.values().contains(&0));
|
||||
assert!(query_index.values().contains(&1));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::index::IndexStatistics;
|
||||
use crate::query::Select;
|
||||
use crate::table::AddDataMode;
|
||||
use crate::utils::{supported_btree_data_type, supported_vector_data_type};
|
||||
use crate::Error;
|
||||
use crate::{Error, Table};
|
||||
use arrow_array::RecordBatchReader;
|
||||
use arrow_ipc::reader::FileReader;
|
||||
use arrow_schema::{DataType, SchemaRef};
|
||||
@@ -185,6 +185,71 @@ impl<S: HttpSend> RemoteTable<S> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn apply_vector_query_params(
|
||||
mut body: serde_json::Value,
|
||||
query: &VectorQuery,
|
||||
) -> Result<Vec<serde_json::Value>> {
|
||||
Self::apply_query_params(&mut body, &query.base)?;
|
||||
|
||||
// Apply general parameters, before we dispatch based on number of query vectors.
|
||||
body["prefilter"] = query.base.prefilter.into();
|
||||
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
|
||||
body["nprobes"] = query.nprobes.into();
|
||||
body["refine_factor"] = query.refine_factor.into();
|
||||
if let Some(vector_column) = query.column.as_ref() {
|
||||
body["vector_column"] = serde_json::Value::String(vector_column.clone());
|
||||
}
|
||||
if !query.use_index {
|
||||
body["bypass_vector_index"] = serde_json::Value::Bool(true);
|
||||
}
|
||||
|
||||
fn vector_to_json(vector: &arrow_array::ArrayRef) -> Result<serde_json::Value> {
|
||||
match vector.data_type() {
|
||||
DataType::Float32 => {
|
||||
let array = vector
|
||||
.as_any()
|
||||
.downcast_ref::<arrow_array::Float32Array>()
|
||||
.unwrap();
|
||||
Ok(serde_json::Value::Array(
|
||||
array
|
||||
.values()
|
||||
.iter()
|
||||
.map(|v| {
|
||||
serde_json::Value::Number(
|
||||
serde_json::Number::from_f64(*v as f64).unwrap(),
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
))
|
||||
}
|
||||
_ => Err(Error::InvalidInput {
|
||||
message: "VectorQuery vector must be of type Float32".into(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
match query.query_vector.len() {
|
||||
0 => {
|
||||
// Server takes empty vector, not null or undefined.
|
||||
body["vector"] = serde_json::Value::Array(Vec::new());
|
||||
Ok(vec![body])
|
||||
}
|
||||
1 => {
|
||||
body["vector"] = vector_to_json(&query.query_vector[0])?;
|
||||
Ok(vec![body])
|
||||
}
|
||||
_ => {
|
||||
let mut bodies = Vec::with_capacity(query.query_vector.len());
|
||||
for vector in &query.query_vector {
|
||||
let mut body = body.clone();
|
||||
body["vector"] = vector_to_json(vector)?;
|
||||
bodies.push(body);
|
||||
}
|
||||
Ok(bodies)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
@@ -306,51 +371,29 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
) -> Result<Arc<dyn ExecutionPlan>> {
|
||||
let request = self.client.post(&format!("/v1/table/{}/query/", self.name));
|
||||
|
||||
let mut body = serde_json::Value::Object(Default::default());
|
||||
Self::apply_query_params(&mut body, &query.base)?;
|
||||
let body = serde_json::Value::Object(Default::default());
|
||||
let bodies = Self::apply_vector_query_params(body, query)?;
|
||||
|
||||
body["prefilter"] = query.base.prefilter.into();
|
||||
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
|
||||
body["nprobes"] = query.nprobes.into();
|
||||
body["refine_factor"] = query.refine_factor.into();
|
||||
|
||||
let vector: Vec<f32> = if let Some(vector) = query.query_vector.as_ref() {
|
||||
match vector.data_type() {
|
||||
DataType::Float32 => vector
|
||||
.as_any()
|
||||
.downcast_ref::<arrow_array::Float32Array>()
|
||||
.unwrap()
|
||||
.values()
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect(),
|
||||
_ => {
|
||||
return Err(Error::InvalidInput {
|
||||
message: "VectorQuery vector must be of type Float32".into(),
|
||||
})
|
||||
}
|
||||
}
|
||||
let mut futures = Vec::with_capacity(bodies.len());
|
||||
for body in bodies {
|
||||
let request = request.try_clone().unwrap().json(&body);
|
||||
let future = async move {
|
||||
let (request_id, response) = self.client.send(request, true).await?;
|
||||
self.read_arrow_stream(&request_id, response).await
|
||||
};
|
||||
futures.push(future);
|
||||
}
|
||||
let streams = futures::future::try_join_all(futures).await?;
|
||||
if streams.len() == 1 {
|
||||
let stream = streams.into_iter().next().unwrap();
|
||||
Ok(Arc::new(OneShotExec::new(stream)))
|
||||
} else {
|
||||
// Server takes empty vector, not null or undefined.
|
||||
Vec::new()
|
||||
};
|
||||
body["vector"] = serde_json::json!(vector);
|
||||
|
||||
if let Some(vector_column) = query.column.as_ref() {
|
||||
body["vector_column"] = serde_json::Value::String(vector_column.clone());
|
||||
let stream_execs = streams
|
||||
.into_iter()
|
||||
.map(|stream| Arc::new(OneShotExec::new(stream)) as Arc<dyn ExecutionPlan>)
|
||||
.collect();
|
||||
Table::multi_vector_plan(stream_execs)
|
||||
}
|
||||
|
||||
if !query.use_index {
|
||||
body["bypass_vector_index"] = serde_json::Value::Bool(true);
|
||||
}
|
||||
|
||||
let request = request.json(&body);
|
||||
|
||||
let (request_id, response) = self.client.send(request, true).await?;
|
||||
|
||||
let stream = self.read_arrow_stream(&request_id, response).await?;
|
||||
|
||||
Ok(Arc::new(OneShotExec::new(stream)))
|
||||
}
|
||||
|
||||
async fn plain_query(
|
||||
@@ -655,6 +698,7 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
|
||||
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
||||
@@ -1207,6 +1251,52 @@ mod tests {
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_query_multiple_vectors() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(request.url().path(), "/v1/table/my_table/query/");
|
||||
assert_eq!(
|
||||
request.headers().get("Content-Type").unwrap(),
|
||||
JSON_CONTENT_TYPE
|
||||
);
|
||||
let data = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
|
||||
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
|
||||
)
|
||||
.unwrap();
|
||||
let response_body = write_ipc_file(&data);
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, ARROW_FILE_CONTENT_TYPE)
|
||||
.body(response_body)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let query = table
|
||||
.query()
|
||||
.nearest_to(vec![0.1, 0.2, 0.3])
|
||||
.unwrap()
|
||||
.add_query_vector(vec![0.4, 0.5, 0.6])
|
||||
.unwrap();
|
||||
let plan = query.explain_plan(true).await.unwrap();
|
||||
assert!(plan.contains("UnionExec"), "Plan: {}", plan);
|
||||
|
||||
let results = query
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
let results = concat_batches(&results[0].schema(), &results).unwrap();
|
||||
|
||||
let query_index = results["query_index"].as_primitive::<Int32Type>();
|
||||
// We don't guarantee order.
|
||||
assert!(query_index.values().contains(&0));
|
||||
assert!(query_index.values().contains(&1));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_index() {
|
||||
let cases = [
|
||||
|
||||
@@ -24,6 +24,9 @@ use arrow_array::{RecordBatchIterator, RecordBatchReader};
|
||||
use arrow_schema::{Field, Schema, SchemaRef};
|
||||
use async_trait::async_trait;
|
||||
use datafusion_physical_plan::display::DisplayableExecutionPlan;
|
||||
use datafusion_physical_plan::projection::ProjectionExec;
|
||||
use datafusion_physical_plan::repartition::RepartitionExec;
|
||||
use datafusion_physical_plan::union::UnionExec;
|
||||
use datafusion_physical_plan::ExecutionPlan;
|
||||
use futures::{StreamExt, TryStreamExt};
|
||||
use lance::dataset::builder::DatasetBuilder;
|
||||
@@ -972,6 +975,57 @@ impl Table {
|
||||
) -> Result<Option<IndexStatistics>> {
|
||||
self.inner.index_stats(index_name.as_ref()).await
|
||||
}
|
||||
|
||||
// Take many execution plans and map them into a single plan that adds
|
||||
// a query_index column and unions them.
|
||||
pub(crate) fn multi_vector_plan(
|
||||
plans: Vec<Arc<dyn ExecutionPlan>>,
|
||||
) -> Result<Arc<dyn ExecutionPlan>> {
|
||||
if plans.is_empty() {
|
||||
return Err(Error::InvalidInput {
|
||||
message: "No plans provided".to_string(),
|
||||
});
|
||||
}
|
||||
// Projection to keeping all existing columns
|
||||
let first_plan = plans[0].clone();
|
||||
let project_all_columns = first_plan
|
||||
.schema()
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, field)| {
|
||||
let expr =
|
||||
datafusion_physical_plan::expressions::Column::new(field.name().as_str(), i);
|
||||
let expr = Arc::new(expr) as Arc<dyn datafusion_physical_plan::PhysicalExpr>;
|
||||
(expr, field.name().clone())
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let projected_plans = plans
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(plan_i, plan)| {
|
||||
let query_index = datafusion_common::ScalarValue::Int32(Some(plan_i as i32));
|
||||
let query_index_expr =
|
||||
datafusion_physical_plan::expressions::Literal::new(query_index);
|
||||
let query_index_expr =
|
||||
Arc::new(query_index_expr) as Arc<dyn datafusion_physical_plan::PhysicalExpr>;
|
||||
let mut projections = vec![(query_index_expr, "query_index".to_string())];
|
||||
projections.extend_from_slice(&project_all_columns);
|
||||
let projection = ProjectionExec::try_new(projections, plan).unwrap();
|
||||
Arc::new(projection) as Arc<dyn datafusion_physical_plan::ExecutionPlan>
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let unioned = Arc::new(UnionExec::new(projected_plans));
|
||||
// We require 1 partition in the final output
|
||||
let repartitioned = RepartitionExec::try_new(
|
||||
unioned,
|
||||
datafusion_physical_plan::Partitioning::RoundRobinBatch(1),
|
||||
)
|
||||
.unwrap();
|
||||
Ok(Arc::new(repartitioned))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NativeTable> for Table {
|
||||
@@ -1784,9 +1838,25 @@ impl TableInternal for NativeTable {
|
||||
) -> Result<Arc<dyn ExecutionPlan>> {
|
||||
let ds_ref = self.dataset.get().await?;
|
||||
|
||||
if query.query_vector.len() > 1 {
|
||||
// If there are multiple query vectors, create a plan for each of them and union them.
|
||||
let query_vecs = query.query_vector.clone();
|
||||
let plan_futures = query_vecs
|
||||
.into_iter()
|
||||
.map(|query_vector| {
|
||||
let mut sub_query = query.clone();
|
||||
sub_query.query_vector = vec![query_vector];
|
||||
let options_ref = options.clone();
|
||||
async move { self.create_plan(&sub_query, options_ref).await }
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let plans = futures::future::try_join_all(plan_futures).await?;
|
||||
return Table::multi_vector_plan(plans);
|
||||
}
|
||||
|
||||
let mut scanner: Scanner = ds_ref.scan();
|
||||
|
||||
if let Some(query_vector) = query.query_vector.as_ref() {
|
||||
if let Some(query_vector) = query.query_vector.first() {
|
||||
// If there is a vector query, default to limit=10 if unspecified
|
||||
let column = if let Some(col) = query.column.as_ref() {
|
||||
col.clone()
|
||||
@@ -1828,18 +1898,11 @@ impl TableInternal for NativeTable {
|
||||
query_vector,
|
||||
query.base.limit.unwrap_or(DEFAULT_TOP_K),
|
||||
)?;
|
||||
scanner.limit(
|
||||
query.base.limit.map(|limit| limit as i64),
|
||||
query.base.offset.map(|offset| offset as i64),
|
||||
)?;
|
||||
} else {
|
||||
// If there is no vector query, it's ok to not have a limit
|
||||
scanner.limit(
|
||||
query.base.limit.map(|limit| limit as i64),
|
||||
query.base.offset.map(|offset| offset as i64),
|
||||
)?;
|
||||
}
|
||||
|
||||
scanner.limit(
|
||||
query.base.limit.map(|limit| limit as i64),
|
||||
query.base.offset.map(|offset| offset as i64),
|
||||
)?;
|
||||
scanner.nprobs(query.nprobes);
|
||||
scanner.use_index(query.use_index);
|
||||
scanner.prefilter(query.base.prefilter);
|
||||
|
||||
Reference in New Issue
Block a user