mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-03 18:32:55 +00:00
Compare commits
12 Commits
lancedb-cl
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
66a881b33a | ||
|
|
a7515d6ee2 | ||
|
|
587c0824af | ||
|
|
b38a4269d0 | ||
|
|
119d88b9db | ||
|
|
74f660d223 | ||
|
|
b2b0979b90 | ||
|
|
ee2a40b182 | ||
|
|
4ca0b15354 | ||
|
|
d8c217b47d | ||
|
|
b724b1a01f | ||
|
|
abd75e0ead |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.13.0-beta.1"
|
current_version = "0.13.0-beta.2"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
1
.github/workflows/nodejs.yml
vendored
1
.github/workflows/nodejs.yml
vendored
@@ -104,7 +104,6 @@ jobs:
|
|||||||
OPENAI_BASE_URL: http://0.0.0.0:8000
|
OPENAI_BASE_URL: http://0.0.0.0:8000
|
||||||
run: |
|
run: |
|
||||||
python ci/mock_openai.py &
|
python ci/mock_openai.py &
|
||||||
ss -ltnp | grep :8000
|
|
||||||
cd nodejs/examples
|
cd nodejs/examples
|
||||||
npm test
|
npm test
|
||||||
macos:
|
macos:
|
||||||
|
|||||||
380
.github/workflows/npm-publish.yml
vendored
380
.github/workflows/npm-publish.yml
vendored
@@ -226,108 +226,109 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
node/dist/lancedb-vectordb-win32*.tgz
|
node/dist/lancedb-vectordb-win32*.tgz
|
||||||
|
|
||||||
node-windows-arm64:
|
# TODO: re-enable once working https://github.com/lancedb/lancedb/pull/1831
|
||||||
name: vectordb win32-arm64-msvc
|
# node-windows-arm64:
|
||||||
runs-on: windows-4x-arm
|
# name: vectordb win32-arm64-msvc
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
# runs-on: windows-4x-arm
|
||||||
steps:
|
# if: startsWith(github.ref, 'refs/tags/v')
|
||||||
- uses: actions/checkout@v4
|
# steps:
|
||||||
- name: Install Git
|
# - uses: actions/checkout@v4
|
||||||
run: |
|
# - name: Install Git
|
||||||
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
# run: |
|
||||||
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
# Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||||
shell: powershell
|
# Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||||
- name: Add Git to PATH
|
# shell: powershell
|
||||||
run: |
|
# - name: Add Git to PATH
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
# run: |
|
||||||
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
# Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||||
shell: powershell
|
# $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||||
- name: Configure Git symlinks
|
# shell: powershell
|
||||||
run: git config --global core.symlinks true
|
# - name: Configure Git symlinks
|
||||||
- uses: actions/checkout@v4
|
# run: git config --global core.symlinks true
|
||||||
- uses: actions/setup-python@v5
|
# - uses: actions/checkout@v4
|
||||||
with:
|
# - uses: actions/setup-python@v5
|
||||||
python-version: "3.13"
|
# with:
|
||||||
- name: Install Visual Studio Build Tools
|
# python-version: "3.13"
|
||||||
run: |
|
# - name: Install Visual Studio Build Tools
|
||||||
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
# run: |
|
||||||
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
# Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||||
"--installPath", "C:\BuildTools", `
|
# Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
# "--installPath", "C:\BuildTools", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
# "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
# "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||||
shell: powershell
|
# "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||||
- name: Add Visual Studio Build Tools to PATH
|
# shell: powershell
|
||||||
run: |
|
# - name: Add Visual Studio Build Tools to PATH
|
||||||
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
# run: |
|
||||||
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
# $vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
# $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||||
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||||
|
|
||||||
# Add MSVC runtime libraries to LIB
|
# # Add MSVC runtime libraries to LIB
|
||||||
$env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
|
# $env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
|
||||||
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
|
# "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
|
||||||
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
# "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||||
Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
|
# Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
|
||||||
|
|
||||||
# Add INCLUDE paths
|
# # Add INCLUDE paths
|
||||||
$env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
|
# $env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
|
||||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
|
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
|
||||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
|
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
|
||||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
|
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
|
||||||
Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
|
# Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Install Rust
|
# - name: Install Rust
|
||||||
run: |
|
# run: |
|
||||||
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
# Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||||
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
# .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add Rust to PATH
|
# - name: Add Rust to PATH
|
||||||
run: |
|
# run: |
|
||||||
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
# Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
|
|
||||||
- uses: Swatinem/rust-cache@v2
|
# - uses: Swatinem/rust-cache@v2
|
||||||
with:
|
# with:
|
||||||
workspaces: rust
|
# workspaces: rust
|
||||||
- name: Install 7-Zip ARM
|
# - name: Install 7-Zip ARM
|
||||||
run: |
|
# run: |
|
||||||
New-Item -Path 'C:\7zip' -ItemType Directory
|
# New-Item -Path 'C:\7zip' -ItemType Directory
|
||||||
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
# Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||||
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
# Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add 7-Zip to PATH
|
# - name: Add 7-Zip to PATH
|
||||||
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
# run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Install Protoc v21.12
|
# - name: Install Protoc v21.12
|
||||||
working-directory: C:\
|
# working-directory: C:\
|
||||||
run: |
|
# run: |
|
||||||
if (Test-Path 'C:\protoc') {
|
# if (Test-Path 'C:\protoc') {
|
||||||
Write-Host "Protoc directory exists, skipping installation"
|
# Write-Host "Protoc directory exists, skipping installation"
|
||||||
return
|
# return
|
||||||
}
|
# }
|
||||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
# New-Item -Path 'C:\protoc' -ItemType Directory
|
||||||
Set-Location C:\protoc
|
# Set-Location C:\protoc
|
||||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
# Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||||
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
# & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add Protoc to PATH
|
# - name: Add Protoc to PATH
|
||||||
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
# run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Build Windows native node modules
|
# - name: Build Windows native node modules
|
||||||
run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc
|
# run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc
|
||||||
- name: Upload Windows ARM64 Artifacts
|
# - name: Upload Windows ARM64 Artifacts
|
||||||
uses: actions/upload-artifact@v4
|
# uses: actions/upload-artifact@v4
|
||||||
with:
|
# with:
|
||||||
name: node-native-windows-arm64
|
# name: node-native-windows-arm64
|
||||||
path: |
|
# path: |
|
||||||
node/dist/*.node
|
# node/dist/*.node
|
||||||
|
|
||||||
nodejs-windows:
|
nodejs-windows:
|
||||||
name: lancedb ${{ matrix.target }}
|
name: lancedb ${{ matrix.target }}
|
||||||
@@ -363,98 +364,99 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
nodejs/dist/*.node
|
nodejs/dist/*.node
|
||||||
|
|
||||||
nodejs-windows-arm64:
|
# TODO: re-enable once working https://github.com/lancedb/lancedb/pull/1831
|
||||||
name: lancedb win32-arm64-msvc
|
# nodejs-windows-arm64:
|
||||||
runs-on: windows-4x-arm
|
# name: lancedb win32-arm64-msvc
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
# runs-on: windows-4x-arm
|
||||||
steps:
|
# if: startsWith(github.ref, 'refs/tags/v')
|
||||||
- uses: actions/checkout@v4
|
# steps:
|
||||||
- name: Install Git
|
# - uses: actions/checkout@v4
|
||||||
run: |
|
# - name: Install Git
|
||||||
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
# run: |
|
||||||
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
# Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||||
shell: powershell
|
# Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||||
- name: Add Git to PATH
|
# shell: powershell
|
||||||
run: |
|
# - name: Add Git to PATH
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
# run: |
|
||||||
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
# Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||||
shell: powershell
|
# $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||||
- name: Configure Git symlinks
|
# shell: powershell
|
||||||
run: git config --global core.symlinks true
|
# - name: Configure Git symlinks
|
||||||
- uses: actions/checkout@v4
|
# run: git config --global core.symlinks true
|
||||||
- uses: actions/setup-python@v5
|
# - uses: actions/checkout@v4
|
||||||
with:
|
# - uses: actions/setup-python@v5
|
||||||
python-version: "3.13"
|
# with:
|
||||||
- name: Install Visual Studio Build Tools
|
# python-version: "3.13"
|
||||||
run: |
|
# - name: Install Visual Studio Build Tools
|
||||||
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
# run: |
|
||||||
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
# Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||||
"--installPath", "C:\BuildTools", `
|
# Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
# "--installPath", "C:\BuildTools", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
# "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
# "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||||
shell: powershell
|
# "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||||
- name: Add Visual Studio Build Tools to PATH
|
# shell: powershell
|
||||||
run: |
|
# - name: Add Visual Studio Build Tools to PATH
|
||||||
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
# run: |
|
||||||
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
# $vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
# $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||||
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||||
|
|
||||||
$env:LIB = ""
|
# $env:LIB = ""
|
||||||
Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
# Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Install Rust
|
# - name: Install Rust
|
||||||
run: |
|
# run: |
|
||||||
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
# Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||||
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
# .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add Rust to PATH
|
# - name: Add Rust to PATH
|
||||||
run: |
|
# run: |
|
||||||
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
# Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
|
|
||||||
- uses: Swatinem/rust-cache@v2
|
# - uses: Swatinem/rust-cache@v2
|
||||||
with:
|
# with:
|
||||||
workspaces: rust
|
# workspaces: rust
|
||||||
- name: Install 7-Zip ARM
|
# - name: Install 7-Zip ARM
|
||||||
run: |
|
# run: |
|
||||||
New-Item -Path 'C:\7zip' -ItemType Directory
|
# New-Item -Path 'C:\7zip' -ItemType Directory
|
||||||
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
# Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||||
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
# Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add 7-Zip to PATH
|
# - name: Add 7-Zip to PATH
|
||||||
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
# run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Install Protoc v21.12
|
# - name: Install Protoc v21.12
|
||||||
working-directory: C:\
|
# working-directory: C:\
|
||||||
run: |
|
# run: |
|
||||||
if (Test-Path 'C:\protoc') {
|
# if (Test-Path 'C:\protoc') {
|
||||||
Write-Host "Protoc directory exists, skipping installation"
|
# Write-Host "Protoc directory exists, skipping installation"
|
||||||
return
|
# return
|
||||||
}
|
# }
|
||||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
# New-Item -Path 'C:\protoc' -ItemType Directory
|
||||||
Set-Location C:\protoc
|
# Set-Location C:\protoc
|
||||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
# Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||||
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
# & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add Protoc to PATH
|
# - name: Add Protoc to PATH
|
||||||
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
# run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Build Windows native node modules
|
# - name: Build Windows native node modules
|
||||||
run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc
|
# run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc
|
||||||
- name: Upload Windows ARM64 Artifacts
|
# - name: Upload Windows ARM64 Artifacts
|
||||||
uses: actions/upload-artifact@v4
|
# uses: actions/upload-artifact@v4
|
||||||
with:
|
# with:
|
||||||
name: nodejs-native-windows-arm64
|
# name: nodejs-native-windows-arm64
|
||||||
path: |
|
# path: |
|
||||||
nodejs/dist/*.node
|
# nodejs/dist/*.node
|
||||||
|
|
||||||
release:
|
release:
|
||||||
name: vectordb NPM Publish
|
name: vectordb NPM Publish
|
||||||
|
|||||||
14
Cargo.toml
14
Cargo.toml
@@ -23,13 +23,13 @@ rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
|||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.19.2", "features" = [
|
lance = { "version" = "=0.19.2", "features" = [
|
||||||
"dynamodb",
|
"dynamodb",
|
||||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
]}
|
||||||
lance-index = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
lance-index = "=0.19.2"
|
||||||
lance-linalg = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
lance-linalg = "=0.19.2"
|
||||||
lance-table = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
lance-table = "=0.19.2"
|
||||||
lance-testing = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
lance-testing = "=0.19.2"
|
||||||
lance-datafusion = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
lance-datafusion = "=0.19.2"
|
||||||
lance-encoding = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2-beta.3" }
|
lance-encoding = "=0.19.2"
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "52.2", optional = false }
|
arrow = { version = "52.2", optional = false }
|
||||||
arrow-array = "52.2"
|
arrow-array = "52.2"
|
||||||
|
|||||||
@@ -790,6 +790,27 @@ Use the `drop_table()` method on the database to remove a table.
|
|||||||
This permanently removes the table and is not recoverable, unlike deleting rows.
|
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||||
If the table does not exist an exception is raised.
|
If the table does not exist an exception is raised.
|
||||||
|
|
||||||
|
## Handling bad vectors
|
||||||
|
|
||||||
|
In LanceDB Python, you can use the `on_bad_vectors` parameter to choose how
|
||||||
|
invalid vector values are handled. Invalid vectors are vectors that are not valid
|
||||||
|
because:
|
||||||
|
|
||||||
|
1. They are the wrong dimension
|
||||||
|
2. They contain NaN values
|
||||||
|
3. They are null but are on a non-nullable field
|
||||||
|
|
||||||
|
By default, LanceDB will raise an error if it encounters a bad vector. You can
|
||||||
|
also choose one of the following options:
|
||||||
|
|
||||||
|
* `drop`: Ignore rows with bad vectors
|
||||||
|
* `fill`: Replace bad values (NaNs) or missing values (too few dimensions) with
|
||||||
|
the fill value specified in the `fill_value` parameter. An input like
|
||||||
|
`[1.0, NaN, 3.0]` will be replaced with `[1.0, 0.0, 3.0]` if `fill_value=0.0`.
|
||||||
|
* `null`: Replace bad vectors with null (only works if the column is nullable).
|
||||||
|
A bad vector `[1.0, NaN, 3.0]` will be replaced with `null` if the column is
|
||||||
|
nullable. If the vector column is non-nullable, then bad vectors will cause an
|
||||||
|
error
|
||||||
|
|
||||||
## Consistency
|
## Consistency
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.13.0-beta.1</version>
|
<version>0.13.0-beta.2</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.13.0-beta.1</version>
|
<version>0.13.0-beta.2</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
||||||
<name>LanceDB Parent</name>
|
<name>LanceDB Parent</name>
|
||||||
|
|||||||
76
node/package-lock.json
generated
76
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -52,12 +52,12 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.1",
|
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.1",
|
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.1",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.1",
|
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.1",
|
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.1"
|
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
@@ -327,66 +327,6 @@
|
|||||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
|
||||||
"version": "0.13.0-beta.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.13.0-beta.1.tgz",
|
|
||||||
"integrity": "sha512-beOrf6selCzzhLgDG8Nibma4nO/CSnA1wUKRmlJHEPtGcg7PW18z6MP/nfwQMpMR/FLRfTo8pPTbpzss47MiQQ==",
|
|
||||||
"cpu": [
|
|
||||||
"arm64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"darwin"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
|
||||||
"version": "0.13.0-beta.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.13.0-beta.1.tgz",
|
|
||||||
"integrity": "sha512-YdraGRF/RbJRkKh0v3xT03LUhq47T2GtCvJ5gZp8wKlh4pHa8LuhLU0DIdvmG/DT5vuQA+td8HDkBm/e3EOdNg==",
|
|
||||||
"cpu": [
|
|
||||||
"x64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"darwin"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
|
||||||
"version": "0.13.0-beta.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.13.0-beta.1.tgz",
|
|
||||||
"integrity": "sha512-Pp0O/uhEqof1oLaWrNbv+Ym+q8kBkiCqaA5+2eAZ6a3e9U+Ozkvb0FQrHuyi9adJ5wKQ4NabyQE9BMf2bYpOnQ==",
|
|
||||||
"cpu": [
|
|
||||||
"arm64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"linux"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
|
||||||
"version": "0.13.0-beta.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.13.0-beta.1.tgz",
|
|
||||||
"integrity": "sha512-y8nxOye4egfWF5FGED9EfkmZ1O5HnRLU4a61B8m5JSpkivO9v2epTcbYN0yt/7ZFCgtqMfJ8VW4Mi7qQcz3KDA==",
|
|
||||||
"cpu": [
|
|
||||||
"x64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"linux"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
|
||||||
"version": "0.13.0-beta.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.13.0-beta.1.tgz",
|
|
||||||
"integrity": "sha512-STMDP9dp0TBLkB3ro+16pKcGy6bmbhRuEZZZ1Tp5P75yTPeVh4zIgWkidMdU1qBbEYM7xacnsp9QAwgLnMU/Ow==",
|
|
||||||
"cpu": [
|
|
||||||
"x64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"win32"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@neon-rs/cli": {
|
"node_modules/@neon-rs/cli": {
|
||||||
"version": "0.0.160",
|
"version": "0.0.160",
|
||||||
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
@@ -89,11 +89,11 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.1",
|
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.1",
|
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.1",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.1",
|
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.1",
|
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.1"
|
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.13.0-beta.1"
|
version = "0.13.0-beta.2"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -187,6 +187,81 @@ describe.each([arrow13, arrow14, arrow15, arrow16, arrow17])(
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// TODO: https://github.com/lancedb/lancedb/issues/1832
|
||||||
|
it.skip("should be able to omit nullable fields", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const schema = new arrow.Schema([
|
||||||
|
new arrow.Field(
|
||||||
|
"vector",
|
||||||
|
new arrow.FixedSizeList(
|
||||||
|
2,
|
||||||
|
new arrow.Field("item", new arrow.Float64()),
|
||||||
|
),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
new arrow.Field("item", new arrow.Utf8(), true),
|
||||||
|
new arrow.Field("price", new arrow.Float64(), false),
|
||||||
|
]);
|
||||||
|
const table = await db.createEmptyTable("test", schema);
|
||||||
|
|
||||||
|
const data1 = { item: "foo", price: 10.0 };
|
||||||
|
await table.add([data1]);
|
||||||
|
const data2 = { vector: [3.1, 4.1], price: 2.0 };
|
||||||
|
await table.add([data2]);
|
||||||
|
const data3 = { vector: [5.9, 26.5], item: "bar", price: 3.0 };
|
||||||
|
await table.add([data3]);
|
||||||
|
|
||||||
|
let res = await table.query().limit(10).toArray();
|
||||||
|
const resVector = res.map((r) => r.get("vector").toArray());
|
||||||
|
expect(resVector).toEqual([null, data2.vector, data3.vector]);
|
||||||
|
const resItem = res.map((r) => r.get("item").toArray());
|
||||||
|
expect(resItem).toEqual(["foo", null, "bar"]);
|
||||||
|
const resPrice = res.map((r) => r.get("price").toArray());
|
||||||
|
expect(resPrice).toEqual([10.0, 2.0, 3.0]);
|
||||||
|
|
||||||
|
const data4 = { item: "foo" };
|
||||||
|
// We can't omit a column if it's not nullable
|
||||||
|
await expect(table.add([data4])).rejects.toThrow("Invalid user input");
|
||||||
|
|
||||||
|
// But we can alter columns to make them nullable
|
||||||
|
await table.alterColumns([{ path: "price", nullable: true }]);
|
||||||
|
await table.add([data4]);
|
||||||
|
|
||||||
|
res = (await table.query().limit(10).toArray()).map((r) => r.toJSON());
|
||||||
|
expect(res).toEqual([data1, data2, data3, data4]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should be able to insert nullable data for non-nullable fields", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const schema = new arrow.Schema([
|
||||||
|
new arrow.Field("x", new arrow.Float64(), false),
|
||||||
|
new arrow.Field("id", new arrow.Utf8(), false),
|
||||||
|
]);
|
||||||
|
const table = await db.createEmptyTable("test", schema);
|
||||||
|
|
||||||
|
const data1 = { x: 4.1, id: "foo" };
|
||||||
|
await table.add([data1]);
|
||||||
|
const res = (await table.query().toArray())[0];
|
||||||
|
expect(res.x).toEqual(data1.x);
|
||||||
|
expect(res.id).toEqual(data1.id);
|
||||||
|
|
||||||
|
const data2 = { x: null, id: "bar" };
|
||||||
|
await expect(table.add([data2])).rejects.toThrow(
|
||||||
|
"declared as non-nullable but contains null values",
|
||||||
|
);
|
||||||
|
|
||||||
|
// But we can alter columns to make them nullable
|
||||||
|
await table.alterColumns([{ path: "x", nullable: true }]);
|
||||||
|
await table.add([data2]);
|
||||||
|
|
||||||
|
const res2 = await table.query().toArray();
|
||||||
|
expect(res2.length).toBe(2);
|
||||||
|
expect(res2[0].x).toEqual(data1.x);
|
||||||
|
expect(res2[0].id).toEqual(data1.id);
|
||||||
|
expect(res2[1].x).toBeNull();
|
||||||
|
expect(res2[1].id).toEqual(data2.id);
|
||||||
|
});
|
||||||
|
|
||||||
it("should return the table as an instance of an arrow table", async () => {
|
it("should return the table as an instance of an arrow table", async () => {
|
||||||
const arrowTbl = await table.toArrow();
|
const arrowTbl = await table.toArrow();
|
||||||
expect(arrowTbl).toBeInstanceOf(ArrowTable);
|
expect(arrowTbl).toBeInstanceOf(ArrowTable);
|
||||||
@@ -998,4 +1073,18 @@ describe("column name options", () => {
|
|||||||
const results = await table.query().where("`camelCase` = 1").toArray();
|
const results = await table.query().where("`camelCase` = 1").toArray();
|
||||||
expect(results[0].camelCase).toBe(1);
|
expect(results[0].camelCase).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("can make multiple vector queries in one go", async () => {
|
||||||
|
const results = await table
|
||||||
|
.query()
|
||||||
|
.nearestTo([0.1, 0.2])
|
||||||
|
.addQueryVector([0.1, 0.2])
|
||||||
|
.limit(1)
|
||||||
|
.toArray();
|
||||||
|
console.log(results);
|
||||||
|
expect(results.length).toBe(2);
|
||||||
|
results.sort((a, b) => a.query_index - b.query_index);
|
||||||
|
expect(results[0].query_index).toBe(0);
|
||||||
|
expect(results[1].query_index).toBe(1);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -6,12 +6,16 @@ import { withTempDirectory } from "./util.ts";
|
|||||||
import * as lancedb from "@lancedb/lancedb";
|
import * as lancedb from "@lancedb/lancedb";
|
||||||
import "@lancedb/lancedb/embedding/transformers";
|
import "@lancedb/lancedb/embedding/transformers";
|
||||||
import { LanceSchema, getRegistry } from "@lancedb/lancedb/embedding";
|
import { LanceSchema, getRegistry } from "@lancedb/lancedb/embedding";
|
||||||
|
import { EmbeddingFunction } from "@lancedb/lancedb/embedding";
|
||||||
import { Utf8 } from "apache-arrow";
|
import { Utf8 } from "apache-arrow";
|
||||||
|
|
||||||
test("full text search", async () => {
|
test("full text search", async () => {
|
||||||
await withTempDirectory(async (databaseDir) => {
|
await withTempDirectory(async (databaseDir) => {
|
||||||
const db = await lancedb.connect(databaseDir);
|
const db = await lancedb.connect(databaseDir);
|
||||||
const func = await getRegistry().get("huggingface").create();
|
console.log(getRegistry());
|
||||||
|
const func = (await getRegistry()
|
||||||
|
.get("huggingface")
|
||||||
|
?.create()) as EmbeddingFunction;
|
||||||
|
|
||||||
const facts = [
|
const facts = [
|
||||||
"Albert Einstein was a theoretical physicist.",
|
"Albert Einstein was a theoretical physicist.",
|
||||||
@@ -56,4 +60,4 @@ test("full text search", async () => {
|
|||||||
|
|
||||||
expect(actual[0]["text"]).toBe("The human body has 206 bones.");
|
expect(actual[0]["text"]).toBe("The human body has 206 bones.");
|
||||||
});
|
});
|
||||||
});
|
}, 100_000);
|
||||||
|
|||||||
@@ -19,9 +19,6 @@ import { EmbeddingFunctionConfig, getRegistry } from "./registry";
|
|||||||
|
|
||||||
export { EmbeddingFunction, TextEmbeddingFunction } from "./embedding_function";
|
export { EmbeddingFunction, TextEmbeddingFunction } from "./embedding_function";
|
||||||
|
|
||||||
// We need to explicitly export '*' so that the `register` decorator actually registers the class.
|
|
||||||
export * from "./openai";
|
|
||||||
export * from "./transformers";
|
|
||||||
export * from "./registry";
|
export * from "./registry";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -17,8 +17,6 @@ import {
|
|||||||
type EmbeddingFunctionConstructor,
|
type EmbeddingFunctionConstructor,
|
||||||
} from "./embedding_function";
|
} from "./embedding_function";
|
||||||
import "reflect-metadata";
|
import "reflect-metadata";
|
||||||
import { OpenAIEmbeddingFunction } from "./openai";
|
|
||||||
import { TransformersEmbeddingFunction } from "./transformers";
|
|
||||||
|
|
||||||
type CreateReturnType<T> = T extends { init: () => Promise<void> }
|
type CreateReturnType<T> = T extends { init: () => Promise<void> }
|
||||||
? Promise<T>
|
? Promise<T>
|
||||||
@@ -73,10 +71,6 @@ export class EmbeddingFunctionRegistry {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
get(name: "openai"): EmbeddingFunctionCreate<OpenAIEmbeddingFunction>;
|
|
||||||
get(
|
|
||||||
name: "huggingface",
|
|
||||||
): EmbeddingFunctionCreate<TransformersEmbeddingFunction>;
|
|
||||||
get<T extends EmbeddingFunction<unknown>>(
|
get<T extends EmbeddingFunction<unknown>>(
|
||||||
name: string,
|
name: string,
|
||||||
): EmbeddingFunctionCreate<T> | undefined;
|
): EmbeddingFunctionCreate<T> | undefined;
|
||||||
|
|||||||
@@ -492,6 +492,42 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
|||||||
super.doCall((inner) => inner.bypassVectorIndex());
|
super.doCall((inner) => inner.bypassVectorIndex());
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add a query vector to the search
|
||||||
|
*
|
||||||
|
* This method can be called multiple times to add multiple query vectors
|
||||||
|
* to the search. If multiple query vectors are added, then they will be searched
|
||||||
|
* in parallel, and the results will be concatenated. A column called `query_index`
|
||||||
|
* will be added to indicate the index of the query vector that produced the result.
|
||||||
|
*
|
||||||
|
* Performance wise, this is equivalent to running multiple queries concurrently.
|
||||||
|
*/
|
||||||
|
addQueryVector(vector: IntoVector): VectorQuery {
|
||||||
|
if (vector instanceof Promise) {
|
||||||
|
const res = (async () => {
|
||||||
|
try {
|
||||||
|
const v = await vector;
|
||||||
|
const arr = Float32Array.from(v);
|
||||||
|
//
|
||||||
|
// biome-ignore lint/suspicious/noExplicitAny: we need to get the `inner`, but js has no package scoping
|
||||||
|
const value: any = this.addQueryVector(arr);
|
||||||
|
const inner = value.inner as
|
||||||
|
| NativeVectorQuery
|
||||||
|
| Promise<NativeVectorQuery>;
|
||||||
|
return inner;
|
||||||
|
} catch (e) {
|
||||||
|
return Promise.reject(e);
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
return new VectorQuery(res);
|
||||||
|
} else {
|
||||||
|
super.doCall((inner) => {
|
||||||
|
inner.addQueryVector(Float32Array.from(vector));
|
||||||
|
});
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** A builder for LanceDB queries. */
|
/** A builder for LanceDB queries. */
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
@@ -10,11 +10,13 @@
|
|||||||
"vector database",
|
"vector database",
|
||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
"./embedding": "./dist/embedding/index.js"
|
"./embedding": "./dist/embedding/index.js",
|
||||||
|
"./embedding/openai": "./dist/embedding/openai.js",
|
||||||
|
"./embedding/transformers": "./dist/embedding/transformers.js"
|
||||||
},
|
},
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
"napi": {
|
"napi": {
|
||||||
|
|||||||
@@ -135,6 +135,16 @@ impl VectorQuery {
|
|||||||
self.inner = self.inner.clone().column(&column);
|
self.inner = self.inner.clone().column(&column);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn add_query_vector(&mut self, vector: Float32Array) -> Result<()> {
|
||||||
|
self.inner = self
|
||||||
|
.inner
|
||||||
|
.clone()
|
||||||
|
.add_query_vector(vector.as_ref())
|
||||||
|
.default_error()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub fn distance_type(&mut self, distance_type: String) -> napi::Result<()> {
|
pub fn distance_type(&mut self, distance_type: String) -> napi::Result<()> {
|
||||||
let distance_type = parse_distance_type(distance_type)?;
|
let distance_type = parse_distance_type(distance_type)?;
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.16.0-beta.0"
|
current_version = "0.16.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.16.0-beta.0"
|
version = "0.16.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ name = "lancedb"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"nest-asyncio~=1.0",
|
"nest-asyncio~=1.0",
|
||||||
"pylance==0.19.2-beta.3",
|
"pylance==0.19.2",
|
||||||
"tqdm>=4.27.0",
|
"tqdm>=4.27.0",
|
||||||
"pydantic>=1.10",
|
"pydantic>=1.10",
|
||||||
"packaging",
|
"packaging",
|
||||||
|
|||||||
@@ -943,12 +943,16 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
|||||||
|
|
||||||
class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
||||||
def to_arrow(self) -> pa.Table:
|
def to_arrow(self) -> pa.Table:
|
||||||
ds = self._table.to_lance()
|
query = Query(
|
||||||
return ds.to_table(
|
|
||||||
columns=self._columns,
|
columns=self._columns,
|
||||||
filter=self._where,
|
filter=self._where,
|
||||||
limit=self._limit,
|
k=self._limit or 10,
|
||||||
|
with_row_id=self._with_row_id,
|
||||||
|
vector=[],
|
||||||
|
# not actually respected in remote query
|
||||||
|
offset=self._offset or 0,
|
||||||
)
|
)
|
||||||
|
return self._table._execute_query(query).read_all()
|
||||||
|
|
||||||
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
|
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
|
||||||
"""Rerank the results using the specified reranker.
|
"""Rerank the results using the specified reranker.
|
||||||
@@ -1491,7 +1495,7 @@ class AsyncQuery(AsyncQueryBase):
|
|||||||
return pa.array(vec)
|
return pa.array(vec)
|
||||||
|
|
||||||
def nearest_to(
|
def nearest_to(
|
||||||
self, query_vector: Optional[Union[VEC, Tuple]] = None
|
self, query_vector: Optional[Union[VEC, Tuple, List[VEC]]] = None
|
||||||
) -> AsyncVectorQuery:
|
) -> AsyncVectorQuery:
|
||||||
"""
|
"""
|
||||||
Find the nearest vectors to the given query vector.
|
Find the nearest vectors to the given query vector.
|
||||||
@@ -1529,7 +1533,27 @@ class AsyncQuery(AsyncQueryBase):
|
|||||||
|
|
||||||
Vector searches always have a [limit][]. If `limit` has not been called then
|
Vector searches always have a [limit][]. If `limit` has not been called then
|
||||||
a default `limit` of 10 will be used.
|
a default `limit` of 10 will be used.
|
||||||
|
|
||||||
|
Typically, a single vector is passed in as the query. However, you can also
|
||||||
|
pass in multiple vectors. This can be useful if you want to find the nearest
|
||||||
|
vectors to multiple query vectors. This is not expected to be faster than
|
||||||
|
making multiple queries concurrently; it is just a convenience method.
|
||||||
|
If multiple vectors are passed in then an additional column `query_index`
|
||||||
|
will be added to the results. This column will contain the index of the
|
||||||
|
query vector that the result is nearest to.
|
||||||
"""
|
"""
|
||||||
|
if (
|
||||||
|
isinstance(query_vector, list)
|
||||||
|
and len(query_vector) > 0
|
||||||
|
and not isinstance(query_vector[0], (float, int))
|
||||||
|
):
|
||||||
|
# multiple have been passed
|
||||||
|
query_vectors = [AsyncQuery._query_vec_to_array(v) for v in query_vector]
|
||||||
|
new_self = self._inner.nearest_to(query_vectors[0])
|
||||||
|
for v in query_vectors[1:]:
|
||||||
|
new_self.add_query_vector(v)
|
||||||
|
return AsyncVectorQuery(new_self)
|
||||||
|
else:
|
||||||
return AsyncVectorQuery(
|
return AsyncVectorQuery(
|
||||||
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
|
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -327,10 +327,6 @@ class RemoteTable(Table):
|
|||||||
- and also the "_distance" column which is the distance between the query
|
- and also the "_distance" column which is the distance between the query
|
||||||
vector and the returned vector.
|
vector and the returned vector.
|
||||||
"""
|
"""
|
||||||
# empty query builder is not supported in saas, raise error
|
|
||||||
if query is None and query_type != "hybrid":
|
|
||||||
raise ValueError("Empty query is not supported")
|
|
||||||
|
|
||||||
return LanceQueryBuilder.create(
|
return LanceQueryBuilder.create(
|
||||||
self,
|
self,
|
||||||
query,
|
query,
|
||||||
|
|||||||
@@ -1567,7 +1567,7 @@ class LanceTable(Table):
|
|||||||
"append" and "overwrite".
|
"append" and "overwrite".
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
What to do if any of the vectors are not the same size or contains NaNs.
|
||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
|
|
||||||
@@ -1851,7 +1851,7 @@ class LanceTable(Table):
|
|||||||
data but will validate against any schema that's specified.
|
data but will validate against any schema that's specified.
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
What to do if any of the vectors are not the same size or contains NaNs.
|
||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
embedding_functions: list of EmbeddingFunctionModel, default None
|
embedding_functions: list of EmbeddingFunctionModel, default None
|
||||||
@@ -2151,13 +2151,11 @@ def _sanitize_schema(
|
|||||||
vector column to fixed_size_list(float32) if necessary.
|
vector column to fixed_size_list(float32) if necessary.
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
What to do if any of the vectors are not the same size or contains NaNs.
|
||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
"""
|
"""
|
||||||
if schema is not None:
|
if schema is not None:
|
||||||
if data.schema == schema:
|
|
||||||
return data
|
|
||||||
# cast the columns to the expected types
|
# cast the columns to the expected types
|
||||||
data = data.combine_chunks()
|
data = data.combine_chunks()
|
||||||
for field in schema:
|
for field in schema:
|
||||||
@@ -2177,6 +2175,7 @@ def _sanitize_schema(
|
|||||||
vector_column_name=field.name,
|
vector_column_name=field.name,
|
||||||
on_bad_vectors=on_bad_vectors,
|
on_bad_vectors=on_bad_vectors,
|
||||||
fill_value=fill_value,
|
fill_value=fill_value,
|
||||||
|
table_schema=schema,
|
||||||
)
|
)
|
||||||
return pa.Table.from_arrays(
|
return pa.Table.from_arrays(
|
||||||
[data[name] for name in schema.names], schema=schema
|
[data[name] for name in schema.names], schema=schema
|
||||||
@@ -2197,6 +2196,7 @@ def _sanitize_schema(
|
|||||||
def _sanitize_vector_column(
|
def _sanitize_vector_column(
|
||||||
data: pa.Table,
|
data: pa.Table,
|
||||||
vector_column_name: str,
|
vector_column_name: str,
|
||||||
|
table_schema: Optional[pa.Schema] = None,
|
||||||
on_bad_vectors: str = "error",
|
on_bad_vectors: str = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
) -> pa.Table:
|
) -> pa.Table:
|
||||||
@@ -2211,12 +2211,16 @@ def _sanitize_vector_column(
|
|||||||
The name of the vector column.
|
The name of the vector column.
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
What to do if any of the vectors are not the same size or contains NaNs.
|
||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.0
|
fill_value: float, default 0.0
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
"""
|
"""
|
||||||
# ChunkedArray is annoying to work with, so we combine chunks here
|
# ChunkedArray is annoying to work with, so we combine chunks here
|
||||||
vec_arr = data[vector_column_name].combine_chunks()
|
vec_arr = data[vector_column_name].combine_chunks()
|
||||||
|
if table_schema is not None:
|
||||||
|
field = table_schema.field(vector_column_name)
|
||||||
|
else:
|
||||||
|
field = None
|
||||||
typ = data[vector_column_name].type
|
typ = data[vector_column_name].type
|
||||||
if pa.types.is_list(typ) or pa.types.is_large_list(typ):
|
if pa.types.is_list(typ) or pa.types.is_large_list(typ):
|
||||||
# if it's a variable size list array,
|
# if it's a variable size list array,
|
||||||
@@ -2243,7 +2247,11 @@ def _sanitize_vector_column(
|
|||||||
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if pc.any(pc.is_null(vec_arr.values, nan_is_null=True)).as_py():
|
if (
|
||||||
|
field is not None
|
||||||
|
and not field.nullable
|
||||||
|
and pc.any(pc.is_null(vec_arr.values)).as_py()
|
||||||
|
) or (pc.any(pc.is_nan(vec_arr.values)).as_py()):
|
||||||
data = _sanitize_nans(
|
data = _sanitize_nans(
|
||||||
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
||||||
)
|
)
|
||||||
@@ -2287,6 +2295,12 @@ def _sanitize_jagged(data, fill_value, on_bad_vectors, vec_arr, vector_column_na
|
|||||||
)
|
)
|
||||||
elif on_bad_vectors == "drop":
|
elif on_bad_vectors == "drop":
|
||||||
data = data.filter(correct_ndims)
|
data = data.filter(correct_ndims)
|
||||||
|
elif on_bad_vectors == "null":
|
||||||
|
data = data.set_column(
|
||||||
|
data.column_names.index(vector_column_name),
|
||||||
|
vector_column_name,
|
||||||
|
pc.if_else(correct_ndims, vec_arr, pa.scalar(None)),
|
||||||
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
@@ -2304,6 +2318,7 @@ def _sanitize_nans(
|
|||||||
f"Vector column {vector_column_name} has NaNs. "
|
f"Vector column {vector_column_name} has NaNs. "
|
||||||
"Set on_bad_vectors='drop' to remove them, or "
|
"Set on_bad_vectors='drop' to remove them, or "
|
||||||
"set on_bad_vectors='fill' and fill_value=<value> to replace them. "
|
"set on_bad_vectors='fill' and fill_value=<value> to replace them. "
|
||||||
|
"Or set on_bad_vectors='null' to replace them with null."
|
||||||
)
|
)
|
||||||
elif on_bad_vectors == "fill":
|
elif on_bad_vectors == "fill":
|
||||||
if fill_value is None:
|
if fill_value is None:
|
||||||
@@ -2323,6 +2338,17 @@ def _sanitize_nans(
|
|||||||
np_arr = np_arr.reshape(-1, vec_arr.type.list_size)
|
np_arr = np_arr.reshape(-1, vec_arr.type.list_size)
|
||||||
not_nulls = np.any(np_arr, axis=1)
|
not_nulls = np.any(np_arr, axis=1)
|
||||||
data = data.filter(~not_nulls)
|
data = data.filter(~not_nulls)
|
||||||
|
elif on_bad_vectors == "null":
|
||||||
|
# null = pa.nulls(len(vec_arr)).cast(vec_arr.type)
|
||||||
|
# values = pc.if_else(pc.is_nan(vec_arr.values), fill_value, vec_arr.values)
|
||||||
|
np_arr = np.isnan(vec_arr.values.to_numpy(zero_copy_only=False))
|
||||||
|
np_arr = np_arr.reshape(-1, vec_arr.type.list_size)
|
||||||
|
no_nans = np.any(np_arr, axis=1)
|
||||||
|
data = data.set_column(
|
||||||
|
data.column_names.index(vector_column_name),
|
||||||
|
vector_column_name,
|
||||||
|
pc.if_else(no_nans, vec_arr, pa.scalar(None)),
|
||||||
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
@@ -2588,7 +2614,7 @@ class AsyncTable:
|
|||||||
"append" and "overwrite".
|
"append" and "overwrite".
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
What to do if any of the vectors are not the same size or contains NaNs.
|
||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
|
|
||||||
|
|||||||
@@ -81,14 +81,15 @@ def test_embedding_function(tmp_path):
|
|||||||
|
|
||||||
|
|
||||||
def test_embedding_with_bad_results(tmp_path):
|
def test_embedding_with_bad_results(tmp_path):
|
||||||
@register("mock-embedding")
|
@register("null-embedding")
|
||||||
class MockEmbeddingFunction(TextEmbeddingFunction):
|
class NullEmbeddingFunction(TextEmbeddingFunction):
|
||||||
def ndims(self):
|
def ndims(self):
|
||||||
return 128
|
return 128
|
||||||
|
|
||||||
def generate_embeddings(
|
def generate_embeddings(
|
||||||
self, texts: Union[List[str], np.ndarray]
|
self, texts: Union[List[str], np.ndarray]
|
||||||
) -> list[Union[np.array, None]]:
|
) -> list[Union[np.array, None]]:
|
||||||
|
# Return None, which is bad if field is non-nullable
|
||||||
return [
|
return [
|
||||||
None if i % 2 == 0 else np.random.randn(self.ndims())
|
None if i % 2 == 0 else np.random.randn(self.ndims())
|
||||||
for i in range(len(texts))
|
for i in range(len(texts))
|
||||||
@@ -96,13 +97,17 @@ def test_embedding_with_bad_results(tmp_path):
|
|||||||
|
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
registry = EmbeddingFunctionRegistry.get_instance()
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
model = registry.get("mock-embedding").create()
|
model = registry.get("null-embedding").create()
|
||||||
|
|
||||||
class Schema(LanceModel):
|
class Schema(LanceModel):
|
||||||
text: str = model.SourceField()
|
text: str = model.SourceField()
|
||||||
vector: Vector(model.ndims()) = model.VectorField()
|
vector: Vector(model.ndims()) = model.VectorField()
|
||||||
|
|
||||||
table = db.create_table("test", schema=Schema, mode="overwrite")
|
table = db.create_table("test", schema=Schema, mode="overwrite")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
# Default on_bad_vectors is "error"
|
||||||
|
table.add([{"text": "hello world"}])
|
||||||
|
|
||||||
table.add(
|
table.add(
|
||||||
[{"text": "hello world"}, {"text": "bar"}],
|
[{"text": "hello world"}, {"text": "bar"}],
|
||||||
on_bad_vectors="drop",
|
on_bad_vectors="drop",
|
||||||
@@ -112,13 +117,33 @@ def test_embedding_with_bad_results(tmp_path):
|
|||||||
assert len(table) == 1
|
assert len(table) == 1
|
||||||
assert df.iloc[0]["text"] == "bar"
|
assert df.iloc[0]["text"] == "bar"
|
||||||
|
|
||||||
# table = db.create_table("test2", schema=Schema, mode="overwrite")
|
@register("nan-embedding")
|
||||||
# table.add(
|
class NanEmbeddingFunction(TextEmbeddingFunction):
|
||||||
# [{"text": "hello world"}, {"text": "bar"}],
|
def ndims(self):
|
||||||
# )
|
return 128
|
||||||
# assert len(table) == 2
|
|
||||||
# tbl = table.to_arrow()
|
def generate_embeddings(
|
||||||
# assert tbl["vector"].null_count == 1
|
self, texts: Union[List[str], np.ndarray]
|
||||||
|
) -> list[Union[np.array, None]]:
|
||||||
|
# Return NaN to produce bad vectors
|
||||||
|
return [
|
||||||
|
[np.NAN] * 128 if i % 2 == 0 else np.random.randn(self.ndims())
|
||||||
|
for i in range(len(texts))
|
||||||
|
]
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
|
model = registry.get("nan-embedding").create()
|
||||||
|
|
||||||
|
table = db.create_table("test2", schema=Schema, mode="overwrite")
|
||||||
|
table.alter_columns(dict(path="vector", nullable=True))
|
||||||
|
table.add(
|
||||||
|
[{"text": "hello world"}, {"text": "bar"}],
|
||||||
|
on_bad_vectors="null",
|
||||||
|
)
|
||||||
|
assert len(table) == 2
|
||||||
|
tbl = table.to_arrow()
|
||||||
|
assert tbl["vector"].null_count == 1
|
||||||
|
|
||||||
|
|
||||||
def test_with_existing_vectors(tmp_path):
|
def test_with_existing_vectors(tmp_path):
|
||||||
|
|||||||
@@ -197,6 +197,23 @@ def test_query_sync_minimal():
|
|||||||
assert data == expected
|
assert data == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_sync_empty_query():
|
||||||
|
def handler(body):
|
||||||
|
assert body == {
|
||||||
|
"k": 10,
|
||||||
|
"filter": "true",
|
||||||
|
"vector": [],
|
||||||
|
"columns": ["id"],
|
||||||
|
}
|
||||||
|
|
||||||
|
return pa.table({"id": [1, 2, 3]})
|
||||||
|
|
||||||
|
with query_test_table(handler) as table:
|
||||||
|
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
|
||||||
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
|
assert data == expected
|
||||||
|
|
||||||
|
|
||||||
def test_query_sync_maximal():
|
def test_query_sync_maximal():
|
||||||
def handler(body):
|
def handler(body):
|
||||||
assert body == {
|
assert body == {
|
||||||
@@ -229,6 +246,17 @@ def test_query_sync_maximal():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_sync_multiple_vectors():
|
||||||
|
def handler(_body):
|
||||||
|
return pa.table({"id": [1]})
|
||||||
|
|
||||||
|
with query_test_table(handler) as table:
|
||||||
|
results = table.search([[1, 2, 3], [4, 5, 6]]).limit(1).to_list()
|
||||||
|
assert len(results) == 2
|
||||||
|
results.sort(key=lambda x: x["query_index"])
|
||||||
|
assert results == [{"id": 1, "query_index": 0}, {"id": 1, "query_index": 1}]
|
||||||
|
|
||||||
|
|
||||||
def test_query_sync_fts():
|
def test_query_sync_fts():
|
||||||
def handler(body):
|
def handler(body):
|
||||||
assert body == {
|
assert body == {
|
||||||
|
|||||||
@@ -240,6 +240,121 @@ def test_add(db):
|
|||||||
_add(table, schema)
|
_add(table, schema)
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_subschema(tmp_path):
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||||
|
pa.field("item", pa.string(), nullable=True),
|
||||||
|
pa.field("price", pa.float64(), nullable=False),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
table = db.create_table("test", schema=schema)
|
||||||
|
|
||||||
|
data = {"price": 10.0, "item": "foo"}
|
||||||
|
table.add([data])
|
||||||
|
data = {"price": 2.0, "vector": [3.1, 4.1]}
|
||||||
|
table.add([data])
|
||||||
|
data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
|
||||||
|
table.add([data])
|
||||||
|
|
||||||
|
expected = pa.table(
|
||||||
|
{
|
||||||
|
"vector": [None, [3.1, 4.1], [5.9, 26.5]],
|
||||||
|
"item": ["foo", None, "bar"],
|
||||||
|
"price": [10.0, 2.0, 3.0],
|
||||||
|
},
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
assert table.to_arrow() == expected
|
||||||
|
|
||||||
|
data = {"item": "foo"}
|
||||||
|
# We can't omit a column if it's not nullable
|
||||||
|
with pytest.raises(OSError, match="Invalid user input"):
|
||||||
|
table.add([data])
|
||||||
|
|
||||||
|
# We can add it if we make the column nullable
|
||||||
|
table.alter_columns(dict(path="price", nullable=True))
|
||||||
|
table.add([data])
|
||||||
|
|
||||||
|
expected_schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||||
|
pa.field("item", pa.string(), nullable=True),
|
||||||
|
pa.field("price", pa.float64(), nullable=True),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected = pa.table(
|
||||||
|
{
|
||||||
|
"vector": [None, [3.1, 4.1], [5.9, 26.5], None],
|
||||||
|
"item": ["foo", None, "bar", "foo"],
|
||||||
|
"price": [10.0, 2.0, 3.0, None],
|
||||||
|
},
|
||||||
|
schema=expected_schema,
|
||||||
|
)
|
||||||
|
assert table.to_arrow() == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_nullability(tmp_path):
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2), nullable=False),
|
||||||
|
pa.field("id", pa.string(), nullable=False),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
table = db.create_table("test", schema=schema)
|
||||||
|
|
||||||
|
nullable_schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||||
|
pa.field("id", pa.string(), nullable=True),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
data = pa.table(
|
||||||
|
{
|
||||||
|
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||||
|
"id": ["foo", "bar"],
|
||||||
|
},
|
||||||
|
schema=nullable_schema,
|
||||||
|
)
|
||||||
|
# We can add nullable schema if it doesn't actually contain nulls
|
||||||
|
table.add(data)
|
||||||
|
|
||||||
|
expected = data.cast(schema)
|
||||||
|
assert table.to_arrow() == expected
|
||||||
|
|
||||||
|
data = pa.table(
|
||||||
|
{
|
||||||
|
"vector": [None],
|
||||||
|
"id": ["baz"],
|
||||||
|
},
|
||||||
|
schema=nullable_schema,
|
||||||
|
)
|
||||||
|
# We can't add nullable schema if it contains nulls
|
||||||
|
with pytest.raises(Exception, match="Vector column vector has NaNs"):
|
||||||
|
table.add(data)
|
||||||
|
|
||||||
|
# But we can make it nullable
|
||||||
|
table.alter_columns(dict(path="vector", nullable=True))
|
||||||
|
table.add(data)
|
||||||
|
|
||||||
|
expected_schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||||
|
pa.field("id", pa.string(), nullable=False),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected = pa.table(
|
||||||
|
{
|
||||||
|
"vector": [[3.1, 4.1], [5.9, 26.5], None],
|
||||||
|
"id": ["foo", "bar", "baz"],
|
||||||
|
},
|
||||||
|
schema=expected_schema,
|
||||||
|
)
|
||||||
|
assert table.to_arrow() == expected
|
||||||
|
|
||||||
|
|
||||||
def test_add_pydantic_model(db):
|
def test_add_pydantic_model(db):
|
||||||
# https://github.com/lancedb/lancedb/issues/562
|
# https://github.com/lancedb/lancedb/issues/562
|
||||||
|
|
||||||
@@ -892,10 +1007,15 @@ def test_empty_query(db):
|
|||||||
table = LanceTable.create(db, "my_table2", data=[{"id": i} for i in range(100)])
|
table = LanceTable.create(db, "my_table2", data=[{"id": i} for i in range(100)])
|
||||||
df = table.search().select(["id"]).to_pandas()
|
df = table.search().select(["id"]).to_pandas()
|
||||||
assert len(df) == 10
|
assert len(df) == 10
|
||||||
|
# None is the same as default
|
||||||
df = table.search().select(["id"]).limit(None).to_pandas()
|
df = table.search().select(["id"]).limit(None).to_pandas()
|
||||||
assert len(df) == 100
|
assert len(df) == 10
|
||||||
|
# invalid limist is the same as None, wihch is the same as default
|
||||||
df = table.search().select(["id"]).limit(-1).to_pandas()
|
df = table.search().select(["id"]).limit(-1).to_pandas()
|
||||||
assert len(df) == 100
|
assert len(df) == 10
|
||||||
|
# valid limit should work
|
||||||
|
df = table.search().select(["id"]).limit(42).to_pandas()
|
||||||
|
assert len(df) == 42
|
||||||
|
|
||||||
|
|
||||||
def test_search_with_schema_inf_single_vector(db):
|
def test_search_with_schema_inf_single_vector(db):
|
||||||
|
|||||||
@@ -142,6 +142,13 @@ impl VectorQuery {
|
|||||||
self.inner = self.inner.clone().only_if(predicate);
|
self.inner = self.inner.clone().only_if(predicate);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn add_query_vector(&mut self, vector: Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
|
let data: ArrayData = ArrayData::from_pyarrow_bound(&vector)?;
|
||||||
|
let array = make_array(data);
|
||||||
|
self.inner = self.inner.clone().add_query_vector(array).infer_error()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||||
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.13.0-beta.1"
|
version = "0.13.0-beta.2"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.13.0-beta.1"
|
version = "0.13.0-beta.2"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -46,6 +46,7 @@ serde = { version = "^1" }
|
|||||||
serde_json = { version = "1" }
|
serde_json = { version = "1" }
|
||||||
async-openai = { version = "0.20.0", optional = true }
|
async-openai = { version = "0.20.0", optional = true }
|
||||||
serde_with = { version = "3.8.1" }
|
serde_with = { version = "3.8.1" }
|
||||||
|
aws-sdk-bedrockruntime = { version = "1.27.0", optional = true }
|
||||||
# For remote feature
|
# For remote feature
|
||||||
reqwest = { version = "0.12.0", features = ["gzip", "json", "stream"], optional = true }
|
reqwest = { version = "0.12.0", features = ["gzip", "json", "stream"], optional = true }
|
||||||
rand = { version = "0.8.3", features = ["small_rng"], optional = true}
|
rand = { version = "0.8.3", features = ["small_rng"], optional = true}
|
||||||
@@ -72,11 +73,13 @@ aws-config = { version = "1.0" }
|
|||||||
aws-smithy-runtime = { version = "1.3" }
|
aws-smithy-runtime = { version = "1.3" }
|
||||||
http-body = "1" # Matching reqwest
|
http-body = "1" # Matching reqwest
|
||||||
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
||||||
fp16kernels = ["lance-linalg/fp16kernels"]
|
fp16kernels = ["lance-linalg/fp16kernels"]
|
||||||
s3-test = []
|
s3-test = []
|
||||||
|
bedrock = ["dep:aws-sdk-bedrockruntime"]
|
||||||
openai = ["dep:async-openai", "dep:reqwest"]
|
openai = ["dep:async-openai", "dep:reqwest"]
|
||||||
polars = ["dep:polars-arrow", "dep:polars"]
|
polars = ["dep:polars-arrow", "dep:polars"]
|
||||||
sentence-transformers = [
|
sentence-transformers = [
|
||||||
@@ -94,3 +97,7 @@ required-features = ["openai"]
|
|||||||
[[example]]
|
[[example]]
|
||||||
name = "sentence_transformers"
|
name = "sentence_transformers"
|
||||||
required-features = ["sentence-transformers"]
|
required-features = ["sentence-transformers"]
|
||||||
|
|
||||||
|
[[example]]
|
||||||
|
name = "bedrock"
|
||||||
|
required-features = ["bedrock"]
|
||||||
|
|||||||
89
rust/lancedb/examples/bedrock.rs
Normal file
89
rust/lancedb/examples/bedrock.rs
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
use std::{iter::once, sync::Arc};
|
||||||
|
|
||||||
|
use arrow_array::{Float64Array, Int32Array, RecordBatch, RecordBatchIterator, StringArray};
|
||||||
|
use arrow_schema::{DataType, Field, Schema};
|
||||||
|
use aws_config::Region;
|
||||||
|
use aws_sdk_bedrockruntime::Client;
|
||||||
|
use futures::StreamExt;
|
||||||
|
use lancedb::{
|
||||||
|
arrow::IntoArrow,
|
||||||
|
connect,
|
||||||
|
embeddings::{bedrock::BedrockEmbeddingFunction, EmbeddingDefinition, EmbeddingFunction},
|
||||||
|
query::{ExecutableQuery, QueryBase},
|
||||||
|
Result,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
let tempdir = tempfile::tempdir().unwrap();
|
||||||
|
let tempdir = tempdir.path().to_str().unwrap();
|
||||||
|
|
||||||
|
// create Bedrock embedding function
|
||||||
|
let region: String = "us-east-1".to_string();
|
||||||
|
let config = aws_config::defaults(aws_config::BehaviorVersion::latest())
|
||||||
|
.region(Region::new(region))
|
||||||
|
.load()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let embedding = Arc::new(BedrockEmbeddingFunction::new(
|
||||||
|
Client::new(&config), // AWS Region
|
||||||
|
));
|
||||||
|
|
||||||
|
let db = connect(tempdir).execute().await?;
|
||||||
|
db.embedding_registry()
|
||||||
|
.register("bedrock", embedding.clone())?;
|
||||||
|
|
||||||
|
let table = db
|
||||||
|
.create_table("vectors", make_data())
|
||||||
|
.add_embedding(EmbeddingDefinition::new(
|
||||||
|
"text",
|
||||||
|
"bedrock",
|
||||||
|
Some("embeddings"),
|
||||||
|
))?
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// execute vector search
|
||||||
|
let query = Arc::new(StringArray::from_iter_values(once("something warm")));
|
||||||
|
let query_vector = embedding.compute_query_embeddings(query)?;
|
||||||
|
let mut results = table
|
||||||
|
.vector_search(query_vector)?
|
||||||
|
.limit(1)
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let rb = results.next().await.unwrap()?;
|
||||||
|
let out = rb
|
||||||
|
.column_by_name("text")
|
||||||
|
.unwrap()
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<StringArray>()
|
||||||
|
.unwrap();
|
||||||
|
let text = out.iter().next().unwrap().unwrap();
|
||||||
|
println!("Closest match: {}", text);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_data() -> impl IntoArrow {
|
||||||
|
let schema = Schema::new(vec![
|
||||||
|
Field::new("id", DataType::Int32, true),
|
||||||
|
Field::new("text", DataType::Utf8, false),
|
||||||
|
Field::new("price", DataType::Float64, false),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let id = Int32Array::from(vec![1, 2, 3, 4]);
|
||||||
|
let text = StringArray::from_iter_values(vec![
|
||||||
|
"Black T-Shirt",
|
||||||
|
"Leather Jacket",
|
||||||
|
"Winter Parka",
|
||||||
|
"Hooded Sweatshirt",
|
||||||
|
]);
|
||||||
|
let price = Float64Array::from(vec![10.0, 50.0, 100.0, 30.0]);
|
||||||
|
let schema = Arc::new(schema);
|
||||||
|
let rb = RecordBatch::try_new(
|
||||||
|
schema.clone(),
|
||||||
|
vec![Arc::new(id), Arc::new(text), Arc::new(price)],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
Box::new(RecordBatchIterator::new(vec![Ok(rb)], schema))
|
||||||
|
}
|
||||||
@@ -17,6 +17,9 @@ pub mod openai;
|
|||||||
#[cfg(feature = "sentence-transformers")]
|
#[cfg(feature = "sentence-transformers")]
|
||||||
pub mod sentence_transformers;
|
pub mod sentence_transformers;
|
||||||
|
|
||||||
|
#[cfg(feature = "bedrock")]
|
||||||
|
pub mod bedrock;
|
||||||
|
|
||||||
use lance::arrow::RecordBatchExt;
|
use lance::arrow::RecordBatchExt;
|
||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
borrow::Cow,
|
||||||
|
|||||||
210
rust/lancedb/src/embeddings/bedrock.rs
Normal file
210
rust/lancedb/src/embeddings/bedrock.rs
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
use aws_sdk_bedrockruntime::Client as BedrockClient;
|
||||||
|
use std::{borrow::Cow, fmt::Formatter, str::FromStr, sync::Arc};
|
||||||
|
|
||||||
|
use arrow::array::{AsArray, Float32Builder};
|
||||||
|
use arrow_array::{Array, ArrayRef, FixedSizeListArray, Float32Array};
|
||||||
|
use arrow_data::ArrayData;
|
||||||
|
use arrow_schema::DataType;
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
|
use super::EmbeddingFunction;
|
||||||
|
use crate::{Error, Result};
|
||||||
|
|
||||||
|
use tokio::runtime::Handle;
|
||||||
|
use tokio::task::block_in_place;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum BedrockEmbeddingModel {
|
||||||
|
TitanEmbedding,
|
||||||
|
CohereLarge,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BedrockEmbeddingModel {
|
||||||
|
fn ndims(&self) -> usize {
|
||||||
|
match self {
|
||||||
|
Self::TitanEmbedding => 1536,
|
||||||
|
Self::CohereLarge => 1024,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn model_id(&self) -> &str {
|
||||||
|
match self {
|
||||||
|
Self::TitanEmbedding => "amazon.titan-embed-text-v1",
|
||||||
|
Self::CohereLarge => "cohere.embed-english-v3",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for BedrockEmbeddingModel {
|
||||||
|
type Err = Error;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"titan-embed-text-v1" => Ok(Self::TitanEmbedding),
|
||||||
|
"cohere-embed-english-v3" => Ok(Self::CohereLarge),
|
||||||
|
_ => Err(Error::InvalidInput {
|
||||||
|
message: "Invalid model. Available models are: 'titan-embed-text-v1', 'cohere-embed-english-v3'".to_string()
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct BedrockEmbeddingFunction {
|
||||||
|
model: BedrockEmbeddingModel,
|
||||||
|
client: BedrockClient,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BedrockEmbeddingFunction {
|
||||||
|
pub fn new(client: BedrockClient) -> Self {
|
||||||
|
Self {
|
||||||
|
model: BedrockEmbeddingModel::TitanEmbedding,
|
||||||
|
client,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_model(client: BedrockClient, model: BedrockEmbeddingModel) -> Self {
|
||||||
|
Self { model, client }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EmbeddingFunction for BedrockEmbeddingFunction {
|
||||||
|
fn name(&self) -> &str {
|
||||||
|
"bedrock"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn source_type(&self) -> Result<Cow<DataType>> {
|
||||||
|
Ok(Cow::Owned(DataType::Utf8))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dest_type(&self) -> Result<Cow<DataType>> {
|
||||||
|
let n_dims = self.model.ndims();
|
||||||
|
Ok(Cow::Owned(DataType::new_fixed_size_list(
|
||||||
|
DataType::Float32,
|
||||||
|
n_dims as i32,
|
||||||
|
false,
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_source_embeddings(&self, source: ArrayRef) -> Result<ArrayRef> {
|
||||||
|
let len = source.len();
|
||||||
|
let n_dims = self.model.ndims();
|
||||||
|
let inner = self.compute_inner(source)?;
|
||||||
|
|
||||||
|
let fsl = DataType::new_fixed_size_list(DataType::Float32, n_dims as i32, false);
|
||||||
|
|
||||||
|
let array_data = ArrayData::builder(fsl)
|
||||||
|
.len(len)
|
||||||
|
.add_child_data(inner.into_data())
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
Ok(Arc::new(FixedSizeListArray::from(array_data)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_query_embeddings(&self, input: Arc<dyn Array>) -> Result<Arc<dyn Array>> {
|
||||||
|
let arr = self.compute_inner(input)?;
|
||||||
|
Ok(Arc::new(arr))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for BedrockEmbeddingFunction {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||||
|
f.debug_struct("BedrockEmbeddingFunction")
|
||||||
|
.field("model", &self.model)
|
||||||
|
// Skip client field as it doesn't implement Debug
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BedrockEmbeddingFunction {
|
||||||
|
fn compute_inner(&self, source: Arc<dyn Array>) -> Result<Float32Array> {
|
||||||
|
if source.is_nullable() {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: "Expected non-nullable data type".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if !matches!(source.data_type(), DataType::Utf8 | DataType::LargeUtf8) {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: "Expected Utf8 data type".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut builder = Float32Builder::new();
|
||||||
|
|
||||||
|
let texts = match source.data_type() {
|
||||||
|
DataType::Utf8 => source
|
||||||
|
.as_string::<i32>()
|
||||||
|
.into_iter()
|
||||||
|
.map(|s| s.expect("array is non-nullable").to_string())
|
||||||
|
.collect::<Vec<String>>(),
|
||||||
|
DataType::LargeUtf8 => source
|
||||||
|
.as_string::<i64>()
|
||||||
|
.into_iter()
|
||||||
|
.map(|s| s.expect("array is non-nullable").to_string())
|
||||||
|
.collect::<Vec<String>>(),
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
|
||||||
|
for text in texts {
|
||||||
|
let request_body = match self.model {
|
||||||
|
BedrockEmbeddingModel::TitanEmbedding => {
|
||||||
|
json!({
|
||||||
|
"inputText": text
|
||||||
|
})
|
||||||
|
}
|
||||||
|
BedrockEmbeddingModel::CohereLarge => {
|
||||||
|
json!({
|
||||||
|
"texts": [text],
|
||||||
|
"input_type": "search_document"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let client = self.client.clone();
|
||||||
|
let model_id = self.model.model_id().to_string();
|
||||||
|
let request_body = request_body.clone();
|
||||||
|
|
||||||
|
let response = block_in_place(move || {
|
||||||
|
Handle::current().block_on(async move {
|
||||||
|
client
|
||||||
|
.invoke_model()
|
||||||
|
.model_id(model_id)
|
||||||
|
.body(aws_sdk_bedrockruntime::primitives::Blob::new(
|
||||||
|
serde_json::to_vec(&request_body).unwrap(),
|
||||||
|
))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let response_json: Value =
|
||||||
|
serde_json::from_slice(response.body.as_ref()).map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to parse response: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let embedding = match self.model {
|
||||||
|
BedrockEmbeddingModel::TitanEmbedding => response_json["embedding"]
|
||||||
|
.as_array()
|
||||||
|
.ok_or_else(|| Error::Runtime {
|
||||||
|
message: "Missing embedding in response".to_string(),
|
||||||
|
})?
|
||||||
|
.iter()
|
||||||
|
.map(|v| v.as_f64().unwrap() as f32)
|
||||||
|
.collect::<Vec<f32>>(),
|
||||||
|
BedrockEmbeddingModel::CohereLarge => response_json["embeddings"][0]
|
||||||
|
.as_array()
|
||||||
|
.ok_or_else(|| Error::Runtime {
|
||||||
|
message: "Missing embeddings in response".to_string(),
|
||||||
|
})?
|
||||||
|
.iter()
|
||||||
|
.map(|v| v.as_f64().unwrap() as f32)
|
||||||
|
.collect::<Vec<f32>>(),
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.append_slice(&embedding);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(builder.finish())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -475,6 +475,7 @@ impl<T: HasQuery> QueryBase for T {
|
|||||||
|
|
||||||
/// Options for controlling the execution of a query
|
/// Options for controlling the execution of a query
|
||||||
#[non_exhaustive]
|
#[non_exhaustive]
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
pub struct QueryExecutionOptions {
|
pub struct QueryExecutionOptions {
|
||||||
/// The maximum number of rows that will be contained in a single
|
/// The maximum number of rows that will be contained in a single
|
||||||
/// `RecordBatch` delivered by the query.
|
/// `RecordBatch` delivered by the query.
|
||||||
@@ -650,7 +651,7 @@ impl Query {
|
|||||||
pub fn nearest_to(self, vector: impl IntoQueryVector) -> Result<VectorQuery> {
|
pub fn nearest_to(self, vector: impl IntoQueryVector) -> Result<VectorQuery> {
|
||||||
let mut vector_query = self.into_vector();
|
let mut vector_query = self.into_vector();
|
||||||
let query_vector = vector.to_query_vector(&DataType::Float32, "default")?;
|
let query_vector = vector.to_query_vector(&DataType::Float32, "default")?;
|
||||||
vector_query.query_vector = Some(query_vector);
|
vector_query.query_vector.push(query_vector);
|
||||||
Ok(vector_query)
|
Ok(vector_query)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -701,7 +702,7 @@ pub struct VectorQuery {
|
|||||||
// the column based on the dataset's schema.
|
// the column based on the dataset's schema.
|
||||||
pub(crate) column: Option<String>,
|
pub(crate) column: Option<String>,
|
||||||
// IVF PQ - ANN search.
|
// IVF PQ - ANN search.
|
||||||
pub(crate) query_vector: Option<Arc<dyn Array>>,
|
pub(crate) query_vector: Vec<Arc<dyn Array>>,
|
||||||
pub(crate) nprobes: usize,
|
pub(crate) nprobes: usize,
|
||||||
pub(crate) refine_factor: Option<u32>,
|
pub(crate) refine_factor: Option<u32>,
|
||||||
pub(crate) distance_type: Option<DistanceType>,
|
pub(crate) distance_type: Option<DistanceType>,
|
||||||
@@ -714,7 +715,7 @@ impl VectorQuery {
|
|||||||
Self {
|
Self {
|
||||||
base,
|
base,
|
||||||
column: None,
|
column: None,
|
||||||
query_vector: None,
|
query_vector: Vec::new(),
|
||||||
nprobes: 20,
|
nprobes: 20,
|
||||||
refine_factor: None,
|
refine_factor: None,
|
||||||
distance_type: None,
|
distance_type: None,
|
||||||
@@ -734,6 +735,22 @@ impl VectorQuery {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Add another query vector to the search.
|
||||||
|
///
|
||||||
|
/// Multiple searches will be dispatched as part of the query.
|
||||||
|
/// This is a convenience method for adding multiple query vectors
|
||||||
|
/// to the search. It is not expected to be faster than issuing
|
||||||
|
/// multiple queries concurrently.
|
||||||
|
///
|
||||||
|
/// The output data will contain an additional columns `query_index` which
|
||||||
|
/// will contain the index of the query vector that was used to generate the
|
||||||
|
/// result.
|
||||||
|
pub fn add_query_vector(mut self, vector: impl IntoQueryVector) -> Result<Self> {
|
||||||
|
let query_vector = vector.to_query_vector(&DataType::Float32, "default")?;
|
||||||
|
self.query_vector.push(query_vector);
|
||||||
|
Ok(self)
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the number of partitions to search (probe)
|
/// Set the number of partitions to search (probe)
|
||||||
///
|
///
|
||||||
/// This argument is only used when the vector column has an IVF PQ index.
|
/// This argument is only used when the vector column has an IVF PQ index.
|
||||||
@@ -854,6 +871,7 @@ mod tests {
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use arrow::{compute::concat_batches, datatypes::Int32Type};
|
||||||
use arrow_array::{
|
use arrow_array::{
|
||||||
cast::AsArray, Float32Array, Int32Array, RecordBatch, RecordBatchIterator,
|
cast::AsArray, Float32Array, Int32Array, RecordBatch, RecordBatchIterator,
|
||||||
RecordBatchReader,
|
RecordBatchReader,
|
||||||
@@ -883,7 +901,10 @@ mod tests {
|
|||||||
|
|
||||||
let vector = Float32Array::from_iter_values([0.1, 0.2]);
|
let vector = Float32Array::from_iter_values([0.1, 0.2]);
|
||||||
let query = table.query().nearest_to(&[0.1, 0.2]).unwrap();
|
let query = table.query().nearest_to(&[0.1, 0.2]).unwrap();
|
||||||
assert_eq!(*query.query_vector.unwrap().as_ref().as_primitive(), vector);
|
assert_eq!(
|
||||||
|
*query.query_vector.first().unwrap().as_ref().as_primitive(),
|
||||||
|
vector
|
||||||
|
);
|
||||||
|
|
||||||
let new_vector = Float32Array::from_iter_values([9.8, 8.7]);
|
let new_vector = Float32Array::from_iter_values([9.8, 8.7]);
|
||||||
|
|
||||||
@@ -899,7 +920,7 @@ mod tests {
|
|||||||
.refine_factor(999);
|
.refine_factor(999);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
*query.query_vector.unwrap().as_ref().as_primitive(),
|
*query.query_vector.first().unwrap().as_ref().as_primitive(),
|
||||||
new_vector
|
new_vector
|
||||||
);
|
);
|
||||||
assert_eq!(query.base.limit.unwrap(), 100);
|
assert_eq!(query.base.limit.unwrap(), 100);
|
||||||
@@ -1197,4 +1218,34 @@ mod tests {
|
|||||||
assert!(batch.column_by_name("_rowid").is_some());
|
assert!(batch.column_by_name("_rowid").is_some());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_multiple_query_vectors() {
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let table = make_test_table(&tmp_dir).await;
|
||||||
|
let query = table
|
||||||
|
.query()
|
||||||
|
.nearest_to(&[0.1, 0.2, 0.3, 0.4])
|
||||||
|
.unwrap()
|
||||||
|
.add_query_vector(&[0.5, 0.6, 0.7, 0.8])
|
||||||
|
.unwrap()
|
||||||
|
.limit(1);
|
||||||
|
|
||||||
|
let plan = query.explain_plan(true).await.unwrap();
|
||||||
|
assert!(plan.contains("UnionExec"));
|
||||||
|
|
||||||
|
let results = query
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.try_collect::<Vec<_>>()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let results = concat_batches(&results[0].schema(), &results).unwrap();
|
||||||
|
assert_eq!(results.num_rows(), 2); // One result for each query vector.
|
||||||
|
let query_index = results["query_index"].as_primitive::<Int32Type>();
|
||||||
|
// We don't guarantee order.
|
||||||
|
assert!(query_index.values().contains(&0));
|
||||||
|
assert!(query_index.values().contains(&1));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use crate::index::IndexStatistics;
|
|||||||
use crate::query::Select;
|
use crate::query::Select;
|
||||||
use crate::table::AddDataMode;
|
use crate::table::AddDataMode;
|
||||||
use crate::utils::{supported_btree_data_type, supported_vector_data_type};
|
use crate::utils::{supported_btree_data_type, supported_vector_data_type};
|
||||||
use crate::Error;
|
use crate::{Error, Table};
|
||||||
use arrow_array::RecordBatchReader;
|
use arrow_array::RecordBatchReader;
|
||||||
use arrow_ipc::reader::FileReader;
|
use arrow_ipc::reader::FileReader;
|
||||||
use arrow_schema::{DataType, SchemaRef};
|
use arrow_schema::{DataType, SchemaRef};
|
||||||
@@ -185,6 +185,71 @@ impl<S: HttpSend> RemoteTable<S> {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn apply_vector_query_params(
|
||||||
|
mut body: serde_json::Value,
|
||||||
|
query: &VectorQuery,
|
||||||
|
) -> Result<Vec<serde_json::Value>> {
|
||||||
|
Self::apply_query_params(&mut body, &query.base)?;
|
||||||
|
|
||||||
|
// Apply general parameters, before we dispatch based on number of query vectors.
|
||||||
|
body["prefilter"] = query.base.prefilter.into();
|
||||||
|
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
|
||||||
|
body["nprobes"] = query.nprobes.into();
|
||||||
|
body["refine_factor"] = query.refine_factor.into();
|
||||||
|
if let Some(vector_column) = query.column.as_ref() {
|
||||||
|
body["vector_column"] = serde_json::Value::String(vector_column.clone());
|
||||||
|
}
|
||||||
|
if !query.use_index {
|
||||||
|
body["bypass_vector_index"] = serde_json::Value::Bool(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn vector_to_json(vector: &arrow_array::ArrayRef) -> Result<serde_json::Value> {
|
||||||
|
match vector.data_type() {
|
||||||
|
DataType::Float32 => {
|
||||||
|
let array = vector
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<arrow_array::Float32Array>()
|
||||||
|
.unwrap();
|
||||||
|
Ok(serde_json::Value::Array(
|
||||||
|
array
|
||||||
|
.values()
|
||||||
|
.iter()
|
||||||
|
.map(|v| {
|
||||||
|
serde_json::Value::Number(
|
||||||
|
serde_json::Number::from_f64(*v as f64).unwrap(),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
_ => Err(Error::InvalidInput {
|
||||||
|
message: "VectorQuery vector must be of type Float32".into(),
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match query.query_vector.len() {
|
||||||
|
0 => {
|
||||||
|
// Server takes empty vector, not null or undefined.
|
||||||
|
body["vector"] = serde_json::Value::Array(Vec::new());
|
||||||
|
Ok(vec![body])
|
||||||
|
}
|
||||||
|
1 => {
|
||||||
|
body["vector"] = vector_to_json(&query.query_vector[0])?;
|
||||||
|
Ok(vec![body])
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
let mut bodies = Vec::with_capacity(query.query_vector.len());
|
||||||
|
for vector in &query.query_vector {
|
||||||
|
let mut body = body.clone();
|
||||||
|
body["vector"] = vector_to_json(vector)?;
|
||||||
|
bodies.push(body);
|
||||||
|
}
|
||||||
|
Ok(bodies)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
@@ -306,51 +371,29 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
) -> Result<Arc<dyn ExecutionPlan>> {
|
) -> Result<Arc<dyn ExecutionPlan>> {
|
||||||
let request = self.client.post(&format!("/v1/table/{}/query/", self.name));
|
let request = self.client.post(&format!("/v1/table/{}/query/", self.name));
|
||||||
|
|
||||||
let mut body = serde_json::Value::Object(Default::default());
|
let body = serde_json::Value::Object(Default::default());
|
||||||
Self::apply_query_params(&mut body, &query.base)?;
|
let bodies = Self::apply_vector_query_params(body, query)?;
|
||||||
|
|
||||||
body["prefilter"] = query.base.prefilter.into();
|
|
||||||
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
|
|
||||||
body["nprobes"] = query.nprobes.into();
|
|
||||||
body["refine_factor"] = query.refine_factor.into();
|
|
||||||
|
|
||||||
let vector: Vec<f32> = if let Some(vector) = query.query_vector.as_ref() {
|
|
||||||
match vector.data_type() {
|
|
||||||
DataType::Float32 => vector
|
|
||||||
.as_any()
|
|
||||||
.downcast_ref::<arrow_array::Float32Array>()
|
|
||||||
.unwrap()
|
|
||||||
.values()
|
|
||||||
.iter()
|
|
||||||
.cloned()
|
|
||||||
.collect(),
|
|
||||||
_ => {
|
|
||||||
return Err(Error::InvalidInput {
|
|
||||||
message: "VectorQuery vector must be of type Float32".into(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Server takes empty vector, not null or undefined.
|
|
||||||
Vec::new()
|
|
||||||
};
|
|
||||||
body["vector"] = serde_json::json!(vector);
|
|
||||||
|
|
||||||
if let Some(vector_column) = query.column.as_ref() {
|
|
||||||
body["vector_column"] = serde_json::Value::String(vector_column.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
if !query.use_index {
|
|
||||||
body["bypass_vector_index"] = serde_json::Value::Bool(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
let request = request.json(&body);
|
|
||||||
|
|
||||||
|
let mut futures = Vec::with_capacity(bodies.len());
|
||||||
|
for body in bodies {
|
||||||
|
let request = request.try_clone().unwrap().json(&body);
|
||||||
|
let future = async move {
|
||||||
let (request_id, response) = self.client.send(request, true).await?;
|
let (request_id, response) = self.client.send(request, true).await?;
|
||||||
|
self.read_arrow_stream(&request_id, response).await
|
||||||
let stream = self.read_arrow_stream(&request_id, response).await?;
|
};
|
||||||
|
futures.push(future);
|
||||||
|
}
|
||||||
|
let streams = futures::future::try_join_all(futures).await?;
|
||||||
|
if streams.len() == 1 {
|
||||||
|
let stream = streams.into_iter().next().unwrap();
|
||||||
Ok(Arc::new(OneShotExec::new(stream)))
|
Ok(Arc::new(OneShotExec::new(stream)))
|
||||||
|
} else {
|
||||||
|
let stream_execs = streams
|
||||||
|
.into_iter()
|
||||||
|
.map(|stream| Arc::new(OneShotExec::new(stream)) as Arc<dyn ExecutionPlan>)
|
||||||
|
.collect();
|
||||||
|
Table::multi_vector_plan(stream_execs)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn plain_query(
|
async fn plain_query(
|
||||||
@@ -655,6 +698,7 @@ mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
|
||||||
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
||||||
use arrow_schema::{DataType, Field, Schema};
|
use arrow_schema::{DataType, Field, Schema};
|
||||||
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
||||||
@@ -1207,6 +1251,52 @@ mod tests {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_query_multiple_vectors() {
|
||||||
|
let table = Table::new_with_handler("my_table", |request| {
|
||||||
|
assert_eq!(request.method(), "POST");
|
||||||
|
assert_eq!(request.url().path(), "/v1/table/my_table/query/");
|
||||||
|
assert_eq!(
|
||||||
|
request.headers().get("Content-Type").unwrap(),
|
||||||
|
JSON_CONTENT_TYPE
|
||||||
|
);
|
||||||
|
let data = RecordBatch::try_new(
|
||||||
|
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
|
||||||
|
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let response_body = write_ipc_file(&data);
|
||||||
|
http::Response::builder()
|
||||||
|
.status(200)
|
||||||
|
.header(CONTENT_TYPE, ARROW_FILE_CONTENT_TYPE)
|
||||||
|
.body(response_body)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
let query = table
|
||||||
|
.query()
|
||||||
|
.nearest_to(vec![0.1, 0.2, 0.3])
|
||||||
|
.unwrap()
|
||||||
|
.add_query_vector(vec![0.4, 0.5, 0.6])
|
||||||
|
.unwrap();
|
||||||
|
let plan = query.explain_plan(true).await.unwrap();
|
||||||
|
assert!(plan.contains("UnionExec"), "Plan: {}", plan);
|
||||||
|
|
||||||
|
let results = query
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.try_collect::<Vec<_>>()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let results = concat_batches(&results[0].schema(), &results).unwrap();
|
||||||
|
|
||||||
|
let query_index = results["query_index"].as_primitive::<Int32Type>();
|
||||||
|
// We don't guarantee order.
|
||||||
|
assert!(query_index.values().contains(&0));
|
||||||
|
assert!(query_index.values().contains(&1));
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_create_index() {
|
async fn test_create_index() {
|
||||||
let cases = [
|
let cases = [
|
||||||
|
|||||||
@@ -24,6 +24,9 @@ use arrow_array::{RecordBatchIterator, RecordBatchReader};
|
|||||||
use arrow_schema::{Field, Schema, SchemaRef};
|
use arrow_schema::{Field, Schema, SchemaRef};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use datafusion_physical_plan::display::DisplayableExecutionPlan;
|
use datafusion_physical_plan::display::DisplayableExecutionPlan;
|
||||||
|
use datafusion_physical_plan::projection::ProjectionExec;
|
||||||
|
use datafusion_physical_plan::repartition::RepartitionExec;
|
||||||
|
use datafusion_physical_plan::union::UnionExec;
|
||||||
use datafusion_physical_plan::ExecutionPlan;
|
use datafusion_physical_plan::ExecutionPlan;
|
||||||
use futures::{StreamExt, TryStreamExt};
|
use futures::{StreamExt, TryStreamExt};
|
||||||
use lance::dataset::builder::DatasetBuilder;
|
use lance::dataset::builder::DatasetBuilder;
|
||||||
@@ -972,6 +975,57 @@ impl Table {
|
|||||||
) -> Result<Option<IndexStatistics>> {
|
) -> Result<Option<IndexStatistics>> {
|
||||||
self.inner.index_stats(index_name.as_ref()).await
|
self.inner.index_stats(index_name.as_ref()).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Take many execution plans and map them into a single plan that adds
|
||||||
|
// a query_index column and unions them.
|
||||||
|
pub(crate) fn multi_vector_plan(
|
||||||
|
plans: Vec<Arc<dyn ExecutionPlan>>,
|
||||||
|
) -> Result<Arc<dyn ExecutionPlan>> {
|
||||||
|
if plans.is_empty() {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: "No plans provided".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// Projection to keeping all existing columns
|
||||||
|
let first_plan = plans[0].clone();
|
||||||
|
let project_all_columns = first_plan
|
||||||
|
.schema()
|
||||||
|
.fields()
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, field)| {
|
||||||
|
let expr =
|
||||||
|
datafusion_physical_plan::expressions::Column::new(field.name().as_str(), i);
|
||||||
|
let expr = Arc::new(expr) as Arc<dyn datafusion_physical_plan::PhysicalExpr>;
|
||||||
|
(expr, field.name().clone())
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let projected_plans = plans
|
||||||
|
.into_iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(plan_i, plan)| {
|
||||||
|
let query_index = datafusion_common::ScalarValue::Int32(Some(plan_i as i32));
|
||||||
|
let query_index_expr =
|
||||||
|
datafusion_physical_plan::expressions::Literal::new(query_index);
|
||||||
|
let query_index_expr =
|
||||||
|
Arc::new(query_index_expr) as Arc<dyn datafusion_physical_plan::PhysicalExpr>;
|
||||||
|
let mut projections = vec![(query_index_expr, "query_index".to_string())];
|
||||||
|
projections.extend_from_slice(&project_all_columns);
|
||||||
|
let projection = ProjectionExec::try_new(projections, plan).unwrap();
|
||||||
|
Arc::new(projection) as Arc<dyn datafusion_physical_plan::ExecutionPlan>
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let unioned = Arc::new(UnionExec::new(projected_plans));
|
||||||
|
// We require 1 partition in the final output
|
||||||
|
let repartitioned = RepartitionExec::try_new(
|
||||||
|
unioned,
|
||||||
|
datafusion_physical_plan::Partitioning::RoundRobinBatch(1),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
Ok(Arc::new(repartitioned))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<NativeTable> for Table {
|
impl From<NativeTable> for Table {
|
||||||
@@ -1784,9 +1838,25 @@ impl TableInternal for NativeTable {
|
|||||||
) -> Result<Arc<dyn ExecutionPlan>> {
|
) -> Result<Arc<dyn ExecutionPlan>> {
|
||||||
let ds_ref = self.dataset.get().await?;
|
let ds_ref = self.dataset.get().await?;
|
||||||
|
|
||||||
|
if query.query_vector.len() > 1 {
|
||||||
|
// If there are multiple query vectors, create a plan for each of them and union them.
|
||||||
|
let query_vecs = query.query_vector.clone();
|
||||||
|
let plan_futures = query_vecs
|
||||||
|
.into_iter()
|
||||||
|
.map(|query_vector| {
|
||||||
|
let mut sub_query = query.clone();
|
||||||
|
sub_query.query_vector = vec![query_vector];
|
||||||
|
let options_ref = options.clone();
|
||||||
|
async move { self.create_plan(&sub_query, options_ref).await }
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let plans = futures::future::try_join_all(plan_futures).await?;
|
||||||
|
return Table::multi_vector_plan(plans);
|
||||||
|
}
|
||||||
|
|
||||||
let mut scanner: Scanner = ds_ref.scan();
|
let mut scanner: Scanner = ds_ref.scan();
|
||||||
|
|
||||||
if let Some(query_vector) = query.query_vector.as_ref() {
|
if let Some(query_vector) = query.query_vector.first() {
|
||||||
// If there is a vector query, default to limit=10 if unspecified
|
// If there is a vector query, default to limit=10 if unspecified
|
||||||
let column = if let Some(col) = query.column.as_ref() {
|
let column = if let Some(col) = query.column.as_ref() {
|
||||||
col.clone()
|
col.clone()
|
||||||
@@ -1828,18 +1898,11 @@ impl TableInternal for NativeTable {
|
|||||||
query_vector,
|
query_vector,
|
||||||
query.base.limit.unwrap_or(DEFAULT_TOP_K),
|
query.base.limit.unwrap_or(DEFAULT_TOP_K),
|
||||||
)?;
|
)?;
|
||||||
scanner.limit(
|
|
||||||
query.base.limit.map(|limit| limit as i64),
|
|
||||||
query.base.offset.map(|offset| offset as i64),
|
|
||||||
)?;
|
|
||||||
} else {
|
|
||||||
// If there is no vector query, it's ok to not have a limit
|
|
||||||
scanner.limit(
|
|
||||||
query.base.limit.map(|limit| limit as i64),
|
|
||||||
query.base.offset.map(|offset| offset as i64),
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
|
scanner.limit(
|
||||||
|
query.base.limit.map(|limit| limit as i64),
|
||||||
|
query.base.offset.map(|offset| offset as i64),
|
||||||
|
)?;
|
||||||
scanner.nprobs(query.nprobes);
|
scanner.nprobs(query.nprobes);
|
||||||
scanner.use_index(query.use_index);
|
scanner.use_index(query.use_index);
|
||||||
scanner.prefilter(query.base.prefilter);
|
scanner.prefilter(query.base.prefilter);
|
||||||
|
|||||||
Reference in New Issue
Block a user