mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 06:39:57 +00:00
Compare commits
8 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
66a881b33a | ||
|
|
a7515d6ee2 | ||
|
|
587c0824af | ||
|
|
b38a4269d0 | ||
|
|
119d88b9db | ||
|
|
74f660d223 | ||
|
|
b2b0979b90 | ||
|
|
ee2a40b182 |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.13.0-beta.1"
|
current_version = "0.13.0-beta.2"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
1
.github/workflows/nodejs.yml
vendored
1
.github/workflows/nodejs.yml
vendored
@@ -104,7 +104,6 @@ jobs:
|
|||||||
OPENAI_BASE_URL: http://0.0.0.0:8000
|
OPENAI_BASE_URL: http://0.0.0.0:8000
|
||||||
run: |
|
run: |
|
||||||
python ci/mock_openai.py &
|
python ci/mock_openai.py &
|
||||||
ss -ltnp | grep :8000
|
|
||||||
cd nodejs/examples
|
cd nodejs/examples
|
||||||
npm test
|
npm test
|
||||||
macos:
|
macos:
|
||||||
|
|||||||
382
.github/workflows/npm-publish.yml
vendored
382
.github/workflows/npm-publish.yml
vendored
@@ -226,108 +226,109 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
node/dist/lancedb-vectordb-win32*.tgz
|
node/dist/lancedb-vectordb-win32*.tgz
|
||||||
|
|
||||||
node-windows-arm64:
|
# TODO: re-enable once working https://github.com/lancedb/lancedb/pull/1831
|
||||||
name: vectordb win32-arm64-msvc
|
# node-windows-arm64:
|
||||||
runs-on: windows-4x-arm
|
# name: vectordb win32-arm64-msvc
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
# runs-on: windows-4x-arm
|
||||||
steps:
|
# if: startsWith(github.ref, 'refs/tags/v')
|
||||||
- uses: actions/checkout@v4
|
# steps:
|
||||||
- name: Install Git
|
# - uses: actions/checkout@v4
|
||||||
run: |
|
# - name: Install Git
|
||||||
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
# run: |
|
||||||
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
# Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||||
shell: powershell
|
# Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||||
- name: Add Git to PATH
|
# shell: powershell
|
||||||
run: |
|
# - name: Add Git to PATH
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
# run: |
|
||||||
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
# Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||||
shell: powershell
|
# $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||||
- name: Configure Git symlinks
|
# shell: powershell
|
||||||
run: git config --global core.symlinks true
|
# - name: Configure Git symlinks
|
||||||
- uses: actions/checkout@v4
|
# run: git config --global core.symlinks true
|
||||||
- uses: actions/setup-python@v5
|
# - uses: actions/checkout@v4
|
||||||
with:
|
# - uses: actions/setup-python@v5
|
||||||
python-version: "3.13"
|
# with:
|
||||||
- name: Install Visual Studio Build Tools
|
# python-version: "3.13"
|
||||||
run: |
|
# - name: Install Visual Studio Build Tools
|
||||||
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
# run: |
|
||||||
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
# Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||||
"--installPath", "C:\BuildTools", `
|
# Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
# "--installPath", "C:\BuildTools", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
# "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
# "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||||
shell: powershell
|
# "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||||
- name: Add Visual Studio Build Tools to PATH
|
# shell: powershell
|
||||||
run: |
|
# - name: Add Visual Studio Build Tools to PATH
|
||||||
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
# run: |
|
||||||
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
# $vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
# $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||||
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||||
|
|
||||||
# Add MSVC runtime libraries to LIB
|
# # Add MSVC runtime libraries to LIB
|
||||||
$env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
|
# $env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
|
||||||
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
|
# "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
|
||||||
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
# "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||||
Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
|
# Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
|
||||||
|
|
||||||
# Add INCLUDE paths
|
# # Add INCLUDE paths
|
||||||
$env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
|
# $env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
|
||||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
|
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
|
||||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
|
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
|
||||||
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
|
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
|
||||||
Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
|
# Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Install Rust
|
# - name: Install Rust
|
||||||
run: |
|
# run: |
|
||||||
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
# Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||||
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
# .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add Rust to PATH
|
# - name: Add Rust to PATH
|
||||||
run: |
|
# run: |
|
||||||
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
# Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
|
|
||||||
- uses: Swatinem/rust-cache@v2
|
# - uses: Swatinem/rust-cache@v2
|
||||||
with:
|
# with:
|
||||||
workspaces: rust
|
# workspaces: rust
|
||||||
- name: Install 7-Zip ARM
|
# - name: Install 7-Zip ARM
|
||||||
run: |
|
# run: |
|
||||||
New-Item -Path 'C:\7zip' -ItemType Directory
|
# New-Item -Path 'C:\7zip' -ItemType Directory
|
||||||
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
# Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||||
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
# Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add 7-Zip to PATH
|
# - name: Add 7-Zip to PATH
|
||||||
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
# run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Install Protoc v21.12
|
# - name: Install Protoc v21.12
|
||||||
working-directory: C:\
|
# working-directory: C:\
|
||||||
run: |
|
# run: |
|
||||||
if (Test-Path 'C:\protoc') {
|
# if (Test-Path 'C:\protoc') {
|
||||||
Write-Host "Protoc directory exists, skipping installation"
|
# Write-Host "Protoc directory exists, skipping installation"
|
||||||
return
|
# return
|
||||||
}
|
# }
|
||||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
# New-Item -Path 'C:\protoc' -ItemType Directory
|
||||||
Set-Location C:\protoc
|
# Set-Location C:\protoc
|
||||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
# Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||||
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
# & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add Protoc to PATH
|
# - name: Add Protoc to PATH
|
||||||
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
# run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Build Windows native node modules
|
# - name: Build Windows native node modules
|
||||||
run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc
|
# run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc
|
||||||
- name: Upload Windows ARM64 Artifacts
|
# - name: Upload Windows ARM64 Artifacts
|
||||||
uses: actions/upload-artifact@v4
|
# uses: actions/upload-artifact@v4
|
||||||
with:
|
# with:
|
||||||
name: node-native-windows-arm64
|
# name: node-native-windows-arm64
|
||||||
path: |
|
# path: |
|
||||||
node/dist/*.node
|
# node/dist/*.node
|
||||||
|
|
||||||
nodejs-windows:
|
nodejs-windows:
|
||||||
name: lancedb ${{ matrix.target }}
|
name: lancedb ${{ matrix.target }}
|
||||||
@@ -363,98 +364,99 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
nodejs/dist/*.node
|
nodejs/dist/*.node
|
||||||
|
|
||||||
nodejs-windows-arm64:
|
# TODO: re-enable once working https://github.com/lancedb/lancedb/pull/1831
|
||||||
name: lancedb win32-arm64-msvc
|
# nodejs-windows-arm64:
|
||||||
runs-on: windows-4x-arm
|
# name: lancedb win32-arm64-msvc
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
# runs-on: windows-4x-arm
|
||||||
steps:
|
# if: startsWith(github.ref, 'refs/tags/v')
|
||||||
- uses: actions/checkout@v4
|
# steps:
|
||||||
- name: Install Git
|
# - uses: actions/checkout@v4
|
||||||
run: |
|
# - name: Install Git
|
||||||
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
# run: |
|
||||||
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
# Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||||
shell: powershell
|
# Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||||
- name: Add Git to PATH
|
# shell: powershell
|
||||||
run: |
|
# - name: Add Git to PATH
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
# run: |
|
||||||
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
# Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||||
shell: powershell
|
# $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||||
- name: Configure Git symlinks
|
# shell: powershell
|
||||||
run: git config --global core.symlinks true
|
# - name: Configure Git symlinks
|
||||||
- uses: actions/checkout@v4
|
# run: git config --global core.symlinks true
|
||||||
- uses: actions/setup-python@v5
|
# - uses: actions/checkout@v4
|
||||||
with:
|
# - uses: actions/setup-python@v5
|
||||||
python-version: "3.13"
|
# with:
|
||||||
- name: Install Visual Studio Build Tools
|
# python-version: "3.13"
|
||||||
run: |
|
# - name: Install Visual Studio Build Tools
|
||||||
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
# run: |
|
||||||
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
# Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||||
"--installPath", "C:\BuildTools", `
|
# Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
# "--installPath", "C:\BuildTools", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
# "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
# "--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||||
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
# "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||||
shell: powershell
|
# "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||||
- name: Add Visual Studio Build Tools to PATH
|
# shell: powershell
|
||||||
run: |
|
# - name: Add Visual Studio Build Tools to PATH
|
||||||
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
# run: |
|
||||||
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
# $vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
# $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||||
|
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||||
|
|
||||||
$env:LIB = ""
|
# $env:LIB = ""
|
||||||
Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
# Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Install Rust
|
# - name: Install Rust
|
||||||
run: |
|
# run: |
|
||||||
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
# Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||||
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
# .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add Rust to PATH
|
# - name: Add Rust to PATH
|
||||||
run: |
|
# run: |
|
||||||
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
# Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
|
|
||||||
- uses: Swatinem/rust-cache@v2
|
# - uses: Swatinem/rust-cache@v2
|
||||||
with:
|
# with:
|
||||||
workspaces: rust
|
# workspaces: rust
|
||||||
- name: Install 7-Zip ARM
|
# - name: Install 7-Zip ARM
|
||||||
run: |
|
# run: |
|
||||||
New-Item -Path 'C:\7zip' -ItemType Directory
|
# New-Item -Path 'C:\7zip' -ItemType Directory
|
||||||
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
# Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||||
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
# Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add 7-Zip to PATH
|
# - name: Add 7-Zip to PATH
|
||||||
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
# run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Install Protoc v21.12
|
# - name: Install Protoc v21.12
|
||||||
working-directory: C:\
|
# working-directory: C:\
|
||||||
run: |
|
# run: |
|
||||||
if (Test-Path 'C:\protoc') {
|
# if (Test-Path 'C:\protoc') {
|
||||||
Write-Host "Protoc directory exists, skipping installation"
|
# Write-Host "Protoc directory exists, skipping installation"
|
||||||
return
|
# return
|
||||||
}
|
# }
|
||||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
# New-Item -Path 'C:\protoc' -ItemType Directory
|
||||||
Set-Location C:\protoc
|
# Set-Location C:\protoc
|
||||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
# Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||||
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
# & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Add Protoc to PATH
|
# - name: Add Protoc to PATH
|
||||||
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
# run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||||
shell: powershell
|
# shell: powershell
|
||||||
- name: Build Windows native node modules
|
# - name: Build Windows native node modules
|
||||||
run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc
|
# run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc
|
||||||
- name: Upload Windows ARM64 Artifacts
|
# - name: Upload Windows ARM64 Artifacts
|
||||||
uses: actions/upload-artifact@v4
|
# uses: actions/upload-artifact@v4
|
||||||
with:
|
# with:
|
||||||
name: nodejs-native-windows-arm64
|
# name: nodejs-native-windows-arm64
|
||||||
path: |
|
# path: |
|
||||||
nodejs/dist/*.node
|
# nodejs/dist/*.node
|
||||||
|
|
||||||
release:
|
release:
|
||||||
name: vectordb NPM Publish
|
name: vectordb NPM Publish
|
||||||
@@ -476,7 +478,7 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
# Tag beta as "preview" instead of default "latest". See lancedb
|
# Tag beta as "preview" instead of default "latest". See lancedb
|
||||||
# npm publish step for more info.
|
# npm publish step for more info.
|
||||||
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
|
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
|
||||||
PUBLISH_ARGS="--tag preview"
|
PUBLISH_ARGS="--tag preview"
|
||||||
|
|||||||
14
Cargo.toml
14
Cargo.toml
@@ -23,13 +23,13 @@ rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
|||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.19.2", "features" = [
|
lance = { "version" = "=0.19.2", "features" = [
|
||||||
"dynamodb",
|
"dynamodb",
|
||||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
|
]}
|
||||||
lance-index = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
|
lance-index = "=0.19.2"
|
||||||
lance-linalg = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
|
lance-linalg = "=0.19.2"
|
||||||
lance-table = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
|
lance-table = "=0.19.2"
|
||||||
lance-testing = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
|
lance-testing = "=0.19.2"
|
||||||
lance-datafusion = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
|
lance-datafusion = "=0.19.2"
|
||||||
lance-encoding = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
|
lance-encoding = "=0.19.2"
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "52.2", optional = false }
|
arrow = { version = "52.2", optional = false }
|
||||||
arrow-array = "52.2"
|
arrow-array = "52.2"
|
||||||
|
|||||||
@@ -790,6 +790,27 @@ Use the `drop_table()` method on the database to remove a table.
|
|||||||
This permanently removes the table and is not recoverable, unlike deleting rows.
|
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||||
If the table does not exist an exception is raised.
|
If the table does not exist an exception is raised.
|
||||||
|
|
||||||
|
## Handling bad vectors
|
||||||
|
|
||||||
|
In LanceDB Python, you can use the `on_bad_vectors` parameter to choose how
|
||||||
|
invalid vector values are handled. Invalid vectors are vectors that are not valid
|
||||||
|
because:
|
||||||
|
|
||||||
|
1. They are the wrong dimension
|
||||||
|
2. They contain NaN values
|
||||||
|
3. They are null but are on a non-nullable field
|
||||||
|
|
||||||
|
By default, LanceDB will raise an error if it encounters a bad vector. You can
|
||||||
|
also choose one of the following options:
|
||||||
|
|
||||||
|
* `drop`: Ignore rows with bad vectors
|
||||||
|
* `fill`: Replace bad values (NaNs) or missing values (too few dimensions) with
|
||||||
|
the fill value specified in the `fill_value` parameter. An input like
|
||||||
|
`[1.0, NaN, 3.0]` will be replaced with `[1.0, 0.0, 3.0]` if `fill_value=0.0`.
|
||||||
|
* `null`: Replace bad vectors with null (only works if the column is nullable).
|
||||||
|
A bad vector `[1.0, NaN, 3.0]` will be replaced with `null` if the column is
|
||||||
|
nullable. If the vector column is non-nullable, then bad vectors will cause an
|
||||||
|
error
|
||||||
|
|
||||||
## Consistency
|
## Consistency
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.13.0-beta.1</version>
|
<version>0.13.0-beta.2</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.13.0-beta.1</version>
|
<version>0.13.0-beta.2</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
||||||
<name>LanceDB Parent</name>
|
<name>LanceDB Parent</name>
|
||||||
|
|||||||
76
node/package-lock.json
generated
76
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -52,12 +52,12 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.1",
|
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.1",
|
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.1",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.1",
|
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.1",
|
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.1"
|
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
@@ -327,66 +327,6 @@
|
|||||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
|
||||||
"version": "0.13.0-beta.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.13.0-beta.1.tgz",
|
|
||||||
"integrity": "sha512-beOrf6selCzzhLgDG8Nibma4nO/CSnA1wUKRmlJHEPtGcg7PW18z6MP/nfwQMpMR/FLRfTo8pPTbpzss47MiQQ==",
|
|
||||||
"cpu": [
|
|
||||||
"arm64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"darwin"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
|
||||||
"version": "0.13.0-beta.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.13.0-beta.1.tgz",
|
|
||||||
"integrity": "sha512-YdraGRF/RbJRkKh0v3xT03LUhq47T2GtCvJ5gZp8wKlh4pHa8LuhLU0DIdvmG/DT5vuQA+td8HDkBm/e3EOdNg==",
|
|
||||||
"cpu": [
|
|
||||||
"x64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"darwin"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
|
||||||
"version": "0.13.0-beta.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.13.0-beta.1.tgz",
|
|
||||||
"integrity": "sha512-Pp0O/uhEqof1oLaWrNbv+Ym+q8kBkiCqaA5+2eAZ6a3e9U+Ozkvb0FQrHuyi9adJ5wKQ4NabyQE9BMf2bYpOnQ==",
|
|
||||||
"cpu": [
|
|
||||||
"arm64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"linux"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
|
||||||
"version": "0.13.0-beta.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.13.0-beta.1.tgz",
|
|
||||||
"integrity": "sha512-y8nxOye4egfWF5FGED9EfkmZ1O5HnRLU4a61B8m5JSpkivO9v2epTcbYN0yt/7ZFCgtqMfJ8VW4Mi7qQcz3KDA==",
|
|
||||||
"cpu": [
|
|
||||||
"x64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"linux"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
|
||||||
"version": "0.13.0-beta.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.13.0-beta.1.tgz",
|
|
||||||
"integrity": "sha512-STMDP9dp0TBLkB3ro+16pKcGy6bmbhRuEZZZ1Tp5P75yTPeVh4zIgWkidMdU1qBbEYM7xacnsp9QAwgLnMU/Ow==",
|
|
||||||
"cpu": [
|
|
||||||
"x64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"win32"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@neon-rs/cli": {
|
"node_modules/@neon-rs/cli": {
|
||||||
"version": "0.0.160",
|
"version": "0.0.160",
|
||||||
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
@@ -89,11 +89,11 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.1",
|
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.1",
|
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.1",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.1",
|
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.1",
|
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2",
|
||||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.1"
|
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.13.0-beta.1"
|
version = "0.13.0-beta.2"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -187,6 +187,81 @@ describe.each([arrow13, arrow14, arrow15, arrow16, arrow17])(
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// TODO: https://github.com/lancedb/lancedb/issues/1832
|
||||||
|
it.skip("should be able to omit nullable fields", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const schema = new arrow.Schema([
|
||||||
|
new arrow.Field(
|
||||||
|
"vector",
|
||||||
|
new arrow.FixedSizeList(
|
||||||
|
2,
|
||||||
|
new arrow.Field("item", new arrow.Float64()),
|
||||||
|
),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
new arrow.Field("item", new arrow.Utf8(), true),
|
||||||
|
new arrow.Field("price", new arrow.Float64(), false),
|
||||||
|
]);
|
||||||
|
const table = await db.createEmptyTable("test", schema);
|
||||||
|
|
||||||
|
const data1 = { item: "foo", price: 10.0 };
|
||||||
|
await table.add([data1]);
|
||||||
|
const data2 = { vector: [3.1, 4.1], price: 2.0 };
|
||||||
|
await table.add([data2]);
|
||||||
|
const data3 = { vector: [5.9, 26.5], item: "bar", price: 3.0 };
|
||||||
|
await table.add([data3]);
|
||||||
|
|
||||||
|
let res = await table.query().limit(10).toArray();
|
||||||
|
const resVector = res.map((r) => r.get("vector").toArray());
|
||||||
|
expect(resVector).toEqual([null, data2.vector, data3.vector]);
|
||||||
|
const resItem = res.map((r) => r.get("item").toArray());
|
||||||
|
expect(resItem).toEqual(["foo", null, "bar"]);
|
||||||
|
const resPrice = res.map((r) => r.get("price").toArray());
|
||||||
|
expect(resPrice).toEqual([10.0, 2.0, 3.0]);
|
||||||
|
|
||||||
|
const data4 = { item: "foo" };
|
||||||
|
// We can't omit a column if it's not nullable
|
||||||
|
await expect(table.add([data4])).rejects.toThrow("Invalid user input");
|
||||||
|
|
||||||
|
// But we can alter columns to make them nullable
|
||||||
|
await table.alterColumns([{ path: "price", nullable: true }]);
|
||||||
|
await table.add([data4]);
|
||||||
|
|
||||||
|
res = (await table.query().limit(10).toArray()).map((r) => r.toJSON());
|
||||||
|
expect(res).toEqual([data1, data2, data3, data4]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should be able to insert nullable data for non-nullable fields", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const schema = new arrow.Schema([
|
||||||
|
new arrow.Field("x", new arrow.Float64(), false),
|
||||||
|
new arrow.Field("id", new arrow.Utf8(), false),
|
||||||
|
]);
|
||||||
|
const table = await db.createEmptyTable("test", schema);
|
||||||
|
|
||||||
|
const data1 = { x: 4.1, id: "foo" };
|
||||||
|
await table.add([data1]);
|
||||||
|
const res = (await table.query().toArray())[0];
|
||||||
|
expect(res.x).toEqual(data1.x);
|
||||||
|
expect(res.id).toEqual(data1.id);
|
||||||
|
|
||||||
|
const data2 = { x: null, id: "bar" };
|
||||||
|
await expect(table.add([data2])).rejects.toThrow(
|
||||||
|
"declared as non-nullable but contains null values",
|
||||||
|
);
|
||||||
|
|
||||||
|
// But we can alter columns to make them nullable
|
||||||
|
await table.alterColumns([{ path: "x", nullable: true }]);
|
||||||
|
await table.add([data2]);
|
||||||
|
|
||||||
|
const res2 = await table.query().toArray();
|
||||||
|
expect(res2.length).toBe(2);
|
||||||
|
expect(res2[0].x).toEqual(data1.x);
|
||||||
|
expect(res2[0].id).toEqual(data1.id);
|
||||||
|
expect(res2[1].x).toBeNull();
|
||||||
|
expect(res2[1].id).toEqual(data2.id);
|
||||||
|
});
|
||||||
|
|
||||||
it("should return the table as an instance of an arrow table", async () => {
|
it("should return the table as an instance of an arrow table", async () => {
|
||||||
const arrowTbl = await table.toArrow();
|
const arrowTbl = await table.toArrow();
|
||||||
expect(arrowTbl).toBeInstanceOf(ArrowTable);
|
expect(arrowTbl).toBeInstanceOf(ArrowTable);
|
||||||
|
|||||||
@@ -6,12 +6,16 @@ import { withTempDirectory } from "./util.ts";
|
|||||||
import * as lancedb from "@lancedb/lancedb";
|
import * as lancedb from "@lancedb/lancedb";
|
||||||
import "@lancedb/lancedb/embedding/transformers";
|
import "@lancedb/lancedb/embedding/transformers";
|
||||||
import { LanceSchema, getRegistry } from "@lancedb/lancedb/embedding";
|
import { LanceSchema, getRegistry } from "@lancedb/lancedb/embedding";
|
||||||
|
import { EmbeddingFunction } from "@lancedb/lancedb/embedding";
|
||||||
import { Utf8 } from "apache-arrow";
|
import { Utf8 } from "apache-arrow";
|
||||||
|
|
||||||
test("full text search", async () => {
|
test("full text search", async () => {
|
||||||
await withTempDirectory(async (databaseDir) => {
|
await withTempDirectory(async (databaseDir) => {
|
||||||
const db = await lancedb.connect(databaseDir);
|
const db = await lancedb.connect(databaseDir);
|
||||||
const func = await getRegistry().get("huggingface").create();
|
console.log(getRegistry());
|
||||||
|
const func = (await getRegistry()
|
||||||
|
.get("huggingface")
|
||||||
|
?.create()) as EmbeddingFunction;
|
||||||
|
|
||||||
const facts = [
|
const facts = [
|
||||||
"Albert Einstein was a theoretical physicist.",
|
"Albert Einstein was a theoretical physicist.",
|
||||||
@@ -56,4 +60,4 @@ test("full text search", async () => {
|
|||||||
|
|
||||||
expect(actual[0]["text"]).toBe("The human body has 206 bones.");
|
expect(actual[0]["text"]).toBe("The human body has 206 bones.");
|
||||||
});
|
});
|
||||||
});
|
}, 100_000);
|
||||||
|
|||||||
@@ -19,9 +19,6 @@ import { EmbeddingFunctionConfig, getRegistry } from "./registry";
|
|||||||
|
|
||||||
export { EmbeddingFunction, TextEmbeddingFunction } from "./embedding_function";
|
export { EmbeddingFunction, TextEmbeddingFunction } from "./embedding_function";
|
||||||
|
|
||||||
// We need to explicitly export '*' so that the `register` decorator actually registers the class.
|
|
||||||
export * from "./openai";
|
|
||||||
export * from "./transformers";
|
|
||||||
export * from "./registry";
|
export * from "./registry";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -17,8 +17,6 @@ import {
|
|||||||
type EmbeddingFunctionConstructor,
|
type EmbeddingFunctionConstructor,
|
||||||
} from "./embedding_function";
|
} from "./embedding_function";
|
||||||
import "reflect-metadata";
|
import "reflect-metadata";
|
||||||
import { OpenAIEmbeddingFunction } from "./openai";
|
|
||||||
import { TransformersEmbeddingFunction } from "./transformers";
|
|
||||||
|
|
||||||
type CreateReturnType<T> = T extends { init: () => Promise<void> }
|
type CreateReturnType<T> = T extends { init: () => Promise<void> }
|
||||||
? Promise<T>
|
? Promise<T>
|
||||||
@@ -73,10 +71,6 @@ export class EmbeddingFunctionRegistry {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
get(name: "openai"): EmbeddingFunctionCreate<OpenAIEmbeddingFunction>;
|
|
||||||
get(
|
|
||||||
name: "huggingface",
|
|
||||||
): EmbeddingFunctionCreate<TransformersEmbeddingFunction>;
|
|
||||||
get<T extends EmbeddingFunction<unknown>>(
|
get<T extends EmbeddingFunction<unknown>>(
|
||||||
name: string,
|
name: string,
|
||||||
): EmbeddingFunctionCreate<T> | undefined;
|
): EmbeddingFunctionCreate<T> | undefined;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
@@ -10,11 +10,13 @@
|
|||||||
"vector database",
|
"vector database",
|
||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0-beta.2",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
"./embedding": "./dist/embedding/index.js"
|
"./embedding": "./dist/embedding/index.js",
|
||||||
|
"./embedding/openai": "./dist/embedding/openai.js",
|
||||||
|
"./embedding/transformers": "./dist/embedding/transformers.js"
|
||||||
},
|
},
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
"napi": {
|
"napi": {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.16.0-beta.1"
|
current_version = "0.16.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.16.0-beta.1"
|
version = "0.16.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -1567,7 +1567,7 @@ class LanceTable(Table):
|
|||||||
"append" and "overwrite".
|
"append" and "overwrite".
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
What to do if any of the vectors are not the same size or contains NaNs.
|
||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
|
|
||||||
@@ -1851,7 +1851,7 @@ class LanceTable(Table):
|
|||||||
data but will validate against any schema that's specified.
|
data but will validate against any schema that's specified.
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
What to do if any of the vectors are not the same size or contains NaNs.
|
||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
embedding_functions: list of EmbeddingFunctionModel, default None
|
embedding_functions: list of EmbeddingFunctionModel, default None
|
||||||
@@ -2151,13 +2151,11 @@ def _sanitize_schema(
|
|||||||
vector column to fixed_size_list(float32) if necessary.
|
vector column to fixed_size_list(float32) if necessary.
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
What to do if any of the vectors are not the same size or contains NaNs.
|
||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
"""
|
"""
|
||||||
if schema is not None:
|
if schema is not None:
|
||||||
if data.schema == schema:
|
|
||||||
return data
|
|
||||||
# cast the columns to the expected types
|
# cast the columns to the expected types
|
||||||
data = data.combine_chunks()
|
data = data.combine_chunks()
|
||||||
for field in schema:
|
for field in schema:
|
||||||
@@ -2177,6 +2175,7 @@ def _sanitize_schema(
|
|||||||
vector_column_name=field.name,
|
vector_column_name=field.name,
|
||||||
on_bad_vectors=on_bad_vectors,
|
on_bad_vectors=on_bad_vectors,
|
||||||
fill_value=fill_value,
|
fill_value=fill_value,
|
||||||
|
table_schema=schema,
|
||||||
)
|
)
|
||||||
return pa.Table.from_arrays(
|
return pa.Table.from_arrays(
|
||||||
[data[name] for name in schema.names], schema=schema
|
[data[name] for name in schema.names], schema=schema
|
||||||
@@ -2197,6 +2196,7 @@ def _sanitize_schema(
|
|||||||
def _sanitize_vector_column(
|
def _sanitize_vector_column(
|
||||||
data: pa.Table,
|
data: pa.Table,
|
||||||
vector_column_name: str,
|
vector_column_name: str,
|
||||||
|
table_schema: Optional[pa.Schema] = None,
|
||||||
on_bad_vectors: str = "error",
|
on_bad_vectors: str = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
) -> pa.Table:
|
) -> pa.Table:
|
||||||
@@ -2211,12 +2211,16 @@ def _sanitize_vector_column(
|
|||||||
The name of the vector column.
|
The name of the vector column.
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
What to do if any of the vectors are not the same size or contains NaNs.
|
||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.0
|
fill_value: float, default 0.0
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
"""
|
"""
|
||||||
# ChunkedArray is annoying to work with, so we combine chunks here
|
# ChunkedArray is annoying to work with, so we combine chunks here
|
||||||
vec_arr = data[vector_column_name].combine_chunks()
|
vec_arr = data[vector_column_name].combine_chunks()
|
||||||
|
if table_schema is not None:
|
||||||
|
field = table_schema.field(vector_column_name)
|
||||||
|
else:
|
||||||
|
field = None
|
||||||
typ = data[vector_column_name].type
|
typ = data[vector_column_name].type
|
||||||
if pa.types.is_list(typ) or pa.types.is_large_list(typ):
|
if pa.types.is_list(typ) or pa.types.is_large_list(typ):
|
||||||
# if it's a variable size list array,
|
# if it's a variable size list array,
|
||||||
@@ -2243,7 +2247,11 @@ def _sanitize_vector_column(
|
|||||||
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if pc.any(pc.is_null(vec_arr.values, nan_is_null=True)).as_py():
|
if (
|
||||||
|
field is not None
|
||||||
|
and not field.nullable
|
||||||
|
and pc.any(pc.is_null(vec_arr.values)).as_py()
|
||||||
|
) or (pc.any(pc.is_nan(vec_arr.values)).as_py()):
|
||||||
data = _sanitize_nans(
|
data = _sanitize_nans(
|
||||||
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
||||||
)
|
)
|
||||||
@@ -2287,6 +2295,12 @@ def _sanitize_jagged(data, fill_value, on_bad_vectors, vec_arr, vector_column_na
|
|||||||
)
|
)
|
||||||
elif on_bad_vectors == "drop":
|
elif on_bad_vectors == "drop":
|
||||||
data = data.filter(correct_ndims)
|
data = data.filter(correct_ndims)
|
||||||
|
elif on_bad_vectors == "null":
|
||||||
|
data = data.set_column(
|
||||||
|
data.column_names.index(vector_column_name),
|
||||||
|
vector_column_name,
|
||||||
|
pc.if_else(correct_ndims, vec_arr, pa.scalar(None)),
|
||||||
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
@@ -2303,7 +2317,8 @@ def _sanitize_nans(
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Vector column {vector_column_name} has NaNs. "
|
f"Vector column {vector_column_name} has NaNs. "
|
||||||
"Set on_bad_vectors='drop' to remove them, or "
|
"Set on_bad_vectors='drop' to remove them, or "
|
||||||
"set on_bad_vectors='fill' and fill_value=<value> to replace them."
|
"set on_bad_vectors='fill' and fill_value=<value> to replace them. "
|
||||||
|
"Or set on_bad_vectors='null' to replace them with null."
|
||||||
)
|
)
|
||||||
elif on_bad_vectors == "fill":
|
elif on_bad_vectors == "fill":
|
||||||
if fill_value is None:
|
if fill_value is None:
|
||||||
@@ -2323,6 +2338,17 @@ def _sanitize_nans(
|
|||||||
np_arr = np_arr.reshape(-1, vec_arr.type.list_size)
|
np_arr = np_arr.reshape(-1, vec_arr.type.list_size)
|
||||||
not_nulls = np.any(np_arr, axis=1)
|
not_nulls = np.any(np_arr, axis=1)
|
||||||
data = data.filter(~not_nulls)
|
data = data.filter(~not_nulls)
|
||||||
|
elif on_bad_vectors == "null":
|
||||||
|
# null = pa.nulls(len(vec_arr)).cast(vec_arr.type)
|
||||||
|
# values = pc.if_else(pc.is_nan(vec_arr.values), fill_value, vec_arr.values)
|
||||||
|
np_arr = np.isnan(vec_arr.values.to_numpy(zero_copy_only=False))
|
||||||
|
np_arr = np_arr.reshape(-1, vec_arr.type.list_size)
|
||||||
|
no_nans = np.any(np_arr, axis=1)
|
||||||
|
data = data.set_column(
|
||||||
|
data.column_names.index(vector_column_name),
|
||||||
|
vector_column_name,
|
||||||
|
pc.if_else(no_nans, vec_arr, pa.scalar(None)),
|
||||||
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
@@ -2588,7 +2614,7 @@ class AsyncTable:
|
|||||||
"append" and "overwrite".
|
"append" and "overwrite".
|
||||||
on_bad_vectors: str, default "error"
|
on_bad_vectors: str, default "error"
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
What to do if any of the vectors are not the same size or contains NaNs.
|
||||||
One of "error", "drop", "fill".
|
One of "error", "drop", "fill", "null".
|
||||||
fill_value: float, default 0.
|
fill_value: float, default 0.
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
|
|
||||||
|
|||||||
@@ -81,14 +81,15 @@ def test_embedding_function(tmp_path):
|
|||||||
|
|
||||||
|
|
||||||
def test_embedding_with_bad_results(tmp_path):
|
def test_embedding_with_bad_results(tmp_path):
|
||||||
@register("mock-embedding")
|
@register("null-embedding")
|
||||||
class MockEmbeddingFunction(TextEmbeddingFunction):
|
class NullEmbeddingFunction(TextEmbeddingFunction):
|
||||||
def ndims(self):
|
def ndims(self):
|
||||||
return 128
|
return 128
|
||||||
|
|
||||||
def generate_embeddings(
|
def generate_embeddings(
|
||||||
self, texts: Union[List[str], np.ndarray]
|
self, texts: Union[List[str], np.ndarray]
|
||||||
) -> list[Union[np.array, None]]:
|
) -> list[Union[np.array, None]]:
|
||||||
|
# Return None, which is bad if field is non-nullable
|
||||||
return [
|
return [
|
||||||
None if i % 2 == 0 else np.random.randn(self.ndims())
|
None if i % 2 == 0 else np.random.randn(self.ndims())
|
||||||
for i in range(len(texts))
|
for i in range(len(texts))
|
||||||
@@ -96,13 +97,17 @@ def test_embedding_with_bad_results(tmp_path):
|
|||||||
|
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
registry = EmbeddingFunctionRegistry.get_instance()
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
model = registry.get("mock-embedding").create()
|
model = registry.get("null-embedding").create()
|
||||||
|
|
||||||
class Schema(LanceModel):
|
class Schema(LanceModel):
|
||||||
text: str = model.SourceField()
|
text: str = model.SourceField()
|
||||||
vector: Vector(model.ndims()) = model.VectorField()
|
vector: Vector(model.ndims()) = model.VectorField()
|
||||||
|
|
||||||
table = db.create_table("test", schema=Schema, mode="overwrite")
|
table = db.create_table("test", schema=Schema, mode="overwrite")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
# Default on_bad_vectors is "error"
|
||||||
|
table.add([{"text": "hello world"}])
|
||||||
|
|
||||||
table.add(
|
table.add(
|
||||||
[{"text": "hello world"}, {"text": "bar"}],
|
[{"text": "hello world"}, {"text": "bar"}],
|
||||||
on_bad_vectors="drop",
|
on_bad_vectors="drop",
|
||||||
@@ -112,13 +117,33 @@ def test_embedding_with_bad_results(tmp_path):
|
|||||||
assert len(table) == 1
|
assert len(table) == 1
|
||||||
assert df.iloc[0]["text"] == "bar"
|
assert df.iloc[0]["text"] == "bar"
|
||||||
|
|
||||||
# table = db.create_table("test2", schema=Schema, mode="overwrite")
|
@register("nan-embedding")
|
||||||
# table.add(
|
class NanEmbeddingFunction(TextEmbeddingFunction):
|
||||||
# [{"text": "hello world"}, {"text": "bar"}],
|
def ndims(self):
|
||||||
# )
|
return 128
|
||||||
# assert len(table) == 2
|
|
||||||
# tbl = table.to_arrow()
|
def generate_embeddings(
|
||||||
# assert tbl["vector"].null_count == 1
|
self, texts: Union[List[str], np.ndarray]
|
||||||
|
) -> list[Union[np.array, None]]:
|
||||||
|
# Return NaN to produce bad vectors
|
||||||
|
return [
|
||||||
|
[np.NAN] * 128 if i % 2 == 0 else np.random.randn(self.ndims())
|
||||||
|
for i in range(len(texts))
|
||||||
|
]
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
|
model = registry.get("nan-embedding").create()
|
||||||
|
|
||||||
|
table = db.create_table("test2", schema=Schema, mode="overwrite")
|
||||||
|
table.alter_columns(dict(path="vector", nullable=True))
|
||||||
|
table.add(
|
||||||
|
[{"text": "hello world"}, {"text": "bar"}],
|
||||||
|
on_bad_vectors="null",
|
||||||
|
)
|
||||||
|
assert len(table) == 2
|
||||||
|
tbl = table.to_arrow()
|
||||||
|
assert tbl["vector"].null_count == 1
|
||||||
|
|
||||||
|
|
||||||
def test_with_existing_vectors(tmp_path):
|
def test_with_existing_vectors(tmp_path):
|
||||||
|
|||||||
@@ -240,6 +240,121 @@ def test_add(db):
|
|||||||
_add(table, schema)
|
_add(table, schema)
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_subschema(tmp_path):
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||||
|
pa.field("item", pa.string(), nullable=True),
|
||||||
|
pa.field("price", pa.float64(), nullable=False),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
table = db.create_table("test", schema=schema)
|
||||||
|
|
||||||
|
data = {"price": 10.0, "item": "foo"}
|
||||||
|
table.add([data])
|
||||||
|
data = {"price": 2.0, "vector": [3.1, 4.1]}
|
||||||
|
table.add([data])
|
||||||
|
data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
|
||||||
|
table.add([data])
|
||||||
|
|
||||||
|
expected = pa.table(
|
||||||
|
{
|
||||||
|
"vector": [None, [3.1, 4.1], [5.9, 26.5]],
|
||||||
|
"item": ["foo", None, "bar"],
|
||||||
|
"price": [10.0, 2.0, 3.0],
|
||||||
|
},
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
assert table.to_arrow() == expected
|
||||||
|
|
||||||
|
data = {"item": "foo"}
|
||||||
|
# We can't omit a column if it's not nullable
|
||||||
|
with pytest.raises(OSError, match="Invalid user input"):
|
||||||
|
table.add([data])
|
||||||
|
|
||||||
|
# We can add it if we make the column nullable
|
||||||
|
table.alter_columns(dict(path="price", nullable=True))
|
||||||
|
table.add([data])
|
||||||
|
|
||||||
|
expected_schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||||
|
pa.field("item", pa.string(), nullable=True),
|
||||||
|
pa.field("price", pa.float64(), nullable=True),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected = pa.table(
|
||||||
|
{
|
||||||
|
"vector": [None, [3.1, 4.1], [5.9, 26.5], None],
|
||||||
|
"item": ["foo", None, "bar", "foo"],
|
||||||
|
"price": [10.0, 2.0, 3.0, None],
|
||||||
|
},
|
||||||
|
schema=expected_schema,
|
||||||
|
)
|
||||||
|
assert table.to_arrow() == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_nullability(tmp_path):
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2), nullable=False),
|
||||||
|
pa.field("id", pa.string(), nullable=False),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
table = db.create_table("test", schema=schema)
|
||||||
|
|
||||||
|
nullable_schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||||
|
pa.field("id", pa.string(), nullable=True),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
data = pa.table(
|
||||||
|
{
|
||||||
|
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||||
|
"id": ["foo", "bar"],
|
||||||
|
},
|
||||||
|
schema=nullable_schema,
|
||||||
|
)
|
||||||
|
# We can add nullable schema if it doesn't actually contain nulls
|
||||||
|
table.add(data)
|
||||||
|
|
||||||
|
expected = data.cast(schema)
|
||||||
|
assert table.to_arrow() == expected
|
||||||
|
|
||||||
|
data = pa.table(
|
||||||
|
{
|
||||||
|
"vector": [None],
|
||||||
|
"id": ["baz"],
|
||||||
|
},
|
||||||
|
schema=nullable_schema,
|
||||||
|
)
|
||||||
|
# We can't add nullable schema if it contains nulls
|
||||||
|
with pytest.raises(Exception, match="Vector column vector has NaNs"):
|
||||||
|
table.add(data)
|
||||||
|
|
||||||
|
# But we can make it nullable
|
||||||
|
table.alter_columns(dict(path="vector", nullable=True))
|
||||||
|
table.add(data)
|
||||||
|
|
||||||
|
expected_schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
|
||||||
|
pa.field("id", pa.string(), nullable=False),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected = pa.table(
|
||||||
|
{
|
||||||
|
"vector": [[3.1, 4.1], [5.9, 26.5], None],
|
||||||
|
"id": ["foo", "bar", "baz"],
|
||||||
|
},
|
||||||
|
schema=expected_schema,
|
||||||
|
)
|
||||||
|
assert table.to_arrow() == expected
|
||||||
|
|
||||||
|
|
||||||
def test_add_pydantic_model(db):
|
def test_add_pydantic_model(db):
|
||||||
# https://github.com/lancedb/lancedb/issues/562
|
# https://github.com/lancedb/lancedb/issues/562
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.13.0-beta.1"
|
version = "0.13.0-beta.2"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.13.0-beta.1"
|
version = "0.13.0-beta.2"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -46,6 +46,7 @@ serde = { version = "^1" }
|
|||||||
serde_json = { version = "1" }
|
serde_json = { version = "1" }
|
||||||
async-openai = { version = "0.20.0", optional = true }
|
async-openai = { version = "0.20.0", optional = true }
|
||||||
serde_with = { version = "3.8.1" }
|
serde_with = { version = "3.8.1" }
|
||||||
|
aws-sdk-bedrockruntime = { version = "1.27.0", optional = true }
|
||||||
# For remote feature
|
# For remote feature
|
||||||
reqwest = { version = "0.12.0", features = ["gzip", "json", "stream"], optional = true }
|
reqwest = { version = "0.12.0", features = ["gzip", "json", "stream"], optional = true }
|
||||||
rand = { version = "0.8.3", features = ["small_rng"], optional = true}
|
rand = { version = "0.8.3", features = ["small_rng"], optional = true}
|
||||||
@@ -72,11 +73,13 @@ aws-config = { version = "1.0" }
|
|||||||
aws-smithy-runtime = { version = "1.3" }
|
aws-smithy-runtime = { version = "1.3" }
|
||||||
http-body = "1" # Matching reqwest
|
http-body = "1" # Matching reqwest
|
||||||
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
||||||
fp16kernels = ["lance-linalg/fp16kernels"]
|
fp16kernels = ["lance-linalg/fp16kernels"]
|
||||||
s3-test = []
|
s3-test = []
|
||||||
|
bedrock = ["dep:aws-sdk-bedrockruntime"]
|
||||||
openai = ["dep:async-openai", "dep:reqwest"]
|
openai = ["dep:async-openai", "dep:reqwest"]
|
||||||
polars = ["dep:polars-arrow", "dep:polars"]
|
polars = ["dep:polars-arrow", "dep:polars"]
|
||||||
sentence-transformers = [
|
sentence-transformers = [
|
||||||
@@ -94,3 +97,7 @@ required-features = ["openai"]
|
|||||||
[[example]]
|
[[example]]
|
||||||
name = "sentence_transformers"
|
name = "sentence_transformers"
|
||||||
required-features = ["sentence-transformers"]
|
required-features = ["sentence-transformers"]
|
||||||
|
|
||||||
|
[[example]]
|
||||||
|
name = "bedrock"
|
||||||
|
required-features = ["bedrock"]
|
||||||
|
|||||||
89
rust/lancedb/examples/bedrock.rs
Normal file
89
rust/lancedb/examples/bedrock.rs
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
use std::{iter::once, sync::Arc};
|
||||||
|
|
||||||
|
use arrow_array::{Float64Array, Int32Array, RecordBatch, RecordBatchIterator, StringArray};
|
||||||
|
use arrow_schema::{DataType, Field, Schema};
|
||||||
|
use aws_config::Region;
|
||||||
|
use aws_sdk_bedrockruntime::Client;
|
||||||
|
use futures::StreamExt;
|
||||||
|
use lancedb::{
|
||||||
|
arrow::IntoArrow,
|
||||||
|
connect,
|
||||||
|
embeddings::{bedrock::BedrockEmbeddingFunction, EmbeddingDefinition, EmbeddingFunction},
|
||||||
|
query::{ExecutableQuery, QueryBase},
|
||||||
|
Result,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
let tempdir = tempfile::tempdir().unwrap();
|
||||||
|
let tempdir = tempdir.path().to_str().unwrap();
|
||||||
|
|
||||||
|
// create Bedrock embedding function
|
||||||
|
let region: String = "us-east-1".to_string();
|
||||||
|
let config = aws_config::defaults(aws_config::BehaviorVersion::latest())
|
||||||
|
.region(Region::new(region))
|
||||||
|
.load()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let embedding = Arc::new(BedrockEmbeddingFunction::new(
|
||||||
|
Client::new(&config), // AWS Region
|
||||||
|
));
|
||||||
|
|
||||||
|
let db = connect(tempdir).execute().await?;
|
||||||
|
db.embedding_registry()
|
||||||
|
.register("bedrock", embedding.clone())?;
|
||||||
|
|
||||||
|
let table = db
|
||||||
|
.create_table("vectors", make_data())
|
||||||
|
.add_embedding(EmbeddingDefinition::new(
|
||||||
|
"text",
|
||||||
|
"bedrock",
|
||||||
|
Some("embeddings"),
|
||||||
|
))?
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// execute vector search
|
||||||
|
let query = Arc::new(StringArray::from_iter_values(once("something warm")));
|
||||||
|
let query_vector = embedding.compute_query_embeddings(query)?;
|
||||||
|
let mut results = table
|
||||||
|
.vector_search(query_vector)?
|
||||||
|
.limit(1)
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let rb = results.next().await.unwrap()?;
|
||||||
|
let out = rb
|
||||||
|
.column_by_name("text")
|
||||||
|
.unwrap()
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<StringArray>()
|
||||||
|
.unwrap();
|
||||||
|
let text = out.iter().next().unwrap().unwrap();
|
||||||
|
println!("Closest match: {}", text);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_data() -> impl IntoArrow {
|
||||||
|
let schema = Schema::new(vec![
|
||||||
|
Field::new("id", DataType::Int32, true),
|
||||||
|
Field::new("text", DataType::Utf8, false),
|
||||||
|
Field::new("price", DataType::Float64, false),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let id = Int32Array::from(vec![1, 2, 3, 4]);
|
||||||
|
let text = StringArray::from_iter_values(vec![
|
||||||
|
"Black T-Shirt",
|
||||||
|
"Leather Jacket",
|
||||||
|
"Winter Parka",
|
||||||
|
"Hooded Sweatshirt",
|
||||||
|
]);
|
||||||
|
let price = Float64Array::from(vec![10.0, 50.0, 100.0, 30.0]);
|
||||||
|
let schema = Arc::new(schema);
|
||||||
|
let rb = RecordBatch::try_new(
|
||||||
|
schema.clone(),
|
||||||
|
vec![Arc::new(id), Arc::new(text), Arc::new(price)],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
Box::new(RecordBatchIterator::new(vec![Ok(rb)], schema))
|
||||||
|
}
|
||||||
@@ -17,6 +17,9 @@ pub mod openai;
|
|||||||
#[cfg(feature = "sentence-transformers")]
|
#[cfg(feature = "sentence-transformers")]
|
||||||
pub mod sentence_transformers;
|
pub mod sentence_transformers;
|
||||||
|
|
||||||
|
#[cfg(feature = "bedrock")]
|
||||||
|
pub mod bedrock;
|
||||||
|
|
||||||
use lance::arrow::RecordBatchExt;
|
use lance::arrow::RecordBatchExt;
|
||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
borrow::Cow,
|
||||||
|
|||||||
210
rust/lancedb/src/embeddings/bedrock.rs
Normal file
210
rust/lancedb/src/embeddings/bedrock.rs
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
use aws_sdk_bedrockruntime::Client as BedrockClient;
|
||||||
|
use std::{borrow::Cow, fmt::Formatter, str::FromStr, sync::Arc};
|
||||||
|
|
||||||
|
use arrow::array::{AsArray, Float32Builder};
|
||||||
|
use arrow_array::{Array, ArrayRef, FixedSizeListArray, Float32Array};
|
||||||
|
use arrow_data::ArrayData;
|
||||||
|
use arrow_schema::DataType;
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
|
use super::EmbeddingFunction;
|
||||||
|
use crate::{Error, Result};
|
||||||
|
|
||||||
|
use tokio::runtime::Handle;
|
||||||
|
use tokio::task::block_in_place;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum BedrockEmbeddingModel {
|
||||||
|
TitanEmbedding,
|
||||||
|
CohereLarge,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BedrockEmbeddingModel {
|
||||||
|
fn ndims(&self) -> usize {
|
||||||
|
match self {
|
||||||
|
Self::TitanEmbedding => 1536,
|
||||||
|
Self::CohereLarge => 1024,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn model_id(&self) -> &str {
|
||||||
|
match self {
|
||||||
|
Self::TitanEmbedding => "amazon.titan-embed-text-v1",
|
||||||
|
Self::CohereLarge => "cohere.embed-english-v3",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for BedrockEmbeddingModel {
|
||||||
|
type Err = Error;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"titan-embed-text-v1" => Ok(Self::TitanEmbedding),
|
||||||
|
"cohere-embed-english-v3" => Ok(Self::CohereLarge),
|
||||||
|
_ => Err(Error::InvalidInput {
|
||||||
|
message: "Invalid model. Available models are: 'titan-embed-text-v1', 'cohere-embed-english-v3'".to_string()
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct BedrockEmbeddingFunction {
|
||||||
|
model: BedrockEmbeddingModel,
|
||||||
|
client: BedrockClient,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BedrockEmbeddingFunction {
|
||||||
|
pub fn new(client: BedrockClient) -> Self {
|
||||||
|
Self {
|
||||||
|
model: BedrockEmbeddingModel::TitanEmbedding,
|
||||||
|
client,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_model(client: BedrockClient, model: BedrockEmbeddingModel) -> Self {
|
||||||
|
Self { model, client }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EmbeddingFunction for BedrockEmbeddingFunction {
|
||||||
|
fn name(&self) -> &str {
|
||||||
|
"bedrock"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn source_type(&self) -> Result<Cow<DataType>> {
|
||||||
|
Ok(Cow::Owned(DataType::Utf8))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dest_type(&self) -> Result<Cow<DataType>> {
|
||||||
|
let n_dims = self.model.ndims();
|
||||||
|
Ok(Cow::Owned(DataType::new_fixed_size_list(
|
||||||
|
DataType::Float32,
|
||||||
|
n_dims as i32,
|
||||||
|
false,
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_source_embeddings(&self, source: ArrayRef) -> Result<ArrayRef> {
|
||||||
|
let len = source.len();
|
||||||
|
let n_dims = self.model.ndims();
|
||||||
|
let inner = self.compute_inner(source)?;
|
||||||
|
|
||||||
|
let fsl = DataType::new_fixed_size_list(DataType::Float32, n_dims as i32, false);
|
||||||
|
|
||||||
|
let array_data = ArrayData::builder(fsl)
|
||||||
|
.len(len)
|
||||||
|
.add_child_data(inner.into_data())
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
Ok(Arc::new(FixedSizeListArray::from(array_data)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_query_embeddings(&self, input: Arc<dyn Array>) -> Result<Arc<dyn Array>> {
|
||||||
|
let arr = self.compute_inner(input)?;
|
||||||
|
Ok(Arc::new(arr))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for BedrockEmbeddingFunction {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||||
|
f.debug_struct("BedrockEmbeddingFunction")
|
||||||
|
.field("model", &self.model)
|
||||||
|
// Skip client field as it doesn't implement Debug
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BedrockEmbeddingFunction {
|
||||||
|
fn compute_inner(&self, source: Arc<dyn Array>) -> Result<Float32Array> {
|
||||||
|
if source.is_nullable() {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: "Expected non-nullable data type".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if !matches!(source.data_type(), DataType::Utf8 | DataType::LargeUtf8) {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: "Expected Utf8 data type".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut builder = Float32Builder::new();
|
||||||
|
|
||||||
|
let texts = match source.data_type() {
|
||||||
|
DataType::Utf8 => source
|
||||||
|
.as_string::<i32>()
|
||||||
|
.into_iter()
|
||||||
|
.map(|s| s.expect("array is non-nullable").to_string())
|
||||||
|
.collect::<Vec<String>>(),
|
||||||
|
DataType::LargeUtf8 => source
|
||||||
|
.as_string::<i64>()
|
||||||
|
.into_iter()
|
||||||
|
.map(|s| s.expect("array is non-nullable").to_string())
|
||||||
|
.collect::<Vec<String>>(),
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
|
||||||
|
for text in texts {
|
||||||
|
let request_body = match self.model {
|
||||||
|
BedrockEmbeddingModel::TitanEmbedding => {
|
||||||
|
json!({
|
||||||
|
"inputText": text
|
||||||
|
})
|
||||||
|
}
|
||||||
|
BedrockEmbeddingModel::CohereLarge => {
|
||||||
|
json!({
|
||||||
|
"texts": [text],
|
||||||
|
"input_type": "search_document"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let client = self.client.clone();
|
||||||
|
let model_id = self.model.model_id().to_string();
|
||||||
|
let request_body = request_body.clone();
|
||||||
|
|
||||||
|
let response = block_in_place(move || {
|
||||||
|
Handle::current().block_on(async move {
|
||||||
|
client
|
||||||
|
.invoke_model()
|
||||||
|
.model_id(model_id)
|
||||||
|
.body(aws_sdk_bedrockruntime::primitives::Blob::new(
|
||||||
|
serde_json::to_vec(&request_body).unwrap(),
|
||||||
|
))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let response_json: Value =
|
||||||
|
serde_json::from_slice(response.body.as_ref()).map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to parse response: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let embedding = match self.model {
|
||||||
|
BedrockEmbeddingModel::TitanEmbedding => response_json["embedding"]
|
||||||
|
.as_array()
|
||||||
|
.ok_or_else(|| Error::Runtime {
|
||||||
|
message: "Missing embedding in response".to_string(),
|
||||||
|
})?
|
||||||
|
.iter()
|
||||||
|
.map(|v| v.as_f64().unwrap() as f32)
|
||||||
|
.collect::<Vec<f32>>(),
|
||||||
|
BedrockEmbeddingModel::CohereLarge => response_json["embeddings"][0]
|
||||||
|
.as_array()
|
||||||
|
.ok_or_else(|| Error::Runtime {
|
||||||
|
message: "Missing embeddings in response".to_string(),
|
||||||
|
})?
|
||||||
|
.iter()
|
||||||
|
.map(|v| v.as_f64().unwrap() as f32)
|
||||||
|
.collect::<Vec<f32>>(),
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.append_slice(&embedding);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(builder.finish())
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user