Compare commits

...

8 Commits

Author SHA1 Message Date
Lance Release
66a881b33a Bump version: 0.16.0-beta.2 → 0.16.0 2024-11-15 20:17:34 +00:00
Lance Release
a7515d6ee2 Bump version: 0.16.0-beta.1 → 0.16.0-beta.2 2024-11-15 20:17:34 +00:00
Will Jones
587c0824af feat: flexible null handling and insert subschemas in Python (#1827)
* Test that we can insert subschemas (omit nullable columns) in Python.
* More work is needed to support this in Node. See:
https://github.com/lancedb/lancedb/issues/1832
* Test that we can insert data with nullable schema but no nulls in
non-nullable schema.
* Add `"null"` option for `on_bad_vectors` where we fill with null if
the vector is bad.
* Make null values not considered bad if the field itself is nullable.
2024-11-15 11:33:00 -08:00
Will Jones
b38a4269d0 fix(node): make openai and huggingface optional dependencies (#1809)
BREAKING CHANGE: openai and huggingface now have separate entrypoints.

Closes [#1624](https://github.com/lancedb/lancedb/issues/1624)
2024-11-14 15:04:35 -08:00
Will Jones
119d88b9db ci: disable Windows Arm64 until the release builds work (#1833)
Started to actually fix this, but it was taking too long
https://github.com/lancedb/lancedb/pull/1831
2024-11-14 15:04:23 -08:00
StevenSu
74f660d223 feat: add new feature, add amazon bedrock embedding function (#1788)
Add amazon bedrock embedding function to rust sdk.

1.  Add BedrockEmbeddingModel ( lancedb/src/embeddings/bedrock.rs)
2. Add example lancedb/examples/bedrock.rs
2024-11-14 11:04:59 -08:00
Lance Release
b2b0979b90 Updating package-lock.json 2024-11-14 04:42:38 +00:00
Lance Release
ee2a40b182 Bump version: 0.13.0-beta.1 → 0.13.0-beta.2 2024-11-14 04:42:19 +00:00
31 changed files with 828 additions and 319 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.13.0-beta.1"
current_version = "0.13.0-beta.2"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -104,7 +104,6 @@ jobs:
OPENAI_BASE_URL: http://0.0.0.0:8000
run: |
python ci/mock_openai.py &
ss -ltnp | grep :8000
cd nodejs/examples
npm test
macos:

View File

@@ -226,108 +226,109 @@ jobs:
path: |
node/dist/lancedb-vectordb-win32*.tgz
node-windows-arm64:
name: vectordb win32-arm64-msvc
runs-on: windows-4x-arm
if: startsWith(github.ref, 'refs/tags/v')
steps:
- uses: actions/checkout@v4
- name: Install Git
run: |
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
shell: powershell
- name: Add Git to PATH
run: |
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
shell: powershell
- name: Configure Git symlinks
run: git config --global core.symlinks true
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.13"
- name: Install Visual Studio Build Tools
run: |
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
"--installPath", "C:\BuildTools", `
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
shell: powershell
- name: Add Visual Studio Build Tools to PATH
run: |
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
# TODO: re-enable once working https://github.com/lancedb/lancedb/pull/1831
# node-windows-arm64:
# name: vectordb win32-arm64-msvc
# runs-on: windows-4x-arm
# if: startsWith(github.ref, 'refs/tags/v')
# steps:
# - uses: actions/checkout@v4
# - name: Install Git
# run: |
# Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
# Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
# shell: powershell
# - name: Add Git to PATH
# run: |
# Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
# $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
# shell: powershell
# - name: Configure Git symlinks
# run: git config --global core.symlinks true
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: "3.13"
# - name: Install Visual Studio Build Tools
# run: |
# Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
# Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
# "--installPath", "C:\BuildTools", `
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
# "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
# "--add", "Microsoft.VisualStudio.Component.VC.ATL", `
# "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
# "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
# shell: powershell
# - name: Add Visual Studio Build Tools to PATH
# run: |
# $vsPath = "C:\BuildTools\VC\Tools\MSVC"
# $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
# Add MSVC runtime libraries to LIB
$env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
# # Add MSVC runtime libraries to LIB
# $env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
# "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
# "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
# Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
# Add INCLUDE paths
$env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
shell: powershell
- name: Install Rust
run: |
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
shell: powershell
- name: Add Rust to PATH
run: |
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
shell: powershell
# # Add INCLUDE paths
# $env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
# Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
# shell: powershell
# - name: Install Rust
# run: |
# Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
# .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
# shell: powershell
# - name: Add Rust to PATH
# run: |
# Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
# shell: powershell
- uses: Swatinem/rust-cache@v2
with:
workspaces: rust
- name: Install 7-Zip ARM
run: |
New-Item -Path 'C:\7zip' -ItemType Directory
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
shell: powershell
- name: Add 7-Zip to PATH
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
shell: powershell
- name: Install Protoc v21.12
working-directory: C:\
run: |
if (Test-Path 'C:\protoc') {
Write-Host "Protoc directory exists, skipping installation"
return
}
New-Item -Path 'C:\protoc' -ItemType Directory
Set-Location C:\protoc
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
shell: powershell
- name: Add Protoc to PATH
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
shell: powershell
- name: Build Windows native node modules
run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc
- name: Upload Windows ARM64 Artifacts
uses: actions/upload-artifact@v4
with:
name: node-native-windows-arm64
path: |
node/dist/*.node
# - uses: Swatinem/rust-cache@v2
# with:
# workspaces: rust
# - name: Install 7-Zip ARM
# run: |
# New-Item -Path 'C:\7zip' -ItemType Directory
# Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
# Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
# shell: powershell
# - name: Add 7-Zip to PATH
# run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
# shell: powershell
# - name: Install Protoc v21.12
# working-directory: C:\
# run: |
# if (Test-Path 'C:\protoc') {
# Write-Host "Protoc directory exists, skipping installation"
# return
# }
# New-Item -Path 'C:\protoc' -ItemType Directory
# Set-Location C:\protoc
# Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
# & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
# shell: powershell
# - name: Add Protoc to PATH
# run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
# shell: powershell
# - name: Build Windows native node modules
# run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc
# - name: Upload Windows ARM64 Artifacts
# uses: actions/upload-artifact@v4
# with:
# name: node-native-windows-arm64
# path: |
# node/dist/*.node
nodejs-windows:
name: lancedb ${{ matrix.target }}
@@ -363,98 +364,99 @@ jobs:
path: |
nodejs/dist/*.node
nodejs-windows-arm64:
name: lancedb win32-arm64-msvc
runs-on: windows-4x-arm
if: startsWith(github.ref, 'refs/tags/v')
steps:
- uses: actions/checkout@v4
- name: Install Git
run: |
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
shell: powershell
- name: Add Git to PATH
run: |
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
shell: powershell
- name: Configure Git symlinks
run: git config --global core.symlinks true
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.13"
- name: Install Visual Studio Build Tools
run: |
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
"--installPath", "C:\BuildTools", `
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
shell: powershell
- name: Add Visual Studio Build Tools to PATH
run: |
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
# TODO: re-enable once working https://github.com/lancedb/lancedb/pull/1831
# nodejs-windows-arm64:
# name: lancedb win32-arm64-msvc
# runs-on: windows-4x-arm
# if: startsWith(github.ref, 'refs/tags/v')
# steps:
# - uses: actions/checkout@v4
# - name: Install Git
# run: |
# Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
# Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
# shell: powershell
# - name: Add Git to PATH
# run: |
# Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
# $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
# shell: powershell
# - name: Configure Git symlinks
# run: git config --global core.symlinks true
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: "3.13"
# - name: Install Visual Studio Build Tools
# run: |
# Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
# Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
# "--installPath", "C:\BuildTools", `
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
# "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
# "--add", "Microsoft.VisualStudio.Component.VC.ATL", `
# "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
# "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
# shell: powershell
# - name: Add Visual Studio Build Tools to PATH
# run: |
# $vsPath = "C:\BuildTools\VC\Tools\MSVC"
# $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
$env:LIB = ""
Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
shell: powershell
- name: Install Rust
run: |
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
shell: powershell
- name: Add Rust to PATH
run: |
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
shell: powershell
# $env:LIB = ""
# Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
# shell: powershell
# - name: Install Rust
# run: |
# Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
# .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
# shell: powershell
# - name: Add Rust to PATH
# run: |
# Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
# shell: powershell
- uses: Swatinem/rust-cache@v2
with:
workspaces: rust
- name: Install 7-Zip ARM
run: |
New-Item -Path 'C:\7zip' -ItemType Directory
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
shell: powershell
- name: Add 7-Zip to PATH
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
shell: powershell
- name: Install Protoc v21.12
working-directory: C:\
run: |
if (Test-Path 'C:\protoc') {
Write-Host "Protoc directory exists, skipping installation"
return
}
New-Item -Path 'C:\protoc' -ItemType Directory
Set-Location C:\protoc
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
shell: powershell
- name: Add Protoc to PATH
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
shell: powershell
- name: Build Windows native node modules
run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc
- name: Upload Windows ARM64 Artifacts
uses: actions/upload-artifact@v4
with:
name: nodejs-native-windows-arm64
path: |
nodejs/dist/*.node
# - uses: Swatinem/rust-cache@v2
# with:
# workspaces: rust
# - name: Install 7-Zip ARM
# run: |
# New-Item -Path 'C:\7zip' -ItemType Directory
# Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
# Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
# shell: powershell
# - name: Add 7-Zip to PATH
# run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
# shell: powershell
# - name: Install Protoc v21.12
# working-directory: C:\
# run: |
# if (Test-Path 'C:\protoc') {
# Write-Host "Protoc directory exists, skipping installation"
# return
# }
# New-Item -Path 'C:\protoc' -ItemType Directory
# Set-Location C:\protoc
# Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
# & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
# shell: powershell
# - name: Add Protoc to PATH
# run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
# shell: powershell
# - name: Build Windows native node modules
# run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc
# - name: Upload Windows ARM64 Artifacts
# uses: actions/upload-artifact@v4
# with:
# name: nodejs-native-windows-arm64
# path: |
# nodejs/dist/*.node
release:
name: vectordb NPM Publish
@@ -476,7 +478,7 @@ jobs:
env:
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
run: |
# Tag beta as "preview" instead of default "latest". See lancedb
# Tag beta as "preview" instead of default "latest". See lancedb
# npm publish step for more info.
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
PUBLISH_ARGS="--tag preview"

View File

@@ -23,13 +23,13 @@ rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
[workspace.dependencies]
lance = { "version" = "=0.19.2", "features" = [
"dynamodb",
], git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
lance-index = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
lance-linalg = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
lance-table = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
lance-testing = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
lance-datafusion = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
lance-encoding = { "version" = "=0.19.2", git = "https://github.com/lancedb/lance.git", tag = "v0.19.2" }
]}
lance-index = "=0.19.2"
lance-linalg = "=0.19.2"
lance-table = "=0.19.2"
lance-testing = "=0.19.2"
lance-datafusion = "=0.19.2"
lance-encoding = "=0.19.2"
# Note that this one does not include pyarrow
arrow = { version = "52.2", optional = false }
arrow-array = "52.2"

View File

@@ -790,6 +790,27 @@ Use the `drop_table()` method on the database to remove a table.
This permanently removes the table and is not recoverable, unlike deleting rows.
If the table does not exist an exception is raised.
## Handling bad vectors
In LanceDB Python, you can use the `on_bad_vectors` parameter to choose how
invalid vector values are handled. Invalid vectors are vectors that are not valid
because:
1. They are the wrong dimension
2. They contain NaN values
3. They are null but are on a non-nullable field
By default, LanceDB will raise an error if it encounters a bad vector. You can
also choose one of the following options:
* `drop`: Ignore rows with bad vectors
* `fill`: Replace bad values (NaNs) or missing values (too few dimensions) with
the fill value specified in the `fill_value` parameter. An input like
`[1.0, NaN, 3.0]` will be replaced with `[1.0, 0.0, 3.0]` if `fill_value=0.0`.
* `null`: Replace bad vectors with null (only works if the column is nullable).
A bad vector `[1.0, NaN, 3.0]` will be replaced with `null` if the column is
nullable. If the vector column is non-nullable, then bad vectors will cause an
error
## Consistency

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.13.0-beta.1</version>
<version>0.13.0-beta.2</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.13.0-beta.1</version>
<version>0.13.0-beta.2</version>
<packaging>pom</packaging>
<name>LanceDB Parent</name>

76
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.13.0-beta.1",
"version": "0.13.0-beta.2",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.13.0-beta.1",
"version": "0.13.0-beta.2",
"cpu": [
"x64",
"arm64"
@@ -52,12 +52,12 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.1",
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.1",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.1",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.1",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.1"
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -327,66 +327,6 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.13.0-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.13.0-beta.1.tgz",
"integrity": "sha512-beOrf6selCzzhLgDG8Nibma4nO/CSnA1wUKRmlJHEPtGcg7PW18z6MP/nfwQMpMR/FLRfTo8pPTbpzss47MiQQ==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.13.0-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.13.0-beta.1.tgz",
"integrity": "sha512-YdraGRF/RbJRkKh0v3xT03LUhq47T2GtCvJ5gZp8wKlh4pHa8LuhLU0DIdvmG/DT5vuQA+td8HDkBm/e3EOdNg==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.13.0-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.13.0-beta.1.tgz",
"integrity": "sha512-Pp0O/uhEqof1oLaWrNbv+Ym+q8kBkiCqaA5+2eAZ6a3e9U+Ozkvb0FQrHuyi9adJ5wKQ4NabyQE9BMf2bYpOnQ==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.13.0-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.13.0-beta.1.tgz",
"integrity": "sha512-y8nxOye4egfWF5FGED9EfkmZ1O5HnRLU4a61B8m5JSpkivO9v2epTcbYN0yt/7ZFCgtqMfJ8VW4Mi7qQcz3KDA==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.13.0-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.13.0-beta.1.tgz",
"integrity": "sha512-STMDP9dp0TBLkB3ro+16pKcGy6bmbhRuEZZZ1Tp5P75yTPeVh4zIgWkidMdU1qBbEYM7xacnsp9QAwgLnMU/Ow==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"win32"
]
},
"node_modules/@neon-rs/cli": {
"version": "0.0.160",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.13.0-beta.1",
"version": "0.13.0-beta.2",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
@@ -89,11 +89,11 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.1",
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.1",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.1",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.1",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.1"
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2"
}
}

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.13.0-beta.1"
version = "0.13.0-beta.2"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -187,6 +187,81 @@ describe.each([arrow13, arrow14, arrow15, arrow16, arrow17])(
},
);
// TODO: https://github.com/lancedb/lancedb/issues/1832
it.skip("should be able to omit nullable fields", async () => {
const db = await connect(tmpDir.name);
const schema = new arrow.Schema([
new arrow.Field(
"vector",
new arrow.FixedSizeList(
2,
new arrow.Field("item", new arrow.Float64()),
),
true,
),
new arrow.Field("item", new arrow.Utf8(), true),
new arrow.Field("price", new arrow.Float64(), false),
]);
const table = await db.createEmptyTable("test", schema);
const data1 = { item: "foo", price: 10.0 };
await table.add([data1]);
const data2 = { vector: [3.1, 4.1], price: 2.0 };
await table.add([data2]);
const data3 = { vector: [5.9, 26.5], item: "bar", price: 3.0 };
await table.add([data3]);
let res = await table.query().limit(10).toArray();
const resVector = res.map((r) => r.get("vector").toArray());
expect(resVector).toEqual([null, data2.vector, data3.vector]);
const resItem = res.map((r) => r.get("item").toArray());
expect(resItem).toEqual(["foo", null, "bar"]);
const resPrice = res.map((r) => r.get("price").toArray());
expect(resPrice).toEqual([10.0, 2.0, 3.0]);
const data4 = { item: "foo" };
// We can't omit a column if it's not nullable
await expect(table.add([data4])).rejects.toThrow("Invalid user input");
// But we can alter columns to make them nullable
await table.alterColumns([{ path: "price", nullable: true }]);
await table.add([data4]);
res = (await table.query().limit(10).toArray()).map((r) => r.toJSON());
expect(res).toEqual([data1, data2, data3, data4]);
});
it("should be able to insert nullable data for non-nullable fields", async () => {
const db = await connect(tmpDir.name);
const schema = new arrow.Schema([
new arrow.Field("x", new arrow.Float64(), false),
new arrow.Field("id", new arrow.Utf8(), false),
]);
const table = await db.createEmptyTable("test", schema);
const data1 = { x: 4.1, id: "foo" };
await table.add([data1]);
const res = (await table.query().toArray())[0];
expect(res.x).toEqual(data1.x);
expect(res.id).toEqual(data1.id);
const data2 = { x: null, id: "bar" };
await expect(table.add([data2])).rejects.toThrow(
"declared as non-nullable but contains null values",
);
// But we can alter columns to make them nullable
await table.alterColumns([{ path: "x", nullable: true }]);
await table.add([data2]);
const res2 = await table.query().toArray();
expect(res2.length).toBe(2);
expect(res2[0].x).toEqual(data1.x);
expect(res2[0].id).toEqual(data1.id);
expect(res2[1].x).toBeNull();
expect(res2[1].id).toEqual(data2.id);
});
it("should return the table as an instance of an arrow table", async () => {
const arrowTbl = await table.toArrow();
expect(arrowTbl).toBeInstanceOf(ArrowTable);

View File

@@ -6,12 +6,16 @@ import { withTempDirectory } from "./util.ts";
import * as lancedb from "@lancedb/lancedb";
import "@lancedb/lancedb/embedding/transformers";
import { LanceSchema, getRegistry } from "@lancedb/lancedb/embedding";
import { EmbeddingFunction } from "@lancedb/lancedb/embedding";
import { Utf8 } from "apache-arrow";
test("full text search", async () => {
await withTempDirectory(async (databaseDir) => {
const db = await lancedb.connect(databaseDir);
const func = await getRegistry().get("huggingface").create();
console.log(getRegistry());
const func = (await getRegistry()
.get("huggingface")
?.create()) as EmbeddingFunction;
const facts = [
"Albert Einstein was a theoretical physicist.",
@@ -56,4 +60,4 @@ test("full text search", async () => {
expect(actual[0]["text"]).toBe("The human body has 206 bones.");
});
});
}, 100_000);

View File

@@ -19,9 +19,6 @@ import { EmbeddingFunctionConfig, getRegistry } from "./registry";
export { EmbeddingFunction, TextEmbeddingFunction } from "./embedding_function";
// We need to explicitly export '*' so that the `register` decorator actually registers the class.
export * from "./openai";
export * from "./transformers";
export * from "./registry";
/**

View File

@@ -17,8 +17,6 @@ import {
type EmbeddingFunctionConstructor,
} from "./embedding_function";
import "reflect-metadata";
import { OpenAIEmbeddingFunction } from "./openai";
import { TransformersEmbeddingFunction } from "./transformers";
type CreateReturnType<T> = T extends { init: () => Promise<void> }
? Promise<T>
@@ -73,10 +71,6 @@ export class EmbeddingFunctionRegistry {
};
}
get(name: "openai"): EmbeddingFunctionCreate<OpenAIEmbeddingFunction>;
get(
name: "huggingface",
): EmbeddingFunctionCreate<TransformersEmbeddingFunction>;
get<T extends EmbeddingFunction<unknown>>(
name: string,
): EmbeddingFunctionCreate<T> | undefined;

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.13.0-beta.1",
"version": "0.13.0-beta.2",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.13.0-beta.1",
"version": "0.13.0-beta.2",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.13.0-beta.1",
"version": "0.13.0-beta.2",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.13.0-beta.1",
"version": "0.13.0-beta.2",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.13.0-beta.1",
"version": "0.13.0-beta.2",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.13.0-beta.1",
"version": "0.13.0-beta.2",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -10,11 +10,13 @@
"vector database",
"ann"
],
"version": "0.13.0-beta.1",
"version": "0.13.0-beta.2",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",
"./embedding": "./dist/embedding/index.js"
"./embedding": "./dist/embedding/index.js",
"./embedding/openai": "./dist/embedding/openai.js",
"./embedding/transformers": "./dist/embedding/transformers.js"
},
"types": "dist/index.d.ts",
"napi": {

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.16.0-beta.1"
current_version = "0.16.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.16.0-beta.1"
version = "0.16.0"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -1567,7 +1567,7 @@ class LanceTable(Table):
"append" and "overwrite".
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
One of "error", "drop", "fill", "null".
fill_value: float, default 0.
The value to use when filling vectors. Only used if on_bad_vectors="fill".
@@ -1851,7 +1851,7 @@ class LanceTable(Table):
data but will validate against any schema that's specified.
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
One of "error", "drop", "fill", "null".
fill_value: float, default 0.
The value to use when filling vectors. Only used if on_bad_vectors="fill".
embedding_functions: list of EmbeddingFunctionModel, default None
@@ -2151,13 +2151,11 @@ def _sanitize_schema(
vector column to fixed_size_list(float32) if necessary.
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
One of "error", "drop", "fill", "null".
fill_value: float, default 0.
The value to use when filling vectors. Only used if on_bad_vectors="fill".
"""
if schema is not None:
if data.schema == schema:
return data
# cast the columns to the expected types
data = data.combine_chunks()
for field in schema:
@@ -2177,6 +2175,7 @@ def _sanitize_schema(
vector_column_name=field.name,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
table_schema=schema,
)
return pa.Table.from_arrays(
[data[name] for name in schema.names], schema=schema
@@ -2197,6 +2196,7 @@ def _sanitize_schema(
def _sanitize_vector_column(
data: pa.Table,
vector_column_name: str,
table_schema: Optional[pa.Schema] = None,
on_bad_vectors: str = "error",
fill_value: float = 0.0,
) -> pa.Table:
@@ -2211,12 +2211,16 @@ def _sanitize_vector_column(
The name of the vector column.
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
One of "error", "drop", "fill", "null".
fill_value: float, default 0.0
The value to use when filling vectors. Only used if on_bad_vectors="fill".
"""
# ChunkedArray is annoying to work with, so we combine chunks here
vec_arr = data[vector_column_name].combine_chunks()
if table_schema is not None:
field = table_schema.field(vector_column_name)
else:
field = None
typ = data[vector_column_name].type
if pa.types.is_list(typ) or pa.types.is_large_list(typ):
# if it's a variable size list array,
@@ -2243,7 +2247,11 @@ def _sanitize_vector_column(
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
)
else:
if pc.any(pc.is_null(vec_arr.values, nan_is_null=True)).as_py():
if (
field is not None
and not field.nullable
and pc.any(pc.is_null(vec_arr.values)).as_py()
) or (pc.any(pc.is_nan(vec_arr.values)).as_py()):
data = _sanitize_nans(
data, fill_value, on_bad_vectors, vec_arr, vector_column_name
)
@@ -2287,6 +2295,12 @@ def _sanitize_jagged(data, fill_value, on_bad_vectors, vec_arr, vector_column_na
)
elif on_bad_vectors == "drop":
data = data.filter(correct_ndims)
elif on_bad_vectors == "null":
data = data.set_column(
data.column_names.index(vector_column_name),
vector_column_name,
pc.if_else(correct_ndims, vec_arr, pa.scalar(None)),
)
return data
@@ -2303,7 +2317,8 @@ def _sanitize_nans(
raise ValueError(
f"Vector column {vector_column_name} has NaNs. "
"Set on_bad_vectors='drop' to remove them, or "
"set on_bad_vectors='fill' and fill_value=<value> to replace them."
"set on_bad_vectors='fill' and fill_value=<value> to replace them. "
"Or set on_bad_vectors='null' to replace them with null."
)
elif on_bad_vectors == "fill":
if fill_value is None:
@@ -2323,6 +2338,17 @@ def _sanitize_nans(
np_arr = np_arr.reshape(-1, vec_arr.type.list_size)
not_nulls = np.any(np_arr, axis=1)
data = data.filter(~not_nulls)
elif on_bad_vectors == "null":
# null = pa.nulls(len(vec_arr)).cast(vec_arr.type)
# values = pc.if_else(pc.is_nan(vec_arr.values), fill_value, vec_arr.values)
np_arr = np.isnan(vec_arr.values.to_numpy(zero_copy_only=False))
np_arr = np_arr.reshape(-1, vec_arr.type.list_size)
no_nans = np.any(np_arr, axis=1)
data = data.set_column(
data.column_names.index(vector_column_name),
vector_column_name,
pc.if_else(no_nans, vec_arr, pa.scalar(None)),
)
return data
@@ -2588,7 +2614,7 @@ class AsyncTable:
"append" and "overwrite".
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
One of "error", "drop", "fill", "null".
fill_value: float, default 0.
The value to use when filling vectors. Only used if on_bad_vectors="fill".

View File

@@ -81,14 +81,15 @@ def test_embedding_function(tmp_path):
def test_embedding_with_bad_results(tmp_path):
@register("mock-embedding")
class MockEmbeddingFunction(TextEmbeddingFunction):
@register("null-embedding")
class NullEmbeddingFunction(TextEmbeddingFunction):
def ndims(self):
return 128
def generate_embeddings(
self, texts: Union[List[str], np.ndarray]
) -> list[Union[np.array, None]]:
# Return None, which is bad if field is non-nullable
return [
None if i % 2 == 0 else np.random.randn(self.ndims())
for i in range(len(texts))
@@ -96,13 +97,17 @@ def test_embedding_with_bad_results(tmp_path):
db = lancedb.connect(tmp_path)
registry = EmbeddingFunctionRegistry.get_instance()
model = registry.get("mock-embedding").create()
model = registry.get("null-embedding").create()
class Schema(LanceModel):
text: str = model.SourceField()
vector: Vector(model.ndims()) = model.VectorField()
table = db.create_table("test", schema=Schema, mode="overwrite")
with pytest.raises(ValueError):
# Default on_bad_vectors is "error"
table.add([{"text": "hello world"}])
table.add(
[{"text": "hello world"}, {"text": "bar"}],
on_bad_vectors="drop",
@@ -112,13 +117,33 @@ def test_embedding_with_bad_results(tmp_path):
assert len(table) == 1
assert df.iloc[0]["text"] == "bar"
# table = db.create_table("test2", schema=Schema, mode="overwrite")
# table.add(
# [{"text": "hello world"}, {"text": "bar"}],
# )
# assert len(table) == 2
# tbl = table.to_arrow()
# assert tbl["vector"].null_count == 1
@register("nan-embedding")
class NanEmbeddingFunction(TextEmbeddingFunction):
def ndims(self):
return 128
def generate_embeddings(
self, texts: Union[List[str], np.ndarray]
) -> list[Union[np.array, None]]:
# Return NaN to produce bad vectors
return [
[np.NAN] * 128 if i % 2 == 0 else np.random.randn(self.ndims())
for i in range(len(texts))
]
db = lancedb.connect(tmp_path)
registry = EmbeddingFunctionRegistry.get_instance()
model = registry.get("nan-embedding").create()
table = db.create_table("test2", schema=Schema, mode="overwrite")
table.alter_columns(dict(path="vector", nullable=True))
table.add(
[{"text": "hello world"}, {"text": "bar"}],
on_bad_vectors="null",
)
assert len(table) == 2
tbl = table.to_arrow()
assert tbl["vector"].null_count == 1
def test_with_existing_vectors(tmp_path):

View File

@@ -240,6 +240,121 @@ def test_add(db):
_add(table, schema)
def test_add_subschema(tmp_path):
db = lancedb.connect(tmp_path)
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
pa.field("item", pa.string(), nullable=True),
pa.field("price", pa.float64(), nullable=False),
]
)
table = db.create_table("test", schema=schema)
data = {"price": 10.0, "item": "foo"}
table.add([data])
data = {"price": 2.0, "vector": [3.1, 4.1]}
table.add([data])
data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
table.add([data])
expected = pa.table(
{
"vector": [None, [3.1, 4.1], [5.9, 26.5]],
"item": ["foo", None, "bar"],
"price": [10.0, 2.0, 3.0],
},
schema=schema,
)
assert table.to_arrow() == expected
data = {"item": "foo"}
# We can't omit a column if it's not nullable
with pytest.raises(OSError, match="Invalid user input"):
table.add([data])
# We can add it if we make the column nullable
table.alter_columns(dict(path="price", nullable=True))
table.add([data])
expected_schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
pa.field("item", pa.string(), nullable=True),
pa.field("price", pa.float64(), nullable=True),
]
)
expected = pa.table(
{
"vector": [None, [3.1, 4.1], [5.9, 26.5], None],
"item": ["foo", None, "bar", "foo"],
"price": [10.0, 2.0, 3.0, None],
},
schema=expected_schema,
)
assert table.to_arrow() == expected
def test_add_nullability(tmp_path):
db = lancedb.connect(tmp_path)
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2), nullable=False),
pa.field("id", pa.string(), nullable=False),
]
)
table = db.create_table("test", schema=schema)
nullable_schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
pa.field("id", pa.string(), nullable=True),
]
)
data = pa.table(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"id": ["foo", "bar"],
},
schema=nullable_schema,
)
# We can add nullable schema if it doesn't actually contain nulls
table.add(data)
expected = data.cast(schema)
assert table.to_arrow() == expected
data = pa.table(
{
"vector": [None],
"id": ["baz"],
},
schema=nullable_schema,
)
# We can't add nullable schema if it contains nulls
with pytest.raises(Exception, match="Vector column vector has NaNs"):
table.add(data)
# But we can make it nullable
table.alter_columns(dict(path="vector", nullable=True))
table.add(data)
expected_schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2), nullable=True),
pa.field("id", pa.string(), nullable=False),
]
)
expected = pa.table(
{
"vector": [[3.1, 4.1], [5.9, 26.5], None],
"id": ["foo", "bar", "baz"],
},
schema=expected_schema,
)
assert table.to_arrow() == expected
def test_add_pydantic_model(db):
# https://github.com/lancedb/lancedb/issues/562

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.13.0-beta.1"
version = "0.13.0-beta.2"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.13.0-beta.1"
version = "0.13.0-beta.2"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true
@@ -46,6 +46,7 @@ serde = { version = "^1" }
serde_json = { version = "1" }
async-openai = { version = "0.20.0", optional = true }
serde_with = { version = "3.8.1" }
aws-sdk-bedrockruntime = { version = "1.27.0", optional = true }
# For remote feature
reqwest = { version = "0.12.0", features = ["gzip", "json", "stream"], optional = true }
rand = { version = "0.8.3", features = ["small_rng"], optional = true}
@@ -72,11 +73,13 @@ aws-config = { version = "1.0" }
aws-smithy-runtime = { version = "1.3" }
http-body = "1" # Matching reqwest
[features]
default = []
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
fp16kernels = ["lance-linalg/fp16kernels"]
s3-test = []
bedrock = ["dep:aws-sdk-bedrockruntime"]
openai = ["dep:async-openai", "dep:reqwest"]
polars = ["dep:polars-arrow", "dep:polars"]
sentence-transformers = [
@@ -94,3 +97,7 @@ required-features = ["openai"]
[[example]]
name = "sentence_transformers"
required-features = ["sentence-transformers"]
[[example]]
name = "bedrock"
required-features = ["bedrock"]

View File

@@ -0,0 +1,89 @@
use std::{iter::once, sync::Arc};
use arrow_array::{Float64Array, Int32Array, RecordBatch, RecordBatchIterator, StringArray};
use arrow_schema::{DataType, Field, Schema};
use aws_config::Region;
use aws_sdk_bedrockruntime::Client;
use futures::StreamExt;
use lancedb::{
arrow::IntoArrow,
connect,
embeddings::{bedrock::BedrockEmbeddingFunction, EmbeddingDefinition, EmbeddingFunction},
query::{ExecutableQuery, QueryBase},
Result,
};
#[tokio::main]
async fn main() -> Result<()> {
let tempdir = tempfile::tempdir().unwrap();
let tempdir = tempdir.path().to_str().unwrap();
// create Bedrock embedding function
let region: String = "us-east-1".to_string();
let config = aws_config::defaults(aws_config::BehaviorVersion::latest())
.region(Region::new(region))
.load()
.await;
let embedding = Arc::new(BedrockEmbeddingFunction::new(
Client::new(&config), // AWS Region
));
let db = connect(tempdir).execute().await?;
db.embedding_registry()
.register("bedrock", embedding.clone())?;
let table = db
.create_table("vectors", make_data())
.add_embedding(EmbeddingDefinition::new(
"text",
"bedrock",
Some("embeddings"),
))?
.execute()
.await?;
// execute vector search
let query = Arc::new(StringArray::from_iter_values(once("something warm")));
let query_vector = embedding.compute_query_embeddings(query)?;
let mut results = table
.vector_search(query_vector)?
.limit(1)
.execute()
.await?;
let rb = results.next().await.unwrap()?;
let out = rb
.column_by_name("text")
.unwrap()
.as_any()
.downcast_ref::<StringArray>()
.unwrap();
let text = out.iter().next().unwrap().unwrap();
println!("Closest match: {}", text);
Ok(())
}
fn make_data() -> impl IntoArrow {
let schema = Schema::new(vec![
Field::new("id", DataType::Int32, true),
Field::new("text", DataType::Utf8, false),
Field::new("price", DataType::Float64, false),
]);
let id = Int32Array::from(vec![1, 2, 3, 4]);
let text = StringArray::from_iter_values(vec![
"Black T-Shirt",
"Leather Jacket",
"Winter Parka",
"Hooded Sweatshirt",
]);
let price = Float64Array::from(vec![10.0, 50.0, 100.0, 30.0]);
let schema = Arc::new(schema);
let rb = RecordBatch::try_new(
schema.clone(),
vec![Arc::new(id), Arc::new(text), Arc::new(price)],
)
.unwrap();
Box::new(RecordBatchIterator::new(vec![Ok(rb)], schema))
}

View File

@@ -17,6 +17,9 @@ pub mod openai;
#[cfg(feature = "sentence-transformers")]
pub mod sentence_transformers;
#[cfg(feature = "bedrock")]
pub mod bedrock;
use lance::arrow::RecordBatchExt;
use std::{
borrow::Cow,

View File

@@ -0,0 +1,210 @@
use aws_sdk_bedrockruntime::Client as BedrockClient;
use std::{borrow::Cow, fmt::Formatter, str::FromStr, sync::Arc};
use arrow::array::{AsArray, Float32Builder};
use arrow_array::{Array, ArrayRef, FixedSizeListArray, Float32Array};
use arrow_data::ArrayData;
use arrow_schema::DataType;
use serde_json::{json, Value};
use super::EmbeddingFunction;
use crate::{Error, Result};
use tokio::runtime::Handle;
use tokio::task::block_in_place;
#[derive(Debug)]
pub enum BedrockEmbeddingModel {
TitanEmbedding,
CohereLarge,
}
impl BedrockEmbeddingModel {
fn ndims(&self) -> usize {
match self {
Self::TitanEmbedding => 1536,
Self::CohereLarge => 1024,
}
}
fn model_id(&self) -> &str {
match self {
Self::TitanEmbedding => "amazon.titan-embed-text-v1",
Self::CohereLarge => "cohere.embed-english-v3",
}
}
}
impl FromStr for BedrockEmbeddingModel {
type Err = Error;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
match s {
"titan-embed-text-v1" => Ok(Self::TitanEmbedding),
"cohere-embed-english-v3" => Ok(Self::CohereLarge),
_ => Err(Error::InvalidInput {
message: "Invalid model. Available models are: 'titan-embed-text-v1', 'cohere-embed-english-v3'".to_string()
}),
}
}
}
pub struct BedrockEmbeddingFunction {
model: BedrockEmbeddingModel,
client: BedrockClient,
}
impl BedrockEmbeddingFunction {
pub fn new(client: BedrockClient) -> Self {
Self {
model: BedrockEmbeddingModel::TitanEmbedding,
client,
}
}
pub fn with_model(client: BedrockClient, model: BedrockEmbeddingModel) -> Self {
Self { model, client }
}
}
impl EmbeddingFunction for BedrockEmbeddingFunction {
fn name(&self) -> &str {
"bedrock"
}
fn source_type(&self) -> Result<Cow<DataType>> {
Ok(Cow::Owned(DataType::Utf8))
}
fn dest_type(&self) -> Result<Cow<DataType>> {
let n_dims = self.model.ndims();
Ok(Cow::Owned(DataType::new_fixed_size_list(
DataType::Float32,
n_dims as i32,
false,
)))
}
fn compute_source_embeddings(&self, source: ArrayRef) -> Result<ArrayRef> {
let len = source.len();
let n_dims = self.model.ndims();
let inner = self.compute_inner(source)?;
let fsl = DataType::new_fixed_size_list(DataType::Float32, n_dims as i32, false);
let array_data = ArrayData::builder(fsl)
.len(len)
.add_child_data(inner.into_data())
.build()?;
Ok(Arc::new(FixedSizeListArray::from(array_data)))
}
fn compute_query_embeddings(&self, input: Arc<dyn Array>) -> Result<Arc<dyn Array>> {
let arr = self.compute_inner(input)?;
Ok(Arc::new(arr))
}
}
impl std::fmt::Debug for BedrockEmbeddingFunction {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("BedrockEmbeddingFunction")
.field("model", &self.model)
// Skip client field as it doesn't implement Debug
.finish()
}
}
impl BedrockEmbeddingFunction {
fn compute_inner(&self, source: Arc<dyn Array>) -> Result<Float32Array> {
if source.is_nullable() {
return Err(Error::InvalidInput {
message: "Expected non-nullable data type".to_string(),
});
}
if !matches!(source.data_type(), DataType::Utf8 | DataType::LargeUtf8) {
return Err(Error::InvalidInput {
message: "Expected Utf8 data type".to_string(),
});
}
let mut builder = Float32Builder::new();
let texts = match source.data_type() {
DataType::Utf8 => source
.as_string::<i32>()
.into_iter()
.map(|s| s.expect("array is non-nullable").to_string())
.collect::<Vec<String>>(),
DataType::LargeUtf8 => source
.as_string::<i64>()
.into_iter()
.map(|s| s.expect("array is non-nullable").to_string())
.collect::<Vec<String>>(),
_ => unreachable!(),
};
for text in texts {
let request_body = match self.model {
BedrockEmbeddingModel::TitanEmbedding => {
json!({
"inputText": text
})
}
BedrockEmbeddingModel::CohereLarge => {
json!({
"texts": [text],
"input_type": "search_document"
})
}
};
let client = self.client.clone();
let model_id = self.model.model_id().to_string();
let request_body = request_body.clone();
let response = block_in_place(move || {
Handle::current().block_on(async move {
client
.invoke_model()
.model_id(model_id)
.body(aws_sdk_bedrockruntime::primitives::Blob::new(
serde_json::to_vec(&request_body).unwrap(),
))
.send()
.await
})
})
.unwrap();
let response_json: Value =
serde_json::from_slice(response.body.as_ref()).map_err(|e| Error::Runtime {
message: format!("Failed to parse response: {}", e),
})?;
let embedding = match self.model {
BedrockEmbeddingModel::TitanEmbedding => response_json["embedding"]
.as_array()
.ok_or_else(|| Error::Runtime {
message: "Missing embedding in response".to_string(),
})?
.iter()
.map(|v| v.as_f64().unwrap() as f32)
.collect::<Vec<f32>>(),
BedrockEmbeddingModel::CohereLarge => response_json["embeddings"][0]
.as_array()
.ok_or_else(|| Error::Runtime {
message: "Missing embeddings in response".to_string(),
})?
.iter()
.map(|v| v.as_f64().unwrap() as f32)
.collect::<Vec<f32>>(),
};
builder.append_slice(&embedding);
}
Ok(builder.finish())
}
}