mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 22:59:57 +00:00
Merge branch 'main' of https://github.com/lancedb/lancedb into yang/relative-lance-dep
This commit is contained in:
234
.github/workflows/npm-publish.yml
vendored
234
.github/workflows/npm-publish.yml
vendored
@@ -226,6 +226,126 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
node/dist/lancedb-vectordb-win32*.tgz
|
node/dist/lancedb-vectordb-win32*.tgz
|
||||||
|
|
||||||
|
node-windows-arm64:
|
||||||
|
name: vectordb win32-arm64-msvc
|
||||||
|
runs-on: windows-4x-arm
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Cache installations
|
||||||
|
id: cache-installs
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
C:\Program Files\Git
|
||||||
|
C:\BuildTools
|
||||||
|
C:\Program Files (x86)\Windows Kits
|
||||||
|
C:\Program Files\7-Zip
|
||||||
|
C:\protoc
|
||||||
|
key: ${{ runner.os }}-arm64-installs-v1
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-arm64-installs-
|
||||||
|
- name: Install Git
|
||||||
|
if: steps.cache-installs.outputs.cache-hit != 'true'
|
||||||
|
run: |
|
||||||
|
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||||
|
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||||
|
shell: powershell
|
||||||
|
- name: Add Git to PATH
|
||||||
|
run: |
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||||
|
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||||
|
shell: powershell
|
||||||
|
- name: Configure Git symlinks
|
||||||
|
run: git config --global core.symlinks true
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.13"
|
||||||
|
- name: Install Visual Studio Build Tools
|
||||||
|
if: steps.cache-installs.outputs.cache-hit != 'true'
|
||||||
|
run: |
|
||||||
|
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||||
|
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||||
|
"--installPath", "C:\BuildTools", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||||
|
shell: powershell
|
||||||
|
- name: Add Visual Studio Build Tools to PATH
|
||||||
|
run: |
|
||||||
|
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||||
|
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||||
|
|
||||||
|
# Add MSVC runtime libraries to LIB
|
||||||
|
$env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
|
||||||
|
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
|
||||||
|
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||||
|
Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
|
||||||
|
|
||||||
|
# Add INCLUDE paths
|
||||||
|
$env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
|
||||||
|
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
|
||||||
|
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
|
||||||
|
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
|
||||||
|
Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
|
||||||
|
shell: powershell
|
||||||
|
- name: Install Rust
|
||||||
|
run: |
|
||||||
|
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||||
|
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||||
|
shell: powershell
|
||||||
|
- name: Add Rust to PATH
|
||||||
|
run: |
|
||||||
|
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||||
|
shell: powershell
|
||||||
|
|
||||||
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
with:
|
||||||
|
workspaces: rust
|
||||||
|
- name: Install 7-Zip ARM
|
||||||
|
if: steps.cache-installs.outputs.cache-hit != 'true'
|
||||||
|
run: |
|
||||||
|
New-Item -Path 'C:\7zip' -ItemType Directory
|
||||||
|
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||||
|
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||||
|
shell: powershell
|
||||||
|
- name: Add 7-Zip to PATH
|
||||||
|
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||||
|
shell: powershell
|
||||||
|
- name: Install Protoc v21.12
|
||||||
|
if: steps.cache-installs.outputs.cache-hit != 'true'
|
||||||
|
working-directory: C:\
|
||||||
|
run: |
|
||||||
|
if (Test-Path 'C:\protoc') {
|
||||||
|
Write-Host "Protoc directory exists, skipping installation"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
New-Item -Path 'C:\protoc' -ItemType Directory
|
||||||
|
Set-Location C:\protoc
|
||||||
|
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||||
|
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||||
|
shell: powershell
|
||||||
|
- name: Add Protoc to PATH
|
||||||
|
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||||
|
shell: powershell
|
||||||
|
- name: Build Windows native node modules
|
||||||
|
run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc
|
||||||
|
- name: Upload Windows ARM64 Artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: node-native-windows-arm64
|
||||||
|
path: |
|
||||||
|
node/dist/*.node
|
||||||
|
|
||||||
nodejs-windows:
|
nodejs-windows:
|
||||||
name: lancedb ${{ matrix.target }}
|
name: lancedb ${{ matrix.target }}
|
||||||
runs-on: windows-2022
|
runs-on: windows-2022
|
||||||
@@ -260,9 +380,119 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
nodejs/dist/*.node
|
nodejs/dist/*.node
|
||||||
|
|
||||||
|
nodejs-windows-arm64:
|
||||||
|
name: lancedb win32-arm64-msvc
|
||||||
|
runs-on: windows-4x-arm
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Cache installations
|
||||||
|
id: cache-installs
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
C:\Program Files\Git
|
||||||
|
C:\BuildTools
|
||||||
|
C:\Program Files (x86)\Windows Kits
|
||||||
|
C:\Program Files\7-Zip
|
||||||
|
C:\protoc
|
||||||
|
key: ${{ runner.os }}-arm64-installs-v1
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-arm64-installs-
|
||||||
|
- name: Install Git
|
||||||
|
if: steps.cache-installs.outputs.cache-hit != 'true'
|
||||||
|
run: |
|
||||||
|
Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||||
|
Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||||
|
shell: powershell
|
||||||
|
- name: Add Git to PATH
|
||||||
|
run: |
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||||
|
$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||||
|
shell: powershell
|
||||||
|
- name: Configure Git symlinks
|
||||||
|
run: git config --global core.symlinks true
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.13"
|
||||||
|
- name: Install Visual Studio Build Tools
|
||||||
|
if: steps.cache-installs.outputs.cache-hit != 'true'
|
||||||
|
run: |
|
||||||
|
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||||
|
Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||||
|
"--installPath", "C:\BuildTools", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||||
|
"--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||||
|
shell: powershell
|
||||||
|
- name: Add Visual Studio Build Tools to PATH
|
||||||
|
run: |
|
||||||
|
$vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||||
|
$latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||||
|
|
||||||
|
$env:LIB = ""
|
||||||
|
Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||||
|
shell: powershell
|
||||||
|
- name: Install Rust
|
||||||
|
run: |
|
||||||
|
Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||||
|
.\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||||
|
shell: powershell
|
||||||
|
- name: Add Rust to PATH
|
||||||
|
run: |
|
||||||
|
Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||||
|
shell: powershell
|
||||||
|
|
||||||
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
with:
|
||||||
|
workspaces: rust
|
||||||
|
- name: Install 7-Zip ARM
|
||||||
|
if: steps.cache-installs.outputs.cache-hit != 'true'
|
||||||
|
run: |
|
||||||
|
New-Item -Path 'C:\7zip' -ItemType Directory
|
||||||
|
Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||||
|
Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||||
|
shell: powershell
|
||||||
|
- name: Add 7-Zip to PATH
|
||||||
|
run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||||
|
shell: powershell
|
||||||
|
- name: Install Protoc v21.12
|
||||||
|
if: steps.cache-installs.outputs.cache-hit != 'true'
|
||||||
|
working-directory: C:\
|
||||||
|
run: |
|
||||||
|
if (Test-Path 'C:\protoc') {
|
||||||
|
Write-Host "Protoc directory exists, skipping installation"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
New-Item -Path 'C:\protoc' -ItemType Directory
|
||||||
|
Set-Location C:\protoc
|
||||||
|
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||||
|
& 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||||
|
shell: powershell
|
||||||
|
- name: Add Protoc to PATH
|
||||||
|
run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||||
|
shell: powershell
|
||||||
|
- name: Build Windows native node modules
|
||||||
|
run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc
|
||||||
|
- name: Upload Windows ARM64 Artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: nodejs-native-windows-arm64
|
||||||
|
path: |
|
||||||
|
nodejs/dist/*.node
|
||||||
|
|
||||||
release:
|
release:
|
||||||
name: vectordb NPM Publish
|
name: vectordb NPM Publish
|
||||||
needs: [node, node-macos, node-linux, node-windows]
|
needs: [node, node-macos, node-linux, node-windows, node-windows-arm64]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
@@ -302,7 +532,7 @@ jobs:
|
|||||||
|
|
||||||
release-nodejs:
|
release-nodejs:
|
||||||
name: lancedb NPM Publish
|
name: lancedb NPM Publish
|
||||||
needs: [nodejs-macos, nodejs-linux, nodejs-windows]
|
needs: [nodejs-macos, nodejs-linux, nodejs-windows, nodejs-windows-arm64]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
|||||||
18
.github/workflows/rust.yml
vendored
18
.github/workflows/rust.yml
vendored
@@ -195,8 +195,18 @@ jobs:
|
|||||||
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||||
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||||
|
|
||||||
$env:LIB = ""
|
# Add MSVC runtime libraries to LIB
|
||||||
Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
$env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
|
||||||
|
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
|
||||||
|
"C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||||
|
Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
|
||||||
|
|
||||||
|
# Add INCLUDE paths
|
||||||
|
$env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
|
||||||
|
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
|
||||||
|
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
|
||||||
|
"C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
|
||||||
|
Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
|
||||||
shell: powershell
|
shell: powershell
|
||||||
- name: Install Rust
|
- name: Install Rust
|
||||||
run: |
|
run: |
|
||||||
@@ -225,6 +235,10 @@ jobs:
|
|||||||
if: steps.cache-installs.outputs.cache-hit != 'true'
|
if: steps.cache-installs.outputs.cache-hit != 'true'
|
||||||
working-directory: C:\
|
working-directory: C:\
|
||||||
run: |
|
run: |
|
||||||
|
if (Test-Path 'C:\protoc') {
|
||||||
|
Write-Host "Protoc directory exists, skipping installation"
|
||||||
|
return
|
||||||
|
}
|
||||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
New-Item -Path 'C:\protoc' -ItemType Directory
|
||||||
Set-Location C:\protoc
|
Set-Location C:\protoc
|
||||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||||
|
|||||||
@@ -10,6 +10,7 @@
|
|||||||
[](https://blog.lancedb.com/)
|
[](https://blog.lancedb.com/)
|
||||||
[](https://discord.gg/zMM32dvNtd)
|
[](https://discord.gg/zMM32dvNtd)
|
||||||
[](https://twitter.com/lancedb)
|
[](https://twitter.com/lancedb)
|
||||||
|
[](https://gurubase.io/g/lancedb)
|
||||||
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,51 @@
|
|||||||
|
# VoyageAI Embeddings
|
||||||
|
|
||||||
|
Voyage AI provides cutting-edge embedding and rerankers.
|
||||||
|
|
||||||
|
|
||||||
|
Using voyageai API requires voyageai package, which can be installed using `pip install voyageai`. Voyage AI embeddings are used to generate embeddings for text data. The embeddings can be used for various tasks like semantic search, clustering, and classification.
|
||||||
|
You also need to set the `VOYAGE_API_KEY` environment variable to use the VoyageAI API.
|
||||||
|
|
||||||
|
Supported models are:
|
||||||
|
|
||||||
|
- voyage-3
|
||||||
|
- voyage-3-lite
|
||||||
|
- voyage-finance-2
|
||||||
|
- voyage-multilingual-2
|
||||||
|
- voyage-law-2
|
||||||
|
- voyage-code-2
|
||||||
|
|
||||||
|
|
||||||
|
Supported parameters (to be passed in `create` method) are:
|
||||||
|
|
||||||
|
| Parameter | Type | Default Value | Description |
|
||||||
|
|---|---|--------|---------|
|
||||||
|
| `name` | `str` | `"voyage-3"` | The model ID of the model to use. Supported base models for Text Embeddings: voyage-3, voyage-3-lite, voyage-finance-2, voyage-multilingual-2, voyage-law-2, voyage-code-2 |
|
||||||
|
| `input_type` | `str` | `None` | Type of the input text. Default to None. Other options: query, document. |
|
||||||
|
| `truncation` | `bool` | `True` | Whether to truncate the input texts to fit within the context length. |
|
||||||
|
|
||||||
|
|
||||||
|
Usage Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import lancedb
|
||||||
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||||
|
|
||||||
|
voyageai = EmbeddingFunctionRegistry
|
||||||
|
.get_instance()
|
||||||
|
.get("voyageai")
|
||||||
|
.create(name="voyage-3")
|
||||||
|
|
||||||
|
class TextModel(LanceModel):
|
||||||
|
text: str = voyageai.SourceField()
|
||||||
|
vector: Vector(voyageai.ndims()) = voyageai.VectorField()
|
||||||
|
|
||||||
|
data = [ { "text": "hello world" },
|
||||||
|
{ "text": "goodbye world" }]
|
||||||
|
|
||||||
|
db = lancedb.connect("~/.lancedb")
|
||||||
|
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||||
|
|
||||||
|
tbl.add(data)
|
||||||
|
```
|
||||||
77
docs/src/reranking/voyageai.md
Normal file
77
docs/src/reranking/voyageai.md
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
# Voyage AI Reranker
|
||||||
|
|
||||||
|
Voyage AI provides cutting-edge embedding and rerankers.
|
||||||
|
|
||||||
|
This re-ranker uses the [VoyageAI](https://docs.voyageai.com/docs/) API to rerank the search results. You can use this re-ranker by passing `VoyageAIReranker()` to the `rerank()` method. Note that you'll either need to set the `VOYAGE_API_KEY` environment variable or pass the `api_key` argument to use this re-ranker.
|
||||||
|
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
Supported Query Types: Hybrid, Vector, FTS
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
import numpy
|
||||||
|
import lancedb
|
||||||
|
from lancedb.embeddings import get_registry
|
||||||
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
from lancedb.rerankers import VoyageAIReranker
|
||||||
|
|
||||||
|
embedder = get_registry().get("sentence-transformers").create()
|
||||||
|
db = lancedb.connect("~/.lancedb")
|
||||||
|
|
||||||
|
class Schema(LanceModel):
|
||||||
|
text: str = embedder.SourceField()
|
||||||
|
vector: Vector(embedder.ndims()) = embedder.VectorField()
|
||||||
|
|
||||||
|
data = [
|
||||||
|
{"text": "hello world"},
|
||||||
|
{"text": "goodbye world"}
|
||||||
|
]
|
||||||
|
tbl = db.create_table("test", schema=Schema, mode="overwrite")
|
||||||
|
tbl.add(data)
|
||||||
|
reranker = VoyageAIReranker(model_name="rerank-2")
|
||||||
|
|
||||||
|
# Run vector search with a reranker
|
||||||
|
result = tbl.search("hello").rerank(reranker=reranker).to_list()
|
||||||
|
|
||||||
|
# Run FTS search with a reranker
|
||||||
|
result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list()
|
||||||
|
|
||||||
|
# Run hybrid search with a reranker
|
||||||
|
tbl.create_fts_index("text", replace=True)
|
||||||
|
result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list()
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Accepted Arguments
|
||||||
|
----------------
|
||||||
|
| Argument | Type | Default | Description |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| `model_name` | `str` | `None` | The name of the reranker model to use. Available models are: rerank-2, rerank-2-lite |
|
||||||
|
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
|
||||||
|
| `top_n` | `str` | `None` | The number of results to return. If None, will return all results. |
|
||||||
|
| `api_key` | `str` | `None` | The API key for the Voyage AI API. If not provided, the `VOYAGE_API_KEY` environment variable is used. |
|
||||||
|
| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type |
|
||||||
|
| `truncation` | `bool` | `None` | Whether to truncate the input to satisfy the "context length limit" on the query and the documents. |
|
||||||
|
|
||||||
|
|
||||||
|
## Supported Scores for each query type
|
||||||
|
You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type:
|
||||||
|
|
||||||
|
### Hybrid Search
|
||||||
|
|`return_score`| Status | Description |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
|
||||||
|
| `all` | ❌ Not Supported | Returns have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`) |
|
||||||
|
|
||||||
|
### Vector Search
|
||||||
|
|`return_score`| Status | Description |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
|
||||||
|
| `all` | ✅ Supported | Returns have vector(`_distance`) along with Hybrid Search score(`_relevance_score`) |
|
||||||
|
|
||||||
|
### FTS Search
|
||||||
|
|`return_score`| Status | Description |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
|
||||||
|
| `all` | ✅ Supported | Returns have FTS(`score`) along with Hybrid Search score(`_relevance_score`) |
|
||||||
@@ -18,7 +18,7 @@ futures.workspace = true
|
|||||||
lancedb = { path = "../rust/lancedb", features = ["remote"] }
|
lancedb = { path = "../rust/lancedb", features = ["remote"] }
|
||||||
napi = { version = "2.16.8", default-features = false, features = [
|
napi = { version = "2.16.8", default-features = false, features = [
|
||||||
"napi9",
|
"napi9",
|
||||||
"async",
|
"async"
|
||||||
] }
|
] }
|
||||||
napi-derive = "2.16.4"
|
napi-derive = "2.16.4"
|
||||||
# Prevent dynamic linking of lzma, which comes from datafusion
|
# Prevent dynamic linking of lzma, which comes from datafusion
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.12.0",
|
"version": "0.13.0-beta.1",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -27,3 +27,4 @@ from .imagebind import ImageBindEmbeddings
|
|||||||
from .utils import with_embeddings
|
from .utils import with_embeddings
|
||||||
from .jinaai import JinaEmbeddings
|
from .jinaai import JinaEmbeddings
|
||||||
from .watsonx import WatsonxEmbeddings
|
from .watsonx import WatsonxEmbeddings
|
||||||
|
from .voyageai import VoyageAIEmbeddingFunction
|
||||||
|
|||||||
127
python/python/lancedb/embeddings/voyageai.py
Normal file
127
python/python/lancedb/embeddings/voyageai.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
# Copyright (c) 2023. LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import ClassVar, List, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ..util import attempt_import_or_raise
|
||||||
|
from .base import TextEmbeddingFunction
|
||||||
|
from .registry import register
|
||||||
|
from .utils import api_key_not_found_help, TEXT
|
||||||
|
|
||||||
|
|
||||||
|
@register("voyageai")
|
||||||
|
class VoyageAIEmbeddingFunction(TextEmbeddingFunction):
|
||||||
|
"""
|
||||||
|
An embedding function that uses the VoyageAI API
|
||||||
|
|
||||||
|
https://docs.voyageai.com/docs/embeddings
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
name: str
|
||||||
|
The name of the model to use. List of acceptable models:
|
||||||
|
|
||||||
|
* voyage-3
|
||||||
|
* voyage-3-lite
|
||||||
|
* voyage-finance-2
|
||||||
|
* voyage-multilingual-2
|
||||||
|
* voyage-law-2
|
||||||
|
* voyage-code-2
|
||||||
|
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
import lancedb
|
||||||
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||||
|
|
||||||
|
voyageai = EmbeddingFunctionRegistry
|
||||||
|
.get_instance()
|
||||||
|
.get("voyageai")
|
||||||
|
.create(name="voyage-3")
|
||||||
|
|
||||||
|
class TextModel(LanceModel):
|
||||||
|
text: str = voyageai.SourceField()
|
||||||
|
vector: Vector(voyageai.ndims()) = voyageai.VectorField()
|
||||||
|
|
||||||
|
data = [ { "text": "hello world" },
|
||||||
|
{ "text": "goodbye world" }]
|
||||||
|
|
||||||
|
db = lancedb.connect("~/.lancedb")
|
||||||
|
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||||
|
|
||||||
|
tbl.add(data)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
client: ClassVar = None
|
||||||
|
|
||||||
|
def ndims(self):
|
||||||
|
if self.name == "voyage-3-lite":
|
||||||
|
return 512
|
||||||
|
elif self.name == "voyage-code-2":
|
||||||
|
return 1536
|
||||||
|
elif self.name in [
|
||||||
|
"voyage-3",
|
||||||
|
"voyage-finance-2",
|
||||||
|
"voyage-multilingual-2",
|
||||||
|
"voyage-law-2",
|
||||||
|
]:
|
||||||
|
return 1024
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Model {self.name} not supported")
|
||||||
|
|
||||||
|
def compute_query_embeddings(self, query: str, *args, **kwargs) -> List[np.array]:
|
||||||
|
return self.compute_source_embeddings(query, input_type="query")
|
||||||
|
|
||||||
|
def compute_source_embeddings(self, texts: TEXT, *args, **kwargs) -> List[np.array]:
|
||||||
|
texts = self.sanitize_input(texts)
|
||||||
|
input_type = (
|
||||||
|
kwargs.get("input_type") or "document"
|
||||||
|
) # assume source input type if not passed by `compute_query_embeddings`
|
||||||
|
return self.generate_embeddings(texts, input_type=input_type)
|
||||||
|
|
||||||
|
def generate_embeddings(
|
||||||
|
self, texts: Union[List[str], np.ndarray], *args, **kwargs
|
||||||
|
) -> List[np.array]:
|
||||||
|
"""
|
||||||
|
Get the embeddings for the given texts
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
texts: list[str] or np.ndarray (of str)
|
||||||
|
The texts to embed
|
||||||
|
input_type: Optional[str]
|
||||||
|
|
||||||
|
truncation: Optional[bool]
|
||||||
|
"""
|
||||||
|
VoyageAIEmbeddingFunction._init_client()
|
||||||
|
rs = VoyageAIEmbeddingFunction.client.embed(
|
||||||
|
texts=texts, model=self.name, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return [emb for emb in rs.embeddings]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _init_client():
|
||||||
|
if VoyageAIEmbeddingFunction.client is None:
|
||||||
|
voyageai = attempt_import_or_raise("voyageai")
|
||||||
|
if os.environ.get("VOYAGE_API_KEY") is None:
|
||||||
|
api_key_not_found_help("voyageai")
|
||||||
|
VoyageAIEmbeddingFunction.client = voyageai.Client(
|
||||||
|
os.environ["VOYAGE_API_KEY"]
|
||||||
|
)
|
||||||
@@ -11,6 +11,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
from datetime import timedelta
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
@@ -495,6 +496,19 @@ class RemoteTable(Table):
|
|||||||
"compact_files() is not supported on the LanceDB cloud"
|
"compact_files() is not supported on the LanceDB cloud"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def optimize(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
cleanup_older_than: Optional[timedelta] = None,
|
||||||
|
delete_unverified: bool = False,
|
||||||
|
):
|
||||||
|
"""optimize() is not supported on the LanceDB cloud.
|
||||||
|
Indices are optimized automatically."""
|
||||||
|
raise NotImplementedError(
|
||||||
|
"optimize() is not supported on the LanceDB cloud. "
|
||||||
|
"Indices are optimized automatically."
|
||||||
|
)
|
||||||
|
|
||||||
def count_rows(self, filter: Optional[str] = None) -> int:
|
def count_rows(self, filter: Optional[str] = None) -> int:
|
||||||
return self._loop.run_until_complete(self._table.count_rows(filter))
|
return self._loop.run_until_complete(self._table.count_rows(filter))
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from .openai import OpenaiReranker
|
|||||||
from .jinaai import JinaReranker
|
from .jinaai import JinaReranker
|
||||||
from .rrf import RRFReranker
|
from .rrf import RRFReranker
|
||||||
from .answerdotai import AnswerdotaiRerankers
|
from .answerdotai import AnswerdotaiRerankers
|
||||||
|
from .voyageai import VoyageAIReranker
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Reranker",
|
"Reranker",
|
||||||
@@ -18,4 +19,5 @@ __all__ = [
|
|||||||
"JinaReranker",
|
"JinaReranker",
|
||||||
"RRFReranker",
|
"RRFReranker",
|
||||||
"AnswerdotaiRerankers",
|
"AnswerdotaiRerankers",
|
||||||
|
"VoyageAIReranker",
|
||||||
]
|
]
|
||||||
|
|||||||
133
python/python/lancedb/rerankers/voyageai.py
Normal file
133
python/python/lancedb/rerankers/voyageai.py
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
# Copyright (c) 2023. LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
from functools import cached_property
|
||||||
|
from typing import Union, Optional
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
from ..util import attempt_import_or_raise
|
||||||
|
from .base import Reranker
|
||||||
|
|
||||||
|
|
||||||
|
class VoyageAIReranker(Reranker):
|
||||||
|
"""
|
||||||
|
Reranks the results using the VoyageAI Rerank API.
|
||||||
|
https://docs.voyageai.com/docs/reranker
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
model_name : str, default "rerank-english-v2.0"
|
||||||
|
The name of the cross encoder model to use. Available voyageai models are:
|
||||||
|
- rerank-2
|
||||||
|
- rerank-2-lite
|
||||||
|
column : str, default "text"
|
||||||
|
The name of the column to use as input to the cross encoder model.
|
||||||
|
top_n : int, default None
|
||||||
|
The number of results to return. If None, will return all results.
|
||||||
|
return_score : str, default "relevance"
|
||||||
|
options are "relevance" or "all". Only "relevance" is supported for now.
|
||||||
|
api_key : str, default None
|
||||||
|
The API key to use. If None, will use the OPENAI_API_KEY environment variable.
|
||||||
|
truncation : Optional[bool], default None
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_name: str,
|
||||||
|
column: str = "text",
|
||||||
|
top_n: Optional[int] = None,
|
||||||
|
return_score="relevance",
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
truncation: Optional[bool] = True,
|
||||||
|
):
|
||||||
|
super().__init__(return_score)
|
||||||
|
self.model_name = model_name
|
||||||
|
self.column = column
|
||||||
|
self.top_n = top_n
|
||||||
|
self.api_key = api_key
|
||||||
|
self.truncation = truncation
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def _client(self):
|
||||||
|
voyageai = attempt_import_or_raise("voyageai")
|
||||||
|
if os.environ.get("VOYAGE_API_KEY") is None and self.api_key is None:
|
||||||
|
raise ValueError(
|
||||||
|
"VOYAGE_API_KEY not set. Either set it in your environment or \
|
||||||
|
pass it as `api_key` argument to the VoyageAIReranker."
|
||||||
|
)
|
||||||
|
return voyageai.Client(
|
||||||
|
api_key=os.environ.get("VOYAGE_API_KEY") or self.api_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _rerank(self, result_set: pa.Table, query: str):
|
||||||
|
docs = result_set[self.column].to_pylist()
|
||||||
|
response = self._client.rerank(
|
||||||
|
query=query,
|
||||||
|
documents=docs,
|
||||||
|
top_k=self.top_n,
|
||||||
|
model=self.model_name,
|
||||||
|
truncation=self.truncation,
|
||||||
|
)
|
||||||
|
results = (
|
||||||
|
response.results
|
||||||
|
) # returns list (text, idx, relevance) attributes sorted descending by score
|
||||||
|
indices, scores = list(
|
||||||
|
zip(*[(result.index, result.relevance_score) for result in results])
|
||||||
|
) # tuples
|
||||||
|
result_set = result_set.take(list(indices))
|
||||||
|
# add the scores
|
||||||
|
result_set = result_set.append_column(
|
||||||
|
"_relevance_score", pa.array(scores, type=pa.float32())
|
||||||
|
)
|
||||||
|
|
||||||
|
return result_set
|
||||||
|
|
||||||
|
def rerank_hybrid(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
vector_results: pa.Table,
|
||||||
|
fts_results: pa.Table,
|
||||||
|
):
|
||||||
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
|
combined_results = self._rerank(combined_results, query)
|
||||||
|
if self.score == "relevance":
|
||||||
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
|
elif self.score == "all":
|
||||||
|
raise NotImplementedError(
|
||||||
|
"return_score='all' not implemented for voyageai reranker"
|
||||||
|
)
|
||||||
|
return combined_results
|
||||||
|
|
||||||
|
def rerank_vector(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
vector_results: pa.Table,
|
||||||
|
):
|
||||||
|
result_set = self._rerank(vector_results, query)
|
||||||
|
if self.score == "relevance":
|
||||||
|
result_set = result_set.drop_columns(["_distance"])
|
||||||
|
|
||||||
|
return result_set
|
||||||
|
|
||||||
|
def rerank_fts(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
fts_results: pa.Table,
|
||||||
|
):
|
||||||
|
result_set = self._rerank(fts_results, query)
|
||||||
|
if self.score == "relevance":
|
||||||
|
result_set = result_set.drop_columns(["_score"])
|
||||||
|
|
||||||
|
return result_set
|
||||||
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import inspect
|
import inspect
|
||||||
import time
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
@@ -32,7 +33,7 @@ import pyarrow.fs as pa_fs
|
|||||||
from lance import LanceDataset
|
from lance import LanceDataset
|
||||||
from lance.dependencies import _check_for_hugging_face
|
from lance.dependencies import _check_for_hugging_face
|
||||||
|
|
||||||
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
from .common import DATA, VEC, VECTOR_COLUMN_NAME, sanitize_uri
|
||||||
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
||||||
from .merge import LanceMergeInsertBuilder
|
from .merge import LanceMergeInsertBuilder
|
||||||
from .pydantic import LanceModel, model_to_dict
|
from .pydantic import LanceModel, model_to_dict
|
||||||
@@ -57,6 +58,8 @@ from .util import (
|
|||||||
)
|
)
|
||||||
from .index import lang_mapping
|
from .index import lang_mapping
|
||||||
|
|
||||||
|
from ._lancedb import connect as lancedb_connect
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import PIL
|
import PIL
|
||||||
from lance.dataset import CleanupStats, ReaderLike
|
from lance.dataset import CleanupStats, ReaderLike
|
||||||
@@ -893,6 +896,55 @@ class Table(ABC):
|
|||||||
For most cases, the default should be fine.
|
For most cases, the default should be fine.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def optimize(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
cleanup_older_than: Optional[timedelta] = None,
|
||||||
|
delete_unverified: bool = False,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Optimize the on-disk data and indices for better performance.
|
||||||
|
|
||||||
|
Modeled after ``VACUUM`` in PostgreSQL.
|
||||||
|
|
||||||
|
Optimization covers three operations:
|
||||||
|
|
||||||
|
* Compaction: Merges small files into larger ones
|
||||||
|
* Prune: Removes old versions of the dataset
|
||||||
|
* Index: Optimizes the indices, adding new data to existing indices
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cleanup_older_than: timedelta, optional default 7 days
|
||||||
|
All files belonging to versions older than this will be removed. Set
|
||||||
|
to 0 days to remove all versions except the latest. The latest version
|
||||||
|
is never removed.
|
||||||
|
delete_unverified: bool, default False
|
||||||
|
Files leftover from a failed transaction may appear to be part of an
|
||||||
|
in-progress operation (e.g. appending new data) and these files will not
|
||||||
|
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||||
|
then these files will be deleted regardless of their age.
|
||||||
|
|
||||||
|
Experimental API
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The optimization process is undergoing active development and may change.
|
||||||
|
Our goal with these changes is to improve the performance of optimization and
|
||||||
|
reduce the complexity.
|
||||||
|
|
||||||
|
That being said, it is essential today to run optimize if you want the best
|
||||||
|
performance. It should be stable and safe to use in production, but it our
|
||||||
|
hope that the API may be simplified (or not even need to be called) in the
|
||||||
|
future.
|
||||||
|
|
||||||
|
The frequency an application shoudl call optimize is based on the frequency of
|
||||||
|
data modifications. If data is frequently added, deleted, or updated then
|
||||||
|
optimize should be run frequently. A good rule of thumb is to run optimize if
|
||||||
|
you have added or modified 100,000 or more records or run more than 20 data
|
||||||
|
modification operations.
|
||||||
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def add_columns(self, transforms: Dict[str, str]):
|
def add_columns(self, transforms: Dict[str, str]):
|
||||||
"""
|
"""
|
||||||
@@ -1971,6 +2023,83 @@ class LanceTable(Table):
|
|||||||
"""
|
"""
|
||||||
return self.to_lance().optimize.compact_files(*args, **kwargs)
|
return self.to_lance().optimize.compact_files(*args, **kwargs)
|
||||||
|
|
||||||
|
def optimize(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
cleanup_older_than: Optional[timedelta] = None,
|
||||||
|
delete_unverified: bool = False,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Optimize the on-disk data and indices for better performance.
|
||||||
|
|
||||||
|
Modeled after ``VACUUM`` in PostgreSQL.
|
||||||
|
|
||||||
|
Optimization covers three operations:
|
||||||
|
|
||||||
|
* Compaction: Merges small files into larger ones
|
||||||
|
* Prune: Removes old versions of the dataset
|
||||||
|
* Index: Optimizes the indices, adding new data to existing indices
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cleanup_older_than: timedelta, optional default 7 days
|
||||||
|
All files belonging to versions older than this will be removed. Set
|
||||||
|
to 0 days to remove all versions except the latest. The latest version
|
||||||
|
is never removed.
|
||||||
|
delete_unverified: bool, default False
|
||||||
|
Files leftover from a failed transaction may appear to be part of an
|
||||||
|
in-progress operation (e.g. appending new data) and these files will not
|
||||||
|
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||||
|
then these files will be deleted regardless of their age.
|
||||||
|
|
||||||
|
Experimental API
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The optimization process is undergoing active development and may change.
|
||||||
|
Our goal with these changes is to improve the performance of optimization and
|
||||||
|
reduce the complexity.
|
||||||
|
|
||||||
|
That being said, it is essential today to run optimize if you want the best
|
||||||
|
performance. It should be stable and safe to use in production, but it our
|
||||||
|
hope that the API may be simplified (or not even need to be called) in the
|
||||||
|
future.
|
||||||
|
|
||||||
|
The frequency an application shoudl call optimize is based on the frequency of
|
||||||
|
data modifications. If data is frequently added, deleted, or updated then
|
||||||
|
optimize should be run frequently. A good rule of thumb is to run optimize if
|
||||||
|
you have added or modified 100,000 or more records or run more than 20 data
|
||||||
|
modification operations.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
asyncio.get_running_loop()
|
||||||
|
raise AssertionError(
|
||||||
|
"Synchronous method called in asynchronous context. "
|
||||||
|
"If you are writing an asynchronous application "
|
||||||
|
"then please use the asynchronous APIs"
|
||||||
|
)
|
||||||
|
|
||||||
|
except RuntimeError:
|
||||||
|
asyncio.run(
|
||||||
|
self._async_optimize(
|
||||||
|
cleanup_older_than=cleanup_older_than,
|
||||||
|
delete_unverified=delete_unverified,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.checkout_latest()
|
||||||
|
|
||||||
|
async def _async_optimize(
|
||||||
|
self,
|
||||||
|
cleanup_older_than: Optional[timedelta] = None,
|
||||||
|
delete_unverified: bool = False,
|
||||||
|
):
|
||||||
|
conn = await lancedb_connect(
|
||||||
|
sanitize_uri(self._conn.uri),
|
||||||
|
)
|
||||||
|
table = AsyncTable(await conn.open_table(self.name))
|
||||||
|
return await table.optimize(
|
||||||
|
cleanup_older_than=cleanup_older_than, delete_unverified=delete_unverified
|
||||||
|
)
|
||||||
|
|
||||||
def add_columns(self, transforms: Dict[str, str]):
|
def add_columns(self, transforms: Dict[str, str]):
|
||||||
self._dataset_mut.add_columns(transforms)
|
self._dataset_mut.add_columns(transforms)
|
||||||
|
|
||||||
|
|||||||
@@ -196,6 +196,7 @@ def test_add_optional_vector(tmp_path):
|
|||||||
"ollama",
|
"ollama",
|
||||||
"cohere",
|
"cohere",
|
||||||
"instructor",
|
"instructor",
|
||||||
|
"voyageai",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_embedding_function_safe_model_dump(embedding_type):
|
def test_embedding_function_safe_model_dump(embedding_type):
|
||||||
|
|||||||
@@ -481,3 +481,22 @@ def test_ollama_embedding(tmp_path):
|
|||||||
json.dumps(dumped_model)
|
json.dumps(dumped_model)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
pytest.fail("Failed to JSON serialize the dumped model")
|
pytest.fail("Failed to JSON serialize the dumped model")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||||
|
)
|
||||||
|
def test_voyageai_embedding_function():
|
||||||
|
voyageai = get_registry().get("voyageai").create(name="voyage-3", max_retries=0)
|
||||||
|
|
||||||
|
class TextModel(LanceModel):
|
||||||
|
text: str = voyageai.SourceField()
|
||||||
|
vector: Vector(voyageai.ndims()) = voyageai.VectorField()
|
||||||
|
|
||||||
|
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
|
||||||
|
db = lancedb.connect("~/lancedb")
|
||||||
|
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||||
|
|
||||||
|
tbl.add(df)
|
||||||
|
assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from lancedb.rerankers import (
|
|||||||
OpenaiReranker,
|
OpenaiReranker,
|
||||||
JinaReranker,
|
JinaReranker,
|
||||||
AnswerdotaiRerankers,
|
AnswerdotaiRerankers,
|
||||||
|
VoyageAIReranker,
|
||||||
)
|
)
|
||||||
from lancedb.table import LanceTable
|
from lancedb.table import LanceTable
|
||||||
|
|
||||||
@@ -344,3 +345,14 @@ def test_jina_reranker(tmp_path, use_tantivy):
|
|||||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||||
reranker = JinaReranker()
|
reranker = JinaReranker()
|
||||||
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||||
|
def test_voyageai_reranker(tmp_path, use_tantivy):
|
||||||
|
pytest.importorskip("voyageai")
|
||||||
|
reranker = VoyageAIReranker(model_name="rerank-2")
|
||||||
|
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||||
|
_run_test_reranker(reranker, table, "single player experience", None, schema)
|
||||||
|
|||||||
@@ -1223,6 +1223,54 @@ async def test_time_travel(db_async: AsyncConnection):
|
|||||||
await table.restore()
|
await table.restore()
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_optimize(db):
|
||||||
|
table = LanceTable.create(
|
||||||
|
db,
|
||||||
|
"test",
|
||||||
|
data=[
|
||||||
|
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||||
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
table.create_scalar_index("price", index_type="BTREE")
|
||||||
|
stats = table.to_lance().stats.index_stats("price_idx")
|
||||||
|
assert stats["num_indexed_rows"] == 2
|
||||||
|
|
||||||
|
table.add([{"vector": [2.0, 2.0], "item": "baz", "price": 30.0}])
|
||||||
|
assert table.count_rows() == 3
|
||||||
|
table.optimize()
|
||||||
|
stats = table.to_lance().stats.index_stats("price_idx")
|
||||||
|
assert stats["num_indexed_rows"] == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_sync_optimize_in_async(db):
|
||||||
|
table = LanceTable.create(
|
||||||
|
db,
|
||||||
|
"test",
|
||||||
|
data=[
|
||||||
|
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||||
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
table.create_scalar_index("price", index_type="BTREE")
|
||||||
|
stats = table.to_lance().stats.index_stats("price_idx")
|
||||||
|
assert stats["num_indexed_rows"] == 2
|
||||||
|
|
||||||
|
table.add([{"vector": [2.0, 2.0], "item": "baz", "price": 30.0}])
|
||||||
|
assert table.count_rows() == 3
|
||||||
|
try:
|
||||||
|
table.optimize()
|
||||||
|
except Exception as e:
|
||||||
|
assert (
|
||||||
|
"Synchronous method called in asynchronous context. "
|
||||||
|
"If you are writing an asynchronous application "
|
||||||
|
"then please use the asynchronous APIs" in str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_optimize(db_async: AsyncConnection):
|
async def test_optimize(db_async: AsyncConnection):
|
||||||
table = await db_async.create_table(
|
table = await db_async.create_table(
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ pub mod scalar;
|
|||||||
pub mod vector;
|
pub mod vector;
|
||||||
|
|
||||||
/// Supported index types.
|
/// Supported index types.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
pub enum Index {
|
pub enum Index {
|
||||||
Auto,
|
Auto,
|
||||||
/// A `BTree` index is an sorted index on scalar columns.
|
/// A `BTree` index is an sorted index on scalar columns.
|
||||||
|
|||||||
Reference in New Issue
Block a user