diff --git a/.github/workflows/npm-publish.yml b/.github/workflows/npm-publish.yml index 18b78dbf..db7b6d08 100644 --- a/.github/workflows/npm-publish.yml +++ b/.github/workflows/npm-publish.yml @@ -226,6 +226,126 @@ jobs: path: | node/dist/lancedb-vectordb-win32*.tgz + node-windows-arm64: + name: vectordb win32-arm64-msvc + runs-on: windows-4x-arm + if: startsWith(github.ref, 'refs/tags/v') + steps: + - uses: actions/checkout@v4 + - name: Cache installations + id: cache-installs + uses: actions/cache@v4 + with: + path: | + C:\Program Files\Git + C:\BuildTools + C:\Program Files (x86)\Windows Kits + C:\Program Files\7-Zip + C:\protoc + key: ${{ runner.os }}-arm64-installs-v1 + restore-keys: | + ${{ runner.os }}-arm64-installs- + - name: Install Git + if: steps.cache-installs.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe" + Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait + shell: powershell + - name: Add Git to PATH + run: | + Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin" + $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User") + shell: powershell + - name: Configure Git symlinks + run: git config --global core.symlinks true + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install Visual Studio Build Tools + if: steps.cache-installs.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe" + Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", ` + "--installPath", "C:\BuildTools", ` + "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", ` + "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", ` + "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", ` + "--add", "Microsoft.VisualStudio.Component.VC.ATL", ` + "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", ` + "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait + shell: powershell + - name: Add Visual Studio Build Tools to PATH + run: | + $vsPath = "C:\BuildTools\VC\Tools\MSVC" + $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name + Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64" + Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64" + Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64" + Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64" + Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin" + + # Add MSVC runtime libraries to LIB + $env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" + + "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" + + "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64" + Add-Content $env:GITHUB_ENV "LIB=$env:LIB" + + # Add INCLUDE paths + $env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" + + "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" + + "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" + + "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared" + Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE" + shell: powershell + - name: Install Rust + run: | + Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe + .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc + shell: powershell + - name: Add Rust to PATH + run: | + Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" + shell: powershell + + - uses: Swatinem/rust-cache@v2 + with: + workspaces: rust + - name: Install 7-Zip ARM + if: steps.cache-installs.outputs.cache-hit != 'true' + run: | + New-Item -Path 'C:\7zip' -ItemType Directory + Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe + Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait + shell: powershell + - name: Add 7-Zip to PATH + run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip" + shell: powershell + - name: Install Protoc v21.12 + if: steps.cache-installs.outputs.cache-hit != 'true' + working-directory: C:\ + run: | + if (Test-Path 'C:\protoc') { + Write-Host "Protoc directory exists, skipping installation" + return + } + New-Item -Path 'C:\protoc' -ItemType Directory + Set-Location C:\protoc + Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip + & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip + shell: powershell + - name: Add Protoc to PATH + run: Add-Content $env:GITHUB_PATH "C:\protoc\bin" + shell: powershell + - name: Build Windows native node modules + run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc + - name: Upload Windows ARM64 Artifacts + uses: actions/upload-artifact@v4 + with: + name: node-native-windows-arm64 + path: | + node/dist/*.node + nodejs-windows: name: lancedb ${{ matrix.target }} runs-on: windows-2022 @@ -260,9 +380,119 @@ jobs: path: | nodejs/dist/*.node + nodejs-windows-arm64: + name: lancedb win32-arm64-msvc + runs-on: windows-4x-arm + if: startsWith(github.ref, 'refs/tags/v') + steps: + - uses: actions/checkout@v4 + - name: Cache installations + id: cache-installs + uses: actions/cache@v4 + with: + path: | + C:\Program Files\Git + C:\BuildTools + C:\Program Files (x86)\Windows Kits + C:\Program Files\7-Zip + C:\protoc + key: ${{ runner.os }}-arm64-installs-v1 + restore-keys: | + ${{ runner.os }}-arm64-installs- + - name: Install Git + if: steps.cache-installs.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe" + Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait + shell: powershell + - name: Add Git to PATH + run: | + Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin" + $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User") + shell: powershell + - name: Configure Git symlinks + run: git config --global core.symlinks true + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Install Visual Studio Build Tools + if: steps.cache-installs.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe" + Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", ` + "--installPath", "C:\BuildTools", ` + "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", ` + "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", ` + "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", ` + "--add", "Microsoft.VisualStudio.Component.VC.ATL", ` + "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", ` + "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait + shell: powershell + - name: Add Visual Studio Build Tools to PATH + run: | + $vsPath = "C:\BuildTools\VC\Tools\MSVC" + $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name + Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64" + Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64" + Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64" + Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64" + Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin" + + $env:LIB = "" + Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64" + shell: powershell + - name: Install Rust + run: | + Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe + .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc + shell: powershell + - name: Add Rust to PATH + run: | + Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" + shell: powershell + + - uses: Swatinem/rust-cache@v2 + with: + workspaces: rust + - name: Install 7-Zip ARM + if: steps.cache-installs.outputs.cache-hit != 'true' + run: | + New-Item -Path 'C:\7zip' -ItemType Directory + Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe + Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait + shell: powershell + - name: Add 7-Zip to PATH + run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip" + shell: powershell + - name: Install Protoc v21.12 + if: steps.cache-installs.outputs.cache-hit != 'true' + working-directory: C:\ + run: | + if (Test-Path 'C:\protoc') { + Write-Host "Protoc directory exists, skipping installation" + return + } + New-Item -Path 'C:\protoc' -ItemType Directory + Set-Location C:\protoc + Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip + & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip + shell: powershell + - name: Add Protoc to PATH + run: Add-Content $env:GITHUB_PATH "C:\protoc\bin" + shell: powershell + - name: Build Windows native node modules + run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc + - name: Upload Windows ARM64 Artifacts + uses: actions/upload-artifact@v4 + with: + name: nodejs-native-windows-arm64 + path: | + nodejs/dist/*.node + release: name: vectordb NPM Publish - needs: [node, node-macos, node-linux, node-windows] + needs: [node, node-macos, node-linux, node-windows, node-windows-arm64] runs-on: ubuntu-latest # Only runs on tags that matches the make-release action if: startsWith(github.ref, 'refs/tags/v') @@ -302,7 +532,7 @@ jobs: release-nodejs: name: lancedb NPM Publish - needs: [nodejs-macos, nodejs-linux, nodejs-windows] + needs: [nodejs-macos, nodejs-linux, nodejs-windows, nodejs-windows-arm64] runs-on: ubuntu-latest # Only runs on tags that matches the make-release action if: startsWith(github.ref, 'refs/tags/v') diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ab02b499..29e47ecc 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -195,8 +195,18 @@ jobs: Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64" Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin" - $env:LIB = "" - Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64" + # Add MSVC runtime libraries to LIB + $env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" + + "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" + + "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64" + Add-Content $env:GITHUB_ENV "LIB=$env:LIB" + + # Add INCLUDE paths + $env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" + + "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" + + "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" + + "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared" + Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE" shell: powershell - name: Install Rust run: | @@ -225,6 +235,10 @@ jobs: if: steps.cache-installs.outputs.cache-hit != 'true' working-directory: C:\ run: | + if (Test-Path 'C:\protoc') { + Write-Host "Protoc directory exists, skipping installation" + return + } New-Item -Path 'C:\protoc' -ItemType Directory Set-Location C:\protoc Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip diff --git a/README.md b/README.md index fa1218f1..fc0aa217 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ [![Blog](https://img.shields.io/badge/Blog-12100E?style=for-the-badge&logoColor=white)](https://blog.lancedb.com/) [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/zMM32dvNtd) [![Twitter](https://img.shields.io/badge/Twitter-%231DA1F2.svg?style=for-the-badge&logo=Twitter&logoColor=white)](https://twitter.com/lancedb) +[![Gurubase](https://img.shields.io/badge/Gurubase-Ask%20LanceDB%20Guru-006BFF?style=for-the-badge)](https://gurubase.io/g/lancedb)

diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md new file mode 100644 index 00000000..41a6be31 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md @@ -0,0 +1,51 @@ +# VoyageAI Embeddings + +Voyage AI provides cutting-edge embedding and rerankers. + + +Using voyageai API requires voyageai package, which can be installed using `pip install voyageai`. Voyage AI embeddings are used to generate embeddings for text data. The embeddings can be used for various tasks like semantic search, clustering, and classification. +You also need to set the `VOYAGE_API_KEY` environment variable to use the VoyageAI API. + +Supported models are: + +- voyage-3 +- voyage-3-lite +- voyage-finance-2 +- voyage-multilingual-2 +- voyage-law-2 +- voyage-code-2 + + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|--------|---------| +| `name` | `str` | `"voyage-3"` | The model ID of the model to use. Supported base models for Text Embeddings: voyage-3, voyage-3-lite, voyage-finance-2, voyage-multilingual-2, voyage-law-2, voyage-code-2 | +| `input_type` | `str` | `None` | Type of the input text. Default to None. Other options: query, document. | +| `truncation` | `bool` | `True` | Whether to truncate the input texts to fit within the context length. | + + +Usage Example: + +```python + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import EmbeddingFunctionRegistry + + voyageai = EmbeddingFunctionRegistry + .get_instance() + .get("voyageai") + .create(name="voyage-3") + + class TextModel(LanceModel): + text: str = voyageai.SourceField() + vector: Vector(voyageai.ndims()) = voyageai.VectorField() + + data = [ { "text": "hello world" }, + { "text": "goodbye world" }] + + db = lancedb.connect("~/.lancedb") + tbl = db.create_table("test", schema=TextModel, mode="overwrite") + + tbl.add(data) +``` \ No newline at end of file diff --git a/docs/src/reranking/voyageai.md b/docs/src/reranking/voyageai.md new file mode 100644 index 00000000..4021729a --- /dev/null +++ b/docs/src/reranking/voyageai.md @@ -0,0 +1,77 @@ +# Voyage AI Reranker + +Voyage AI provides cutting-edge embedding and rerankers. + +This re-ranker uses the [VoyageAI](https://docs.voyageai.com/docs/) API to rerank the search results. You can use this re-ranker by passing `VoyageAIReranker()` to the `rerank()` method. Note that you'll either need to set the `VOYAGE_API_KEY` environment variable or pass the `api_key` argument to use this re-ranker. + + +!!! note + Supported Query Types: Hybrid, Vector, FTS + + +```python +import numpy +import lancedb +from lancedb.embeddings import get_registry +from lancedb.pydantic import LanceModel, Vector +from lancedb.rerankers import VoyageAIReranker + +embedder = get_registry().get("sentence-transformers").create() +db = lancedb.connect("~/.lancedb") + +class Schema(LanceModel): + text: str = embedder.SourceField() + vector: Vector(embedder.ndims()) = embedder.VectorField() + +data = [ + {"text": "hello world"}, + {"text": "goodbye world"} + ] +tbl = db.create_table("test", schema=Schema, mode="overwrite") +tbl.add(data) +reranker = VoyageAIReranker(model_name="rerank-2") + +# Run vector search with a reranker +result = tbl.search("hello").rerank(reranker=reranker).to_list() + +# Run FTS search with a reranker +result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list() + +# Run hybrid search with a reranker +tbl.create_fts_index("text", replace=True) +result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() + +``` + +Accepted Arguments +---------------- +| Argument | Type | Default | Description | +| --- | --- | --- | --- | +| `model_name` | `str` | `None` | The name of the reranker model to use. Available models are: rerank-2, rerank-2-lite | +| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | +| `top_n` | `str` | `None` | The number of results to return. If None, will return all results. | +| `api_key` | `str` | `None` | The API key for the Voyage AI API. If not provided, the `VOYAGE_API_KEY` environment variable is used. | +| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type | +| `truncation` | `bool` | `None` | Whether to truncate the input to satisfy the "context length limit" on the query and the documents. | + + +## Supported Scores for each query type +You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: + +### Hybrid Search +|`return_score`| Status | Description | +| --- | --- | --- | +| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column | +| `all` | ❌ Not Supported | Returns have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`) | + +### Vector Search +|`return_score`| Status | Description | +| --- | --- | --- | +| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column | +| `all` | ✅ Supported | Returns have vector(`_distance`) along with Hybrid Search score(`_relevance_score`) | + +### FTS Search +|`return_score`| Status | Description | +| --- | --- | --- | +| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column | +| `all` | ✅ Supported | Returns have FTS(`score`) along with Hybrid Search score(`_relevance_score`) | \ No newline at end of file diff --git a/nodejs/Cargo.toml b/nodejs/Cargo.toml index ba7af8da..f2a79408 100644 --- a/nodejs/Cargo.toml +++ b/nodejs/Cargo.toml @@ -18,7 +18,7 @@ futures.workspace = true lancedb = { path = "../rust/lancedb", features = ["remote"] } napi = { version = "2.16.8", default-features = false, features = [ "napi9", - "async", + "async" ] } napi-derive = "2.16.4" # Prevent dynamic linking of lzma, which comes from datafusion diff --git a/nodejs/npm/win32-arm64-msvc/package.json b/nodejs/npm/win32-arm64-msvc/package.json index 0478cef7..e35e6e08 100644 --- a/nodejs/npm/win32-arm64-msvc/package.json +++ b/nodejs/npm/win32-arm64-msvc/package.json @@ -1,6 +1,6 @@ { "name": "@lancedb/lancedb-win32-arm64-msvc", - "version": "0.12.0", + "version": "0.13.0-beta.1", "os": [ "win32" ], diff --git a/python/python/lancedb/embeddings/__init__.py b/python/python/lancedb/embeddings/__init__.py index 76da3ab4..afa127d7 100644 --- a/python/python/lancedb/embeddings/__init__.py +++ b/python/python/lancedb/embeddings/__init__.py @@ -27,3 +27,4 @@ from .imagebind import ImageBindEmbeddings from .utils import with_embeddings from .jinaai import JinaEmbeddings from .watsonx import WatsonxEmbeddings +from .voyageai import VoyageAIEmbeddingFunction diff --git a/python/python/lancedb/embeddings/voyageai.py b/python/python/lancedb/embeddings/voyageai.py new file mode 100644 index 00000000..161c5e43 --- /dev/null +++ b/python/python/lancedb/embeddings/voyageai.py @@ -0,0 +1,127 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import ClassVar, List, Union + +import numpy as np + +from ..util import attempt_import_or_raise +from .base import TextEmbeddingFunction +from .registry import register +from .utils import api_key_not_found_help, TEXT + + +@register("voyageai") +class VoyageAIEmbeddingFunction(TextEmbeddingFunction): + """ + An embedding function that uses the VoyageAI API + + https://docs.voyageai.com/docs/embeddings + + Parameters + ---------- + name: str + The name of the model to use. List of acceptable models: + + * voyage-3 + * voyage-3-lite + * voyage-finance-2 + * voyage-multilingual-2 + * voyage-law-2 + * voyage-code-2 + + + Examples + -------- + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import EmbeddingFunctionRegistry + + voyageai = EmbeddingFunctionRegistry + .get_instance() + .get("voyageai") + .create(name="voyage-3") + + class TextModel(LanceModel): + text: str = voyageai.SourceField() + vector: Vector(voyageai.ndims()) = voyageai.VectorField() + + data = [ { "text": "hello world" }, + { "text": "goodbye world" }] + + db = lancedb.connect("~/.lancedb") + tbl = db.create_table("test", schema=TextModel, mode="overwrite") + + tbl.add(data) + + """ + + name: str + client: ClassVar = None + + def ndims(self): + if self.name == "voyage-3-lite": + return 512 + elif self.name == "voyage-code-2": + return 1536 + elif self.name in [ + "voyage-3", + "voyage-finance-2", + "voyage-multilingual-2", + "voyage-law-2", + ]: + return 1024 + else: + raise ValueError(f"Model {self.name} not supported") + + def compute_query_embeddings(self, query: str, *args, **kwargs) -> List[np.array]: + return self.compute_source_embeddings(query, input_type="query") + + def compute_source_embeddings(self, texts: TEXT, *args, **kwargs) -> List[np.array]: + texts = self.sanitize_input(texts) + input_type = ( + kwargs.get("input_type") or "document" + ) # assume source input type if not passed by `compute_query_embeddings` + return self.generate_embeddings(texts, input_type=input_type) + + def generate_embeddings( + self, texts: Union[List[str], np.ndarray], *args, **kwargs + ) -> List[np.array]: + """ + Get the embeddings for the given texts + + Parameters + ---------- + texts: list[str] or np.ndarray (of str) + The texts to embed + input_type: Optional[str] + + truncation: Optional[bool] + """ + VoyageAIEmbeddingFunction._init_client() + rs = VoyageAIEmbeddingFunction.client.embed( + texts=texts, model=self.name, **kwargs + ) + + return [emb for emb in rs.embeddings] + + @staticmethod + def _init_client(): + if VoyageAIEmbeddingFunction.client is None: + voyageai = attempt_import_or_raise("voyageai") + if os.environ.get("VOYAGE_API_KEY") is None: + api_key_not_found_help("voyageai") + VoyageAIEmbeddingFunction.client = voyageai.Client( + os.environ["VOYAGE_API_KEY"] + ) diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index a375d30c..2165c725 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -11,6 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from datetime import timedelta import asyncio import logging from functools import cached_property @@ -495,6 +496,19 @@ class RemoteTable(Table): "compact_files() is not supported on the LanceDB cloud" ) + def optimize( + self, + *, + cleanup_older_than: Optional[timedelta] = None, + delete_unverified: bool = False, + ): + """optimize() is not supported on the LanceDB cloud. + Indices are optimized automatically.""" + raise NotImplementedError( + "optimize() is not supported on the LanceDB cloud. " + "Indices are optimized automatically." + ) + def count_rows(self, filter: Optional[str] = None) -> int: return self._loop.run_until_complete(self._table.count_rows(filter)) diff --git a/python/python/lancedb/rerankers/__init__.py b/python/python/lancedb/rerankers/__init__.py index 93903a16..c3e27331 100644 --- a/python/python/lancedb/rerankers/__init__.py +++ b/python/python/lancedb/rerankers/__init__.py @@ -7,6 +7,7 @@ from .openai import OpenaiReranker from .jinaai import JinaReranker from .rrf import RRFReranker from .answerdotai import AnswerdotaiRerankers +from .voyageai import VoyageAIReranker __all__ = [ "Reranker", @@ -18,4 +19,5 @@ __all__ = [ "JinaReranker", "RRFReranker", "AnswerdotaiRerankers", + "VoyageAIReranker", ] diff --git a/python/python/lancedb/rerankers/voyageai.py b/python/python/lancedb/rerankers/voyageai.py new file mode 100644 index 00000000..d04a5ad4 --- /dev/null +++ b/python/python/lancedb/rerankers/voyageai.py @@ -0,0 +1,133 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from functools import cached_property +from typing import Union, Optional + +import pyarrow as pa + +from ..util import attempt_import_or_raise +from .base import Reranker + + +class VoyageAIReranker(Reranker): + """ + Reranks the results using the VoyageAI Rerank API. + https://docs.voyageai.com/docs/reranker + + Parameters + ---------- + model_name : str, default "rerank-english-v2.0" + The name of the cross encoder model to use. Available voyageai models are: + - rerank-2 + - rerank-2-lite + column : str, default "text" + The name of the column to use as input to the cross encoder model. + top_n : int, default None + The number of results to return. If None, will return all results. + return_score : str, default "relevance" + options are "relevance" or "all". Only "relevance" is supported for now. + api_key : str, default None + The API key to use. If None, will use the OPENAI_API_KEY environment variable. + truncation : Optional[bool], default None + """ + + def __init__( + self, + model_name: str, + column: str = "text", + top_n: Optional[int] = None, + return_score="relevance", + api_key: Optional[str] = None, + truncation: Optional[bool] = True, + ): + super().__init__(return_score) + self.model_name = model_name + self.column = column + self.top_n = top_n + self.api_key = api_key + self.truncation = truncation + + @cached_property + def _client(self): + voyageai = attempt_import_or_raise("voyageai") + if os.environ.get("VOYAGE_API_KEY") is None and self.api_key is None: + raise ValueError( + "VOYAGE_API_KEY not set. Either set it in your environment or \ + pass it as `api_key` argument to the VoyageAIReranker." + ) + return voyageai.Client( + api_key=os.environ.get("VOYAGE_API_KEY") or self.api_key, + ) + + def _rerank(self, result_set: pa.Table, query: str): + docs = result_set[self.column].to_pylist() + response = self._client.rerank( + query=query, + documents=docs, + top_k=self.top_n, + model=self.model_name, + truncation=self.truncation, + ) + results = ( + response.results + ) # returns list (text, idx, relevance) attributes sorted descending by score + indices, scores = list( + zip(*[(result.index, result.relevance_score) for result in results]) + ) # tuples + result_set = result_set.take(list(indices)) + # add the scores + result_set = result_set.append_column( + "_relevance_score", pa.array(scores, type=pa.float32()) + ) + + return result_set + + def rerank_hybrid( + self, + query: str, + vector_results: pa.Table, + fts_results: pa.Table, + ): + combined_results = self.merge_results(vector_results, fts_results) + combined_results = self._rerank(combined_results, query) + if self.score == "relevance": + combined_results = self._keep_relevance_score(combined_results) + elif self.score == "all": + raise NotImplementedError( + "return_score='all' not implemented for voyageai reranker" + ) + return combined_results + + def rerank_vector( + self, + query: str, + vector_results: pa.Table, + ): + result_set = self._rerank(vector_results, query) + if self.score == "relevance": + result_set = result_set.drop_columns(["_distance"]) + + return result_set + + def rerank_fts( + self, + query: str, + fts_results: pa.Table, + ): + result_set = self._rerank(fts_results, query) + if self.score == "relevance": + result_set = result_set.drop_columns(["_score"]) + + return result_set diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 18e2c266..6403c88f 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -3,6 +3,7 @@ from __future__ import annotations +import asyncio import inspect import time from abc import ABC, abstractmethod @@ -32,7 +33,7 @@ import pyarrow.fs as pa_fs from lance import LanceDataset from lance.dependencies import _check_for_hugging_face -from .common import DATA, VEC, VECTOR_COLUMN_NAME +from .common import DATA, VEC, VECTOR_COLUMN_NAME, sanitize_uri from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry from .merge import LanceMergeInsertBuilder from .pydantic import LanceModel, model_to_dict @@ -57,6 +58,8 @@ from .util import ( ) from .index import lang_mapping +from ._lancedb import connect as lancedb_connect + if TYPE_CHECKING: import PIL from lance.dataset import CleanupStats, ReaderLike @@ -893,6 +896,55 @@ class Table(ABC): For most cases, the default should be fine. """ + @abstractmethod + def optimize( + self, + *, + cleanup_older_than: Optional[timedelta] = None, + delete_unverified: bool = False, + ): + """ + Optimize the on-disk data and indices for better performance. + + Modeled after ``VACUUM`` in PostgreSQL. + + Optimization covers three operations: + + * Compaction: Merges small files into larger ones + * Prune: Removes old versions of the dataset + * Index: Optimizes the indices, adding new data to existing indices + + Parameters + ---------- + cleanup_older_than: timedelta, optional default 7 days + All files belonging to versions older than this will be removed. Set + to 0 days to remove all versions except the latest. The latest version + is never removed. + delete_unverified: bool, default False + Files leftover from a failed transaction may appear to be part of an + in-progress operation (e.g. appending new data) and these files will not + be deleted unless they are at least 7 days old. If delete_unverified is True + then these files will be deleted regardless of their age. + + Experimental API + ---------------- + + The optimization process is undergoing active development and may change. + Our goal with these changes is to improve the performance of optimization and + reduce the complexity. + + That being said, it is essential today to run optimize if you want the best + performance. It should be stable and safe to use in production, but it our + hope that the API may be simplified (or not even need to be called) in the + future. + + The frequency an application shoudl call optimize is based on the frequency of + data modifications. If data is frequently added, deleted, or updated then + optimize should be run frequently. A good rule of thumb is to run optimize if + you have added or modified 100,000 or more records or run more than 20 data + modification operations. + """ + @abstractmethod def add_columns(self, transforms: Dict[str, str]): """ @@ -1971,6 +2023,83 @@ class LanceTable(Table): """ return self.to_lance().optimize.compact_files(*args, **kwargs) + def optimize( + self, + *, + cleanup_older_than: Optional[timedelta] = None, + delete_unverified: bool = False, + ): + """ + Optimize the on-disk data and indices for better performance. + + Modeled after ``VACUUM`` in PostgreSQL. + + Optimization covers three operations: + + * Compaction: Merges small files into larger ones + * Prune: Removes old versions of the dataset + * Index: Optimizes the indices, adding new data to existing indices + + Parameters + ---------- + cleanup_older_than: timedelta, optional default 7 days + All files belonging to versions older than this will be removed. Set + to 0 days to remove all versions except the latest. The latest version + is never removed. + delete_unverified: bool, default False + Files leftover from a failed transaction may appear to be part of an + in-progress operation (e.g. appending new data) and these files will not + be deleted unless they are at least 7 days old. If delete_unverified is True + then these files will be deleted regardless of their age. + + Experimental API + ---------------- + + The optimization process is undergoing active development and may change. + Our goal with these changes is to improve the performance of optimization and + reduce the complexity. + + That being said, it is essential today to run optimize if you want the best + performance. It should be stable and safe to use in production, but it our + hope that the API may be simplified (or not even need to be called) in the + future. + + The frequency an application shoudl call optimize is based on the frequency of + data modifications. If data is frequently added, deleted, or updated then + optimize should be run frequently. A good rule of thumb is to run optimize if + you have added or modified 100,000 or more records or run more than 20 data + modification operations. + """ + try: + asyncio.get_running_loop() + raise AssertionError( + "Synchronous method called in asynchronous context. " + "If you are writing an asynchronous application " + "then please use the asynchronous APIs" + ) + + except RuntimeError: + asyncio.run( + self._async_optimize( + cleanup_older_than=cleanup_older_than, + delete_unverified=delete_unverified, + ) + ) + self.checkout_latest() + + async def _async_optimize( + self, + cleanup_older_than: Optional[timedelta] = None, + delete_unverified: bool = False, + ): + conn = await lancedb_connect( + sanitize_uri(self._conn.uri), + ) + table = AsyncTable(await conn.open_table(self.name)) + return await table.optimize( + cleanup_older_than=cleanup_older_than, delete_unverified=delete_unverified + ) + def add_columns(self, transforms: Dict[str, str]): self._dataset_mut.add_columns(transforms) diff --git a/python/python/tests/test_embeddings.py b/python/python/tests/test_embeddings.py index e48fb209..a9f939ee 100644 --- a/python/python/tests/test_embeddings.py +++ b/python/python/tests/test_embeddings.py @@ -196,6 +196,7 @@ def test_add_optional_vector(tmp_path): "ollama", "cohere", "instructor", + "voyageai", ], ) def test_embedding_function_safe_model_dump(embedding_type): diff --git a/python/python/tests/test_embeddings_slow.py b/python/python/tests/test_embeddings_slow.py index 9e17ca66..58f9ff98 100644 --- a/python/python/tests/test_embeddings_slow.py +++ b/python/python/tests/test_embeddings_slow.py @@ -481,3 +481,22 @@ def test_ollama_embedding(tmp_path): json.dumps(dumped_model) except TypeError: pytest.fail("Failed to JSON serialize the dumped model") + + +@pytest.mark.slow +@pytest.mark.skipif( + os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set" +) +def test_voyageai_embedding_function(): + voyageai = get_registry().get("voyageai").create(name="voyage-3", max_retries=0) + + class TextModel(LanceModel): + text: str = voyageai.SourceField() + vector: Vector(voyageai.ndims()) = voyageai.VectorField() + + df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) + db = lancedb.connect("~/lancedb") + tbl = db.create_table("test", schema=TextModel, mode="overwrite") + + tbl.add(df) + assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims() diff --git a/python/python/tests/test_rerankers.py b/python/python/tests/test_rerankers.py index f2f7c6cc..4e1c6898 100644 --- a/python/python/tests/test_rerankers.py +++ b/python/python/tests/test_rerankers.py @@ -16,6 +16,7 @@ from lancedb.rerankers import ( OpenaiReranker, JinaReranker, AnswerdotaiRerankers, + VoyageAIReranker, ) from lancedb.table import LanceTable @@ -344,3 +345,14 @@ def test_jina_reranker(tmp_path, use_tantivy): table, schema = get_test_table(tmp_path, use_tantivy) reranker = JinaReranker() _run_test_reranker(reranker, table, "single player experience", None, schema) + + +@pytest.mark.skipif( + os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set" +) +@pytest.mark.parametrize("use_tantivy", [True, False]) +def test_voyageai_reranker(tmp_path, use_tantivy): + pytest.importorskip("voyageai") + reranker = VoyageAIReranker(model_name="rerank-2") + table, schema = get_test_table(tmp_path, use_tantivy) + _run_test_reranker(reranker, table, "single player experience", None, schema) diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index bdf22ddf..7ed367cb 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -1223,6 +1223,54 @@ async def test_time_travel(db_async: AsyncConnection): await table.restore() +def test_sync_optimize(db): + table = LanceTable.create( + db, + "test", + data=[ + {"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, + {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}, + ], + ) + + table.create_scalar_index("price", index_type="BTREE") + stats = table.to_lance().stats.index_stats("price_idx") + assert stats["num_indexed_rows"] == 2 + + table.add([{"vector": [2.0, 2.0], "item": "baz", "price": 30.0}]) + assert table.count_rows() == 3 + table.optimize() + stats = table.to_lance().stats.index_stats("price_idx") + assert stats["num_indexed_rows"] == 3 + + +@pytest.mark.asyncio +async def test_sync_optimize_in_async(db): + table = LanceTable.create( + db, + "test", + data=[ + {"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, + {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}, + ], + ) + + table.create_scalar_index("price", index_type="BTREE") + stats = table.to_lance().stats.index_stats("price_idx") + assert stats["num_indexed_rows"] == 2 + + table.add([{"vector": [2.0, 2.0], "item": "baz", "price": 30.0}]) + assert table.count_rows() == 3 + try: + table.optimize() + except Exception as e: + assert ( + "Synchronous method called in asynchronous context. " + "If you are writing an asynchronous application " + "then please use the asynchronous APIs" in str(e) + ) + + @pytest.mark.asyncio async def test_optimize(db_async: AsyncConnection): table = await db_async.create_table( diff --git a/rust/lancedb/src/index.rs b/rust/lancedb/src/index.rs index 6ec6249e..432e01c2 100644 --- a/rust/lancedb/src/index.rs +++ b/rust/lancedb/src/index.rs @@ -29,6 +29,7 @@ pub mod scalar; pub mod vector; /// Supported index types. +#[derive(Debug, Clone)] pub enum Index { Auto, /// A `BTree` index is an sorted index on scalar columns.