mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-24 05:49:57 +00:00
Compare commits
27 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6e5927ce6d | ||
|
|
6c1f32ac11 | ||
|
|
4fdf084777 | ||
|
|
1fad24fcd8 | ||
|
|
6ef20b85ca | ||
|
|
35bacdd57e | ||
|
|
a5ebe5a6c4 | ||
|
|
bf03ad1b4a | ||
|
|
2a9e3e2084 | ||
|
|
f298f15360 | ||
|
|
679b031b99 | ||
|
|
f50b5d532b | ||
|
|
fe655a15f0 | ||
|
|
9d0af794d0 | ||
|
|
048a2d10f8 | ||
|
|
c78a9849b4 | ||
|
|
c663085203 | ||
|
|
8b628854d5 | ||
|
|
a8d8c17b2a | ||
|
|
3c487e5fc7 | ||
|
|
d6219d687c | ||
|
|
239f725b32 | ||
|
|
5f261cf2d8 | ||
|
|
79eaa52184 | ||
|
|
bd82e1f66d | ||
|
|
ba34c3bee1 | ||
|
|
d4d0873e2b |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.14.0-beta.1"
|
||||
current_version = "0.14.1-beta.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
4
.github/workflows/docs.yml
vendored
4
.github/workflows/docs.yml
vendored
@@ -72,9 +72,9 @@ jobs:
|
||||
- name: Setup Pages
|
||||
uses: actions/configure-pages@v2
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-pages-artifact@v1
|
||||
uses: actions/upload-pages-artifact@v3
|
||||
with:
|
||||
path: "docs/site"
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v1
|
||||
uses: actions/deploy-pages@v4
|
||||
|
||||
214
.github/workflows/npm-publish.yml
vendored
214
.github/workflows/npm-publish.yml
vendored
@@ -143,7 +143,7 @@ jobs:
|
||||
|
||||
node-linux-musl:
|
||||
name: vectordb (${{ matrix.config.arch}}-unknown-linux-musl)
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine:edge
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
@@ -152,10 +152,7 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
- arch: x86_64
|
||||
runner: ubuntu-latest
|
||||
- arch: aarch64
|
||||
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
|
||||
runner: buildjet-16vcpu-ubuntu-2204-arm
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -249,7 +246,7 @@ jobs:
|
||||
|
||||
nodejs-linux-musl:
|
||||
name: lancedb (${{ matrix.config.arch}}-unknown-linux-musl
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine:edge
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
@@ -258,10 +255,7 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
- arch: x86_64
|
||||
runner: ubuntu-latest
|
||||
- arch: aarch64
|
||||
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
|
||||
runner: buildjet-16vcpu-ubuntu-2204-arm
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -342,6 +336,7 @@ jobs:
|
||||
|
||||
node-windows-arm64:
|
||||
name: vectordb ${{ matrix.config.arch }}-pc-windows-msvc
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine:edge
|
||||
strategy:
|
||||
@@ -384,110 +379,6 @@ jobs:
|
||||
path: |
|
||||
node/dist/lancedb-vectordb-win32*.tgz
|
||||
|
||||
# TODO: re-enable once working https://github.com/lancedb/lancedb/pull/1831
|
||||
# node-windows-arm64:
|
||||
# name: vectordb win32-arm64-msvc
|
||||
# runs-on: windows-4x-arm
|
||||
# if: startsWith(github.ref, 'refs/tags/v')
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
# - name: Install Git
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||
# Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||
# shell: powershell
|
||||
# - name: Add Git to PATH
|
||||
# run: |
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||
# $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||
# shell: powershell
|
||||
# - name: Configure Git symlinks
|
||||
# run: git config --global core.symlinks true
|
||||
# - uses: actions/checkout@v4
|
||||
# - uses: actions/setup-python@v5
|
||||
# with:
|
||||
# python-version: "3.13"
|
||||
# - name: Install Visual Studio Build Tools
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||
# Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||
# "--installPath", "C:\BuildTools", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||
# shell: powershell
|
||||
# - name: Add Visual Studio Build Tools to PATH
|
||||
# run: |
|
||||
# $vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||
# $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||
|
||||
# # Add MSVC runtime libraries to LIB
|
||||
# $env:LIB = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\lib\arm64;" +
|
||||
# "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;" +
|
||||
# "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||
# Add-Content $env:GITHUB_ENV "LIB=$env:LIB"
|
||||
|
||||
# # Add INCLUDE paths
|
||||
# $env:INCLUDE = "C:\BuildTools\VC\Tools\MSVC\$latestVersion\include;" +
|
||||
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\ucrt;" +
|
||||
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\um;" +
|
||||
# "C:\Program Files (x86)\Windows Kits\10\Include\10.0.22621.0\shared"
|
||||
# Add-Content $env:GITHUB_ENV "INCLUDE=$env:INCLUDE"
|
||||
# shell: powershell
|
||||
# - name: Install Rust
|
||||
# run: |
|
||||
# Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||
# .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||
# shell: powershell
|
||||
# - name: Add Rust to PATH
|
||||
# run: |
|
||||
# Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||
# shell: powershell
|
||||
|
||||
# - uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# workspaces: rust
|
||||
# - name: Install 7-Zip ARM
|
||||
# run: |
|
||||
# New-Item -Path 'C:\7zip' -ItemType Directory
|
||||
# Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||
# Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||
# shell: powershell
|
||||
# - name: Add 7-Zip to PATH
|
||||
# run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||
# shell: powershell
|
||||
# - name: Install Protoc v21.12
|
||||
# working-directory: C:\
|
||||
# run: |
|
||||
# if (Test-Path 'C:\protoc') {
|
||||
# Write-Host "Protoc directory exists, skipping installation"
|
||||
# return
|
||||
# }
|
||||
# New-Item -Path 'C:\protoc' -ItemType Directory
|
||||
# Set-Location C:\protoc
|
||||
# Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||
# & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||
# shell: powershell
|
||||
# - name: Add Protoc to PATH
|
||||
# run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||
# shell: powershell
|
||||
# - name: Build Windows native node modules
|
||||
# run: .\ci\build_windows_artifacts.ps1 aarch64-pc-windows-msvc
|
||||
# - name: Upload Windows ARM64 Artifacts
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: node-native-windows-arm64
|
||||
# path: |
|
||||
# node/dist/*.node
|
||||
|
||||
nodejs-windows:
|
||||
name: lancedb ${{ matrix.target }}
|
||||
runs-on: windows-2022
|
||||
@@ -524,6 +415,8 @@ jobs:
|
||||
|
||||
nodejs-windows-arm64:
|
||||
name: lancedb ${{ matrix.config.arch }}-pc-windows-msvc
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine:edge
|
||||
strategy:
|
||||
@@ -568,100 +461,6 @@ jobs:
|
||||
path: |
|
||||
nodejs/dist/*.node
|
||||
|
||||
# TODO: re-enable once working https://github.com/lancedb/lancedb/pull/1831
|
||||
# nodejs-windows-arm64:
|
||||
# name: lancedb win32-arm64-msvc
|
||||
# runs-on: windows-4x-arm
|
||||
# if: startsWith(github.ref, 'refs/tags/v')
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
# - name: Install Git
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "https://github.com/git-for-windows/git/releases/download/v2.44.0.windows.1/Git-2.44.0-64-bit.exe" -OutFile "git-installer.exe"
|
||||
# Start-Process -FilePath "git-installer.exe" -ArgumentList "/VERYSILENT", "/NORESTART" -Wait
|
||||
# shell: powershell
|
||||
# - name: Add Git to PATH
|
||||
# run: |
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files\Git\bin"
|
||||
# $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
|
||||
# shell: powershell
|
||||
# - name: Configure Git symlinks
|
||||
# run: git config --global core.symlinks true
|
||||
# - uses: actions/checkout@v4
|
||||
# - uses: actions/setup-python@v5
|
||||
# with:
|
||||
# python-version: "3.13"
|
||||
# - name: Install Visual Studio Build Tools
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_buildtools.exe" -OutFile "vs_buildtools.exe"
|
||||
# Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--quiet", "--wait", "--norestart", "--nocache", `
|
||||
# "--installPath", "C:\BuildTools", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.ARM64", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.Windows11SDK.22621", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.ATL", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.ATLMFC", `
|
||||
# "--add", "Microsoft.VisualStudio.Component.VC.Llvm.Clang" -Wait
|
||||
# shell: powershell
|
||||
# - name: Add Visual Studio Build Tools to PATH
|
||||
# run: |
|
||||
# $vsPath = "C:\BuildTools\VC\Tools\MSVC"
|
||||
# $latestVersion = (Get-ChildItem $vsPath | Sort-Object {[version]$_.Name} -Descending)[0].Name
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\arm64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\MSVC\$latestVersion\bin\Hostx64\x64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\arm64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64"
|
||||
# Add-Content $env:GITHUB_PATH "C:\BuildTools\VC\Tools\Llvm\x64\bin"
|
||||
|
||||
# $env:LIB = ""
|
||||
# Add-Content $env:GITHUB_ENV "LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\arm64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\arm64"
|
||||
# shell: powershell
|
||||
# - name: Install Rust
|
||||
# run: |
|
||||
# Invoke-WebRequest https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||
# .\rustup-init.exe -y --default-host aarch64-pc-windows-msvc
|
||||
# shell: powershell
|
||||
# - name: Add Rust to PATH
|
||||
# run: |
|
||||
# Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
|
||||
# shell: powershell
|
||||
|
||||
# - uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# workspaces: rust
|
||||
# - name: Install 7-Zip ARM
|
||||
# run: |
|
||||
# New-Item -Path 'C:\7zip' -ItemType Directory
|
||||
# Invoke-WebRequest https://7-zip.org/a/7z2408-arm64.exe -OutFile C:\7zip\7z-installer.exe
|
||||
# Start-Process -FilePath C:\7zip\7z-installer.exe -ArgumentList '/S' -Wait
|
||||
# shell: powershell
|
||||
# - name: Add 7-Zip to PATH
|
||||
# run: Add-Content $env:GITHUB_PATH "C:\Program Files\7-Zip"
|
||||
# shell: powershell
|
||||
# - name: Install Protoc v21.12
|
||||
# working-directory: C:\
|
||||
# run: |
|
||||
# if (Test-Path 'C:\protoc') {
|
||||
# Write-Host "Protoc directory exists, skipping installation"
|
||||
# return
|
||||
# }
|
||||
# New-Item -Path 'C:\protoc' -ItemType Directory
|
||||
# Set-Location C:\protoc
|
||||
# Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||
# & 'C:\Program Files\7-Zip\7z.exe' x protoc.zip
|
||||
# shell: powershell
|
||||
# - name: Add Protoc to PATH
|
||||
# run: Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||
# shell: powershell
|
||||
# - name: Build Windows native node modules
|
||||
# run: .\ci\build_windows_artifacts_nodejs.ps1 aarch64-pc-windows-msvc
|
||||
# - name: Upload Windows ARM64 Artifacts
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: nodejs-native-windows-arm64
|
||||
# path: |
|
||||
# nodejs/dist/*.node
|
||||
|
||||
release:
|
||||
name: vectordb NPM Publish
|
||||
needs: [node, node-macos, node-linux-gnu, node-linux-musl, node-windows, node-windows-arm64]
|
||||
@@ -762,6 +561,7 @@ jobs:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.ACTION_MONITORING_SLACK }}
|
||||
|
||||
update-package-lock:
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
needs: [release]
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
@@ -779,6 +579,7 @@ jobs:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
update-package-lock-nodejs:
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
needs: [release-nodejs]
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
@@ -796,6 +597,7 @@ jobs:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
gh-release:
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
36
Cargo.toml
36
Cargo.toml
@@ -23,27 +23,27 @@ rust-version = "1.80.0" # TO
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.20.0", "features" = [
|
||||
"dynamodb",
|
||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
|
||||
lance-io = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
|
||||
lance-index = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
|
||||
lance-linalg = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
|
||||
lance-table = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
|
||||
lance-testing = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
|
||||
lance-datafusion = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
|
||||
lance-encoding = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
|
||||
] }
|
||||
lance-io = "0.20.0"
|
||||
lance-index = "0.20.0"
|
||||
lance-linalg = "0.20.0"
|
||||
lance-table = "0.20.0"
|
||||
lance-testing = "0.20.0"
|
||||
lance-datafusion = "0.20.0"
|
||||
lance-encoding = "0.20.0"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "52.2", optional = false }
|
||||
arrow-array = "52.2"
|
||||
arrow-data = "52.2"
|
||||
arrow-ipc = "52.2"
|
||||
arrow-ord = "52.2"
|
||||
arrow-schema = "52.2"
|
||||
arrow-arith = "52.2"
|
||||
arrow-cast = "52.2"
|
||||
arrow = { version = "53.2", optional = false }
|
||||
arrow-array = "53.2"
|
||||
arrow-data = "53.2"
|
||||
arrow-ipc = "53.2"
|
||||
arrow-ord = "53.2"
|
||||
arrow-schema = "53.2"
|
||||
arrow-arith = "53.2"
|
||||
arrow-cast = "53.2"
|
||||
async-trait = "0"
|
||||
chrono = "0.4.35"
|
||||
datafusion-common = "41.0"
|
||||
datafusion-physical-plan = "41.0"
|
||||
datafusion-common = "42.0"
|
||||
datafusion-physical-plan = "42.0"
|
||||
env_logger = "0.10"
|
||||
half = { "version" = "=2.4.1", default-features = false, features = [
|
||||
"num-traits",
|
||||
|
||||
@@ -6,6 +6,7 @@ LanceDB registers the OpenAI embeddings function in the registry by default, as
|
||||
|---|---|---|---|
|
||||
| `name` | `str` | `"text-embedding-ada-002"` | The name of the model. |
|
||||
| `dim` | `int` | Model default | For OpenAI's newer text-embedding-3 model, we can specify a dimensionality that is smaller than the 1536 size. This feature supports it |
|
||||
| `use_azure` | bool | `False` | Set true to use Azure OpenAPI SDK |
|
||||
|
||||
|
||||
```python
|
||||
|
||||
@@ -27,10 +27,13 @@ LanceDB OSS supports object stores such as AWS S3 (and compatible stores), Azure
|
||||
|
||||
Azure Blob Storage:
|
||||
|
||||
<!-- skip-test -->
|
||||
```python
|
||||
import lancedb
|
||||
db = lancedb.connect("az://bucket/path")
|
||||
```
|
||||
Note that for Azure, storage credentials must be configured. See [below](#azure-blob-storage) for more details.
|
||||
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
@@ -87,11 +90,6 @@ In most cases, when running in the respective cloud and permissions are set up c
|
||||
export TIMEOUT=60s
|
||||
```
|
||||
|
||||
!!! note "`storage_options` availability"
|
||||
|
||||
The `storage_options` parameter is only available in Python *async* API and JavaScript API.
|
||||
It is not yet supported in the Python synchronous API.
|
||||
|
||||
If you only want this to apply to one particular connection, you can pass the `storage_options` argument when opening the connection:
|
||||
|
||||
=== "Python"
|
||||
|
||||
@@ -790,6 +790,101 @@ Use the `drop_table()` method on the database to remove a table.
|
||||
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||
If the table does not exist an exception is raised.
|
||||
|
||||
## Changing schemas
|
||||
|
||||
While tables must have a schema specified when they are created, you can
|
||||
change the schema over time. There's three methods to alter the schema of
|
||||
a table:
|
||||
|
||||
* `add_columns`: Add new columns to the table
|
||||
* `alter_columns`: Alter the name, nullability, or data type of a column
|
||||
* `drop_columns`: Drop columns from the table
|
||||
|
||||
### Adding new columns
|
||||
|
||||
You can add new columns to the table with the `add_columns` method. New columns
|
||||
are filled with values based on a SQL expression. For example, you can add a new
|
||||
column `y` to the table and fill it with the value of `x + 1`.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
table.add_columns({"double_price": "price * 2"})
|
||||
```
|
||||
**API Reference:** [lancedb.table.Table.add_columns][]
|
||||
|
||||
=== "Typescript"
|
||||
|
||||
```typescript
|
||||
--8<-- "nodejs/examples/basic.test.ts:add_columns"
|
||||
```
|
||||
**API Reference:** [lancedb.Table.addColumns](../js/classes/Table.md/#addcolumns)
|
||||
|
||||
If you want to fill it with null, you can use `cast(NULL as <data_type>)` as
|
||||
the SQL expression to fill the column with nulls, while controlling the data
|
||||
type of the column. Available data types are base on the
|
||||
[DataFusion data types](https://datafusion.apache.org/user-guide/sql/data_types.html).
|
||||
You can use any of the SQL types, such as `BIGINT`:
|
||||
|
||||
```sql
|
||||
cast(NULL as BIGINT)
|
||||
```
|
||||
|
||||
Using Arrow data types and the `arrow_typeof` function is not yet supported.
|
||||
|
||||
<!-- TODO: we could provide a better formula for filling with nulls:
|
||||
https://github.com/lancedb/lance/issues/3175
|
||||
-->
|
||||
|
||||
### Altering existing columns
|
||||
|
||||
You can alter the name, nullability, or data type of a column with the `alter_columns`
|
||||
method.
|
||||
|
||||
Changing the name or nullability of a column just updates the metadata. Because
|
||||
of this, it's a fast operation. Changing the data type of a column requires
|
||||
rewriting the column, which can be a heavy operation.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
import pyarrow as pa
|
||||
table.alter_column({"path": "double_price", "rename": "dbl_price",
|
||||
"data_type": pa.float32(), "nullable": False})
|
||||
```
|
||||
**API Reference:** [lancedb.table.Table.alter_columns][]
|
||||
|
||||
=== "Typescript"
|
||||
|
||||
```typescript
|
||||
--8<-- "nodejs/examples/basic.test.ts:alter_columns"
|
||||
```
|
||||
**API Reference:** [lancedb.Table.alterColumns](../js/classes/Table.md/#altercolumns)
|
||||
|
||||
### Dropping columns
|
||||
|
||||
You can drop columns from the table with the `drop_columns` method. This will
|
||||
will remove the column from the schema.
|
||||
|
||||
<!-- TODO: Provide guidance on how to reduce disk usage once optimize helps here
|
||||
waiting on: https://github.com/lancedb/lance/issues/3177
|
||||
-->
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
table.drop_columns(["dbl_price"])
|
||||
```
|
||||
**API Reference:** [lancedb.table.Table.drop_columns][]
|
||||
|
||||
=== "Typescript"
|
||||
|
||||
```typescript
|
||||
--8<-- "nodejs/examples/basic.test.ts:drop_columns"
|
||||
```
|
||||
**API Reference:** [lancedb.Table.dropColumns](../js/classes/Table.md/#altercolumns)
|
||||
|
||||
|
||||
## Handling bad vectors
|
||||
|
||||
In LanceDB Python, you can use the `on_bad_vectors` parameter to choose how
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.14.0-beta.1</version>
|
||||
<version>0.14.1-beta.0</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.14.0-beta.1</version>
|
||||
<version>0.14.1-beta.0</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<name>LanceDB Parent</name>
|
||||
|
||||
20
node/package-lock.json
generated
20
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,14 +52,14 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-darwin-x64": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.14.0-beta.1"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.14.1-beta.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"private": false,
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
"scripts": {
|
||||
@@ -91,13 +92,13 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-x64": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.14.0-beta.1",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.14.0-beta.1"
|
||||
"@lancedb/vectordb-darwin-x64": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.14.1-beta.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.14.0-beta.1"
|
||||
version = "0.14.1-beta.0"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -825,6 +825,18 @@ describe("schema evolution", function () {
|
||||
new Field("price", new Float64(), true),
|
||||
]);
|
||||
expect(await table.schema()).toEqual(expectedSchema);
|
||||
|
||||
await table.alterColumns([{ path: "new_id", dataType: "int32" }]);
|
||||
const expectedSchema2 = new Schema([
|
||||
new Field("new_id", new Int32(), true),
|
||||
new Field(
|
||||
"vector",
|
||||
new FixedSizeList(2, new Field("item", new Float32(), true)),
|
||||
true,
|
||||
),
|
||||
new Field("price", new Float64(), true),
|
||||
]);
|
||||
expect(await table.schema()).toEqual(expectedSchema2);
|
||||
});
|
||||
|
||||
it("can drop a column from the schema", async function () {
|
||||
|
||||
@@ -116,6 +116,26 @@ test("basic table examples", async () => {
|
||||
await tbl.add(data);
|
||||
// --8<-- [end:add_data]
|
||||
}
|
||||
|
||||
{
|
||||
// --8<-- [start:add_columns]
|
||||
await tbl.addColumns([{ name: "double_price", valueSql: "price * 2" }]);
|
||||
// --8<-- [end:add_columns]
|
||||
// --8<-- [start:alter_columns]
|
||||
await tbl.alterColumns([
|
||||
{
|
||||
path: "double_price",
|
||||
rename: "dbl_price",
|
||||
dataType: "float",
|
||||
nullable: true,
|
||||
},
|
||||
]);
|
||||
// --8<-- [end:alter_columns]
|
||||
// --8<-- [start:drop_columns]
|
||||
await tbl.dropColumns(["dbl_price"]);
|
||||
// --8<-- [end:drop_columns]
|
||||
}
|
||||
|
||||
{
|
||||
// --8<-- [start:vector_search]
|
||||
const res = await tbl.search([100, 100]).limit(2).toArray();
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.14.0-beta.1",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.13.0",
|
||||
"version": "0.14.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.13.0",
|
||||
"version": "0.14.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -10,7 +10,8 @@
|
||||
"vector database",
|
||||
"ann"
|
||||
],
|
||||
"version": "0.14.0-beta.1",
|
||||
"private": false,
|
||||
"version": "0.14.1-beta.0",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
@@ -30,7 +31,8 @@
|
||||
"aarch64-unknown-linux-gnu",
|
||||
"x86_64-unknown-linux-musl",
|
||||
"aarch64-unknown-linux-musl",
|
||||
"x86_64-pc-windows-msvc"
|
||||
"x86_64-pc-windows-msvc",
|
||||
"aarch64-pc-windows-msvc"
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
@@ -178,16 +178,20 @@ impl Table {
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn alter_columns(&self, alterations: Vec<ColumnAlteration>) -> napi::Result<()> {
|
||||
for alteration in &alterations {
|
||||
if alteration.rename.is_none() && alteration.nullable.is_none() {
|
||||
if alteration.rename.is_none()
|
||||
&& alteration.nullable.is_none()
|
||||
&& alteration.data_type.is_none()
|
||||
{
|
||||
return Err(napi::Error::from_reason(
|
||||
"Alteration must have a 'rename' or 'nullable' field.",
|
||||
"Alteration must have a 'rename', 'dataType', or 'nullable' field.",
|
||||
));
|
||||
}
|
||||
}
|
||||
let alterations = alterations
|
||||
.into_iter()
|
||||
.map(LanceColumnAlteration::from)
|
||||
.collect::<Vec<_>>();
|
||||
.map(LanceColumnAlteration::try_from)
|
||||
.collect::<std::result::Result<Vec<_>, String>>()
|
||||
.map_err(napi::Error::from_reason)?;
|
||||
|
||||
self.inner_ref()?
|
||||
.alter_columns(&alterations)
|
||||
@@ -433,24 +437,43 @@ pub struct ColumnAlteration {
|
||||
/// The new name of the column. If not provided then the name will not be changed.
|
||||
/// This must be distinct from the names of all other columns in the table.
|
||||
pub rename: Option<String>,
|
||||
/// A new data type for the column. If not provided then the data type will not be changed.
|
||||
/// Changing data types is limited to casting to the same general type. For example, these
|
||||
/// changes are valid:
|
||||
/// * `int32` -> `int64` (integers)
|
||||
/// * `double` -> `float` (floats)
|
||||
/// * `string` -> `large_string` (strings)
|
||||
/// But these changes are not:
|
||||
/// * `int32` -> `double` (mix integers and floats)
|
||||
/// * `string` -> `int32` (mix strings and integers)
|
||||
pub data_type: Option<String>,
|
||||
/// Set the new nullability. Note that a nullable column cannot be made non-nullable.
|
||||
pub nullable: Option<bool>,
|
||||
}
|
||||
|
||||
impl From<ColumnAlteration> for LanceColumnAlteration {
|
||||
fn from(js: ColumnAlteration) -> Self {
|
||||
impl TryFrom<ColumnAlteration> for LanceColumnAlteration {
|
||||
type Error = String;
|
||||
fn try_from(js: ColumnAlteration) -> std::result::Result<Self, Self::Error> {
|
||||
let ColumnAlteration {
|
||||
path,
|
||||
rename,
|
||||
nullable,
|
||||
data_type,
|
||||
} = js;
|
||||
Self {
|
||||
let data_type = if let Some(data_type) = data_type {
|
||||
Some(
|
||||
lancedb::utils::string_to_datatype(&data_type)
|
||||
.ok_or_else(|| format!("Invalid data type: {}", data_type))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(Self {
|
||||
path,
|
||||
rename,
|
||||
nullable,
|
||||
// TODO: wire up this field
|
||||
data_type: None,
|
||||
}
|
||||
data_type,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.17.0-beta.3"
|
||||
current_version = "0.17.1-beta.1"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.17.0-beta.3"
|
||||
version = "0.17.1-beta.1"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
@@ -14,23 +14,18 @@ name = "_lancedb"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
arrow = { version = "52.1", features = ["pyarrow"] }
|
||||
arrow = { version = "53.2", features = ["pyarrow"] }
|
||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||
env_logger.workspace = true
|
||||
pyo3 = { version = "0.21", features = [
|
||||
pyo3 = { version = "0.22.2", features = [
|
||||
"extension-module",
|
||||
"abi3-py39",
|
||||
"gil-refs"
|
||||
] }
|
||||
# Using this fork for now: https://github.com/awestlake87/pyo3-asyncio/issues/119
|
||||
# pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
|
||||
pyo3-asyncio-0-21 = { version = "0.21.0", features = [
|
||||
"attributes",
|
||||
"tokio-runtime"
|
||||
] }
|
||||
pyo3-async-runtimes = { version = "0.22", features = ["attributes", "tokio-runtime"] }
|
||||
pin-project = "1.1.5"
|
||||
futures.workspace = true
|
||||
tokio = { version = "1.36.0", features = ["sync"] }
|
||||
tokio = { version = "1.40", features = ["sync"] }
|
||||
|
||||
[build-dependencies]
|
||||
pyo3-build-config = { version = "0.20.3", features = [
|
||||
|
||||
@@ -3,7 +3,7 @@ name = "lancedb"
|
||||
# version in Cargo.toml
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"pylance==0.20.0b3",
|
||||
"pylance==0.20.0",
|
||||
"tqdm>=4.27.0",
|
||||
"pydantic>=1.10",
|
||||
"packaging",
|
||||
|
||||
@@ -36,6 +36,7 @@ def connect(
|
||||
read_consistency_interval: Optional[timedelta] = None,
|
||||
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
|
||||
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> DBConnection:
|
||||
"""Connect to a LanceDB database.
|
||||
@@ -67,6 +68,9 @@ def connect(
|
||||
Configuration options for the LanceDB Cloud HTTP client. If a dict, then
|
||||
the keys are the attributes of the ClientConfig class. If None, then the
|
||||
default configuration is used.
|
||||
storage_options: dict, optional
|
||||
Additional options for the storage backend. See available options at
|
||||
https://lancedb.github.io/lancedb/guides/storage/
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -106,12 +110,17 @@ def connect(
|
||||
# TODO: remove this (deprecation warning downstream)
|
||||
request_thread_pool=request_thread_pool,
|
||||
client_config=client_config,
|
||||
storage_options=storage_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if kwargs:
|
||||
raise ValueError(f"Unknown keyword arguments: {kwargs}")
|
||||
return LanceDBConnection(uri, read_consistency_interval=read_consistency_interval)
|
||||
return LanceDBConnection(
|
||||
uri,
|
||||
read_consistency_interval=read_consistency_interval,
|
||||
storage_options=storage_options,
|
||||
)
|
||||
|
||||
|
||||
async def connect_async(
|
||||
|
||||
@@ -79,9 +79,21 @@ class Query:
|
||||
def limit(self, limit: int): ...
|
||||
def offset(self, offset: int): ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
|
||||
def nearest_to_text(self, query: dict) -> Query: ...
|
||||
def nearest_to_text(self, query: dict) -> FTSQuery: ...
|
||||
async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ...
|
||||
|
||||
class FTSQuery:
|
||||
def where(self, filter: str): ...
|
||||
def select(self, columns: List[str]): ...
|
||||
def limit(self, limit: int): ...
|
||||
def offset(self, offset: int): ...
|
||||
def fast_search(self): ...
|
||||
def with_row_id(self): ...
|
||||
def postfilter(self): ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
|
||||
async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
|
||||
async def explain_plan(self) -> str: ...
|
||||
|
||||
class VectorQuery:
|
||||
async def execute(self) -> RecordBatchStream: ...
|
||||
def where(self, filter: str): ...
|
||||
@@ -95,6 +107,24 @@ class VectorQuery:
|
||||
def refine_factor(self, refine_factor: int): ...
|
||||
def nprobes(self, nprobes: int): ...
|
||||
def bypass_vector_index(self): ...
|
||||
def nearest_to_text(self, query: dict) -> HybridQuery: ...
|
||||
|
||||
class HybridQuery:
|
||||
def where(self, filter: str): ...
|
||||
def select(self, columns: List[str]): ...
|
||||
def limit(self, limit: int): ...
|
||||
def offset(self, offset: int): ...
|
||||
def fast_search(self): ...
|
||||
def with_row_id(self): ...
|
||||
def postfilter(self): ...
|
||||
def distance_type(self, distance_type: str): ...
|
||||
def refine_factor(self, refine_factor: int): ...
|
||||
def nprobes(self, nprobes: int): ...
|
||||
def bypass_vector_index(self): ...
|
||||
def to_vector_query(self) -> VectorQuery: ...
|
||||
def to_fts_query(self) -> FTSQuery: ...
|
||||
def get_limit(self) -> int: ...
|
||||
def get_with_row_id(self) -> bool: ...
|
||||
|
||||
class CompactionStats:
|
||||
fragments_removed: int
|
||||
|
||||
@@ -13,34 +13,29 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Literal, Optional, Union
|
||||
|
||||
import pyarrow as pa
|
||||
from overrides import EnforceOverrides, override
|
||||
from pyarrow import fs
|
||||
|
||||
from lancedb.common import data_to_reader, validate_schema
|
||||
from lancedb.common import data_to_reader, sanitize_uri, validate_schema
|
||||
from lancedb.background_loop import BackgroundEventLoop
|
||||
|
||||
from ._lancedb import connect as lancedb_connect
|
||||
from .table import (
|
||||
AsyncTable,
|
||||
LanceTable,
|
||||
Table,
|
||||
_table_path,
|
||||
sanitize_create_table,
|
||||
)
|
||||
from .util import (
|
||||
fs_from_uri,
|
||||
get_uri_location,
|
||||
get_uri_scheme,
|
||||
validate_table_name,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow as pa
|
||||
from .pydantic import LanceModel
|
||||
from datetime import timedelta
|
||||
|
||||
@@ -48,6 +43,8 @@ if TYPE_CHECKING:
|
||||
from .common import DATA, URI
|
||||
from .embeddings import EmbeddingFunctionConfig
|
||||
|
||||
LOOP = BackgroundEventLoop()
|
||||
|
||||
|
||||
class DBConnection(EnforceOverrides):
|
||||
"""An active LanceDB connection interface."""
|
||||
@@ -180,6 +177,7 @@ class DBConnection(EnforceOverrides):
|
||||
control over how data is saved, either provide the PyArrow schema to
|
||||
convert to or else provide a [PyArrow Table](pyarrow.Table) directly.
|
||||
|
||||
>>> import pyarrow as pa
|
||||
>>> custom_schema = pa.schema([
|
||||
... pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
... pa.field("lat", pa.float32()),
|
||||
@@ -327,7 +325,11 @@ class LanceDBConnection(DBConnection):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, uri: URI, *, read_consistency_interval: Optional[timedelta] = None
|
||||
self,
|
||||
uri: URI,
|
||||
*,
|
||||
read_consistency_interval: Optional[timedelta] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
if not isinstance(uri, Path):
|
||||
scheme = get_uri_scheme(uri)
|
||||
@@ -338,9 +340,27 @@ class LanceDBConnection(DBConnection):
|
||||
uri = uri.expanduser().absolute()
|
||||
Path(uri).mkdir(parents=True, exist_ok=True)
|
||||
self._uri = str(uri)
|
||||
|
||||
self._entered = False
|
||||
self.read_consistency_interval = read_consistency_interval
|
||||
self.storage_options = storage_options
|
||||
|
||||
if read_consistency_interval is not None:
|
||||
read_consistency_interval_secs = read_consistency_interval.total_seconds()
|
||||
else:
|
||||
read_consistency_interval_secs = None
|
||||
|
||||
async def do_connect():
|
||||
return await lancedb_connect(
|
||||
sanitize_uri(uri),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
read_consistency_interval_secs,
|
||||
None,
|
||||
storage_options,
|
||||
)
|
||||
|
||||
self._conn = AsyncConnection(LOOP.run(do_connect()))
|
||||
|
||||
def __repr__(self) -> str:
|
||||
val = f"{self.__class__.__name__}({self._uri}"
|
||||
@@ -364,32 +384,7 @@ class LanceDBConnection(DBConnection):
|
||||
Iterator of str.
|
||||
A list of table names.
|
||||
"""
|
||||
try:
|
||||
asyncio.get_running_loop()
|
||||
# User application is async. Soon we will just tell them to use the
|
||||
# async version. Until then fallback to the old sync implementation.
|
||||
try:
|
||||
filesystem = fs_from_uri(self.uri)[0]
|
||||
except pa.ArrowInvalid:
|
||||
raise NotImplementedError("Unsupported scheme: " + self.uri)
|
||||
|
||||
try:
|
||||
loc = get_uri_location(self.uri)
|
||||
paths = filesystem.get_file_info(fs.FileSelector(loc))
|
||||
except FileNotFoundError:
|
||||
# It is ok if the file does not exist since it will be created
|
||||
paths = []
|
||||
tables = [
|
||||
os.path.splitext(file_info.base_name)[0]
|
||||
for file_info in paths
|
||||
if file_info.extension == "lance"
|
||||
]
|
||||
tables.sort()
|
||||
return tables
|
||||
except RuntimeError:
|
||||
# User application is sync. It is safe to use the async implementation
|
||||
# under the hood.
|
||||
return asyncio.run(self._async_get_table_names(page_token, limit))
|
||||
return LOOP.run(self._conn.table_names(start_after=page_token, limit=limit))
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.table_names())
|
||||
@@ -461,19 +456,16 @@ class LanceDBConnection(DBConnection):
|
||||
If True, ignore if the table does not exist.
|
||||
"""
|
||||
try:
|
||||
table_uri = _table_path(self.uri, name)
|
||||
filesystem, path = fs_from_uri(table_uri)
|
||||
filesystem.delete_dir(path)
|
||||
except FileNotFoundError:
|
||||
LOOP.run(self._conn.drop_table(name))
|
||||
except ValueError as e:
|
||||
if not ignore_missing:
|
||||
raise
|
||||
raise e
|
||||
if f"Table '{name}' was not found" not in str(e):
|
||||
raise e
|
||||
|
||||
@override
|
||||
def drop_database(self):
|
||||
dummy_table_uri = _table_path(self.uri, "dummy")
|
||||
uri = dummy_table_uri.removesuffix("dummy.lance")
|
||||
filesystem, path = fs_from_uri(uri)
|
||||
filesystem.delete_dir(path)
|
||||
LOOP.run(self._conn.drop_database())
|
||||
|
||||
|
||||
class AsyncConnection(object):
|
||||
@@ -689,6 +681,7 @@ class AsyncConnection(object):
|
||||
control over how data is saved, either provide the PyArrow schema to
|
||||
convert to or else provide a [PyArrow Table](pyarrow.Table) directly.
|
||||
|
||||
>>> import pyarrow as pa
|
||||
>>> custom_schema = pa.schema([
|
||||
... pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
... pa.field("lat", pa.float32()),
|
||||
|
||||
@@ -48,6 +48,9 @@ class OpenAIEmbeddings(TextEmbeddingFunction):
|
||||
organization: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
|
||||
# Set true to use Azure OpenAI API
|
||||
use_azure: bool = False
|
||||
|
||||
def ndims(self):
|
||||
return self._ndims
|
||||
|
||||
@@ -123,4 +126,8 @@ class OpenAIEmbeddings(TextEmbeddingFunction):
|
||||
kwargs["organization"] = self.organization
|
||||
if self.api_key:
|
||||
kwargs["api_key"] = self.api_key
|
||||
return openai.OpenAI(**kwargs)
|
||||
|
||||
if self.use_azure:
|
||||
return openai.AzureOpenAI(**kwargs)
|
||||
else:
|
||||
return openai.OpenAI(**kwargs)
|
||||
|
||||
@@ -110,7 +110,16 @@ class FTS:
|
||||
remove_stop_words: bool = False,
|
||||
ascii_folding: bool = False,
|
||||
):
|
||||
self._inner = LanceDbIndex.fts(with_position=with_position)
|
||||
self._inner = LanceDbIndex.fts(
|
||||
with_position=with_position,
|
||||
base_tokenizer=base_tokenizer,
|
||||
language=language,
|
||||
max_token_length=max_token_length,
|
||||
lower_case=lower_case,
|
||||
stem=stem,
|
||||
remove_stop_words=remove_stop_words,
|
||||
ascii_folding=ascii_folding,
|
||||
)
|
||||
|
||||
|
||||
class HnswPq:
|
||||
|
||||
@@ -26,6 +26,7 @@ from typing import (
|
||||
Union,
|
||||
)
|
||||
|
||||
import asyncio
|
||||
import deprecation
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
@@ -44,6 +45,8 @@ if TYPE_CHECKING:
|
||||
import polars as pl
|
||||
|
||||
from ._lancedb import Query as LanceQuery
|
||||
from ._lancedb import FTSQuery as LanceFTSQuery
|
||||
from ._lancedb import HybridQuery as LanceHybridQuery
|
||||
from ._lancedb import VectorQuery as LanceVectorQuery
|
||||
from .common import VEC
|
||||
from .pydantic import LanceModel
|
||||
@@ -1124,35 +1127,55 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
fts_results = fts_future.result()
|
||||
vector_results = vector_future.result()
|
||||
|
||||
# convert to ranks first if needed
|
||||
if self._norm == "rank":
|
||||
vector_results = self._rank(vector_results, "_distance")
|
||||
fts_results = self._rank(fts_results, "_score")
|
||||
return self._combine_hybrid_results(
|
||||
fts_results=fts_results,
|
||||
vector_results=vector_results,
|
||||
norm=self._norm,
|
||||
fts_query=self._fts_query._query,
|
||||
reranker=self._reranker,
|
||||
limit=self._limit,
|
||||
with_row_ids=self._with_row_id,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _combine_hybrid_results(
|
||||
fts_results: pa.Table,
|
||||
vector_results: pa.Table,
|
||||
norm: str,
|
||||
fts_query: str,
|
||||
reranker,
|
||||
limit: int,
|
||||
with_row_ids: bool,
|
||||
) -> pa.Table:
|
||||
if norm == "rank":
|
||||
vector_results = LanceHybridQueryBuilder._rank(vector_results, "_distance")
|
||||
fts_results = LanceHybridQueryBuilder._rank(fts_results, "_score")
|
||||
|
||||
# normalize the scores to be between 0 and 1, 0 being most relevant
|
||||
vector_results = self._normalize_scores(vector_results, "_distance")
|
||||
vector_results = LanceHybridQueryBuilder._normalize_scores(
|
||||
vector_results, "_distance"
|
||||
)
|
||||
|
||||
# In fts higher scores represent relevance. Not inverting them here as
|
||||
# rerankers might need to preserve this score to support `return_score="all"`
|
||||
fts_results = self._normalize_scores(fts_results, "_score")
|
||||
fts_results = LanceHybridQueryBuilder._normalize_scores(fts_results, "_score")
|
||||
|
||||
results = self._reranker.rerank_hybrid(
|
||||
self._fts_query._query, vector_results, fts_results
|
||||
)
|
||||
results = reranker.rerank_hybrid(fts_query, vector_results, fts_results)
|
||||
|
||||
check_reranker_result(results)
|
||||
|
||||
# apply limit after reranking
|
||||
results = results.slice(length=self._limit)
|
||||
results = results.slice(length=limit)
|
||||
|
||||
if not self._with_row_id:
|
||||
if not with_row_ids:
|
||||
results = results.drop(["_rowid"])
|
||||
|
||||
return results
|
||||
|
||||
def to_batches(self):
|
||||
raise NotImplementedError("to_batches not yet supported on a hybrid query")
|
||||
|
||||
def _rank(self, results: pa.Table, column: str, ascending: bool = True):
|
||||
@staticmethod
|
||||
def _rank(results: pa.Table, column: str, ascending: bool = True):
|
||||
if len(results) == 0:
|
||||
return results
|
||||
# Get the _score column from results
|
||||
@@ -1169,7 +1192,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
)
|
||||
return results
|
||||
|
||||
def _normalize_scores(self, results: pa.Table, column: str, invert=False):
|
||||
@staticmethod
|
||||
def _normalize_scores(results: pa.Table, column: str, invert=False):
|
||||
if len(results) == 0:
|
||||
return results
|
||||
# Get the _score column from results
|
||||
@@ -1635,7 +1659,7 @@ class AsyncQuery(AsyncQueryBase):
|
||||
|
||||
def nearest_to_text(
|
||||
self, query: str, columns: Union[str, List[str]] = []
|
||||
) -> AsyncQuery:
|
||||
) -> AsyncFTSQuery:
|
||||
"""
|
||||
Find the documents that are most relevant to the given text query.
|
||||
|
||||
@@ -1658,8 +1682,90 @@ class AsyncQuery(AsyncQueryBase):
|
||||
"""
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
return self
|
||||
return AsyncFTSQuery(
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
|
||||
|
||||
class AsyncFTSQuery(AsyncQueryBase):
|
||||
"""A query for full text search for LanceDB."""
|
||||
|
||||
def __init__(self, inner: LanceFTSQuery):
|
||||
super().__init__(inner)
|
||||
self._inner = inner
|
||||
|
||||
def get_query(self):
|
||||
self._inner.get_query()
|
||||
|
||||
def nearest_to(
|
||||
self,
|
||||
query_vector: Union[VEC, Tuple, List[VEC]],
|
||||
) -> AsyncHybridQuery:
|
||||
"""
|
||||
In addition doing text search on the LanceDB Table, also
|
||||
find the nearest vectors to the given query vector.
|
||||
|
||||
This converts the query from a FTS Query to a Hybrid query. Results
|
||||
from the vector search will be combined with results from the FTS query.
|
||||
|
||||
This method will attempt to convert the input to the query vector
|
||||
expected by the embedding model. If the input cannot be converted
|
||||
then an error will be thrown.
|
||||
|
||||
By default, there is no embedding model, and the input should be
|
||||
something that can be converted to a pyarrow array of floats. This
|
||||
includes lists, numpy arrays, and tuples.
|
||||
|
||||
If there is only one vector column (a column whose data type is a
|
||||
fixed size list of floats) then the column does not need to be specified.
|
||||
If there is more than one vector column you must use
|
||||
[AsyncVectorQuery.column][lancedb.query.AsyncVectorQuery.column] to specify
|
||||
which column you would like to compare with.
|
||||
|
||||
If no index has been created on the vector column then a vector query
|
||||
will perform a distance comparison between the query vector and every
|
||||
vector in the database and then sort the results. This is sometimes
|
||||
called a "flat search"
|
||||
|
||||
For small databases, with tens of thousands of vectors or less, this can
|
||||
be reasonably fast. In larger databases you should create a vector index
|
||||
on the column. If there is a vector index then an "approximate" nearest
|
||||
neighbor search (frequently called an ANN search) will be performed. This
|
||||
search is much faster, but the results will be approximate.
|
||||
|
||||
The query can be further parameterized using the returned builder. There
|
||||
are various ANN search parameters that will let you fine tune your recall
|
||||
accuracy vs search latency.
|
||||
|
||||
Hybrid searches always have a [limit][]. If `limit` has not been called then
|
||||
a default `limit` of 10 will be used.
|
||||
|
||||
Typically, a single vector is passed in as the query. However, you can also
|
||||
pass in multiple vectors. This can be useful if you want to find the nearest
|
||||
vectors to multiple query vectors. This is not expected to be faster than
|
||||
making multiple queries concurrently; it is just a convenience method.
|
||||
If multiple vectors are passed in then an additional column `query_index`
|
||||
will be added to the results. This column will contain the index of the
|
||||
query vector that the result is nearest to.
|
||||
"""
|
||||
if query_vector is None:
|
||||
raise ValueError("query_vector can not be None")
|
||||
|
||||
if (
|
||||
isinstance(query_vector, list)
|
||||
and len(query_vector) > 0
|
||||
and not isinstance(query_vector[0], (float, int))
|
||||
):
|
||||
# multiple have been passed
|
||||
query_vectors = [AsyncQuery._query_vec_to_array(v) for v in query_vector]
|
||||
new_self = self._inner.nearest_to(query_vectors[0])
|
||||
for v in query_vectors[1:]:
|
||||
new_self.add_query_vector(v)
|
||||
return AsyncHybridQuery(new_self)
|
||||
else:
|
||||
return AsyncHybridQuery(
|
||||
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
|
||||
)
|
||||
|
||||
|
||||
class AsyncVectorQuery(AsyncQueryBase):
|
||||
@@ -1796,3 +1902,160 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
"""
|
||||
self._inner.bypass_vector_index()
|
||||
return self
|
||||
|
||||
def nearest_to_text(
|
||||
self, query: str, columns: Union[str, List[str]] = []
|
||||
) -> AsyncHybridQuery:
|
||||
"""
|
||||
Find the documents that are most relevant to the given text query,
|
||||
in addition to vector search.
|
||||
|
||||
This converts the vector query into a hybrid query.
|
||||
|
||||
This search will perform a full text search on the table and return
|
||||
the most relevant documents, combined with the vector query results.
|
||||
The text relevance is determined by BM25.
|
||||
|
||||
The columns to search must be with native FTS index
|
||||
(Tantivy-based can't work with this method).
|
||||
|
||||
By default, all indexed columns are searched,
|
||||
now only one column can be searched at a time.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query: str
|
||||
The text query to search for.
|
||||
columns: str or list of str, default None
|
||||
The columns to search in. If None, all indexed columns are searched.
|
||||
For now only one column can be searched at a time.
|
||||
"""
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
return AsyncHybridQuery(
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
|
||||
|
||||
class AsyncHybridQuery(AsyncQueryBase):
|
||||
"""
|
||||
A query builder that performs hybrid vector and full text search.
|
||||
Results are combined and reranked based on the specified reranker.
|
||||
By default, the results are reranked using the RRFReranker, which
|
||||
uses reciprocal rank fusion score for reranking.
|
||||
|
||||
To make the vector and fts results comparable, the scores are normalized.
|
||||
Instead of normalizing scores, the `normalize` parameter can be set to "rank"
|
||||
in the `rerank` method to convert the scores to ranks and then normalize them.
|
||||
"""
|
||||
|
||||
def __init__(self, inner: LanceHybridQuery):
|
||||
super().__init__(inner)
|
||||
self._inner = inner
|
||||
self._norm = "score"
|
||||
self._reranker = RRFReranker()
|
||||
|
||||
def rerank(
|
||||
self, reranker: Reranker = RRFReranker(), normalize: str = "score"
|
||||
) -> AsyncHybridQuery:
|
||||
"""
|
||||
Rerank the hybrid search results using the specified reranker. The reranker
|
||||
must be an instance of Reranker class.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
reranker: Reranker, default RRFReranker()
|
||||
The reranker to use. Must be an instance of Reranker class.
|
||||
normalize: str, default "score"
|
||||
The method to normalize the scores. Can be "rank" or "score". If "rank",
|
||||
the scores are converted to ranks and then normalized. If "score", the
|
||||
scores are normalized directly.
|
||||
Returns
|
||||
-------
|
||||
AsyncHybridQuery
|
||||
The AsyncHybridQuery object.
|
||||
"""
|
||||
if normalize not in ["rank", "score"]:
|
||||
raise ValueError("normalize must be 'rank' or 'score'.")
|
||||
if reranker and not isinstance(reranker, Reranker):
|
||||
raise ValueError("reranker must be an instance of Reranker class.")
|
||||
|
||||
self._norm = normalize
|
||||
self._reranker = reranker
|
||||
|
||||
return self
|
||||
|
||||
async def to_batches(self):
|
||||
raise NotImplementedError("to_batches not yet supported on a hybrid query")
|
||||
|
||||
async def to_arrow(self) -> pa.Table:
|
||||
fts_query = AsyncFTSQuery(self._inner.to_fts_query())
|
||||
vec_query = AsyncVectorQuery(self._inner.to_vector_query())
|
||||
|
||||
# save the row ID choice that was made on the query builder and force it
|
||||
# to actually fetch the row ids because we need this for reranking
|
||||
with_row_ids = self._inner.get_with_row_id()
|
||||
fts_query.with_row_id()
|
||||
vec_query.with_row_id()
|
||||
|
||||
fts_results, vector_results = await asyncio.gather(
|
||||
fts_query.to_arrow(),
|
||||
vec_query.to_arrow(),
|
||||
)
|
||||
|
||||
return LanceHybridQueryBuilder._combine_hybrid_results(
|
||||
fts_results=fts_results,
|
||||
vector_results=vector_results,
|
||||
norm=self._norm,
|
||||
fts_query=fts_query.get_query(),
|
||||
reranker=self._reranker,
|
||||
limit=self._inner.get_limit(),
|
||||
with_row_ids=with_row_ids,
|
||||
)
|
||||
|
||||
async def explain_plan(self, verbose: Optional[bool] = False):
|
||||
"""Return the execution plan for this query.
|
||||
|
||||
The output includes both the vector and FTS search plans.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import asyncio
|
||||
>>> from lancedb import connect_async
|
||||
>>> from lancedb.index import FTS
|
||||
>>> async def doctest_example():
|
||||
... conn = await connect_async("./.lancedb")
|
||||
... table = await conn.create_table("my_table", [{"vector": [99, 99], "text": "hello world"}])
|
||||
... await table.create_index("text", config=FTS(with_position=False))
|
||||
... query = [100, 100]
|
||||
... plan = await table.query().nearest_to([1, 2]).nearest_to_text("hello").explain_plan(True)
|
||||
... print(plan)
|
||||
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||
Vector Search Plan:
|
||||
ProjectionExec: expr=[vector@0 as vector, text@3 as text, _distance@2 as _distance]
|
||||
Take: columns="vector, _rowid, _distance, (text)"
|
||||
CoalesceBatchesExec: target_batch_size=1024
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
||||
FTS Search Plan:
|
||||
LanceScan: uri=..., projection=[vector, text], row_id=false, row_addr=false, ordered=true
|
||||
|
||||
Parameters
|
||||
----------
|
||||
verbose : bool, default False
|
||||
Use a verbose output format.
|
||||
|
||||
Returns
|
||||
-------
|
||||
plan
|
||||
""" # noqa: E501
|
||||
|
||||
results = ["Vector Search Plan:"]
|
||||
results.append(await self._inner.to_vector_query().explain_plan(verbose))
|
||||
results.append("FTS Search Plan:")
|
||||
results.append(await self._inner.to_fts_query().explain_plan(verbose))
|
||||
|
||||
return "\n".join(results)
|
||||
|
||||
@@ -20,19 +20,16 @@ import warnings
|
||||
|
||||
from lancedb import connect_async
|
||||
from lancedb.remote import ClientConfig
|
||||
from lancedb.remote.background_loop import BackgroundEventLoop
|
||||
import pyarrow as pa
|
||||
from overrides import override
|
||||
|
||||
from ..common import DATA
|
||||
from ..db import DBConnection
|
||||
from ..db import DBConnection, LOOP
|
||||
from ..embeddings import EmbeddingFunctionConfig
|
||||
from ..pydantic import LanceModel
|
||||
from ..table import Table
|
||||
from ..util import validate_table_name
|
||||
|
||||
LOOP = BackgroundEventLoop()
|
||||
|
||||
|
||||
class RemoteDBConnection(DBConnection):
|
||||
"""A connection to a remote LanceDB database."""
|
||||
@@ -47,9 +44,9 @@ class RemoteDBConnection(DBConnection):
|
||||
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
|
||||
connection_timeout: Optional[float] = None,
|
||||
read_timeout: Optional[float] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""Connect to a remote LanceDB database."""
|
||||
|
||||
if isinstance(client_config, dict):
|
||||
client_config = ClientConfig(**client_config)
|
||||
elif client_config is None:
|
||||
@@ -97,6 +94,7 @@ class RemoteDBConnection(DBConnection):
|
||||
region=region,
|
||||
host_override=host_override,
|
||||
client_config=client_config,
|
||||
storage_options=storage_options,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ class RemoteTable(Table):
|
||||
|
||||
def list_versions(self):
|
||||
"""List all versions of the table"""
|
||||
return self._loop.run_until_complete(self._table.list_versions())
|
||||
return LOOP.run(self._table.list_versions())
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
"""to_arrow() is not yet supported on LanceDB cloud."""
|
||||
@@ -89,10 +89,10 @@ class RemoteTable(Table):
|
||||
return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
||||
|
||||
def checkout(self, version):
|
||||
return self._loop.run_until_complete(self._table.checkout(version))
|
||||
return LOOP.run(self._table.checkout(version))
|
||||
|
||||
def checkout_latest(self):
|
||||
return self._loop.run_until_complete(self._table.checkout_latest())
|
||||
return LOOP.run(self._table.checkout_latest())
|
||||
|
||||
def list_indices(self):
|
||||
"""List all the indices on the table"""
|
||||
@@ -138,8 +138,25 @@ class RemoteTable(Table):
|
||||
*,
|
||||
replace: bool = False,
|
||||
with_position: bool = True,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: str = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
stem: bool = False,
|
||||
remove_stop_words: bool = False,
|
||||
ascii_folding: bool = False,
|
||||
):
|
||||
config = FTS(with_position=with_position)
|
||||
config = FTS(
|
||||
with_position=with_position,
|
||||
base_tokenizer=base_tokenizer,
|
||||
language=language,
|
||||
max_token_length=max_token_length,
|
||||
lower_case=lower_case,
|
||||
stem=stem,
|
||||
remove_stop_words=remove_stop_words,
|
||||
ascii_folding=ascii_folding,
|
||||
)
|
||||
LOOP.run(self._table.create_index(column, config=config, replace=replace))
|
||||
|
||||
def create_index(
|
||||
@@ -490,19 +507,13 @@ class RemoteTable(Table):
|
||||
return LOOP.run(self._table.count_rows(filter))
|
||||
|
||||
def add_columns(self, transforms: Dict[str, str]):
|
||||
raise NotImplementedError(
|
||||
"add_columns() is not yet supported on the LanceDB cloud"
|
||||
)
|
||||
return LOOP.run(self._table.add_columns(transforms))
|
||||
|
||||
def alter_columns(self, alterations: Iterable[Dict[str, str]]):
|
||||
raise NotImplementedError(
|
||||
"alter_columns() is not yet supported on the LanceDB cloud"
|
||||
)
|
||||
def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
|
||||
return LOOP.run(self._table.alter_columns(*alterations))
|
||||
|
||||
def drop_columns(self, columns: Iterable[str]):
|
||||
raise NotImplementedError(
|
||||
"drop_columns() is not yet supported on the LanceDB cloud"
|
||||
)
|
||||
return LOOP.run(self._table.drop_columns(columns))
|
||||
|
||||
|
||||
def add_index(tbl: pa.Table, i: int) -> pa.Table:
|
||||
|
||||
@@ -967,8 +967,6 @@ class Table(ABC):
|
||||
"""
|
||||
Add new columns with defined values.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transforms: Dict[str, str]
|
||||
@@ -978,20 +976,21 @@ class Table(ABC):
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def alter_columns(self, alterations: Iterable[Dict[str, str]]):
|
||||
def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
|
||||
"""
|
||||
Alter column names and nullability.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
alterations : Iterable[Dict[str, Any]]
|
||||
A sequence of dictionaries, each with the following keys:
|
||||
- "path": str
|
||||
The column path to alter. For a top-level column, this is the name.
|
||||
For a nested column, this is the dot-separated path, e.g. "a.b.c".
|
||||
- "name": str, optional
|
||||
- "rename": str, optional
|
||||
The new name of the column. If not specified, the column name is
|
||||
not changed.
|
||||
- "data_type": pyarrow.DataType, optional
|
||||
The new data type of the column. Existing values will be casted
|
||||
to this type. If not specified, the column data type is not changed.
|
||||
- "nullable": bool, optional
|
||||
Whether the column should be nullable. If not specified, the column
|
||||
nullability is not changed. Only non-nullable columns can be changed
|
||||
@@ -1004,8 +1003,6 @@ class Table(ABC):
|
||||
"""
|
||||
Drop columns from the table.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : Iterable[str]
|
||||
@@ -1080,13 +1077,16 @@ class _LanceLatestDatasetRef(_LanceDatasetRef):
|
||||
index_cache_size: Optional[int] = None
|
||||
read_consistency_interval: Optional[timedelta] = None
|
||||
last_consistency_check: Optional[float] = None
|
||||
storage_options: Optional[Dict[str, str]] = None
|
||||
_dataset: Optional[LanceDataset] = None
|
||||
|
||||
@property
|
||||
def dataset(self) -> LanceDataset:
|
||||
if not self._dataset:
|
||||
self._dataset = lance.dataset(
|
||||
self.uri, index_cache_size=self.index_cache_size
|
||||
self.uri,
|
||||
index_cache_size=self.index_cache_size,
|
||||
storage_options=self.storage_options,
|
||||
)
|
||||
self.last_consistency_check = time.monotonic()
|
||||
elif self.read_consistency_interval is not None:
|
||||
@@ -1117,13 +1117,17 @@ class _LanceTimeTravelRef(_LanceDatasetRef):
|
||||
uri: str
|
||||
version: int
|
||||
index_cache_size: Optional[int] = None
|
||||
storage_options: Optional[Dict[str, str]] = None
|
||||
_dataset: Optional[LanceDataset] = None
|
||||
|
||||
@property
|
||||
def dataset(self) -> LanceDataset:
|
||||
if not self._dataset:
|
||||
self._dataset = lance.dataset(
|
||||
self.uri, version=self.version, index_cache_size=self.index_cache_size
|
||||
self.uri,
|
||||
version=self.version,
|
||||
index_cache_size=self.index_cache_size,
|
||||
storage_options=self.storage_options,
|
||||
)
|
||||
return self._dataset
|
||||
|
||||
@@ -1172,24 +1176,27 @@ class LanceTable(Table):
|
||||
uri=self._dataset_uri,
|
||||
version=version,
|
||||
index_cache_size=index_cache_size,
|
||||
storage_options=connection.storage_options,
|
||||
)
|
||||
else:
|
||||
self._ref = _LanceLatestDatasetRef(
|
||||
uri=self._dataset_uri,
|
||||
read_consistency_interval=connection.read_consistency_interval,
|
||||
index_cache_size=index_cache_size,
|
||||
storage_options=connection.storage_options,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def open(cls, db, name, **kwargs):
|
||||
tbl = cls(db, name, **kwargs)
|
||||
fs, path = fs_from_uri(tbl._dataset_path)
|
||||
file_info = fs.get_file_info(path)
|
||||
if file_info.type != pa.fs.FileType.Directory:
|
||||
raise FileNotFoundError(
|
||||
f"Table {name} does not exist."
|
||||
f"Please first call db.create_table({name}, data)"
|
||||
)
|
||||
|
||||
# check the dataset exists
|
||||
try:
|
||||
tbl.version
|
||||
except ValueError as e:
|
||||
if "Not found:" in str(e):
|
||||
raise FileNotFoundError(f"Table {name} does not exist")
|
||||
raise e
|
||||
|
||||
return tbl
|
||||
|
||||
@@ -1617,11 +1624,7 @@ class LanceTable(Table):
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
)
|
||||
# Access the dataset_mut property to ensure that the dataset is mutable.
|
||||
self._ref.dataset_mut
|
||||
self._ref.dataset = lance.write_dataset(
|
||||
data, self._dataset_uri, schema=self.schema, mode=mode
|
||||
)
|
||||
self._ref.dataset_mut.insert(data, mode=mode, schema=self.schema)
|
||||
|
||||
def merge(
|
||||
self,
|
||||
@@ -1905,7 +1908,13 @@ class LanceTable(Table):
|
||||
|
||||
empty = pa.Table.from_batches([], schema=schema)
|
||||
try:
|
||||
lance.write_dataset(empty, tbl._dataset_uri, schema=schema, mode=mode)
|
||||
lance.write_dataset(
|
||||
empty,
|
||||
tbl._dataset_uri,
|
||||
schema=schema,
|
||||
mode=mode,
|
||||
storage_options=db.storage_options,
|
||||
)
|
||||
except OSError as err:
|
||||
if "Dataset already exists" in str(err) and exist_ok:
|
||||
if tbl.schema != schema:
|
||||
@@ -2923,6 +2932,53 @@ class AsyncTable:
|
||||
|
||||
return await self._inner.update(updates_sql, where)
|
||||
|
||||
async def add_columns(self, transforms: Dict[str, str]):
|
||||
"""
|
||||
Add new columns with defined values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transforms: Dict[str, str]
|
||||
A map of column name to a SQL expression to use to calculate the
|
||||
value of the new column. These expressions will be evaluated for
|
||||
each row in the table, and can reference existing columns.
|
||||
"""
|
||||
await self._inner.add_columns(list(transforms.items()))
|
||||
|
||||
async def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
|
||||
"""
|
||||
Alter column names and nullability.
|
||||
|
||||
alterations : Iterable[Dict[str, Any]]
|
||||
A sequence of dictionaries, each with the following keys:
|
||||
- "path": str
|
||||
The column path to alter. For a top-level column, this is the name.
|
||||
For a nested column, this is the dot-separated path, e.g. "a.b.c".
|
||||
- "rename": str, optional
|
||||
The new name of the column. If not specified, the column name is
|
||||
not changed.
|
||||
- "data_type": pyarrow.DataType, optional
|
||||
The new data type of the column. Existing values will be casted
|
||||
to this type. If not specified, the column data type is not changed.
|
||||
- "nullable": bool, optional
|
||||
Whether the column should be nullable. If not specified, the column
|
||||
nullability is not changed. Only non-nullable columns can be changed
|
||||
to nullable. Currently, you cannot change a nullable column to
|
||||
non-nullable.
|
||||
"""
|
||||
await self._inner.alter_columns(alterations)
|
||||
|
||||
async def drop_columns(self, columns: Iterable[str]):
|
||||
"""
|
||||
Drop columns from the table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : Iterable[str]
|
||||
The names of the columns to drop.
|
||||
"""
|
||||
await self._inner.drop_columns(columns)
|
||||
|
||||
async def version(self) -> int:
|
||||
"""
|
||||
Retrieve the version of the table
|
||||
|
||||
111
python/python/tests/test_hybrid_query.py
Normal file
111
python/python/tests/test_hybrid_query.py
Normal file
@@ -0,0 +1,111 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import lancedb
|
||||
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from lancedb.index import FTS
|
||||
from lancedb.table import AsyncTable
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def table(tmpdir_factory) -> AsyncTable:
|
||||
tmp_path = str(tmpdir_factory.mktemp("data"))
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
data = pa.table(
|
||||
{
|
||||
"text": pa.array(["a", "b", "cat", "dog"]),
|
||||
"vector": pa.array(
|
||||
[[0.1, 0.1], [2, 2], [-0.1, -0.1], [0.5, -0.5]],
|
||||
type=pa.list_(pa.float32(), list_size=2),
|
||||
),
|
||||
}
|
||||
)
|
||||
table = await db.create_table("test", data)
|
||||
await table.create_index("text", config=FTS(with_position=False))
|
||||
return table
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_hybrid_query(table: AsyncTable):
|
||||
result = await (
|
||||
table.query().nearest_to([0.0, 0.4]).nearest_to_text("dog").limit(2).to_arrow()
|
||||
)
|
||||
assert len(result) == 2
|
||||
# ensure we get results that would match well for text and vector
|
||||
assert result["text"].to_pylist() == ["a", "dog"]
|
||||
|
||||
# ensure there is no rowid by default
|
||||
assert "_rowid" not in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_hybrid_query_with_row_ids(table: AsyncTable):
|
||||
result = await (
|
||||
table.query()
|
||||
.nearest_to([0.0, 0.4])
|
||||
.nearest_to_text("dog")
|
||||
.limit(2)
|
||||
.with_row_id()
|
||||
.to_arrow()
|
||||
)
|
||||
assert len(result) == 2
|
||||
# ensure we get results that would match well for text and vector
|
||||
assert result["text"].to_pylist() == ["a", "dog"]
|
||||
assert result["_rowid"].to_pylist() == [0, 3]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_hybrid_query_filters(table: AsyncTable):
|
||||
# test that query params are passed down from the regular builder to
|
||||
# child vector/fts builders
|
||||
result = await (
|
||||
table.query()
|
||||
.where("text not in ('a', 'dog')")
|
||||
.nearest_to([0.3, 0.3])
|
||||
.nearest_to_text("*a*")
|
||||
.limit(2)
|
||||
.to_arrow()
|
||||
)
|
||||
assert len(result) == 2
|
||||
# ensure we get results that would match well for text and vector
|
||||
assert result["text"].to_pylist() == ["cat", "b"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_hybrid_query_default_limit(table: AsyncTable):
|
||||
# add 10 new rows
|
||||
new_rows = []
|
||||
for i in range(100):
|
||||
if i < 2:
|
||||
new_rows.append({"text": "close_vec", "vector": [0.1, 0.1]})
|
||||
else:
|
||||
new_rows.append({"text": "far_vec", "vector": [5 * i, 5 * i]})
|
||||
await table.add(new_rows)
|
||||
result = await (
|
||||
table.query().nearest_to_text("dog").nearest_to([0.1, 0.1]).to_arrow()
|
||||
)
|
||||
|
||||
# assert we got the default limit of 10
|
||||
assert len(result) == 10
|
||||
|
||||
# assert we got the closest vectors and the text searched for
|
||||
texts = result["text"].to_pylist()
|
||||
assert texts.count("close_vec") == 2
|
||||
assert texts.count("dog") == 1
|
||||
assert texts.count("a") == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_explain_plan(table: AsyncTable):
|
||||
plan = await (
|
||||
table.query().nearest_to_text("dog").nearest_to([0.1, 0.1]).explain_plan(True)
|
||||
)
|
||||
|
||||
assert "Vector Search Plan" in plan
|
||||
assert "KNNVectorDistance" in plan
|
||||
assert "FTS Search Plan" in plan
|
||||
assert "LanceScan" in plan
|
||||
@@ -229,6 +229,44 @@ def test_table_add_in_threadpool():
|
||||
future.result()
|
||||
|
||||
|
||||
def test_table_create_indices():
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/create_index/":
|
||||
request.send_response(200)
|
||||
request.end_headers()
|
||||
elif request.path == "/v1/table/test/create/?mode=create":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(b"{}")
|
||||
elif request.path == "/v1/table/test/describe/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(
|
||||
dict(
|
||||
version=1,
|
||||
schema=dict(
|
||||
fields=[
|
||||
dict(name="id", type={"type": "int64"}, nullable=False),
|
||||
]
|
||||
),
|
||||
)
|
||||
)
|
||||
request.wfile.write(payload.encode())
|
||||
else:
|
||||
request.send_response(404)
|
||||
request.end_headers()
|
||||
|
||||
with mock_lancedb_connection(handler) as db:
|
||||
# Parameters are well-tested through local and async tests.
|
||||
# This is a smoke-test.
|
||||
table = db.create_table("test", [{"id": 1}])
|
||||
table.create_scalar_index("id")
|
||||
table.create_fts_index("text")
|
||||
table.create_scalar_index("vector")
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def query_test_table(query_handler):
|
||||
def handler(request):
|
||||
|
||||
@@ -30,6 +30,7 @@ class MockDB:
|
||||
def __init__(self, uri: Path):
|
||||
self.uri = str(uri)
|
||||
self.read_consistency_interval = None
|
||||
self.storage_options = None
|
||||
|
||||
@functools.cached_property
|
||||
def is_managed_remote(self) -> bool:
|
||||
@@ -1292,6 +1293,19 @@ def test_add_columns(tmp_path):
|
||||
assert table.to_arrow().column_names == ["id", "new_col"]
|
||||
assert table.to_arrow()["new_col"].to_pylist() == [2, 3]
|
||||
|
||||
table.add_columns({"null_int": "cast(null as bigint)"})
|
||||
assert table.schema.field("null_int").type == pa.int64()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_columns_async(db_async: AsyncConnection):
|
||||
data = pa.table({"id": [0, 1]})
|
||||
table = await db_async.create_table("my_table", data=data)
|
||||
await table.add_columns({"new_col": "id + 2"})
|
||||
data = await table.to_arrow()
|
||||
assert data.column_names == ["id", "new_col"]
|
||||
assert data["new_col"].to_pylist() == [2, 3]
|
||||
|
||||
|
||||
def test_alter_columns(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
@@ -1301,6 +1315,18 @@ def test_alter_columns(tmp_path):
|
||||
assert table.to_arrow().column_names == ["new_id"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alter_columns_async(db_async: AsyncConnection):
|
||||
data = pa.table({"id": [0, 1]})
|
||||
table = await db_async.create_table("my_table", data=data)
|
||||
await table.alter_columns({"path": "id", "rename": "new_id"})
|
||||
assert (await table.to_arrow()).column_names == ["new_id"]
|
||||
await table.alter_columns(dict(path="new_id", data_type=pa.int16(), nullable=True))
|
||||
data = await table.to_arrow()
|
||||
assert data.column(0).type == pa.int16()
|
||||
assert data.schema.field(0).nullable
|
||||
|
||||
|
||||
def test_drop_columns(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pa.table({"id": [0, 1], "category": ["a", "b"]})
|
||||
@@ -1309,6 +1335,14 @@ def test_drop_columns(tmp_path):
|
||||
assert table.to_arrow().column_names == ["id"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_drop_columns_async(db_async: AsyncConnection):
|
||||
data = pa.table({"id": [0, 1], "category": ["a", "b"]})
|
||||
table = await db_async.create_table("my_table", data=data)
|
||||
await table.drop_columns(["category"])
|
||||
assert (await table.to_arrow()).column_names == ["id"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_time_travel(db_async: AsyncConnection):
|
||||
# Setup
|
||||
|
||||
@@ -10,7 +10,7 @@ use arrow::{
|
||||
use futures::stream::StreamExt;
|
||||
use lancedb::arrow::SendableRecordBatchStream;
|
||||
use pyo3::{pyclass, pymethods, Bound, PyAny, PyObject, PyRef, PyResult, Python};
|
||||
use pyo3_asyncio_0_21::tokio::future_into_py;
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::error::PythonErrorExt;
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ use pyo3::{
|
||||
exceptions::{PyRuntimeError, PyValueError},
|
||||
pyclass, pyfunction, pymethods, Bound, FromPyObject, PyAny, PyRef, PyResult, Python,
|
||||
};
|
||||
use pyo3_asyncio_0_21::tokio::future_into_py;
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::{error::PythonErrorExt, table::Table};
|
||||
|
||||
@@ -58,6 +58,7 @@ impl Connection {
|
||||
self.inner.take();
|
||||
}
|
||||
|
||||
#[pyo3(signature = (start_after=None, limit=None))]
|
||||
pub fn table_names(
|
||||
self_: PyRef<'_, Self>,
|
||||
start_after: Option<String>,
|
||||
@@ -74,6 +75,7 @@ impl Connection {
|
||||
future_into_py(self_.py(), async move { op.execute().await.infer_error() })
|
||||
}
|
||||
|
||||
#[pyo3(signature = (name, mode, data, storage_options=None, data_storage_version=None, enable_v2_manifest_paths=None))]
|
||||
pub fn create_table<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
name: String,
|
||||
@@ -111,6 +113,7 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (name, mode, schema, storage_options=None, data_storage_version=None, enable_v2_manifest_paths=None))]
|
||||
pub fn create_empty_table<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
name: String,
|
||||
@@ -198,6 +201,7 @@ impl Connection {
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn connect(
|
||||
py: Python,
|
||||
|
||||
@@ -138,7 +138,9 @@ fn http_from_rust_error(
|
||||
status_code: Option<u16>,
|
||||
) -> PyResult<PyErr> {
|
||||
let message = err.to_string();
|
||||
let http_err_cls = py.import("lancedb.remote.errors")?.getattr("HttpError")?;
|
||||
let http_err_cls = py
|
||||
.import_bound("lancedb.remote.errors")?
|
||||
.getattr("HttpError")?;
|
||||
let py_err = http_err_cls.call1((message, request_id, status_code))?;
|
||||
|
||||
// Reset the traceback since it doesn't provide additional information.
|
||||
@@ -149,5 +151,5 @@ fn http_from_rust_error(
|
||||
py_err.setattr(intern!(py, "__cause__"), cause_err)?;
|
||||
}
|
||||
|
||||
Ok(PyErr::from_value(py_err))
|
||||
Ok(PyErr::from_value_bound(py_err))
|
||||
}
|
||||
|
||||
@@ -47,6 +47,7 @@ impl Index {
|
||||
|
||||
#[pymethods]
|
||||
impl Index {
|
||||
#[pyo3(signature = (distance_type=None, num_partitions=None, num_sub_vectors=None, max_iterations=None, sample_rate=None))]
|
||||
#[staticmethod]
|
||||
pub fn ivf_pq(
|
||||
distance_type: Option<String>,
|
||||
@@ -106,6 +107,7 @@ impl Index {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (with_position=None, base_tokenizer=None, language=None, max_token_length=None, lower_case=None, stem=None, remove_stop_words=None, ascii_folding=None))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[staticmethod]
|
||||
pub fn fts(
|
||||
@@ -146,6 +148,7 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyo3(signature = (distance_type=None, num_partitions=None, num_sub_vectors=None, max_iterations=None, sample_rate=None, m=None, ef_construction=None))]
|
||||
#[staticmethod]
|
||||
pub fn hnsw_pq(
|
||||
distance_type: Option<String>,
|
||||
@@ -184,6 +187,7 @@ impl Index {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (distance_type=None, num_partitions=None, max_iterations=None, sample_rate=None, m=None, ef_construction=None))]
|
||||
#[staticmethod]
|
||||
pub fn hnsw_sq(
|
||||
distance_type: Option<String>,
|
||||
|
||||
@@ -16,7 +16,11 @@ use arrow::RecordBatchStream;
|
||||
use connection::{connect, Connection};
|
||||
use env_logger::Env;
|
||||
use index::{Index, IndexConfig};
|
||||
use pyo3::{pymodule, types::PyModule, wrap_pyfunction, PyResult, Python};
|
||||
use pyo3::{
|
||||
pymodule,
|
||||
types::{PyModule, PyModuleMethods},
|
||||
wrap_pyfunction, Bound, PyResult, Python,
|
||||
};
|
||||
use query::{Query, VectorQuery};
|
||||
use table::Table;
|
||||
|
||||
@@ -29,7 +33,7 @@ pub mod table;
|
||||
pub mod util;
|
||||
|
||||
#[pymodule]
|
||||
pub fn _lancedb(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
let env = Env::new()
|
||||
.filter_or("LANCEDB_LOG", "warn")
|
||||
.write_style("LANCEDB_LOG_STYLE");
|
||||
|
||||
@@ -18,7 +18,8 @@ use arrow::pyarrow::FromPyArrow;
|
||||
use lancedb::index::scalar::FullTextSearchQuery;
|
||||
use lancedb::query::QueryExecutionOptions;
|
||||
use lancedb::query::{
|
||||
ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
|
||||
ExecutableQuery, HasQuery, Query as LanceDbQuery, QueryBase, Select,
|
||||
VectorQuery as LanceDbVectorQuery,
|
||||
};
|
||||
use pyo3::exceptions::PyRuntimeError;
|
||||
use pyo3::prelude::{PyAnyMethods, PyDictMethods};
|
||||
@@ -29,7 +30,7 @@ use pyo3::PyAny;
|
||||
use pyo3::PyRef;
|
||||
use pyo3::PyResult;
|
||||
use pyo3::{pyclass, PyErr};
|
||||
use pyo3_asyncio_0_21::tokio::future_into_py;
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::arrow::RecordBatchStream;
|
||||
use crate::error::PythonErrorExt;
|
||||
@@ -87,7 +88,7 @@ impl Query {
|
||||
Ok(VectorQuery { inner })
|
||||
}
|
||||
|
||||
pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<()> {
|
||||
pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<FTSQuery> {
|
||||
let query_text = query
|
||||
.get_item("query")?
|
||||
.ok_or(PyErr::new::<PyRuntimeError, _>(
|
||||
@@ -100,11 +101,14 @@ impl Query {
|
||||
.transpose()?;
|
||||
|
||||
let fts_query = FullTextSearchQuery::new(query_text).columns(columns);
|
||||
self.inner = self.inner.clone().full_text_search(fts_query);
|
||||
|
||||
Ok(())
|
||||
Ok(FTSQuery {
|
||||
fts_query,
|
||||
inner: self.inner.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
max_batch_length: Option<u32>,
|
||||
@@ -132,6 +136,87 @@ impl Query {
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[derive(Clone)]
|
||||
pub struct FTSQuery {
|
||||
inner: LanceDbQuery,
|
||||
fts_query: FullTextSearchQuery,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl FTSQuery {
|
||||
pub fn r#where(&mut self, predicate: String) {
|
||||
self.inner = self.inner.clone().only_if(predicate);
|
||||
}
|
||||
|
||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
||||
}
|
||||
|
||||
pub fn limit(&mut self, limit: u32) {
|
||||
self.inner = self.inner.clone().limit(limit as usize);
|
||||
}
|
||||
|
||||
pub fn offset(&mut self, offset: u32) {
|
||||
self.inner = self.inner.clone().offset(offset as usize);
|
||||
}
|
||||
|
||||
pub fn fast_search(&mut self) {
|
||||
self.inner = self.inner.clone().fast_search();
|
||||
}
|
||||
|
||||
pub fn with_row_id(&mut self) {
|
||||
self.inner = self.inner.clone().with_row_id();
|
||||
}
|
||||
|
||||
pub fn postfilter(&mut self) {
|
||||
self.inner = self.inner.clone().postfilter();
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
max_batch_length: Option<u32>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_
|
||||
.inner
|
||||
.clone()
|
||||
.full_text_search(self_.fts_query.clone());
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
let mut opts = QueryExecutionOptions::default();
|
||||
if let Some(max_batch_length) = max_batch_length {
|
||||
opts.max_batch_length = max_batch_length;
|
||||
}
|
||||
let inner_stream = inner.execute_with_options(opts).await.infer_error()?;
|
||||
Ok(RecordBatchStream::new(inner_stream))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn nearest_to(&mut self, vector: Bound<'_, PyAny>) -> PyResult<HybridQuery> {
|
||||
let vector_query = Query::new(self.inner.clone()).nearest_to(vector)?;
|
||||
Ok(HybridQuery {
|
||||
inner_fts: self.clone(),
|
||||
inner_vec: vector_query,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn explain_plan(self_: PyRef<'_, Self>, verbose: bool) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner
|
||||
.explain_plan(verbose)
|
||||
.await
|
||||
.map_err(|e| PyRuntimeError::new_err(e.to_string()))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_query(&self) -> String {
|
||||
self.fts_query.query.clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[derive(Clone)]
|
||||
pub struct VectorQuery {
|
||||
inner: LanceDbVectorQuery,
|
||||
}
|
||||
@@ -203,6 +288,7 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().bypass_vector_index()
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
max_batch_length: Option<u32>,
|
||||
@@ -227,4 +313,105 @@ impl VectorQuery {
|
||||
.map_err(|e| PyRuntimeError::new_err(e.to_string()))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<HybridQuery> {
|
||||
let fts_query = Query::new(self.inner.mut_query().clone()).nearest_to_text(query)?;
|
||||
Ok(HybridQuery {
|
||||
inner_vec: self.clone(),
|
||||
inner_fts: fts_query,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct HybridQuery {
|
||||
inner_vec: VectorQuery,
|
||||
inner_fts: FTSQuery,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl HybridQuery {
|
||||
pub fn r#where(&mut self, predicate: String) {
|
||||
self.inner_vec.r#where(predicate.clone());
|
||||
self.inner_fts.r#where(predicate);
|
||||
}
|
||||
|
||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||
self.inner_vec.select(columns.clone());
|
||||
self.inner_fts.select(columns);
|
||||
}
|
||||
|
||||
pub fn limit(&mut self, limit: u32) {
|
||||
self.inner_vec.limit(limit);
|
||||
self.inner_fts.limit(limit);
|
||||
}
|
||||
|
||||
pub fn offset(&mut self, offset: u32) {
|
||||
self.inner_vec.offset(offset);
|
||||
self.inner_fts.offset(offset);
|
||||
}
|
||||
|
||||
pub fn fast_search(&mut self) {
|
||||
self.inner_vec.fast_search();
|
||||
self.inner_fts.fast_search();
|
||||
}
|
||||
|
||||
pub fn with_row_id(&mut self) {
|
||||
self.inner_fts.with_row_id();
|
||||
self.inner_vec.with_row_id();
|
||||
}
|
||||
|
||||
pub fn postfilter(&mut self) {
|
||||
self.inner_vec.postfilter();
|
||||
self.inner_fts.postfilter();
|
||||
}
|
||||
|
||||
pub fn add_query_vector(&mut self, vector: Bound<'_, PyAny>) -> PyResult<()> {
|
||||
self.inner_vec.add_query_vector(vector)
|
||||
}
|
||||
|
||||
pub fn column(&mut self, column: String) {
|
||||
self.inner_vec.column(column);
|
||||
}
|
||||
|
||||
pub fn distance_type(&mut self, distance_type: String) -> PyResult<()> {
|
||||
self.inner_vec.distance_type(distance_type)
|
||||
}
|
||||
|
||||
pub fn refine_factor(&mut self, refine_factor: u32) {
|
||||
self.inner_vec.refine_factor(refine_factor);
|
||||
}
|
||||
|
||||
pub fn nprobes(&mut self, nprobe: u32) {
|
||||
self.inner_vec.nprobes(nprobe);
|
||||
}
|
||||
|
||||
pub fn ef(&mut self, ef: u32) {
|
||||
self.inner_vec.ef(ef);
|
||||
}
|
||||
|
||||
pub fn bypass_vector_index(&mut self) {
|
||||
self.inner_vec.bypass_vector_index();
|
||||
}
|
||||
|
||||
pub fn to_vector_query(&mut self) -> PyResult<VectorQuery> {
|
||||
Ok(VectorQuery {
|
||||
inner: self.inner_vec.inner.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn to_fts_query(&mut self) -> PyResult<FTSQuery> {
|
||||
Ok(FTSQuery {
|
||||
inner: self.inner_fts.inner.clone(),
|
||||
fts_query: self.inner_fts.fts_query.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_limit(&mut self) -> Option<u32> {
|
||||
self.inner_fts.inner.limit.map(|i| i as u32)
|
||||
}
|
||||
|
||||
pub fn get_with_row_id(&mut self) -> bool {
|
||||
self.inner_fts.inner.with_row_id
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +1,21 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
use arrow::{
|
||||
datatypes::DataType,
|
||||
ffi_stream::ArrowArrayStreamReader,
|
||||
pyarrow::{FromPyArrow, ToPyArrow},
|
||||
};
|
||||
use lancedb::table::{
|
||||
AddDataMode, Duration, OptimizeAction, OptimizeOptions, Table as LanceDbTable,
|
||||
AddDataMode, ColumnAlteration, Duration, NewColumnTransform, OptimizeAction, OptimizeOptions,
|
||||
Table as LanceDbTable,
|
||||
};
|
||||
use pyo3::{
|
||||
exceptions::{PyRuntimeError, PyValueError},
|
||||
pyclass, pymethods,
|
||||
types::{IntoPyDict, PyDict, PyDictMethods, PyString},
|
||||
types::{IntoPyDict, PyAnyMethods, PyDict, PyDictMethods},
|
||||
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
|
||||
};
|
||||
use pyo3_asyncio_0_21::tokio::future_into_py;
|
||||
use pyo3_async_runtimes::tokio::future_into_py;
|
||||
|
||||
use crate::{
|
||||
error::PythonErrorExt,
|
||||
@@ -137,9 +141,10 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (updates, r#where=None))]
|
||||
pub fn update<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
updates: &PyDict,
|
||||
updates: &Bound<'_, PyDict>,
|
||||
r#where: Option<String>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let mut op = self_.inner_ref()?.update();
|
||||
@@ -147,10 +152,8 @@ impl Table {
|
||||
op = op.only_if(only_if);
|
||||
}
|
||||
for (column_name, value) in updates.into_iter() {
|
||||
let column_name: &PyString = column_name.downcast()?;
|
||||
let column_name = column_name.to_str()?.to_string();
|
||||
let value: &PyString = value.downcast()?;
|
||||
let value = value.to_str()?.to_string();
|
||||
let column_name: String = column_name.extract()?;
|
||||
let value: String = value.extract()?;
|
||||
op = op.column(column_name, value);
|
||||
}
|
||||
future_into_py(self_.py(), async move {
|
||||
@@ -159,6 +162,7 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (filter=None))]
|
||||
pub fn count_rows(
|
||||
self_: PyRef<'_, Self>,
|
||||
filter: Option<String>,
|
||||
@@ -169,6 +173,7 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (column, index=None, replace=None))]
|
||||
pub fn create_index<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
column: String,
|
||||
@@ -263,7 +268,8 @@ impl Table {
|
||||
.unwrap();
|
||||
|
||||
let tup: Vec<(&String, &String)> = v.metadata.iter().collect();
|
||||
dict.set_item("metadata", tup.into_py_dict(py)).unwrap();
|
||||
dict.set_item("metadata", tup.into_py_dict_bound(py))
|
||||
.unwrap();
|
||||
dict.to_object(py)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
@@ -299,6 +305,7 @@ impl Table {
|
||||
Query::new(self.inner_ref().unwrap().query())
|
||||
}
|
||||
|
||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))]
|
||||
pub fn optimize(
|
||||
self_: PyRef<'_, Self>,
|
||||
cleanup_since_ms: Option<u64>,
|
||||
@@ -406,6 +413,72 @@ impl Table {
|
||||
.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_columns(
|
||||
self_: PyRef<'_, Self>,
|
||||
definitions: Vec<(String, String)>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let definitions = NewColumnTransform::SqlExpressions(definitions);
|
||||
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.add_columns(definitions, None).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn alter_columns<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
alterations: Vec<Bound<PyDict>>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let alterations = alterations
|
||||
.iter()
|
||||
.map(|alteration| {
|
||||
let path = alteration
|
||||
.get_item("path")?
|
||||
.ok_or_else(|| PyValueError::new_err("Missing path"))?
|
||||
.extract()?;
|
||||
let rename = {
|
||||
// We prefer rename, but support name for backwards compatibility
|
||||
let rename = if let Ok(Some(rename)) = alteration.get_item("rename") {
|
||||
Some(rename)
|
||||
} else {
|
||||
alteration.get_item("name")?
|
||||
};
|
||||
rename.map(|name| name.extract()).transpose()?
|
||||
};
|
||||
let nullable = alteration
|
||||
.get_item("nullable")?
|
||||
.map(|val| val.extract())
|
||||
.transpose()?;
|
||||
let data_type = alteration
|
||||
.get_item("data_type")?
|
||||
.map(|val| DataType::from_pyarrow_bound(&val))
|
||||
.transpose()?;
|
||||
Ok(ColumnAlteration {
|
||||
path,
|
||||
rename,
|
||||
nullable,
|
||||
data_type,
|
||||
})
|
||||
})
|
||||
.collect::<PyResult<Vec<_>>>()?;
|
||||
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.alter_columns(&alterations).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn drop_columns(self_: PyRef<Self>, columns: Vec<String>) -> PyResult<Bound<PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let column_refs = columns.iter().map(String::as_str).collect::<Vec<&str>>();
|
||||
inner.drop_columns(&column_refs).await.infer_error()?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-node"
|
||||
version = "0.14.0-beta.1"
|
||||
version = "0.14.1-beta.0"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.14.0-beta.1"
|
||||
version = "0.14.1-beta.0"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
|
||||
@@ -625,7 +625,7 @@ impl ConnectBuilder {
|
||||
|
||||
/// Set the LanceDB Cloud client configuration.
|
||||
///
|
||||
/// ```
|
||||
/// ```no_run
|
||||
/// # use lancedb::connect;
|
||||
/// # use lancedb::remote::*;
|
||||
/// connect("db://my_database")
|
||||
|
||||
@@ -53,7 +53,10 @@ pub struct LabelListIndexBuilder {}
|
||||
/// A full text search index is an index on a string column that allows for full text search
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FtsIndexBuilder {
|
||||
pub(crate) with_position: bool,
|
||||
/// Whether to store the position of the tokens
|
||||
/// This is used for phrase queries
|
||||
pub with_position: bool,
|
||||
|
||||
pub tokenizer_configs: TokenizerConfig,
|
||||
}
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
//!
|
||||
//! LanceDB runs in process, to use it in your Rust project, put the following in your `Cargo.toml`:
|
||||
//!
|
||||
//! ```ignore
|
||||
//! ```shell
|
||||
//! cargo install lancedb
|
||||
//! ```
|
||||
//!
|
||||
|
||||
@@ -348,7 +348,7 @@ pub trait QueryBase {
|
||||
///
|
||||
/// The filter should be supplied as an SQL query string. For example:
|
||||
///
|
||||
/// ```ignore
|
||||
/// ```sql
|
||||
/// x > 10
|
||||
/// y > 0 AND y < 100
|
||||
/// x > 5 OR y = 'test'
|
||||
@@ -364,8 +364,18 @@ pub trait QueryBase {
|
||||
///
|
||||
/// This method is only valid on tables that have a full text search index.
|
||||
///
|
||||
/// ```ignore
|
||||
/// query.full_text_search(FullTextSearchQuery::new("hello world"))
|
||||
/// ```
|
||||
/// use lance_index::scalar::FullTextSearchQuery;
|
||||
/// use lancedb::query::{QueryBase, ExecutableQuery};
|
||||
///
|
||||
/// # use lancedb::Table;
|
||||
/// # async fn query(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let results = table.query()
|
||||
/// .full_text_search(FullTextSearchQuery::new("hello world".into()))
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
fn full_text_search(self, query: FullTextSearchQuery) -> Self;
|
||||
|
||||
@@ -563,7 +573,7 @@ pub struct Query {
|
||||
parent: Arc<dyn TableInternal>,
|
||||
|
||||
/// limit the number of rows to return.
|
||||
pub(crate) limit: Option<usize>,
|
||||
pub limit: Option<usize>,
|
||||
|
||||
/// Offset of the query.
|
||||
pub(crate) offset: Option<usize>,
|
||||
@@ -586,7 +596,7 @@ pub struct Query {
|
||||
/// If set to true, the query will return the `_rowid` meta column.
|
||||
///
|
||||
/// By default, this is false.
|
||||
pub(crate) with_row_id: bool,
|
||||
pub with_row_id: bool,
|
||||
|
||||
/// If set to false, the filter will be applied after the vector search.
|
||||
pub(crate) prefilter: bool,
|
||||
|
||||
@@ -271,7 +271,7 @@ impl From<StorageOptions> for RemoteOptions {
|
||||
filtered.insert(opt.to_string(), v.to_string());
|
||||
}
|
||||
}
|
||||
RemoteOptions::new(filtered)
|
||||
Self::new(filtered)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ use datafusion_physical_plan::{ExecutionPlan, SendableRecordBatchStream};
|
||||
use futures::TryStreamExt;
|
||||
use http::header::CONTENT_TYPE;
|
||||
use http::StatusCode;
|
||||
use lance::arrow::json::JsonSchema;
|
||||
use lance::arrow::json::{JsonDataType, JsonSchema};
|
||||
use lance::dataset::scanner::DatasetRecordBatchStream;
|
||||
use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
|
||||
use lance_datafusion::exec::OneShotExec;
|
||||
@@ -570,7 +570,19 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
Index::BTree(_) => ("BTREE", None),
|
||||
Index::Bitmap(_) => ("BITMAP", None),
|
||||
Index::LabelList(_) => ("LABEL_LIST", None),
|
||||
Index::FTS(_) => ("FTS", None),
|
||||
Index::FTS(fts) => {
|
||||
let with_position = fts.with_position;
|
||||
let configs = serde_json::to_value(fts.tokenizer_configs).map_err(|e| {
|
||||
Error::InvalidInput {
|
||||
message: format!("failed to serialize FTS index params {:?}", e),
|
||||
}
|
||||
})?;
|
||||
for (key, value) in configs.as_object().unwrap() {
|
||||
body[key] = value.clone();
|
||||
}
|
||||
body["with_position"] = serde_json::Value::Bool(with_position);
|
||||
("FTS", None)
|
||||
}
|
||||
Index::Auto => {
|
||||
let schema = self.schema().await?;
|
||||
let field = schema
|
||||
@@ -643,25 +655,80 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
}
|
||||
async fn add_columns(
|
||||
&self,
|
||||
_transforms: NewColumnTransform,
|
||||
transforms: NewColumnTransform,
|
||||
_read_columns: Option<Vec<String>>,
|
||||
) -> Result<()> {
|
||||
self.check_mutable().await?;
|
||||
Err(Error::NotSupported {
|
||||
message: "add_columns is not yet supported.".into(),
|
||||
})
|
||||
match transforms {
|
||||
NewColumnTransform::SqlExpressions(expressions) => {
|
||||
let body = expressions
|
||||
.into_iter()
|
||||
.map(|(name, expression)| {
|
||||
serde_json::json!({
|
||||
"name": name,
|
||||
"expression": expression,
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let body = serde_json::json!({ "new_columns": body });
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/add_columns/", self.name))
|
||||
.json(&body);
|
||||
let (request_id, response) = self.client.send(request, false).await?;
|
||||
self.check_table_response(&request_id, response).await?;
|
||||
Ok(())
|
||||
}
|
||||
_ => {
|
||||
return Err(Error::NotSupported {
|
||||
message: "Only SQL expressions are supported for adding columns".into(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn alter_columns(&self, _alterations: &[ColumnAlteration]) -> Result<()> {
|
||||
|
||||
async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<()> {
|
||||
self.check_mutable().await?;
|
||||
Err(Error::NotSupported {
|
||||
message: "alter_columns is not yet supported.".into(),
|
||||
})
|
||||
let body = alterations
|
||||
.iter()
|
||||
.map(|alteration| {
|
||||
let mut value = serde_json::json!({
|
||||
"path": alteration.path,
|
||||
});
|
||||
if let Some(rename) = &alteration.rename {
|
||||
value["rename"] = serde_json::Value::String(rename.clone());
|
||||
}
|
||||
if let Some(data_type) = &alteration.data_type {
|
||||
let json_data_type = JsonDataType::try_from(data_type).unwrap();
|
||||
let json_data_type = serde_json::to_value(&json_data_type).unwrap();
|
||||
value["data_type"] = json_data_type;
|
||||
}
|
||||
if let Some(nullable) = &alteration.nullable {
|
||||
value["nullable"] = serde_json::Value::Bool(*nullable);
|
||||
}
|
||||
value
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let body = serde_json::json!({ "alterations": body });
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/alter_columns/", self.name))
|
||||
.json(&body);
|
||||
let (request_id, response) = self.client.send(request, false).await?;
|
||||
self.check_table_response(&request_id, response).await?;
|
||||
Ok(())
|
||||
}
|
||||
async fn drop_columns(&self, _columns: &[&str]) -> Result<()> {
|
||||
|
||||
async fn drop_columns(&self, columns: &[&str]) -> Result<()> {
|
||||
self.check_mutable().await?;
|
||||
Err(Error::NotSupported {
|
||||
message: "drop_columns is not yet supported.".into(),
|
||||
})
|
||||
let body = serde_json::json!({ "columns": columns });
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/drop_columns/", self.name))
|
||||
.json(&body);
|
||||
let (request_id, response) = self.client.send(request, false).await?;
|
||||
self.check_table_response(&request_id, response).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
||||
@@ -844,7 +911,17 @@ mod tests {
|
||||
Box::pin(table.update().column("a", "a + 1").execute().map_ok(|_| ())),
|
||||
Box::pin(table.add(example_data()).execute().map_ok(|_| ())),
|
||||
Box::pin(table.merge_insert(&["test"]).execute(example_data())),
|
||||
Box::pin(table.delete("false")), // TODO: other endpoints.
|
||||
Box::pin(table.delete("false")),
|
||||
Box::pin(table.add_columns(
|
||||
NewColumnTransform::SqlExpressions(vec![("x".into(), "y".into())]),
|
||||
None,
|
||||
)),
|
||||
Box::pin(async {
|
||||
let alterations = vec![ColumnAlteration::new("x".into()).rename("y".into())];
|
||||
table.alter_columns(&alterations).await
|
||||
}),
|
||||
Box::pin(table.drop_columns(&["a"])),
|
||||
// TODO: other endpoints.
|
||||
];
|
||||
|
||||
for result in results {
|
||||
@@ -1431,6 +1508,7 @@ mod tests {
|
||||
];
|
||||
|
||||
for (index_type, distance_type, index) in cases {
|
||||
let params = index.clone();
|
||||
let table = Table::new_with_handler("my_table", move |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(request.url().path(), "/v1/table/my_table/create_index/");
|
||||
@@ -1447,6 +1525,17 @@ mod tests {
|
||||
if let Some(distance_type) = distance_type {
|
||||
expected_body["metric_type"] = distance_type.to_lowercase().into();
|
||||
}
|
||||
if let Index::FTS(fts) = ¶ms {
|
||||
expected_body["with_position"] = fts.with_position.into();
|
||||
expected_body["base_tokenizer"] = "simple".into();
|
||||
expected_body["language"] = "English".into();
|
||||
expected_body["max_token_length"] = 40.into();
|
||||
expected_body["lower_case"] = true.into();
|
||||
expected_body["stem"] = false.into();
|
||||
expected_body["remove_stop_words"] = false.into();
|
||||
expected_body["ascii_folding"] = false.into();
|
||||
}
|
||||
|
||||
assert_eq!(body, expected_body);
|
||||
|
||||
http::Response::builder().status(200).body("{}").unwrap()
|
||||
@@ -1799,4 +1888,114 @@ mod tests {
|
||||
.await;
|
||||
assert!(matches!(res, Err(Error::NotSupported { .. })));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_add_columns() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(request.url().path(), "/v1/table/my_table/add_columns/");
|
||||
assert_eq!(
|
||||
request.headers().get("Content-Type").unwrap(),
|
||||
JSON_CONTENT_TYPE
|
||||
);
|
||||
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body = std::str::from_utf8(body).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_str(body).unwrap();
|
||||
let new_columns = value.get("new_columns").unwrap().as_array().unwrap();
|
||||
assert!(new_columns.len() == 2);
|
||||
|
||||
let col_name = new_columns[0]["name"].as_str().unwrap();
|
||||
let expression = new_columns[0]["expression"].as_str().unwrap();
|
||||
assert_eq!(col_name, "b");
|
||||
assert_eq!(expression, "a + 1");
|
||||
|
||||
let col_name = new_columns[1]["name"].as_str().unwrap();
|
||||
let expression = new_columns[1]["expression"].as_str().unwrap();
|
||||
assert_eq!(col_name, "x");
|
||||
assert_eq!(expression, "cast(NULL as int32)");
|
||||
|
||||
http::Response::builder().status(200).body("{}").unwrap()
|
||||
});
|
||||
|
||||
table
|
||||
.add_columns(
|
||||
NewColumnTransform::SqlExpressions(vec![
|
||||
("b".into(), "a + 1".into()),
|
||||
("x".into(), "cast(NULL as int32)".into()),
|
||||
]),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_alter_columns() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(request.url().path(), "/v1/table/my_table/alter_columns/");
|
||||
assert_eq!(
|
||||
request.headers().get("Content-Type").unwrap(),
|
||||
JSON_CONTENT_TYPE
|
||||
);
|
||||
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body = std::str::from_utf8(body).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_str(body).unwrap();
|
||||
let alterations = value.get("alterations").unwrap().as_array().unwrap();
|
||||
assert!(alterations.len() == 2);
|
||||
|
||||
let path = alterations[0]["path"].as_str().unwrap();
|
||||
let data_type = alterations[0]["data_type"]["type"].as_str().unwrap();
|
||||
assert_eq!(path, "b.c");
|
||||
assert_eq!(data_type, "int32");
|
||||
|
||||
let path = alterations[1]["path"].as_str().unwrap();
|
||||
let nullable = alterations[1]["nullable"].as_bool().unwrap();
|
||||
let rename = alterations[1]["rename"].as_str().unwrap();
|
||||
assert_eq!(path, "x");
|
||||
assert!(nullable);
|
||||
assert_eq!(rename, "y");
|
||||
|
||||
http::Response::builder().status(200).body("{}").unwrap()
|
||||
});
|
||||
|
||||
table
|
||||
.alter_columns(&[
|
||||
ColumnAlteration::new("b.c".into()).cast_to(DataType::Int32),
|
||||
ColumnAlteration::new("x".into())
|
||||
.rename("y".into())
|
||||
.set_nullable(true),
|
||||
])
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_drop_columns() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(request.url().path(), "/v1/table/my_table/drop_columns/");
|
||||
assert_eq!(
|
||||
request.headers().get("Content-Type").unwrap(),
|
||||
JSON_CONTENT_TYPE
|
||||
);
|
||||
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body = std::str::from_utf8(body).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_str(body).unwrap();
|
||||
let columns = value.get("columns").unwrap().as_array().unwrap();
|
||||
assert!(columns.len() == 2);
|
||||
|
||||
let col1 = columns[0].as_str().unwrap();
|
||||
let col2 = columns[1].as_str().unwrap();
|
||||
assert_eq!(col1, "a");
|
||||
assert_eq!(col2, "b");
|
||||
|
||||
http::Response::builder().status(200).body("{}").unwrap()
|
||||
});
|
||||
|
||||
table.drop_columns(&["a", "b"]).await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
//! LanceDB Table APIs
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -37,7 +36,8 @@ pub use lance::dataset::ColumnAlteration;
|
||||
pub use lance::dataset::NewColumnTransform;
|
||||
pub use lance::dataset::ReadParams;
|
||||
use lance::dataset::{
|
||||
Dataset, UpdateBuilder as LanceUpdateBuilder, Version, WhenMatched, WriteMode, WriteParams,
|
||||
Dataset, InsertBuilder, UpdateBuilder as LanceUpdateBuilder, Version, WhenMatched, WriteMode,
|
||||
WriteParams,
|
||||
};
|
||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||
use lance::io::WrappingObjectStore;
|
||||
@@ -1046,12 +1046,6 @@ pub struct NativeTable {
|
||||
name: String,
|
||||
uri: String,
|
||||
pub(crate) dataset: dataset::DatasetConsistencyWrapper,
|
||||
|
||||
// the object store wrapper to use on write path
|
||||
store_wrapper: Option<Arc<dyn WrappingObjectStore>>,
|
||||
|
||||
storage_options: HashMap<String, String>,
|
||||
|
||||
// This comes from the connection options. We store here so we can pass down
|
||||
// to the dataset when we recreate it (for example, in checkout_latest).
|
||||
read_consistency_interval: Option<std::time::Duration>,
|
||||
@@ -1117,13 +1111,6 @@ impl NativeTable {
|
||||
None => params,
|
||||
};
|
||||
|
||||
let storage_options = params
|
||||
.store_options
|
||||
.clone()
|
||||
.unwrap_or_default()
|
||||
.storage_options
|
||||
.unwrap_or_default();
|
||||
|
||||
let dataset = DatasetBuilder::from_uri(uri)
|
||||
.with_read_params(params)
|
||||
.load()
|
||||
@@ -1141,8 +1128,6 @@ impl NativeTable {
|
||||
name: name.to_string(),
|
||||
uri: uri.to_string(),
|
||||
dataset,
|
||||
store_wrapper: write_store_wrapper,
|
||||
storage_options,
|
||||
read_consistency_interval,
|
||||
})
|
||||
}
|
||||
@@ -1191,12 +1176,6 @@ impl NativeTable {
|
||||
Some(wrapper) => params.patch_with_store_wrapper(wrapper)?,
|
||||
None => params,
|
||||
};
|
||||
let storage_options = params
|
||||
.store_params
|
||||
.clone()
|
||||
.unwrap_or_default()
|
||||
.storage_options
|
||||
.unwrap_or_default();
|
||||
|
||||
let dataset = Dataset::write(batches, uri, Some(params))
|
||||
.await
|
||||
@@ -1210,8 +1189,6 @@ impl NativeTable {
|
||||
name: name.to_string(),
|
||||
uri: uri.to_string(),
|
||||
dataset: DatasetConsistencyWrapper::new_latest(dataset, read_consistency_interval),
|
||||
store_wrapper: write_store_wrapper,
|
||||
storage_options,
|
||||
read_consistency_interval,
|
||||
})
|
||||
}
|
||||
@@ -1758,10 +1735,13 @@ impl TableInternal for NativeTable {
|
||||
add: AddDataBuilder<NoData>,
|
||||
data: Box<dyn RecordBatchReader + Send>,
|
||||
) -> Result<()> {
|
||||
let data =
|
||||
MaybeEmbedded::try_new(data, self.table_definition().await?, add.embedding_registry)?;
|
||||
let data = Box::new(MaybeEmbedded::try_new(
|
||||
data,
|
||||
self.table_definition().await?,
|
||||
add.embedding_registry,
|
||||
)?) as Box<dyn RecordBatchReader + Send>;
|
||||
|
||||
let mut lance_params = add.write_options.lance_write_params.unwrap_or(WriteParams {
|
||||
let lance_params = add.write_options.lance_write_params.unwrap_or(WriteParams {
|
||||
mode: match add.mode {
|
||||
AddDataMode::Append => WriteMode::Append,
|
||||
AddDataMode::Overwrite => WriteMode::Overwrite,
|
||||
@@ -1769,27 +1749,15 @@ impl TableInternal for NativeTable {
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
// Bring storage options from table
|
||||
let storage_options = lance_params
|
||||
.store_params
|
||||
.get_or_insert(Default::default())
|
||||
.storage_options
|
||||
.get_or_insert(Default::default());
|
||||
for (key, value) in self.storage_options.iter() {
|
||||
if !storage_options.contains_key(key) {
|
||||
storage_options.insert(key.clone(), value.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// patch the params if we have a write store wrapper
|
||||
let lance_params = match self.store_wrapper.clone() {
|
||||
Some(wrapper) => lance_params.patch_with_store_wrapper(wrapper)?,
|
||||
None => lance_params,
|
||||
let dataset = {
|
||||
// Limited scope for the mutable borrow of self.dataset avoids deadlock.
|
||||
let ds = self.dataset.get_mut().await?;
|
||||
InsertBuilder::new(Arc::new(ds.clone()))
|
||||
.with_params(&lance_params)
|
||||
.execute_stream(data)
|
||||
.await?
|
||||
};
|
||||
|
||||
self.dataset.ensure_mutable().await?;
|
||||
let dataset = Dataset::write(data, &self.uri, Some(lance_params)).await?;
|
||||
|
||||
self.dataset.set_latest(dataset).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::{DataType, Schema};
|
||||
use lance::arrow::json::JsonDataType;
|
||||
use lance::dataset::{ReadParams, WriteParams};
|
||||
use lance::io::{ObjectStoreParams, WrappingObjectStore};
|
||||
use lazy_static::lazy_static;
|
||||
@@ -175,6 +176,15 @@ pub fn supported_vector_data_type(dtype: &DataType) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
/// Note: this is temporary until we get a proper datatype conversion in Lance.
|
||||
pub fn string_to_datatype(s: &str) -> Option<DataType> {
|
||||
let data_type = serde_json::Value::String(s.to_string());
|
||||
let json_type =
|
||||
serde_json::Value::Object([("type".to_string(), data_type)].iter().cloned().collect());
|
||||
let json_type: JsonDataType = serde_json::from_value(json_type).ok()?;
|
||||
(&json_type).try_into().ok()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -239,4 +249,11 @@ mod tests {
|
||||
assert!(validate_table_name("my@table").is_err());
|
||||
assert!(validate_table_name("name with space").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_to_datatype() {
|
||||
let string = "int32";
|
||||
let expected = DataType::Int32;
|
||||
assert_eq!(string_to_datatype(string), Some(expected));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user