mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 06:39:57 +00:00
Compare commits
34 Commits
v0.1.10-py
...
v0.1.11-py
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2704a4522c | ||
|
|
030f07e7f0 | ||
|
|
72afa06b7a | ||
|
|
088e745e1d | ||
|
|
7a57cddb2c | ||
|
|
8ff5f88916 | ||
|
|
028a6e433d | ||
|
|
04c6814fb1 | ||
|
|
c62e4ca1eb | ||
|
|
aecc5fc42b | ||
|
|
2fdcb307eb | ||
|
|
ad18826579 | ||
|
|
a8a50591d7 | ||
|
|
6dfe7fabc2 | ||
|
|
2b108e1c80 | ||
|
|
8c9edafccc | ||
|
|
0590413b96 | ||
|
|
bd2d40a927 | ||
|
|
08944bf4fd | ||
|
|
826dc90151 | ||
|
|
08cc483ec9 | ||
|
|
ff1d206182 | ||
|
|
c385c55629 | ||
|
|
0a03f7ca5a | ||
|
|
88be978e87 | ||
|
|
98b12caa06 | ||
|
|
091dffb171 | ||
|
|
ace6aa883a | ||
|
|
80c25f9896 | ||
|
|
caf22fdb71 | ||
|
|
0e7ae5dfbf | ||
|
|
b261e27222 | ||
|
|
9f603f73a9 | ||
|
|
9ef846929b |
@@ -1,5 +1,5 @@
|
|||||||
[bumpversion]
|
[bumpversion]
|
||||||
current_version = 0.1.10
|
current_version = 0.1.13
|
||||||
commit = True
|
commit = True
|
||||||
message = Bump version: {current_version} → {new_version}
|
message = Bump version: {current_version} → {new_version}
|
||||||
tag = True
|
tag = True
|
||||||
|
|||||||
2
.github/workflows/docs_test.yml
vendored
2
.github/workflows/docs_test.yml
vendored
@@ -81,7 +81,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
cd docs/test/node_modules/vectordb
|
cd docs/test/node_modules/vectordb
|
||||||
npm ci
|
npm ci
|
||||||
npm run build
|
npm run build-release
|
||||||
npm run tsc
|
npm run tsc
|
||||||
- name: Create test files
|
- name: Create test files
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
12
.github/workflows/node.yml
vendored
12
.github/workflows/node.yml
vendored
@@ -67,8 +67,12 @@ jobs:
|
|||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
npm ci
|
npm ci
|
||||||
npm run build
|
|
||||||
npm run tsc
|
npm run tsc
|
||||||
|
npm run build
|
||||||
|
npm run pack-build
|
||||||
|
npm install --no-save ./dist/vectordb-*.tgz
|
||||||
|
# Remove index.node to test with dependency installed
|
||||||
|
rm index.node
|
||||||
- name: Test
|
- name: Test
|
||||||
run: npm run test
|
run: npm run test
|
||||||
macos:
|
macos:
|
||||||
@@ -94,8 +98,12 @@ jobs:
|
|||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
npm ci
|
npm ci
|
||||||
npm run build
|
|
||||||
npm run tsc
|
npm run tsc
|
||||||
|
npm run build
|
||||||
|
npm run pack-build
|
||||||
|
npm install --no-save ./dist/vectordb-*.tgz
|
||||||
|
# Remove index.node to test with dependency installed
|
||||||
|
rm index.node
|
||||||
- name: Test
|
- name: Test
|
||||||
run: |
|
run: |
|
||||||
npm run test
|
npm run test
|
||||||
|
|||||||
171
.github/workflows/npm-publish.yml
vendored
Normal file
171
.github/workflows/npm-publish.yml
vendored
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
name: NPM Publish
|
||||||
|
|
||||||
|
on:
|
||||||
|
release:
|
||||||
|
types: [ published ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
node:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
working-directory: node
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- uses: actions/setup-node@v3
|
||||||
|
with:
|
||||||
|
node-version: 20
|
||||||
|
cache: 'npm'
|
||||||
|
cache-dependency-path: node/package-lock.json
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
npm ci
|
||||||
|
npm run tsc
|
||||||
|
npm pack
|
||||||
|
- name: Upload Linux Artifacts
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: node-package
|
||||||
|
path: |
|
||||||
|
node/vectordb-*.tgz
|
||||||
|
|
||||||
|
node-macos:
|
||||||
|
runs-on: macos-12
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
target: [x86_64-apple-darwin, aarch64-apple-darwin]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Install system dependencies
|
||||||
|
run: brew install protobuf
|
||||||
|
- name: Install npm dependencies
|
||||||
|
run: |
|
||||||
|
cd node
|
||||||
|
npm ci
|
||||||
|
- name: Install rustup target
|
||||||
|
if: ${{ matrix.target == 'aarch64-apple-darwin' }}
|
||||||
|
run: rustup target add aarch64-apple-darwin
|
||||||
|
- name: Build MacOS native node modules
|
||||||
|
run: bash ci/build_macos_artifacts.sh ${{ matrix.target }}
|
||||||
|
- name: Upload Darwin Artifacts
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: darwin-native
|
||||||
|
path: |
|
||||||
|
node/dist/vectordb-darwin*.tgz
|
||||||
|
|
||||||
|
node-linux:
|
||||||
|
name: node-linux (${{ matrix.arch}}-unknown-linux-${{ matrix.libc }})
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
libc:
|
||||||
|
- gnu
|
||||||
|
# TODO: re-enable musl once we have refactored to pre-built containers
|
||||||
|
# Right now we have to build node from source which is too expensive.
|
||||||
|
# - musl
|
||||||
|
arch:
|
||||||
|
- x86_64
|
||||||
|
# Building on aarch64 is too slow for now
|
||||||
|
# - aarch64
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Change owner to root (for npm)
|
||||||
|
# The docker container is run as root, so we need the files to be owned by root
|
||||||
|
# Otherwise npm is a nightmare: https://github.com/npm/cli/issues/3773
|
||||||
|
run: sudo chown -R root:root .
|
||||||
|
- name: Set up QEMU
|
||||||
|
if: ${{ matrix.arch == 'aarch64' }}
|
||||||
|
uses: docker/setup-qemu-action@v2
|
||||||
|
with:
|
||||||
|
platforms: arm64
|
||||||
|
- name: Build Linux GNU native node modules
|
||||||
|
if: ${{ matrix.libc == 'gnu' }}
|
||||||
|
run: |
|
||||||
|
docker run \
|
||||||
|
-v $(pwd):/io -w /io \
|
||||||
|
rust:1.70-bookworm \
|
||||||
|
bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-gnu
|
||||||
|
- name: Build musl Linux native node modules
|
||||||
|
if: ${{ matrix.libc == 'musl' }}
|
||||||
|
run: |
|
||||||
|
docker run --platform linux/arm64/v8 \
|
||||||
|
-v $(pwd):/io -w /io \
|
||||||
|
quay.io/pypa/musllinux_1_1_${{ matrix.arch }} \
|
||||||
|
bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-musl
|
||||||
|
- name: Upload Linux Artifacts
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: linux-native
|
||||||
|
path: |
|
||||||
|
node/dist/vectordb-linux*.tgz
|
||||||
|
|
||||||
|
node-windows:
|
||||||
|
runs-on: windows-2022
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
target: [x86_64-pc-windows-msvc]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Install Protoc v21.12
|
||||||
|
working-directory: C:\
|
||||||
|
run: |
|
||||||
|
New-Item -Path 'C:\protoc' -ItemType Directory
|
||||||
|
Set-Location C:\protoc
|
||||||
|
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||||
|
7z x protoc.zip
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||||
|
shell: powershell
|
||||||
|
- name: Install npm dependencies
|
||||||
|
run: |
|
||||||
|
cd node
|
||||||
|
npm ci
|
||||||
|
- name: Build Windows native node modules
|
||||||
|
run: .\ci\build_windows_artifacts.ps1 ${{ matrix.target }}
|
||||||
|
- name: Upload Windows Artifacts
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: windows-native
|
||||||
|
path: |
|
||||||
|
node/dist/vectordb-win32*.tgz
|
||||||
|
|
||||||
|
release:
|
||||||
|
needs: [node, node-macos, node-linux]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
steps:
|
||||||
|
- uses: actions/download-artifact@v3
|
||||||
|
- name: Display structure of downloaded files
|
||||||
|
run: ls -R
|
||||||
|
- uses: actions/setup-node@v3
|
||||||
|
with:
|
||||||
|
node-version: 20
|
||||||
|
- name: Publish to NPM
|
||||||
|
env:
|
||||||
|
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||||
|
run: |
|
||||||
|
mv */*.tgz .
|
||||||
|
for filename in *.tgz; do
|
||||||
|
npm publish $filename
|
||||||
|
done
|
||||||
22
.github/workflows/rust.yml
vendored
22
.github/workflows/rust.yml
vendored
@@ -6,6 +6,7 @@ on:
|
|||||||
- main
|
- main
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
|
- Cargo.toml
|
||||||
- rust/**
|
- rust/**
|
||||||
- .github/workflows/rust.yml
|
- .github/workflows/rust.yml
|
||||||
|
|
||||||
@@ -65,3 +66,24 @@ jobs:
|
|||||||
run: cargo build --all-features
|
run: cargo build --all-features
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: cargo test --all-features
|
run: cargo test --all-features
|
||||||
|
windows:
|
||||||
|
runs-on: windows-2022
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
with:
|
||||||
|
workspaces: rust
|
||||||
|
- name: Install Protoc v21.12
|
||||||
|
working-directory: C:\
|
||||||
|
run: |
|
||||||
|
New-Item -Path 'C:\protoc' -ItemType Directory
|
||||||
|
Set-Location C:\protoc
|
||||||
|
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||||
|
7z x protoc.zip
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||||
|
shell: powershell
|
||||||
|
- name: Run tests
|
||||||
|
run: |
|
||||||
|
$env:VCPKG_ROOT = $env:VCPKG_INSTALLATION_ROOT
|
||||||
|
cargo build
|
||||||
|
cargo test
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -5,6 +5,8 @@
|
|||||||
.DS_Store
|
.DS_Store
|
||||||
venv
|
venv
|
||||||
|
|
||||||
|
.vscode
|
||||||
|
|
||||||
rust/target
|
rust/target
|
||||||
rust/Cargo.lock
|
rust/Cargo.lock
|
||||||
|
|
||||||
|
|||||||
12
Cargo.toml
12
Cargo.toml
@@ -6,9 +6,11 @@ members = [
|
|||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = "0.5.3"
|
lance = "=0.5.8"
|
||||||
arrow-array = "40.0"
|
arrow-array = "42.0"
|
||||||
arrow-data = "40.0"
|
arrow-data = "42.0"
|
||||||
arrow-schema = "40.0"
|
arrow-schema = "42.0"
|
||||||
arrow-ipc = "40.0"
|
arrow-ipc = "42.0"
|
||||||
|
half = { "version" = "2.2.1", default-features = false }
|
||||||
object_store = "0.6.1"
|
object_store = "0.6.1"
|
||||||
|
|
||||||
|
|||||||
72
ci/build_linux_artifacts.sh
Normal file
72
ci/build_linux_artifacts.sh
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Builds the Linux artifacts (node binaries).
|
||||||
|
# Usage: ./build_linux_artifacts.sh [target]
|
||||||
|
# Targets supported:
|
||||||
|
# - x86_64-unknown-linux-gnu:centos
|
||||||
|
# - aarch64-unknown-linux-gnu:centos
|
||||||
|
# - aarch64-unknown-linux-musl
|
||||||
|
# - x86_64-unknown-linux-musl
|
||||||
|
|
||||||
|
# TODO: refactor this into a Docker container we can pull
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
setup_dependencies() {
|
||||||
|
echo "Installing system dependencies..."
|
||||||
|
if [[ $1 == *musl ]]; then
|
||||||
|
# musllinux
|
||||||
|
apk add openssl-dev
|
||||||
|
else
|
||||||
|
# rust / debian
|
||||||
|
apt update
|
||||||
|
apt install -y libssl-dev protobuf-compiler
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
install_node() {
|
||||||
|
echo "Installing node..."
|
||||||
|
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
|
||||||
|
source "$HOME"/.bashrc
|
||||||
|
|
||||||
|
if [[ $1 == *musl ]]; then
|
||||||
|
# This node version is 15, we need 16 or higher:
|
||||||
|
# apk add nodejs-current npm
|
||||||
|
# So instead we install from source (nvm doesn't provide binaries for musl):
|
||||||
|
nvm install -s --no-progress 17
|
||||||
|
else
|
||||||
|
nvm install --no-progress 17 # latest that supports glibc 2.17
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
build_node_binary() {
|
||||||
|
echo "Building node library for $1..."
|
||||||
|
pushd node
|
||||||
|
|
||||||
|
npm ci
|
||||||
|
|
||||||
|
if [[ $1 == *musl ]]; then
|
||||||
|
# This is needed for cargo to allow build cdylibs with musl
|
||||||
|
export RUSTFLAGS="-C target-feature=-crt-static"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Cargo can run out of memory while pulling dependencies, especially when running
|
||||||
|
# in QEMU. This is a workaround for that.
|
||||||
|
export CARGO_NET_GIT_FETCH_WITH_CLI=true
|
||||||
|
|
||||||
|
# We don't pass in target, since the native target here already matches
|
||||||
|
# We need to pass OPENSSL_LIB_DIR and OPENSSL_INCLUDE_DIR for static build to work https://github.com/sfackler/rust-openssl/issues/877
|
||||||
|
OPENSSL_STATIC=1 OPENSSL_LIB_DIR=/usr/lib/x86_64-linux-gnu OPENSSL_INCLUDE_DIR=/usr/include/openssl/ npm run build-release
|
||||||
|
npm run pack-build
|
||||||
|
|
||||||
|
popd
|
||||||
|
}
|
||||||
|
|
||||||
|
TARGET=${1:-x86_64-unknown-linux-gnu}
|
||||||
|
# Others:
|
||||||
|
# aarch64-unknown-linux-gnu
|
||||||
|
# x86_64-unknown-linux-musl
|
||||||
|
# aarch64-unknown-linux-musl
|
||||||
|
|
||||||
|
setup_dependencies $TARGET
|
||||||
|
install_node $TARGET
|
||||||
|
build_node_binary $TARGET
|
||||||
33
ci/build_macos_artifacts.sh
Normal file
33
ci/build_macos_artifacts.sh
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# Builds the macOS artifacts (node binaries).
|
||||||
|
# Usage: ./ci/build_macos_artifacts.sh [target]
|
||||||
|
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
|
||||||
|
|
||||||
|
prebuild_rust() {
|
||||||
|
# Building here for the sake of easier debugging.
|
||||||
|
pushd rust/ffi/node
|
||||||
|
echo "Building rust library for $1"
|
||||||
|
export RUST_BACKTRACE=1
|
||||||
|
cargo build --release --target $1
|
||||||
|
popd
|
||||||
|
}
|
||||||
|
|
||||||
|
build_node_binaries() {
|
||||||
|
pushd node
|
||||||
|
echo "Building node library for $1"
|
||||||
|
npm run build-release -- --target $1
|
||||||
|
npm run pack-build -- --target $1
|
||||||
|
popd
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ -n "$1" ]; then
|
||||||
|
targets=$1
|
||||||
|
else
|
||||||
|
targets="x86_64-apple-darwin aarch64-apple-darwin"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Building artifacts for targets: $targets"
|
||||||
|
for target in $targets
|
||||||
|
do
|
||||||
|
prebuild_rust $target
|
||||||
|
build_node_binaries $target
|
||||||
|
done
|
||||||
41
ci/build_windows_artifacts.ps1
Normal file
41
ci/build_windows_artifacts.ps1
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
# Builds the Windows artifacts (node binaries).
|
||||||
|
# Usage: .\ci\build_windows_artifacts.ps1 [target]
|
||||||
|
# Targets supported:
|
||||||
|
# - x86_64-pc-windows-msvc
|
||||||
|
# - i686-pc-windows-msvc
|
||||||
|
|
||||||
|
function Prebuild-Rust {
|
||||||
|
param (
|
||||||
|
[string]$target
|
||||||
|
)
|
||||||
|
|
||||||
|
# Building here for the sake of easier debugging.
|
||||||
|
Push-Location -Path "rust/ffi/node"
|
||||||
|
Write-Host "Building rust library for $target"
|
||||||
|
$env:RUST_BACKTRACE=1
|
||||||
|
cargo build --release --target $target
|
||||||
|
Pop-Location
|
||||||
|
}
|
||||||
|
|
||||||
|
function Build-NodeBinaries {
|
||||||
|
param (
|
||||||
|
[string]$target
|
||||||
|
)
|
||||||
|
|
||||||
|
Push-Location -Path "node"
|
||||||
|
Write-Host "Building node library for $target"
|
||||||
|
npm run build-release -- --target $target
|
||||||
|
npm run pack-build -- --target $target
|
||||||
|
Pop-Location
|
||||||
|
}
|
||||||
|
|
||||||
|
$targets = $args[0]
|
||||||
|
if (-not $targets) {
|
||||||
|
$targets = "x86_64-pc-windows-msvc"
|
||||||
|
}
|
||||||
|
|
||||||
|
Write-Host "Building artifacts for targets: $targets"
|
||||||
|
foreach ($target in $targets) {
|
||||||
|
Prebuild-Rust $target
|
||||||
|
Build-NodeBinaries $target
|
||||||
|
}
|
||||||
@@ -50,13 +50,19 @@ markdown_extensions:
|
|||||||
- pymdownx.superfences
|
- pymdownx.superfences
|
||||||
- pymdownx.tabbed:
|
- pymdownx.tabbed:
|
||||||
alternate_style: true
|
alternate_style: true
|
||||||
|
- md_in_html
|
||||||
|
|
||||||
nav:
|
nav:
|
||||||
- Home: index.md
|
- Home: index.md
|
||||||
- Basics: basic.md
|
- Basics: basic.md
|
||||||
- Embeddings: embedding.md
|
- Embeddings: embedding.md
|
||||||
- Python full-text search: fts.md
|
- Python full-text search: fts.md
|
||||||
- Python integrations: integrations.md
|
- Python integrations:
|
||||||
|
- Pandas and PyArrow: python/arrow.md
|
||||||
|
- DuckDB: python/duckdb.md
|
||||||
|
- LangChain 🦜️🔗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
|
||||||
|
- LlamaIndex 🦙: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
|
||||||
|
- Pydantic: python/pydantic.md
|
||||||
- Python examples:
|
- Python examples:
|
||||||
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
|
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
|
||||||
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
|
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
|
||||||
@@ -65,6 +71,7 @@ nav:
|
|||||||
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
||||||
- Javascript examples:
|
- Javascript examples:
|
||||||
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
|
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
|
||||||
|
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
||||||
- References:
|
- References:
|
||||||
- Vector Search: search.md
|
- Vector Search: search.md
|
||||||
- SQL filters: sql.md
|
- SQL filters: sql.md
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# ANN (Approximate Nearest Neighbor) Indexes
|
# ANN (Approximate Nearest Neighbor) Indexes
|
||||||
|
|
||||||
You can create an index over your vector data to make search faster.
|
You can create an index over your vector data to make search faster.
|
||||||
Vector indexes are faster but less accurate than exhaustive search.
|
Vector indexes are faster but less accurate than exhaustive search (KNN or Flat Search).
|
||||||
LanceDB provides many parameters to fine-tune the index's size, the speed of queries, and the accuracy of results.
|
LanceDB provides many parameters to fine-tune the index's size, the speed of queries, and the accuracy of results.
|
||||||
|
|
||||||
Currently, LanceDB does *not* automatically create the ANN index.
|
Currently, LanceDB does *not* automatically create the ANN index.
|
||||||
@@ -10,7 +10,18 @@ If you can live with <100ms latency, skipping index creation is a simpler workfl
|
|||||||
|
|
||||||
In the future we will look to automatically create and configure the ANN index.
|
In the future we will look to automatically create and configure the ANN index.
|
||||||
|
|
||||||
## Creating an ANN Index
|
## Types of Index
|
||||||
|
|
||||||
|
Lance can support multiple index types, the most widely used one is `IVF_PQ`.
|
||||||
|
|
||||||
|
* `IVF_PQ`: use **Inverted File Index (IVF)** to first divide the dataset into `N` partitions,
|
||||||
|
and then use **Product Quantization** to compress vectors in each partition.
|
||||||
|
* `DISKANN` (**Experimental**): organize the vector as a on-disk graph, where the vertices approximately
|
||||||
|
represent the nearest neighbors of each vector.
|
||||||
|
|
||||||
|
## Creating an IVF_PQ Index
|
||||||
|
|
||||||
|
Lance supports `IVF_PQ` index type by default.
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
Creating indexes is done via the [create_index](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.create_index) method.
|
Creating indexes is done via the [create_index](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.create_index) method.
|
||||||
@@ -45,15 +56,18 @@ In the future we will look to automatically create and configure the ANN index.
|
|||||||
await table.createIndex({ type: 'ivf_pq', column: 'vector', num_partitions: 256, num_sub_vectors: 96 })
|
await table.createIndex({ type: 'ivf_pq', column: 'vector', num_partitions: 256, num_sub_vectors: 96 })
|
||||||
```
|
```
|
||||||
|
|
||||||
Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index
|
- **metric** (default: "L2"): The distance metric to use. By default it uses euclidean distance "`L2`".
|
||||||
creation by providing the following parameters:
|
We also support "cosine" and "dot" distance as well.
|
||||||
|
- **num_partitions** (default: 256): The number of partitions of the index.
|
||||||
|
- **num_sub_vectors** (default: 96): The number of sub-vectors (M) that will be created during Product Quantization (PQ).
|
||||||
|
For D dimensional vector, it will be divided into `M` of `D/M` sub-vectors, each of which is presented by
|
||||||
|
a single PQ code.
|
||||||
|
|
||||||
|
<figure markdown>
|
||||||
|

|
||||||
|
<figcaption>IVF_PQ index with <code>num_partitions=2, num_sub_vectors=4</code></figcaption>
|
||||||
|
</figure>
|
||||||
|
|
||||||
- **metric** (default: "L2"): The distance metric to use. By default we use euclidean distance. We also support "cosine" distance.
|
|
||||||
- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table
|
|
||||||
with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional.
|
|
||||||
A higher number leads to faster queries, but it makes index generation slower.
|
|
||||||
- **num_sub_vectors** (default: 96): The number of subvectors (M) that will be created during Product Quantization (PQ). A larger number makes
|
|
||||||
search more accurate, but also makes the index larger and slower to build.
|
|
||||||
|
|
||||||
## Querying an ANN Index
|
## Querying an ANN Index
|
||||||
|
|
||||||
@@ -138,3 +152,31 @@ You can select the columns returned by the query using a select clause.
|
|||||||
.select(["id"])
|
.select(["id"])
|
||||||
.execute()
|
.execute()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## FAQ
|
||||||
|
|
||||||
|
### When is it necessary to create an ANN vector index.
|
||||||
|
|
||||||
|
`LanceDB` has manually tuned SIMD code for computing vector distances.
|
||||||
|
In our benchmarks, computing 100K pairs of 1K dimension vectors only take less than 20ms.
|
||||||
|
For small dataset (<100K rows) or the applications which can accept 100ms latency, vector indices are usually not necessary.
|
||||||
|
|
||||||
|
For large-scale or higher dimension vectors, it is beneficial to create vector index.
|
||||||
|
|
||||||
|
### How big is my index, and how many memory will it take.
|
||||||
|
|
||||||
|
In LanceDB, all vector indices are disk-based, meaning that when responding to a vector query, only the relevant pages from the index file are loaded from disk and cached in memory. Additionally, each sub-vector is usually encoded into 1 byte PQ code.
|
||||||
|
|
||||||
|
For example, with a 1024-dimension dataset, if we choose `num_sub_vectors=64`, each sub-vector has `1024 / 64 = 16` float32 numbers.
|
||||||
|
Product quantization can lead to approximately `16 * sizeof(float32) / 1 = 64` times of space reduction.
|
||||||
|
|
||||||
|
### How to choose `num_partitions` and `num_sub_vectors` for `IVF_PQ` index.
|
||||||
|
|
||||||
|
`num_partitions` is used to decide how many partitions the first level `IVF` index uses.
|
||||||
|
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
|
||||||
|
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
|
||||||
|
|
||||||
|
`num_sub_vectors` decides how many Product Quantization code to generate on each vector. Because
|
||||||
|
Product Quantization is a lossy compression of the original vector, the more `num_sub_vectors` usually results to
|
||||||
|
less space distortion, and thus yield better accuracy. However, similarly, more `num_sub_vectors` causes heavier I/O and
|
||||||
|
more PQ computation, thus, higher latency. `dimension / num_sub_vectors` should be aligned with 8 for better SIMD efficiency.
|
||||||
BIN
docs/src/assets/ivf_pq.png
Normal file
BIN
docs/src/assets/ivf_pq.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 266 KiB |
@@ -46,7 +46,7 @@ You can also use an external API like OpenAI to generate embeddings
|
|||||||
|
|
||||||
def embed_func(c):
|
def embed_func(c):
|
||||||
rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
|
rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
|
||||||
return [record["embedding"] for record in rs["data"]]
|
return [record["embedding"] for record in rs["data"]]
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "Javascript"
|
=== "Javascript"
|
||||||
@@ -126,7 +126,7 @@ belong in the same latent space and your results will be nonsensical.
|
|||||||
=== "Javascript"
|
=== "Javascript"
|
||||||
```javascript
|
```javascript
|
||||||
const results = await table
|
const results = await table
|
||||||
.search('What's the best pizza topping?')
|
.search("What's the best pizza topping?")
|
||||||
.limit(10)
|
.limit(10)
|
||||||
.execute()
|
.execute()
|
||||||
```
|
```
|
||||||
|
|||||||
121
docs/src/examples/transformerjs_embedding_search_nodejs.md
Normal file
121
docs/src/examples/transformerjs_embedding_search_nodejs.md
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
# Vector embedding search using TransformersJS
|
||||||
|
|
||||||
|
## Embed and query data from LacneDB using TransformersJS
|
||||||
|
|
||||||
|
<img id="splash" width="400" alt="transformersjs" src="https://github.com/lancedb/lancedb/assets/43097991/88a31e30-3d6f-4eef-9216-4b7c688f1b4f">
|
||||||
|
|
||||||
|
This example shows how to use the [transformers.js](https://github.com/xenova/transformers.js) library to perform vector embedding search using LanceDB's Javascript API.
|
||||||
|
|
||||||
|
|
||||||
|
### Setting up
|
||||||
|
First, install the dependencies:
|
||||||
|
```bash
|
||||||
|
npm install vectordb
|
||||||
|
npm i @xenova/transformers
|
||||||
|
```
|
||||||
|
|
||||||
|
We will also be using the [all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) model to make it compatible with Transformers.js
|
||||||
|
|
||||||
|
Within our `index.js` file we will import the necessary libraries and define our model and database:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const lancedb = require('vectordb')
|
||||||
|
const { pipeline } = await import('@xenova/transformers')
|
||||||
|
const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
|
||||||
|
```
|
||||||
|
|
||||||
|
### Creating the embedding function
|
||||||
|
|
||||||
|
Next, we will create a function that will take in a string and return the vector embedding of that string. We will use the `pipe` function we defined earlier to get the vector embedding of the string.
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Define the function. `sourceColumn` is required for LanceDB to know
|
||||||
|
// which column to use as input.
|
||||||
|
const embed_fun = {}
|
||||||
|
embed_fun.sourceColumn = 'text'
|
||||||
|
embed_fun.embed = async function (batch) {
|
||||||
|
let result = []
|
||||||
|
// Given a batch of strings, we will use the `pipe` function to get
|
||||||
|
// the vector embedding of each string.
|
||||||
|
for (let text of batch) {
|
||||||
|
// 'mean' pooling and normalizing allows the embeddings to share the
|
||||||
|
// same length.
|
||||||
|
const res = await pipe(text, { pooling: 'mean', normalize: true })
|
||||||
|
result.push(Array.from(res['data']))
|
||||||
|
}
|
||||||
|
return (result)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Creating the database
|
||||||
|
|
||||||
|
Now, we will create the LanceDB database and add the embedding function we defined earlier.
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Link a folder and create a table with data
|
||||||
|
const db = await lancedb.connect('data/sample-lancedb')
|
||||||
|
|
||||||
|
// You can also import any other data, but make sure that you have a column
|
||||||
|
// for the embedding function to use.
|
||||||
|
const data = [
|
||||||
|
{ id: 1, text: 'Cherry', type: 'fruit' },
|
||||||
|
{ id: 2, text: 'Carrot', type: 'vegetable' },
|
||||||
|
{ id: 3, text: 'Potato', type: 'vegetable' },
|
||||||
|
{ id: 4, text: 'Apple', type: 'fruit' },
|
||||||
|
{ id: 5, text: 'Banana', type: 'fruit' }
|
||||||
|
]
|
||||||
|
|
||||||
|
// Create the table with the embedding function
|
||||||
|
const table = await db.createTable('food_table', data, "create", embed_fun)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Performing the search
|
||||||
|
|
||||||
|
Now, we can perform the search using the `search` function. LanceDB automatically uses the embedding function we defined earlier to get the vector embedding of the query string.
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Query the table
|
||||||
|
const results = await table
|
||||||
|
.search("a sweet fruit to eat")
|
||||||
|
.metricType("cosine")
|
||||||
|
.limit(2)
|
||||||
|
.execute()
|
||||||
|
console.log(results.map(r => r.text))
|
||||||
|
```
|
||||||
|
```bash
|
||||||
|
[ 'Banana', 'Cherry' ]
|
||||||
|
```
|
||||||
|
|
||||||
|
Output of `results`:
|
||||||
|
```bash
|
||||||
|
[
|
||||||
|
{
|
||||||
|
vector: Float32Array(384) [
|
||||||
|
-0.057455405592918396,
|
||||||
|
0.03617725893855095,
|
||||||
|
-0.0367760956287384,
|
||||||
|
... 381 more items
|
||||||
|
],
|
||||||
|
id: 5,
|
||||||
|
text: 'Banana',
|
||||||
|
type: 'fruit',
|
||||||
|
score: 0.4919965863227844
|
||||||
|
},
|
||||||
|
{
|
||||||
|
vector: Float32Array(384) [
|
||||||
|
0.0009714411571621895,
|
||||||
|
0.008223623037338257,
|
||||||
|
0.009571489877998829,
|
||||||
|
... 381 more items
|
||||||
|
],
|
||||||
|
id: 1,
|
||||||
|
text: 'Cherry',
|
||||||
|
type: 'fruit',
|
||||||
|
score: 0.5540297031402588
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Wrapping it up
|
||||||
|
|
||||||
|
In this example, we showed how to use the `transformers.js` library to perform vector embedding search using LanceDB's Javascript API. You can find the full code for this example on [Github](https://github.com/lancedb/lancedb/blob/main/node/examples/js-transformers/index.js)!
|
||||||
@@ -67,6 +67,6 @@ LanceDB's core is written in Rust 🦀 and is built using <a href="https://githu
|
|||||||
* [`Embedding Functions`](embedding.md) - functions for working with embeddings.
|
* [`Embedding Functions`](embedding.md) - functions for working with embeddings.
|
||||||
* [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries.
|
* [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries.
|
||||||
* [`Full text search`](fts.md) - [EXPERIMENTAL] full-text search API
|
* [`Full text search`](fts.md) - [EXPERIMENTAL] full-text search API
|
||||||
* [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
|
* [`Ecosystem Integrations`](python/integration.md) - integrating LanceDB with python data tooling ecosystem.
|
||||||
* [`Python API Reference`](python/python.md) - detailed documentation for the LanceDB Python SDK.
|
* [`Python API Reference`](python/python.md) - detailed documentation for the LanceDB Python SDK.
|
||||||
* [`Node API Reference`](javascript/modules.md) - detailed documentation for the LanceDB Python SDK.
|
* [`Node API Reference`](javascript/modules.md) - detailed documentation for the LanceDB Python SDK.
|
||||||
|
|||||||
@@ -1,116 +0,0 @@
|
|||||||
# Integrations
|
|
||||||
|
|
||||||
Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, PyArrow and DuckDB.
|
|
||||||
|
|
||||||
## Pandas and PyArrow
|
|
||||||
|
|
||||||
First, we need to connect to a `LanceDB` database.
|
|
||||||
|
|
||||||
```py
|
|
||||||
|
|
||||||
import lancedb
|
|
||||||
|
|
||||||
db = lancedb.connect("data/sample-lancedb")
|
|
||||||
```
|
|
||||||
|
|
||||||
And write a `Pandas DataFrame` to LanceDB directly.
|
|
||||||
|
|
||||||
```py
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
data = pd.DataFrame({
|
|
||||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
|
||||||
"item": ["foo", "bar"],
|
|
||||||
"price": [10.0, 20.0]
|
|
||||||
})
|
|
||||||
table = db.create_table("pd_table", data=data)
|
|
||||||
```
|
|
||||||
|
|
||||||
You will find detailed instructions of creating dataset and index in [Basic Operations](basic.md) and [Indexing](ann_indexes.md)
|
|
||||||
sections.
|
|
||||||
|
|
||||||
|
|
||||||
We can now perform similarity searches via `LanceDB`.
|
|
||||||
|
|
||||||
```py
|
|
||||||
# Open the table previously created.
|
|
||||||
table = db.open_table("pd_table")
|
|
||||||
|
|
||||||
query_vector = [100, 100]
|
|
||||||
# Pandas DataFrame
|
|
||||||
df = table.search(query_vector).limit(1).to_df()
|
|
||||||
print(df)
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
vector item price score
|
|
||||||
0 [5.9, 26.5] bar 20.0 14257.05957
|
|
||||||
```
|
|
||||||
|
|
||||||
If you have a simple filter, it's faster to provide a where clause to `LanceDB`'s search query.
|
|
||||||
If you have more complex criteria, you can always apply the filter to the resulting pandas `DataFrame` from the search query.
|
|
||||||
|
|
||||||
```python
|
|
||||||
|
|
||||||
# Apply the filter via LanceDB
|
|
||||||
results = table.search([100, 100]).where("price < 15").to_df()
|
|
||||||
assert len(results) == 1
|
|
||||||
assert results["item"].iloc[0] == "foo"
|
|
||||||
|
|
||||||
# Apply the filter via Pandas
|
|
||||||
df = results = table.search([100, 100]).to_df()
|
|
||||||
results = df[df.price < 15]
|
|
||||||
assert len(results) == 1
|
|
||||||
assert results["item"].iloc[0] == "foo"
|
|
||||||
```
|
|
||||||
|
|
||||||
## DuckDB
|
|
||||||
|
|
||||||
`LanceDB` works with `DuckDB` via [PyArrow integration](https://duckdb.org/docs/guides/python/sql_on_arrow).
|
|
||||||
|
|
||||||
Let us start with installing `duckdb` and `lancedb`.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
pip install duckdb lancedb
|
|
||||||
```
|
|
||||||
|
|
||||||
We will re-use the dataset created previously
|
|
||||||
|
|
||||||
```python
|
|
||||||
import lancedb
|
|
||||||
|
|
||||||
db = lancedb.connect("data/sample-lancedb")
|
|
||||||
table = db.open_table("pd_table")
|
|
||||||
arrow_table = table.to_arrow()
|
|
||||||
```
|
|
||||||
|
|
||||||
`DuckDB` can directly query the `arrow_table`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import duckdb
|
|
||||||
|
|
||||||
duckdb.query("SELECT * FROM arrow_table")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────┬─────────┬────────┐
|
|
||||||
│ vector │ item │ price │
|
|
||||||
│ float[] │ varchar │ double │
|
|
||||||
├─────────────┼─────────┼────────┤
|
|
||||||
│ [3.1, 4.1] │ foo │ 10.0 │
|
|
||||||
│ [5.9, 26.5] │ bar │ 20.0 │
|
|
||||||
└─────────────┴─────────┴────────┘
|
|
||||||
```
|
|
||||||
```python
|
|
||||||
duckdb.query("SELECT mean(price) FROM arrow_table")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
Out[16]:
|
|
||||||
┌─────────────┐
|
|
||||||
│ mean(price) │
|
|
||||||
│ double │
|
|
||||||
├─────────────┤
|
|
||||||
│ 15.0 │
|
|
||||||
└─────────────┘
|
|
||||||
```
|
|
||||||
101
docs/src/python/arrow.md
Normal file
101
docs/src/python/arrow.md
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
# Pandas and PyArrow
|
||||||
|
|
||||||
|
|
||||||
|
Built on top of [Apache Arrow](https://arrow.apache.org/),
|
||||||
|
`LanceDB` is easy to integrate with the Python ecosystem, including [Pandas](https://pandas.pydata.org/)
|
||||||
|
and PyArrow.
|
||||||
|
|
||||||
|
## Create dataset
|
||||||
|
|
||||||
|
First, we need to connect to a `LanceDB` database.
|
||||||
|
|
||||||
|
```py
|
||||||
|
|
||||||
|
import lancedb
|
||||||
|
|
||||||
|
db = lancedb.connect("data/sample-lancedb")
|
||||||
|
```
|
||||||
|
|
||||||
|
Afterwards, we write a `Pandas DataFrame` to LanceDB directly.
|
||||||
|
|
||||||
|
```py
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
data = pd.DataFrame({
|
||||||
|
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||||
|
"item": ["foo", "bar"],
|
||||||
|
"price": [10.0, 20.0]
|
||||||
|
})
|
||||||
|
table = db.create_table("pd_table", data=data)
|
||||||
|
```
|
||||||
|
|
||||||
|
Similar to [`pyarrow.write_dataset()`](https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html),
|
||||||
|
[db.create_table()](../python/#lancedb.db.DBConnection.create_table) accepts a wide-range of forms of data.
|
||||||
|
|
||||||
|
For example, if you have a dataset that is larger than memory size, you can create table with `Iterator[pyarrow.RecordBatch]`,
|
||||||
|
to lazily generate data:
|
||||||
|
|
||||||
|
```py
|
||||||
|
|
||||||
|
from typing import Iterable
|
||||||
|
import pyarrow as pa
|
||||||
|
import lancedb
|
||||||
|
|
||||||
|
def make_batches() -> Iterable[pa.RecordBatch]:
|
||||||
|
for i in range(5):
|
||||||
|
yield pa.RecordBatch.from_arrays(
|
||||||
|
[
|
||||||
|
pa.array([[3.1, 4.1], [5.9, 26.5]]),
|
||||||
|
pa.array(["foo", "bar"]),
|
||||||
|
pa.array([10.0, 20.0]),
|
||||||
|
],
|
||||||
|
["vector", "item", "price"])
|
||||||
|
|
||||||
|
schema=pa.schema([
|
||||||
|
pa.field("vector", pa.list_(pa.float32())),
|
||||||
|
pa.field("item", pa.utf8()),
|
||||||
|
pa.field("price", pa.float32()),
|
||||||
|
])
|
||||||
|
|
||||||
|
table = db.create_table("iterable_table", data=make_batches(), schema=schema)
|
||||||
|
```
|
||||||
|
|
||||||
|
You will find detailed instructions of creating dataset in
|
||||||
|
[Basic Operations](../basic.md) and [API](../python/#lancedb.db.DBConnection.create_table)
|
||||||
|
sections.
|
||||||
|
|
||||||
|
## Vector Search
|
||||||
|
|
||||||
|
We can now perform similarity search via `LanceDB` Python API.
|
||||||
|
|
||||||
|
```py
|
||||||
|
# Open the table previously created.
|
||||||
|
table = db.open_table("pd_table")
|
||||||
|
|
||||||
|
query_vector = [100, 100]
|
||||||
|
# Pandas DataFrame
|
||||||
|
df = table.search(query_vector).limit(1).to_df()
|
||||||
|
print(df)
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
vector item price score
|
||||||
|
0 [5.9, 26.5] bar 20.0 14257.05957
|
||||||
|
```
|
||||||
|
|
||||||
|
If you have a simple filter, it's faster to provide a `where clause` to `LanceDB`'s search query.
|
||||||
|
If you have more complex criteria, you can always apply the filter to the resulting Pandas `DataFrame`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
|
||||||
|
# Apply the filter via LanceDB
|
||||||
|
results = table.search([100, 100]).where("price < 15").to_df()
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results["item"].iloc[0] == "foo"
|
||||||
|
|
||||||
|
# Apply the filter via Pandas
|
||||||
|
df = results = table.search([100, 100]).to_df()
|
||||||
|
results = df[df.price < 15]
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results["item"].iloc[0] == "foo"
|
||||||
|
```
|
||||||
56
docs/src/python/duckdb.md
Normal file
56
docs/src/python/duckdb.md
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
# DuckDB
|
||||||
|
|
||||||
|
`LanceDB` works with `DuckDB` via [PyArrow integration](https://duckdb.org/docs/guides/python/sql_on_arrow).
|
||||||
|
|
||||||
|
Let us start with installing `duckdb` and `lancedb`.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install duckdb lancedb
|
||||||
|
```
|
||||||
|
|
||||||
|
We will re-use [the dataset created previously](./arrow.md):
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pandas as pd
|
||||||
|
import lancedb
|
||||||
|
|
||||||
|
db = lancedb.connect("data/sample-lancedb")
|
||||||
|
data = pd.DataFrame({
|
||||||
|
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||||
|
"item": ["foo", "bar"],
|
||||||
|
"price": [10.0, 20.0]
|
||||||
|
})
|
||||||
|
table = db.create_table("pd_table", data=data)
|
||||||
|
arrow_table = table.to_arrow()
|
||||||
|
```
|
||||||
|
|
||||||
|
`DuckDB` can directly query the `arrow_table`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import duckdb
|
||||||
|
|
||||||
|
duckdb.query("SELECT * FROM arrow_table")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────┬─────────┬────────┐
|
||||||
|
│ vector │ item │ price │
|
||||||
|
│ float[] │ varchar │ double │
|
||||||
|
├─────────────┼─────────┼────────┤
|
||||||
|
│ [3.1, 4.1] │ foo │ 10.0 │
|
||||||
|
│ [5.9, 26.5] │ bar │ 20.0 │
|
||||||
|
└─────────────┴─────────┴────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
```py
|
||||||
|
duckdb.query("SELECT mean(price) FROM arrow_table")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────┐
|
||||||
|
│ mean(price) │
|
||||||
|
│ double │
|
||||||
|
├─────────────┤
|
||||||
|
│ 15.0 │
|
||||||
|
└─────────────┘
|
||||||
|
```
|
||||||
7
docs/src/python/integration.md
Normal file
7
docs/src/python/integration.md
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
# Integration
|
||||||
|
|
||||||
|
Built on top of [Apache Arrow](https://arrow.apache.org/),
|
||||||
|
`LanceDB` is very easy to be integrate with Python ecosystems.
|
||||||
|
|
||||||
|
* [Pandas and Arrow Integration](./arrow.md)
|
||||||
|
* [DuckDB Integration](./duckdb.md)
|
||||||
35
docs/src/python/pydantic.md
Normal file
35
docs/src/python/pydantic.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Pydantic
|
||||||
|
|
||||||
|
[Pydantic](https://docs.pydantic.dev/latest/) is a data validation library in Python.
|
||||||
|
|
||||||
|
## Schema
|
||||||
|
|
||||||
|
LanceDB supports to create Apache Arrow Schema from a
|
||||||
|
[Pydantic BaseModel](https://docs.pydantic.dev/latest/api/main/#pydantic.main.BaseModel)
|
||||||
|
via [pydantic_to_schema()](python.md##lancedb.pydantic.pydantic_to_schema) method.
|
||||||
|
|
||||||
|
::: lancedb.pydantic.pydantic_to_schema
|
||||||
|
|
||||||
|
## Vector Field
|
||||||
|
|
||||||
|
LanceDB provides a [`vector(dim)`](python.md#lancedb.pydantic.vector) method to define a
|
||||||
|
vector Field in a Pydantic Model.
|
||||||
|
|
||||||
|
::: lancedb.pydantic.vector
|
||||||
|
|
||||||
|
## Type Conversion
|
||||||
|
|
||||||
|
LanceDB automatically convert Pydantic fields to
|
||||||
|
[Apache Arrow DataType](https://arrow.apache.org/docs/python/generated/pyarrow.DataType.html#pyarrow.DataType).
|
||||||
|
|
||||||
|
Current supported type conversions:
|
||||||
|
|
||||||
|
| Pydantic Field Type | PyArrow Data Type |
|
||||||
|
| ------------------- | ----------------- |
|
||||||
|
| `int` | `pyarrow.int64` |
|
||||||
|
| `float` | `pyarrow.float64` |
|
||||||
|
| `bool` | `pyarrow.bool` |
|
||||||
|
| `str` | `pyarrow.utf8()` |
|
||||||
|
| `list` | `pyarrow.List` |
|
||||||
|
| `BaseModel` | `pyarrow.Struct` |
|
||||||
|
| `vector(n)` | `pyarrow.FixedSizeList(float32, n)` |
|
||||||
@@ -43,3 +43,17 @@ pip install lancedb
|
|||||||
::: lancedb.fts.populate_index
|
::: lancedb.fts.populate_index
|
||||||
|
|
||||||
::: lancedb.fts.search_index
|
::: lancedb.fts.search_index
|
||||||
|
|
||||||
|
## Utilities
|
||||||
|
|
||||||
|
::: lancedb.vector
|
||||||
|
|
||||||
|
## Integrations
|
||||||
|
|
||||||
|
### Pydantic
|
||||||
|
|
||||||
|
::: lancedb.pydantic.pydantic_to_schema
|
||||||
|
|
||||||
|
::: lancedb.pydantic.vector
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -25,9 +25,9 @@ Currently, we support the following metrics:
|
|||||||
|
|
||||||
### Flat Search
|
### Flat Search
|
||||||
|
|
||||||
|
If LanceDB does not create a vector index, LanceDB would need to scan (`Flat Search`) the entire vector column
|
||||||
|
and compute the distance for each vector in order to find the closest matches.
|
||||||
|
|
||||||
If there is no [vector index is created](ann_indexes.md), LanceDB will just brute-force scan
|
|
||||||
the vector column and compute the distance.
|
|
||||||
|
|
||||||
<!-- Setup Code
|
<!-- Setup Code
|
||||||
```python
|
```python
|
||||||
@@ -79,39 +79,43 @@ await db_setup.createTable('my_vectors', data)
|
|||||||
const tbl = await db.openTable("my_vectors")
|
const tbl = await db.openTable("my_vectors")
|
||||||
|
|
||||||
const results_1 = await tbl.search(Array(1536).fill(1.2))
|
const results_1 = await tbl.search(Array(1536).fill(1.2))
|
||||||
.limit(20)
|
.limit(10)
|
||||||
.execute()
|
.execute()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
<!-- Commenting out for now since metricType fails for JS on Ubuntu 22.04.
|
|
||||||
|
|
||||||
By default, `l2` will be used as `Metric` type. You can customize the metric type
|
By default, `l2` will be used as `Metric` type. You can customize the metric type
|
||||||
as well.
|
as well.
|
||||||
-->
|
|
||||||
|
|
||||||
<!--
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
-->
|
|
||||||
<!-- ```python
|
```python
|
||||||
df = tbl.search(np.random.random((1536))) \
|
df = tbl.search(np.random.random((1536))) \
|
||||||
.metric("cosine") \
|
.metric("cosine") \
|
||||||
.limit(10) \
|
.limit(10) \
|
||||||
.to_df()
|
.to_df()
|
||||||
```
|
```
|
||||||
-->
|
|
||||||
<!--
|
|
||||||
=== "JavaScript"
|
|
||||||
-->
|
|
||||||
|
|
||||||
<!-- ```javascript
|
|
||||||
|
=== "JavaScript"
|
||||||
|
|
||||||
|
```javascript
|
||||||
const results_2 = await tbl.search(Array(1536).fill(1.2))
|
const results_2 = await tbl.search(Array(1536).fill(1.2))
|
||||||
.metricType("cosine")
|
.metricType("cosine")
|
||||||
.limit(20)
|
.limit(10)
|
||||||
.execute()
|
.execute()
|
||||||
```
|
```
|
||||||
-->
|
|
||||||
|
|
||||||
### Search with Vector Index.
|
|
||||||
|
### Approximate Nearest Neighbor (ANN) Search with Vector Index.
|
||||||
|
|
||||||
|
To accelerate vector retrievals, it is common to build vector indices.
|
||||||
|
A vector index is a data structure specifically designed to efficiently organize and
|
||||||
|
search vector data based on their similarity or distance metrics.
|
||||||
|
By constructing a vector index, you can reduce the search space and avoid the need
|
||||||
|
for brute-force scanning of the entire vector column.
|
||||||
|
|
||||||
|
However, fast vector search using indices often entails making a trade-off with accuracy to some extent.
|
||||||
|
This is why it is often called **Approximate Nearest Neighbors (ANN)** search, while the Flat Search (KNN)
|
||||||
|
always returns 100% recall.
|
||||||
|
|
||||||
See [ANN Index](ann_indexes.md) for more details.
|
See [ANN Index](ann_indexes.md) for more details.
|
||||||
@@ -7,6 +7,7 @@ const excludedFiles = [
|
|||||||
"../src/embedding.md",
|
"../src/embedding.md",
|
||||||
"../src/examples/serverless_lancedb_with_s3_and_lambda.md",
|
"../src/examples/serverless_lancedb_with_s3_and_lambda.md",
|
||||||
"../src/examples/serverless_qa_bot_with_modal_and_langchain.md",
|
"../src/examples/serverless_qa_bot_with_modal_and_langchain.md",
|
||||||
|
"../src/examples/transformerjs_embedding_search_nodejs.md",
|
||||||
"../src/examples/youtube_transcript_bot_with_nodejs.md",
|
"../src/examples/youtube_transcript_bot_with_nodejs.md",
|
||||||
];
|
];
|
||||||
const nodePrefix = "javascript";
|
const nodePrefix = "javascript";
|
||||||
|
|||||||
4
node/.npmignore
Normal file
4
node/.npmignore
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
gen_test_data.py
|
||||||
|
index.node
|
||||||
|
dist/lancedb*.tgz
|
||||||
|
vectordb*.tgz
|
||||||
@@ -8,6 +8,10 @@ A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb)
|
|||||||
npm install vectordb
|
npm install vectordb
|
||||||
```
|
```
|
||||||
|
|
||||||
|
This will download the appropriate native library for your platform. We currently
|
||||||
|
support x86_64 Linux, aarch64 Linux, Intel MacOS, and ARM (M1/M2) MacOS. We do not
|
||||||
|
yet support Windows or musl-based Linux (such as Alpine Linux).
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
### Basic Example
|
### Basic Example
|
||||||
@@ -26,12 +30,34 @@ The [examples](./examples) folder contains complete examples.
|
|||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
Run the tests with
|
To build everything fresh:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install
|
||||||
|
npm run tsc
|
||||||
|
npm run build
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you should be able to run the tests with:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
npm test
|
npm test
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Rebuilding Rust library
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run build
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rebuilding Typescript
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run tsc
|
||||||
|
```
|
||||||
|
|
||||||
|
### Fix lints
|
||||||
|
|
||||||
To run the linter and have it automatically fix all errors
|
To run the linter and have it automatically fix all errors
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
66
node/examples/js-transformers/index.js
Normal file
66
node/examples/js-transformers/index.js
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
// Copyright 2023 Lance Developers.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
'use strict'
|
||||||
|
|
||||||
|
|
||||||
|
async function example() {
|
||||||
|
|
||||||
|
const lancedb = require('vectordb')
|
||||||
|
|
||||||
|
// Import transformers and the all-MiniLM-L6-v2 model (https://huggingface.co/Xenova/all-MiniLM-L6-v2)
|
||||||
|
const { pipeline } = await import('@xenova/transformers')
|
||||||
|
const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
|
||||||
|
|
||||||
|
|
||||||
|
// Create embedding function from pipeline which returns a list of vectors from batch
|
||||||
|
// sourceColumn is the name of the column in the data to be embedded
|
||||||
|
//
|
||||||
|
// Output of pipe is a Tensor { data: Float32Array(384) }, so filter for the vector
|
||||||
|
const embed_fun = {}
|
||||||
|
embed_fun.sourceColumn = 'text'
|
||||||
|
embed_fun.embed = async function (batch) {
|
||||||
|
let result = []
|
||||||
|
for (let text of batch) {
|
||||||
|
const res = await pipe(text, { pooling: 'mean', normalize: true })
|
||||||
|
result.push(Array.from(res['data']))
|
||||||
|
}
|
||||||
|
return (result)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Link a folder and create a table with data
|
||||||
|
const db = await lancedb.connect('data/sample-lancedb')
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ id: 1, text: 'Cherry', type: 'fruit' },
|
||||||
|
{ id: 2, text: 'Carrot', type: 'vegetable' },
|
||||||
|
{ id: 3, text: 'Potato', type: 'vegetable' },
|
||||||
|
{ id: 4, text: 'Apple', type: 'fruit' },
|
||||||
|
{ id: 5, text: 'Banana', type: 'fruit' }
|
||||||
|
]
|
||||||
|
|
||||||
|
const table = await db.createTable('food_table', data, "create", embed_fun)
|
||||||
|
|
||||||
|
|
||||||
|
// Query the table
|
||||||
|
const results = await table
|
||||||
|
.search("a sweet fruit to eat")
|
||||||
|
.metricType("cosine")
|
||||||
|
.limit(2)
|
||||||
|
.execute()
|
||||||
|
console.log(results.map(r => r.text))
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
example().then(_ => { console.log("Done!") })
|
||||||
16
node/examples/js-transformers/package.json
Normal file
16
node/examples/js-transformers/package.json
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"name": "vectordb-example-js-transformers",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "Example for using transformers.js with lancedb",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"author": "Lance Devs",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"dependencies": {
|
||||||
|
"@xenova/transformers": "^2.4.1",
|
||||||
|
"vectordb": "^0.1.12"
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -12,29 +12,26 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
const { currentTarget } = require('@neon-rs/load');
|
||||||
|
|
||||||
let nativeLib;
|
let nativeLib;
|
||||||
|
|
||||||
function getPlatformLibrary() {
|
|
||||||
if (process.platform === "darwin" && process.arch == "arm64") {
|
|
||||||
return require('./aarch64-apple-darwin.node');
|
|
||||||
} else if (process.platform === "darwin" && process.arch == "x64") {
|
|
||||||
return require('./x86_64-apple-darwin.node');
|
|
||||||
} else if (process.platform === "linux" && process.arch == "x64") {
|
|
||||||
return require('./x86_64-unknown-linux-gnu.node');
|
|
||||||
} else {
|
|
||||||
throw new Error(`vectordb: unsupported platform ${process.platform}_${process.arch}. Please file a bug report at https://github.com/lancedb/lancedb/issues`)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
nativeLib = require('./index.node')
|
nativeLib = require(`vectordb-${currentTarget()}`);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (e.code === "MODULE_NOT_FOUND") {
|
try {
|
||||||
nativeLib = getPlatformLibrary();
|
// Might be developing locally, so try that. But don't expose that error
|
||||||
} else {
|
// to the user.
|
||||||
throw new Error('vectordb: failed to load native library. Please file a bug report at https://github.com/lancedb/lancedb/issues');
|
nativeLib = require("./index.node");
|
||||||
|
} catch {
|
||||||
|
throw new Error(`vectordb: failed to load native library.
|
||||||
|
You may need to run \`npm install vectordb-${currentTarget()}\`.
|
||||||
|
|
||||||
|
If that does not work, please file a bug report at https://github.com/lancedb/lancedb/issues
|
||||||
|
|
||||||
|
Source error: ${e}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = nativeLib
|
// Dynamic require for runtime.
|
||||||
|
module.exports = nativeLib;
|
||||||
|
|||||||
101
node/package-lock.json
generated
101
node/package-lock.json
generated
@@ -1,18 +1,29 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.1.9",
|
"version": "0.1.13",
|
||||||
"lockfileVersion": 2,
|
"lockfileVersion": 2,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.1.9",
|
"version": "0.1.13",
|
||||||
|
"cpu": [
|
||||||
|
"x64",
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
"license": "Apache-2.0",
|
"license": "Apache-2.0",
|
||||||
|
"os": [
|
||||||
|
"darwin",
|
||||||
|
"linux",
|
||||||
|
"win32"
|
||||||
|
],
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@apache-arrow/ts": "^12.0.0",
|
"@apache-arrow/ts": "^12.0.0",
|
||||||
|
"@neon-rs/load": "^0.0.74",
|
||||||
"apache-arrow": "^12.0.0"
|
"apache-arrow": "^12.0.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@neon-rs/cli": "^0.0.74",
|
||||||
"@types/chai": "^4.3.4",
|
"@types/chai": "^4.3.4",
|
||||||
"@types/chai-as-promised": "^7.1.5",
|
"@types/chai-as-promised": "^7.1.5",
|
||||||
"@types/mocha": "^10.0.1",
|
"@types/mocha": "^10.0.1",
|
||||||
@@ -37,6 +48,13 @@
|
|||||||
"typedoc": "^0.24.7",
|
"typedoc": "^0.24.7",
|
||||||
"typedoc-plugin-markdown": "^3.15.3",
|
"typedoc-plugin-markdown": "^3.15.3",
|
||||||
"typescript": "*"
|
"typescript": "*"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"vectordb-darwin-arm64": "0.1.13",
|
||||||
|
"vectordb-darwin-x64": "0.1.13",
|
||||||
|
"vectordb-linux-arm64-gnu": "0.1.13",
|
||||||
|
"vectordb-linux-x64-gnu": "0.1.13",
|
||||||
|
"vectordb-win32-x64-msvc": "0.1.13"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@apache-arrow/ts": {
|
"node_modules/@apache-arrow/ts": {
|
||||||
@@ -204,6 +222,20 @@
|
|||||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@neon-rs/cli": {
|
||||||
|
"version": "0.0.74",
|
||||||
|
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
|
||||||
|
"integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
|
||||||
|
"dev": true,
|
||||||
|
"bin": {
|
||||||
|
"neon": "index.js"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@neon-rs/load": {
|
||||||
|
"version": "0.0.74",
|
||||||
|
"resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
|
||||||
|
"integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
|
||||||
|
},
|
||||||
"node_modules/@nodelib/fs.scandir": {
|
"node_modules/@nodelib/fs.scandir": {
|
||||||
"version": "2.1.5",
|
"version": "2.1.5",
|
||||||
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
|
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
|
||||||
@@ -4256,6 +4288,42 @@
|
|||||||
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
|
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"node_modules/vectordb-darwin-arm64": {
|
||||||
|
"version": "0.1.13",
|
||||||
|
"resolved": "https://registry.npmjs.org/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.13.tgz",
|
||||||
|
"integrity": "sha512-9lLuX5P8m75EfP85pfC4LxO9J7Tzu4LngX55BVAdFe6qPRHu+iHmLw0QYYSVDqNm3GtDr2qFJlL2ILlsApyYyg==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"node_modules/vectordb-darwin-x64": {
|
||||||
|
"version": "0.1.13",
|
||||||
|
"resolved": "https://registry.npmjs.org/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.13.tgz",
|
||||||
|
"integrity": "sha512-5mkhBJlcfAqcty7Ww2csgYogq+b0NhtllAbag9IIznvqfcrvITU0H0vm5LGWbRuE/BUUxC25MJhm93YWBzqEVA==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"node_modules/vectordb-linux-x64-gnu": {
|
||||||
|
"version": "0.1.13",
|
||||||
|
"resolved": "https://registry.npmjs.org/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.13.tgz",
|
||||||
|
"integrity": "sha512-fU+sIHUkXyMdrWjggT93p0blKD+pbgr+x01tn9d2/pbA1ePo2AwuE86rYPA+BjyCUE1QifPgKadzGVVpqWYmnQ==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
]
|
||||||
|
},
|
||||||
"node_modules/vscode-oniguruma": {
|
"node_modules/vscode-oniguruma": {
|
||||||
"version": "1.7.0",
|
"version": "1.7.0",
|
||||||
"resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz",
|
"resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz",
|
||||||
@@ -4601,6 +4669,17 @@
|
|||||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"@neon-rs/cli": {
|
||||||
|
"version": "0.0.74",
|
||||||
|
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
|
||||||
|
"integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
|
||||||
|
"dev": true
|
||||||
|
},
|
||||||
|
"@neon-rs/load": {
|
||||||
|
"version": "0.0.74",
|
||||||
|
"resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
|
||||||
|
"integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
|
||||||
|
},
|
||||||
"@nodelib/fs.scandir": {
|
"@nodelib/fs.scandir": {
|
||||||
"version": "2.1.5",
|
"version": "2.1.5",
|
||||||
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
|
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
|
||||||
@@ -7540,6 +7619,24 @@
|
|||||||
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
|
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"vectordb-darwin-arm64": {
|
||||||
|
"version": "0.1.13",
|
||||||
|
"resolved": "https://registry.npmjs.org/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.13.tgz",
|
||||||
|
"integrity": "sha512-9lLuX5P8m75EfP85pfC4LxO9J7Tzu4LngX55BVAdFe6qPRHu+iHmLw0QYYSVDqNm3GtDr2qFJlL2ILlsApyYyg==",
|
||||||
|
"optional": true
|
||||||
|
},
|
||||||
|
"vectordb-darwin-x64": {
|
||||||
|
"version": "0.1.13",
|
||||||
|
"resolved": "https://registry.npmjs.org/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.13.tgz",
|
||||||
|
"integrity": "sha512-5mkhBJlcfAqcty7Ww2csgYogq+b0NhtllAbag9IIznvqfcrvITU0H0vm5LGWbRuE/BUUxC25MJhm93YWBzqEVA==",
|
||||||
|
"optional": true
|
||||||
|
},
|
||||||
|
"vectordb-linux-x64-gnu": {
|
||||||
|
"version": "0.1.13",
|
||||||
|
"resolved": "https://registry.npmjs.org/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.13.tgz",
|
||||||
|
"integrity": "sha512-fU+sIHUkXyMdrWjggT93p0blKD+pbgr+x01tn9d2/pbA1ePo2AwuE86rYPA+BjyCUE1QifPgKadzGVVpqWYmnQ==",
|
||||||
|
"optional": true
|
||||||
|
},
|
||||||
"vscode-oniguruma": {
|
"vscode-oniguruma": {
|
||||||
"version": "1.7.0",
|
"version": "1.7.0",
|
||||||
"resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz",
|
"resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz",
|
||||||
|
|||||||
@@ -1,16 +1,18 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.1.10",
|
"version": "0.1.13",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"tsc": "tsc -b",
|
"tsc": "tsc -b",
|
||||||
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json-render-diagnostics",
|
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
|
||||||
"build-release": "npm run build -- --release",
|
"build-release": "npm run build -- --release",
|
||||||
"test": "npm run tsc; mocha -recursive dist/test",
|
"test": "npm run tsc && mocha -recursive dist/test",
|
||||||
"lint": "eslint src --ext .js,.ts",
|
"lint": "eslint src --ext .js,.ts",
|
||||||
"clean": "rm -rf node_modules *.node dist/"
|
"clean": "rm -rf node_modules *.node dist/",
|
||||||
|
"pack-build": "neon pack-build",
|
||||||
|
"check-npm": "printenv && which node && which npm && npm --version"
|
||||||
},
|
},
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
@@ -25,6 +27,7 @@
|
|||||||
"author": "Lance Devs",
|
"author": "Lance Devs",
|
||||||
"license": "Apache-2.0",
|
"license": "Apache-2.0",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@neon-rs/cli": "^0.0.74",
|
||||||
"@types/chai": "^4.3.4",
|
"@types/chai": "^4.3.4",
|
||||||
"@types/chai-as-promised": "^7.1.5",
|
"@types/chai-as-promised": "^7.1.5",
|
||||||
"@types/mocha": "^10.0.1",
|
"@types/mocha": "^10.0.1",
|
||||||
@@ -52,6 +55,32 @@
|
|||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@apache-arrow/ts": "^12.0.0",
|
"@apache-arrow/ts": "^12.0.0",
|
||||||
|
"@neon-rs/load": "^0.0.74",
|
||||||
"apache-arrow": "^12.0.0"
|
"apache-arrow": "^12.0.0"
|
||||||
|
},
|
||||||
|
"os": [
|
||||||
|
"darwin",
|
||||||
|
"linux",
|
||||||
|
"win32"
|
||||||
|
],
|
||||||
|
"cpu": [
|
||||||
|
"x64",
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"neon": {
|
||||||
|
"targets": {
|
||||||
|
"x86_64-apple-darwin": "vectordb-darwin-x64",
|
||||||
|
"aarch64-apple-darwin": "vectordb-darwin-arm64",
|
||||||
|
"x86_64-unknown-linux-gnu": "vectordb-linux-x64-gnu",
|
||||||
|
"aarch64-unknown-linux-gnu": "vectordb-linux-arm64-gnu",
|
||||||
|
"x86_64-pc-windows-msvc": "vectordb-win32-x64-msvc"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"vectordb-darwin-arm64": "0.1.13",
|
||||||
|
"vectordb-darwin-x64": "0.1.13",
|
||||||
|
"vectordb-linux-arm64-gnu": "0.1.13",
|
||||||
|
"vectordb-linux-x64-gnu": "0.1.13",
|
||||||
|
"vectordb-win32-x64-msvc": "0.1.13"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,13 +27,38 @@ const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, t
|
|||||||
export type { EmbeddingFunction }
|
export type { EmbeddingFunction }
|
||||||
export { OpenAIEmbeddingFunction } from './embedding/openai'
|
export { OpenAIEmbeddingFunction } from './embedding/openai'
|
||||||
|
|
||||||
|
export interface AwsCredentials {
|
||||||
|
accessKeyId: string
|
||||||
|
|
||||||
|
secretKey: string
|
||||||
|
|
||||||
|
sessionToken?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ConnectionOptions {
|
||||||
|
uri: string
|
||||||
|
awsCredentials?: AwsCredentials
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Connect to a LanceDB instance at the given URI
|
* Connect to a LanceDB instance at the given URI
|
||||||
* @param uri The uri of the database.
|
* @param uri The uri of the database.
|
||||||
*/
|
*/
|
||||||
export async function connect (uri: string): Promise<Connection> {
|
export async function connect (uri: string): Promise<Connection>
|
||||||
const db = await databaseNew(uri)
|
export async function connect (opts: Partial<ConnectionOptions>): Promise<Connection>
|
||||||
return new LocalConnection(db, uri)
|
export async function connect (arg: string | Partial<ConnectionOptions>): Promise<Connection> {
|
||||||
|
let opts: ConnectionOptions
|
||||||
|
if (typeof arg === 'string') {
|
||||||
|
opts = { uri: arg }
|
||||||
|
} else {
|
||||||
|
// opts = { uri: arg.uri, awsCredentials = arg.awsCredentials }
|
||||||
|
opts = Object.assign({
|
||||||
|
uri: '',
|
||||||
|
awsCredentials: undefined
|
||||||
|
}, arg)
|
||||||
|
}
|
||||||
|
const db = await databaseNew(opts.uri)
|
||||||
|
return new LocalConnection(db, opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -126,16 +151,16 @@ export interface Table<T = number[]> {
|
|||||||
* A connection to a LanceDB database.
|
* A connection to a LanceDB database.
|
||||||
*/
|
*/
|
||||||
export class LocalConnection implements Connection {
|
export class LocalConnection implements Connection {
|
||||||
private readonly _uri: string
|
private readonly _options: ConnectionOptions
|
||||||
private readonly _db: any
|
private readonly _db: any
|
||||||
|
|
||||||
constructor (db: any, uri: string) {
|
constructor (db: any, options: ConnectionOptions) {
|
||||||
this._uri = uri
|
this._options = options
|
||||||
this._db = db
|
this._db = db
|
||||||
}
|
}
|
||||||
|
|
||||||
get uri (): string {
|
get uri (): string {
|
||||||
return this._uri
|
return this._options.uri
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -158,12 +183,13 @@ export class LocalConnection implements Connection {
|
|||||||
* @param embeddings An embedding function to use on this Table
|
* @param embeddings An embedding function to use on this Table
|
||||||
*/
|
*/
|
||||||
async openTable<T> (name: string, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
|
async openTable<T> (name: string, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
|
||||||
|
async openTable<T> (name: string, embeddings?: EmbeddingFunction<T>): Promise<Table<T>>
|
||||||
async openTable<T> (name: string, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
|
async openTable<T> (name: string, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
|
||||||
const tbl = await databaseOpenTable.call(this._db, name)
|
const tbl = await databaseOpenTable.call(this._db, name)
|
||||||
if (embeddings !== undefined) {
|
if (embeddings !== undefined) {
|
||||||
return new LocalTable(tbl, name, embeddings)
|
return new LocalTable(tbl, name, this._options, embeddings)
|
||||||
} else {
|
} else {
|
||||||
return new LocalTable(tbl, name)
|
return new LocalTable(tbl, name, this._options)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -186,15 +212,27 @@ export class LocalConnection implements Connection {
|
|||||||
* @param embeddings An embedding function to use on this Table
|
* @param embeddings An embedding function to use on this Table
|
||||||
*/
|
*/
|
||||||
async createTable<T> (name: string, data: Array<Record<string, unknown>>, mode: WriteMode, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
|
async createTable<T> (name: string, data: Array<Record<string, unknown>>, mode: WriteMode, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
|
||||||
|
async createTable<T> (name: string, data: Array<Record<string, unknown>>, mode: WriteMode, embeddings?: EmbeddingFunction<T>): Promise<Table<T>>
|
||||||
async createTable<T> (name: string, data: Array<Record<string, unknown>>, mode: WriteMode, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
|
async createTable<T> (name: string, data: Array<Record<string, unknown>>, mode: WriteMode, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
|
||||||
if (mode === undefined) {
|
if (mode === undefined) {
|
||||||
mode = WriteMode.Create
|
mode = WriteMode.Create
|
||||||
}
|
}
|
||||||
const tbl = await tableCreate.call(this._db, name, await fromRecordsToBuffer(data, embeddings), mode.toLowerCase())
|
|
||||||
|
const createArgs = [this._db, name, await fromRecordsToBuffer(data, embeddings), mode.toLowerCase()]
|
||||||
|
if (this._options.awsCredentials !== undefined) {
|
||||||
|
createArgs.push(this._options.awsCredentials.accessKeyId)
|
||||||
|
createArgs.push(this._options.awsCredentials.secretKey)
|
||||||
|
if (this._options.awsCredentials.sessionToken !== undefined) {
|
||||||
|
createArgs.push(this._options.awsCredentials.sessionToken)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const tbl = await tableCreate.call(...createArgs)
|
||||||
|
|
||||||
if (embeddings !== undefined) {
|
if (embeddings !== undefined) {
|
||||||
return new LocalTable(tbl, name, embeddings)
|
return new LocalTable(tbl, name, this._options, embeddings)
|
||||||
} else {
|
} else {
|
||||||
return new LocalTable(tbl, name)
|
return new LocalTable(tbl, name, this._options)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -217,18 +255,21 @@ export class LocalTable<T = number[]> implements Table<T> {
|
|||||||
private readonly _tbl: any
|
private readonly _tbl: any
|
||||||
private readonly _name: string
|
private readonly _name: string
|
||||||
private readonly _embeddings?: EmbeddingFunction<T>
|
private readonly _embeddings?: EmbeddingFunction<T>
|
||||||
|
private readonly _options: ConnectionOptions
|
||||||
|
|
||||||
constructor (tbl: any, name: string)
|
constructor (tbl: any, name: string, options: ConnectionOptions)
|
||||||
/**
|
/**
|
||||||
* @param tbl
|
* @param tbl
|
||||||
* @param name
|
* @param name
|
||||||
|
* @param options
|
||||||
* @param embeddings An embedding function to use when interacting with this table
|
* @param embeddings An embedding function to use when interacting with this table
|
||||||
*/
|
*/
|
||||||
constructor (tbl: any, name: string, embeddings: EmbeddingFunction<T>)
|
constructor (tbl: any, name: string, options: ConnectionOptions, embeddings: EmbeddingFunction<T>)
|
||||||
constructor (tbl: any, name: string, embeddings?: EmbeddingFunction<T>) {
|
constructor (tbl: any, name: string, options: ConnectionOptions, embeddings?: EmbeddingFunction<T>) {
|
||||||
this._tbl = tbl
|
this._tbl = tbl
|
||||||
this._name = name
|
this._name = name
|
||||||
this._embeddings = embeddings
|
this._embeddings = embeddings
|
||||||
|
this._options = options
|
||||||
}
|
}
|
||||||
|
|
||||||
get name (): string {
|
get name (): string {
|
||||||
@@ -250,7 +291,15 @@ export class LocalTable<T = number[]> implements Table<T> {
|
|||||||
* @return The number of rows added to the table
|
* @return The number of rows added to the table
|
||||||
*/
|
*/
|
||||||
async add (data: Array<Record<string, unknown>>): Promise<number> {
|
async add (data: Array<Record<string, unknown>>): Promise<number> {
|
||||||
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Append.toString())
|
const callArgs = [this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Append.toString()]
|
||||||
|
if (this._options.awsCredentials !== undefined) {
|
||||||
|
callArgs.push(this._options.awsCredentials.accessKeyId)
|
||||||
|
callArgs.push(this._options.awsCredentials.secretKey)
|
||||||
|
if (this._options.awsCredentials.sessionToken !== undefined) {
|
||||||
|
callArgs.push(this._options.awsCredentials.sessionToken)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tableAdd.call(...callArgs)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -260,6 +309,14 @@ export class LocalTable<T = number[]> implements Table<T> {
|
|||||||
* @return The number of rows added to the table
|
* @return The number of rows added to the table
|
||||||
*/
|
*/
|
||||||
async overwrite (data: Array<Record<string, unknown>>): Promise<number> {
|
async overwrite (data: Array<Record<string, unknown>>): Promise<number> {
|
||||||
|
const callArgs = [this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Overwrite.toString()]
|
||||||
|
if (this._options.awsCredentials !== undefined) {
|
||||||
|
callArgs.push(this._options.awsCredentials.accessKeyId)
|
||||||
|
callArgs.push(this._options.awsCredentials.secretKey)
|
||||||
|
if (this._options.awsCredentials.sessionToken !== undefined) {
|
||||||
|
callArgs.push(this._options.awsCredentials.sessionToken)
|
||||||
|
}
|
||||||
|
}
|
||||||
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Overwrite.toString())
|
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Overwrite.toString())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,26 +18,48 @@ import { describe } from 'mocha'
|
|||||||
import { assert } from 'chai'
|
import { assert } from 'chai'
|
||||||
|
|
||||||
import * as lancedb from '../index'
|
import * as lancedb from '../index'
|
||||||
|
import { type ConnectionOptions } from '../index'
|
||||||
|
|
||||||
describe('LanceDB S3 client', function () {
|
describe('LanceDB S3 client', function () {
|
||||||
if (process.env.TEST_S3_BASE_URL != null) {
|
if (process.env.TEST_S3_BASE_URL != null) {
|
||||||
const baseUri = process.env.TEST_S3_BASE_URL
|
const baseUri = process.env.TEST_S3_BASE_URL
|
||||||
it('should have a valid url', async function () {
|
it('should have a valid url', async function () {
|
||||||
const uri = `${baseUri}/valid_url`
|
const opts = { uri: `${baseUri}/valid_url` }
|
||||||
const table = await createTestDB(uri, 2, 20)
|
const table = await createTestDB(opts, 2, 20)
|
||||||
const con = await lancedb.connect(uri)
|
const con = await lancedb.connect(opts)
|
||||||
assert.equal(con.uri, uri)
|
assert.equal(con.uri, opts.uri)
|
||||||
|
|
||||||
const results = await table.search([0.1, 0.3]).limit(5).execute()
|
const results = await table.search([0.1, 0.3]).limit(5).execute()
|
||||||
assert.equal(results.length, 5)
|
assert.equal(results.length, 5)
|
||||||
})
|
}).timeout(10_000)
|
||||||
|
} else {
|
||||||
|
describe.skip('Skip S3 test', function () {})
|
||||||
|
}
|
||||||
|
|
||||||
|
if (process.env.TEST_S3_BASE_URL != null && process.env.TEST_AWS_ACCESS_KEY_ID != null && process.env.TEST_AWS_SECRET_ACCESS_KEY != null) {
|
||||||
|
const baseUri = process.env.TEST_S3_BASE_URL
|
||||||
|
it('use custom credentials', async function () {
|
||||||
|
const opts: ConnectionOptions = {
|
||||||
|
uri: `${baseUri}/custom_credentials`,
|
||||||
|
awsCredentials: {
|
||||||
|
accessKeyId: process.env.TEST_AWS_ACCESS_KEY_ID as string,
|
||||||
|
secretKey: process.env.TEST_AWS_SECRET_ACCESS_KEY as string
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const table = await createTestDB(opts, 2, 20)
|
||||||
|
const con = await lancedb.connect(opts)
|
||||||
|
assert.equal(con.uri, opts.uri)
|
||||||
|
|
||||||
|
const results = await table.search([0.1, 0.3]).limit(5).execute()
|
||||||
|
assert.equal(results.length, 5)
|
||||||
|
}).timeout(10_000)
|
||||||
} else {
|
} else {
|
||||||
describe.skip('Skip S3 test', function () {})
|
describe.skip('Skip S3 test', function () {})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
async function createTestDB (uri: string, numDimensions: number = 2, numRows: number = 2): Promise<lancedb.Table> {
|
async function createTestDB (opts: ConnectionOptions, numDimensions: number = 2, numRows: number = 2): Promise<lancedb.Table> {
|
||||||
const con = await lancedb.connect(uri)
|
const con = await lancedb.connect(opts)
|
||||||
|
|
||||||
const data = []
|
const data = []
|
||||||
for (let i = 0; i < numRows; i++) {
|
for (let i = 0; i < numRows; i++) {
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ import * as chai from 'chai'
|
|||||||
import * as chaiAsPromised from 'chai-as-promised'
|
import * as chaiAsPromised from 'chai-as-promised'
|
||||||
|
|
||||||
import * as lancedb from '../index'
|
import * as lancedb from '../index'
|
||||||
import { type EmbeddingFunction, MetricType, Query, WriteMode } from '../index'
|
import { type AwsCredentials, type EmbeddingFunction, MetricType, Query, WriteMode } from '../index'
|
||||||
|
|
||||||
const expect = chai.expect
|
const expect = chai.expect
|
||||||
const assert = chai.assert
|
const assert = chai.assert
|
||||||
@@ -32,6 +32,22 @@ describe('LanceDB client', function () {
|
|||||||
assert.equal(con.uri, uri)
|
assert.equal(con.uri, uri)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
it('should accept an options object', async function () {
|
||||||
|
const uri = await createTestDB()
|
||||||
|
const con = await lancedb.connect({ uri })
|
||||||
|
assert.equal(con.uri, uri)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should accept custom aws credentials', async function () {
|
||||||
|
const uri = await createTestDB()
|
||||||
|
const awsCredentials: AwsCredentials = {
|
||||||
|
accessKeyId: '',
|
||||||
|
secretKey: ''
|
||||||
|
}
|
||||||
|
const con = await lancedb.connect({ uri, awsCredentials })
|
||||||
|
assert.equal(con.uri, uri)
|
||||||
|
})
|
||||||
|
|
||||||
it('should return the existing table names', async function () {
|
it('should return the existing table names', async function () {
|
||||||
const uri = await createTestDB()
|
const uri = await createTestDB()
|
||||||
const con = await lancedb.connect(uri)
|
const con = await lancedb.connect(uri)
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ from typing import Optional
|
|||||||
|
|
||||||
from .db import URI, DBConnection, LanceDBConnection
|
from .db import URI, DBConnection, LanceDBConnection
|
||||||
from .remote.db import RemoteDBConnection
|
from .remote.db import RemoteDBConnection
|
||||||
|
from .schema import vector
|
||||||
|
|
||||||
|
|
||||||
def connect(
|
def connect(
|
||||||
|
|||||||
@@ -13,11 +13,12 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import functools
|
|
||||||
import os
|
import os
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from pyarrow import fs
|
from pyarrow import fs
|
||||||
|
|
||||||
@@ -38,8 +39,10 @@ class DBConnection(ABC):
|
|||||||
def create_table(
|
def create_table(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
data: DATA = None,
|
data: Optional[
|
||||||
schema: pa.Schema = None,
|
Union[List[dict], dict, pd.DataFrame, pa.Table, Iterable[pa.RecordBatch]],
|
||||||
|
] = None,
|
||||||
|
schema: Optional[pa.Schema] = None,
|
||||||
mode: str = "create",
|
mode: str = "create",
|
||||||
on_bad_vectors: str = "error",
|
on_bad_vectors: str = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
@@ -51,7 +54,7 @@ class DBConnection(ABC):
|
|||||||
name: str
|
name: str
|
||||||
The name of the table.
|
The name of the table.
|
||||||
data: list, tuple, dict, pd.DataFrame; optional
|
data: list, tuple, dict, pd.DataFrame; optional
|
||||||
The data to insert into the table.
|
The data to initialize the table. User must provide at least one of `data` or `schema`.
|
||||||
schema: pyarrow.Schema; optional
|
schema: pyarrow.Schema; optional
|
||||||
The schema of the table.
|
The schema of the table.
|
||||||
mode: str; default "create"
|
mode: str; default "create"
|
||||||
@@ -64,16 +67,16 @@ class DBConnection(ABC):
|
|||||||
fill_value: float
|
fill_value: float
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||||
|
|
||||||
Note
|
|
||||||
----
|
|
||||||
The vector index won't be created by default.
|
|
||||||
To create the index, call the `create_index` method on the table.
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
LanceTable
|
LanceTable
|
||||||
A reference to the newly created table.
|
A reference to the newly created table.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
|
||||||
|
The vector index won't be created by default.
|
||||||
|
To create the index, call the `create_index` method on the table.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
|
|
||||||
@@ -119,7 +122,7 @@ class DBConnection(ABC):
|
|||||||
|
|
||||||
Data is converted to Arrow before being written to disk. For maximum
|
Data is converted to Arrow before being written to disk. For maximum
|
||||||
control over how data is saved, either provide the PyArrow schema to
|
control over how data is saved, either provide the PyArrow schema to
|
||||||
convert to or else provide a PyArrow table directly.
|
convert to or else provide a [PyArrow Table](pyarrow.Table) directly.
|
||||||
|
|
||||||
>>> custom_schema = pa.schema([
|
>>> custom_schema = pa.schema([
|
||||||
... pa.field("vector", pa.list_(pa.float32(), 2)),
|
... pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||||
@@ -138,6 +141,30 @@ class DBConnection(ABC):
|
|||||||
vector: [[[1.1,1.2],[0.2,1.8]]]
|
vector: [[[1.1,1.2],[0.2,1.8]]]
|
||||||
lat: [[45.5,40.1]]
|
lat: [[45.5,40.1]]
|
||||||
long: [[-122.7,-74.1]]
|
long: [[-122.7,-74.1]]
|
||||||
|
|
||||||
|
|
||||||
|
It is also possible to create an table from `[Iterable[pa.RecordBatch]]`:
|
||||||
|
|
||||||
|
|
||||||
|
>>> import pyarrow as pa
|
||||||
|
>>> def make_batches():
|
||||||
|
... for i in range(5):
|
||||||
|
... yield pa.RecordBatch.from_arrays(
|
||||||
|
... [
|
||||||
|
... pa.array([[3.1, 4.1], [5.9, 26.5]]),
|
||||||
|
... pa.array(["foo", "bar"]),
|
||||||
|
... pa.array([10.0, 20.0]),
|
||||||
|
... ],
|
||||||
|
... ["vector", "item", "price"],
|
||||||
|
... )
|
||||||
|
>>> schema=pa.schema([
|
||||||
|
... pa.field("vector", pa.list_(pa.float32())),
|
||||||
|
... pa.field("item", pa.utf8()),
|
||||||
|
... pa.field("price", pa.float32()),
|
||||||
|
... ])
|
||||||
|
>>> db.create_table("table4", make_batches(), schema=schema)
|
||||||
|
LanceTable(table4)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@@ -252,7 +279,7 @@ class LanceDBConnection(DBConnection):
|
|||||||
def create_table(
|
def create_table(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
data: DATA = None,
|
data: Optional[Union[List[dict], dict, pd.DataFrame]] = None,
|
||||||
schema: pa.Schema = None,
|
schema: pa.Schema = None,
|
||||||
mode: str = "create",
|
mode: str = "create",
|
||||||
on_bad_vectors: str = "error",
|
on_bad_vectors: str = "error",
|
||||||
@@ -260,114 +287,22 @@ class LanceDBConnection(DBConnection):
|
|||||||
) -> LanceTable:
|
) -> LanceTable:
|
||||||
"""Create a table in the database.
|
"""Create a table in the database.
|
||||||
|
|
||||||
Parameters
|
See
|
||||||
----------
|
---
|
||||||
name: str
|
DBConnection.create_table
|
||||||
The name of the table.
|
|
||||||
data: list, tuple, dict, pd.DataFrame; optional
|
|
||||||
The data to insert into the table.
|
|
||||||
schema: pyarrow.Schema; optional
|
|
||||||
The schema of the table.
|
|
||||||
mode: str; default "create"
|
|
||||||
The mode to use when creating the table. Can be either "create" or "overwrite".
|
|
||||||
By default, if the table already exists, an exception is raised.
|
|
||||||
If you want to overwrite the table, use mode="overwrite".
|
|
||||||
on_bad_vectors: str, default "error"
|
|
||||||
What to do if any of the vectors are not the same size or contains NaNs.
|
|
||||||
One of "error", "drop", "fill".
|
|
||||||
fill_value: float
|
|
||||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
|
||||||
|
|
||||||
Note
|
|
||||||
----
|
|
||||||
The vector index won't be created by default.
|
|
||||||
To create the index, call the `create_index` method on the table.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
LanceTable
|
|
||||||
A reference to the newly created table.
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
|
|
||||||
Can create with list of tuples or dictionaries:
|
|
||||||
|
|
||||||
>>> import lancedb
|
|
||||||
>>> db = lancedb.connect("./.lancedb")
|
|
||||||
>>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
|
|
||||||
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
|
|
||||||
>>> db.create_table("my_table", data)
|
|
||||||
LanceTable(my_table)
|
|
||||||
>>> db["my_table"].head()
|
|
||||||
pyarrow.Table
|
|
||||||
vector: fixed_size_list<item: float>[2]
|
|
||||||
child 0, item: float
|
|
||||||
lat: double
|
|
||||||
long: double
|
|
||||||
----
|
|
||||||
vector: [[[1.1,1.2],[0.2,1.8]]]
|
|
||||||
lat: [[45.5,40.1]]
|
|
||||||
long: [[-122.7,-74.1]]
|
|
||||||
|
|
||||||
You can also pass a pandas DataFrame:
|
|
||||||
|
|
||||||
>>> import pandas as pd
|
|
||||||
>>> data = pd.DataFrame({
|
|
||||||
... "vector": [[1.1, 1.2], [0.2, 1.8]],
|
|
||||||
... "lat": [45.5, 40.1],
|
|
||||||
... "long": [-122.7, -74.1]
|
|
||||||
... })
|
|
||||||
>>> db.create_table("table2", data)
|
|
||||||
LanceTable(table2)
|
|
||||||
>>> db["table2"].head()
|
|
||||||
pyarrow.Table
|
|
||||||
vector: fixed_size_list<item: float>[2]
|
|
||||||
child 0, item: float
|
|
||||||
lat: double
|
|
||||||
long: double
|
|
||||||
----
|
|
||||||
vector: [[[1.1,1.2],[0.2,1.8]]]
|
|
||||||
lat: [[45.5,40.1]]
|
|
||||||
long: [[-122.7,-74.1]]
|
|
||||||
|
|
||||||
Data is converted to Arrow before being written to disk. For maximum
|
|
||||||
control over how data is saved, either provide the PyArrow schema to
|
|
||||||
convert to or else provide a PyArrow table directly.
|
|
||||||
|
|
||||||
>>> custom_schema = pa.schema([
|
|
||||||
... pa.field("vector", pa.list_(pa.float32(), 2)),
|
|
||||||
... pa.field("lat", pa.float32()),
|
|
||||||
... pa.field("long", pa.float32())
|
|
||||||
... ])
|
|
||||||
>>> db.create_table("table3", data, schema = custom_schema)
|
|
||||||
LanceTable(table3)
|
|
||||||
>>> db["table3"].head()
|
|
||||||
pyarrow.Table
|
|
||||||
vector: fixed_size_list<item: float>[2]
|
|
||||||
child 0, item: float
|
|
||||||
lat: float
|
|
||||||
long: float
|
|
||||||
----
|
|
||||||
vector: [[[1.1,1.2],[0.2,1.8]]]
|
|
||||||
lat: [[45.5,40.1]]
|
|
||||||
long: [[-122.7,-74.1]]
|
|
||||||
"""
|
"""
|
||||||
if mode.lower() not in ["create", "overwrite"]:
|
if mode.lower() not in ["create", "overwrite"]:
|
||||||
raise ValueError("mode must be either 'create' or 'overwrite'")
|
raise ValueError("mode must be either 'create' or 'overwrite'")
|
||||||
|
|
||||||
if data is not None:
|
tbl = LanceTable.create(
|
||||||
tbl = LanceTable.create(
|
self,
|
||||||
self,
|
name,
|
||||||
name,
|
data,
|
||||||
data,
|
schema,
|
||||||
schema,
|
mode=mode,
|
||||||
mode=mode,
|
on_bad_vectors=on_bad_vectors,
|
||||||
on_bad_vectors=on_bad_vectors,
|
fill_value=fill_value,
|
||||||
fill_value=fill_value,
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
tbl = LanceTable.open(self, name)
|
|
||||||
return tbl
|
return tbl
|
||||||
|
|
||||||
def open_table(self, name: str) -> LanceTable:
|
def open_table(self, name: str) -> LanceTable:
|
||||||
|
|||||||
212
python/lancedb/pydantic.py
Normal file
212
python/lancedb/pydantic.py
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Pydantic adapter for LanceDB"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import inspect
|
||||||
|
import sys
|
||||||
|
import types
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Any, List, Type, Union, _GenericAlias
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
import pydantic
|
||||||
|
from pydantic_core import CoreSchema, core_schema
|
||||||
|
|
||||||
|
|
||||||
|
class FixedSizeListMixin(ABC):
|
||||||
|
@staticmethod
|
||||||
|
@abstractmethod
|
||||||
|
def dim() -> int:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
@abstractmethod
|
||||||
|
def value_arrow_type() -> pa.DataType:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
def vector(
|
||||||
|
dim: int, value_type: pa.DataType = pa.float32()
|
||||||
|
) -> Type[FixedSizeListMixin]:
|
||||||
|
"""Pydantic Vector Type.
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
Experimental feature.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dim : int
|
||||||
|
The dimension of the vector.
|
||||||
|
value_type : pyarrow.DataType, optional
|
||||||
|
The value type of the vector, by default pa.float32()
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
>>> import pydantic
|
||||||
|
>>> from lancedb.pydantic import vector
|
||||||
|
...
|
||||||
|
>>> class MyModel(pydantic.BaseModel):
|
||||||
|
... id: int
|
||||||
|
... url: str
|
||||||
|
... embeddings: vector(768)
|
||||||
|
>>> schema = pydantic_to_schema(MyModel)
|
||||||
|
>>> assert schema == pa.schema([
|
||||||
|
... pa.field("id", pa.int64(), False),
|
||||||
|
... pa.field("url", pa.utf8(), False),
|
||||||
|
... pa.field("embeddings", pa.list_(pa.float32(), 768), False)
|
||||||
|
... ])
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO: make a public parameterized type.
|
||||||
|
class FixedSizeList(list, FixedSizeListMixin):
|
||||||
|
@staticmethod
|
||||||
|
def dim() -> int:
|
||||||
|
return dim
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def value_arrow_type() -> pa.DataType:
|
||||||
|
return value_type
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __get_pydantic_core_schema__(
|
||||||
|
cls, _source_type: Any, _handler: pydantic.GetCoreSchemaHandler
|
||||||
|
) -> CoreSchema:
|
||||||
|
return core_schema.no_info_after_validator_function(
|
||||||
|
cls,
|
||||||
|
core_schema.list_schema(
|
||||||
|
min_length=dim,
|
||||||
|
max_length=dim,
|
||||||
|
items_schema=core_schema.float_schema(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
return FixedSizeList
|
||||||
|
|
||||||
|
|
||||||
|
def _py_type_to_arrow_type(py_type: Type[Any]) -> pa.DataType:
|
||||||
|
"""Convert Python Type to Arrow DataType.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
TypeError
|
||||||
|
If the type is not supported.
|
||||||
|
"""
|
||||||
|
if py_type == int:
|
||||||
|
return pa.int64()
|
||||||
|
elif py_type == float:
|
||||||
|
return pa.float64()
|
||||||
|
elif py_type == str:
|
||||||
|
return pa.utf8()
|
||||||
|
elif py_type == bool:
|
||||||
|
return pa.bool_()
|
||||||
|
elif py_type == bytes:
|
||||||
|
return pa.binary()
|
||||||
|
raise TypeError(
|
||||||
|
f"Converting Pydantic type to Arrow Type: unsupported type {py_type}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _pydantic_model_to_fields(model: pydantic.BaseModel) -> List[pa.Field]:
|
||||||
|
fields = []
|
||||||
|
for name, field in model.model_fields.items():
|
||||||
|
fields.append(_pydantic_to_field(name, field))
|
||||||
|
return fields
|
||||||
|
|
||||||
|
|
||||||
|
def _pydantic_to_arrow_type(field: pydantic.fields.FieldInfo) -> pa.DataType:
|
||||||
|
"""Convert a Pydantic FieldInfo to Arrow DataType"""
|
||||||
|
if isinstance(field.annotation, _GenericAlias) or (
|
||||||
|
sys.version_info > (3, 9) and isinstance(field.annotation, types.GenericAlias)
|
||||||
|
):
|
||||||
|
origin = field.annotation.__origin__
|
||||||
|
args = field.annotation.__args__
|
||||||
|
if origin == list:
|
||||||
|
child = args[0]
|
||||||
|
return pa.list_(_py_type_to_arrow_type(child))
|
||||||
|
elif origin == Union:
|
||||||
|
if len(args) == 2 and args[1] == type(None):
|
||||||
|
return _py_type_to_arrow_type(args[0])
|
||||||
|
elif inspect.isclass(field.annotation):
|
||||||
|
if issubclass(field.annotation, pydantic.BaseModel):
|
||||||
|
# Struct
|
||||||
|
fields = _pydantic_model_to_fields(field.annotation)
|
||||||
|
return pa.struct(fields)
|
||||||
|
elif issubclass(field.annotation, FixedSizeListMixin):
|
||||||
|
return pa.list_(field.annotation.value_arrow_type(), field.annotation.dim())
|
||||||
|
return _py_type_to_arrow_type(field.annotation)
|
||||||
|
|
||||||
|
|
||||||
|
def is_nullable(field: pydantic.fields.FieldInfo) -> bool:
|
||||||
|
"""Check if a Pydantic FieldInfo is nullable."""
|
||||||
|
if isinstance(field.annotation, _GenericAlias):
|
||||||
|
origin = field.annotation.__origin__
|
||||||
|
args = field.annotation.__args__
|
||||||
|
if origin == Union:
|
||||||
|
if len(args) == 2 and args[1] == type(None):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _pydantic_to_field(name: str, field: pydantic.fields.FieldInfo) -> pa.Field:
|
||||||
|
"""Convert a Pydantic field to a PyArrow Field."""
|
||||||
|
dt = _pydantic_to_arrow_type(field)
|
||||||
|
return pa.field(name, dt, is_nullable(field))
|
||||||
|
|
||||||
|
|
||||||
|
def pydantic_to_schema(model: Type[pydantic.BaseModel]) -> pa.Schema:
|
||||||
|
"""Convert a Pydantic model to a PyArrow Schema.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
model : Type[pydantic.BaseModel]
|
||||||
|
The Pydantic BaseModel to convert to Arrow Schema.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pyarrow.Schema
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
>>> from typing import List, Optional
|
||||||
|
>>> import pydantic
|
||||||
|
>>> from lancedb.pydantic import pydantic_to_schema
|
||||||
|
...
|
||||||
|
>>> class InnerModel(pydantic.BaseModel):
|
||||||
|
... a: str
|
||||||
|
... b: Optional[float]
|
||||||
|
>>>
|
||||||
|
>>> class FooModel(pydantic.BaseModel):
|
||||||
|
... id: int
|
||||||
|
... s: Optional[str] = None
|
||||||
|
... vec: List[float]
|
||||||
|
... li: List[int]
|
||||||
|
... inner: InnerModel
|
||||||
|
>>> schema = pydantic_to_schema(FooModel)
|
||||||
|
>>> assert schema == pa.schema([
|
||||||
|
... pa.field("id", pa.int64(), False),
|
||||||
|
... pa.field("s", pa.utf8(), True),
|
||||||
|
... pa.field("vec", pa.list_(pa.float64()), False),
|
||||||
|
... pa.field("li", pa.list_(pa.int64()), False),
|
||||||
|
... pa.field("inner", pa.struct([
|
||||||
|
... pa.field("a", pa.utf8(), False),
|
||||||
|
... pa.field("b", pa.float64(), True),
|
||||||
|
... ]), False),
|
||||||
|
... ])
|
||||||
|
"""
|
||||||
|
fields = _pydantic_model_to_fields(model)
|
||||||
|
return pa.schema(fields)
|
||||||
@@ -226,6 +226,7 @@ class LanceQueryBuilder:
|
|||||||
columns=self._columns,
|
columns=self._columns,
|
||||||
nprobes=self._nprobes,
|
nprobes=self._nprobes,
|
||||||
refine_factor=self._refine_factor,
|
refine_factor=self._refine_factor,
|
||||||
|
vector_column=self._vector_column,
|
||||||
)
|
)
|
||||||
return self._table._execute_query(query)
|
return self._table._execute_query(query)
|
||||||
|
|
||||||
|
|||||||
22
python/lancedb/remote/arrow.py
Normal file
22
python/lancedb/remote/arrow.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
|
||||||
|
def to_ipc_binary(table: pa.Table) -> bytes:
|
||||||
|
"""Serialize a PyArrow Table to IPC binary."""
|
||||||
|
sink = pa.BufferOutputStream()
|
||||||
|
with pa.ipc.new_stream(sink, table.schema) as writer:
|
||||||
|
writer.write_table(table)
|
||||||
|
return sink.getvalue().to_pybytes()
|
||||||
@@ -13,16 +13,19 @@
|
|||||||
|
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
from typing import Dict
|
from typing import Any, Callable, Dict, Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import attr
|
import attr
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from lancedb.common import Credential
|
from lancedb.common import Credential
|
||||||
from lancedb.remote import VectorQuery, VectorQueryResult
|
from lancedb.remote import VectorQuery, VectorQueryResult
|
||||||
from lancedb.remote.errors import LanceDBClientError
|
from lancedb.remote.errors import LanceDBClientError
|
||||||
|
|
||||||
|
ARROW_STREAM_CONTENT_TYPE = "application/vnd.apache.arrow.stream"
|
||||||
|
|
||||||
|
|
||||||
def _check_not_closed(f):
|
def _check_not_closed(f):
|
||||||
@functools.wraps(f)
|
@functools.wraps(f)
|
||||||
@@ -34,6 +37,12 @@ def _check_not_closed(f):
|
|||||||
return wrapped
|
return wrapped
|
||||||
|
|
||||||
|
|
||||||
|
async def _read_ipc(resp: aiohttp.ClientResponse) -> pa.Table:
|
||||||
|
resp_body = await resp.read()
|
||||||
|
with pa.ipc.open_file(pa.BufferReader(resp_body)) as reader:
|
||||||
|
return reader.read_all()
|
||||||
|
|
||||||
|
|
||||||
@attr.define(slots=False)
|
@attr.define(slots=False)
|
||||||
class RestfulLanceDBClient:
|
class RestfulLanceDBClient:
|
||||||
db_name: str
|
db_name: str
|
||||||
@@ -52,32 +61,85 @@ class RestfulLanceDBClient:
|
|||||||
|
|
||||||
@functools.cached_property
|
@functools.cached_property
|
||||||
def headers(self) -> Dict[str, str]:
|
def headers(self) -> Dict[str, str]:
|
||||||
return {
|
headers = {
|
||||||
"x-api-key": self.api_key,
|
"x-api-key": self.api_key,
|
||||||
}
|
}
|
||||||
|
if self.region == "local": # Local test mode
|
||||||
|
headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
|
||||||
|
return headers
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def _check_status(resp: aiohttp.ClientResponse):
|
||||||
|
if resp.status == 404:
|
||||||
|
raise LanceDBClientError(f"Not found: {await resp.text()}")
|
||||||
|
elif 400 <= resp.status < 500:
|
||||||
|
raise LanceDBClientError(
|
||||||
|
f"Bad Request: {resp.status}, error: {await resp.text()}"
|
||||||
|
)
|
||||||
|
elif 500 <= resp.status < 600:
|
||||||
|
raise LanceDBClientError(
|
||||||
|
f"Internal Server Error: {resp.status}, error: {await resp.text()}"
|
||||||
|
)
|
||||||
|
elif resp.status != 200:
|
||||||
|
raise LanceDBClientError(
|
||||||
|
f"Unknown Error: {resp.status}, error: {await resp.text()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@_check_not_closed
|
||||||
|
async def get(self, uri: str, params: Union[Dict[str, Any], BaseModel] = None):
|
||||||
|
"""Send a GET request and returns the deserialized response payload."""
|
||||||
|
if isinstance(params, BaseModel):
|
||||||
|
params: Dict[str, Any] = params.dict(exclude_none=True)
|
||||||
|
async with self.session.get(uri, params=params, headers=self.headers) as resp:
|
||||||
|
await self._check_status(resp)
|
||||||
|
return await resp.json()
|
||||||
|
|
||||||
|
@_check_not_closed
|
||||||
|
async def post(
|
||||||
|
self,
|
||||||
|
uri: str,
|
||||||
|
data: Union[Dict[str, Any], BaseModel, bytes],
|
||||||
|
params: Optional[Dict[str, Any]] = None,
|
||||||
|
content_type: Optional[str] = None,
|
||||||
|
deserialize: Callable = lambda resp: resp.json(),
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Send a POST request and returns the deserialized response payload.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
uri : str
|
||||||
|
The uri to send the POST request to.
|
||||||
|
data: Union[Dict[str, Any], BaseModel]
|
||||||
|
|
||||||
|
"""
|
||||||
|
if isinstance(data, BaseModel):
|
||||||
|
data: Dict[str, Any] = data.dict(exclude_none=True)
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
req_kwargs = {"data": data}
|
||||||
|
else:
|
||||||
|
req_kwargs = {"json": data}
|
||||||
|
|
||||||
|
headers = self.headers.copy()
|
||||||
|
if content_type is not None:
|
||||||
|
headers["content-type"] = content_type
|
||||||
|
async with self.session.post(
|
||||||
|
uri,
|
||||||
|
headers=headers,
|
||||||
|
params=params,
|
||||||
|
**req_kwargs,
|
||||||
|
) as resp:
|
||||||
|
resp: aiohttp.ClientResponse = resp
|
||||||
|
await self._check_status(resp)
|
||||||
|
return await deserialize(resp)
|
||||||
|
|
||||||
|
@_check_not_closed
|
||||||
|
async def list_tables(self):
|
||||||
|
"""List all tables in the database."""
|
||||||
|
json = await self.get("/v1/table/", {})
|
||||||
|
return json["tables"]
|
||||||
|
|
||||||
@_check_not_closed
|
@_check_not_closed
|
||||||
async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
|
async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
|
||||||
async with self.session.post(
|
"""Query a table."""
|
||||||
f"/1/table/{table_name}/",
|
tbl = await self.post(f"/v1/table/{table_name}/", query, deserialize=_read_ipc)
|
||||||
json=query.dict(exclude_none=True),
|
|
||||||
headers=self.headers,
|
|
||||||
) as resp:
|
|
||||||
resp: aiohttp.ClientResponse = resp
|
|
||||||
if 400 <= resp.status < 500:
|
|
||||||
raise LanceDBClientError(
|
|
||||||
f"Bad Request: {resp.status}, error: {await resp.text()}"
|
|
||||||
)
|
|
||||||
if 500 <= resp.status < 600:
|
|
||||||
raise LanceDBClientError(
|
|
||||||
f"Internal Server Error: {resp.status}, error: {await resp.text()}"
|
|
||||||
)
|
|
||||||
if resp.status != 200:
|
|
||||||
raise LanceDBClientError(
|
|
||||||
f"Unknown Error: {resp.status}, error: {await resp.text()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
resp_body = await resp.read()
|
|
||||||
with pa.ipc.open_file(pa.BufferReader(resp_body)) as reader:
|
|
||||||
tbl = reader.read_all()
|
|
||||||
return VectorQueryResult(tbl)
|
return VectorQueryResult(tbl)
|
||||||
|
|||||||
@@ -11,6 +11,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import uuid
|
||||||
from typing import List
|
from typing import List
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
@@ -18,9 +20,11 @@ import pyarrow as pa
|
|||||||
|
|
||||||
from lancedb.common import DATA
|
from lancedb.common import DATA
|
||||||
from lancedb.db import DBConnection
|
from lancedb.db import DBConnection
|
||||||
from lancedb.table import Table
|
from lancedb.schema import schema_to_json
|
||||||
|
from lancedb.table import Table, _sanitize_data
|
||||||
|
|
||||||
from .client import RestfulLanceDBClient
|
from .arrow import to_ipc_binary
|
||||||
|
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
|
||||||
|
|
||||||
|
|
||||||
class RemoteDBConnection(DBConnection):
|
class RemoteDBConnection(DBConnection):
|
||||||
@@ -34,12 +38,18 @@ class RemoteDBConnection(DBConnection):
|
|||||||
self.db_name = parsed.netloc
|
self.db_name = parsed.netloc
|
||||||
self.api_key = api_key
|
self.api_key = api_key
|
||||||
self._client = RestfulLanceDBClient(self.db_name, region, api_key)
|
self._client = RestfulLanceDBClient(self.db_name, region, api_key)
|
||||||
|
try:
|
||||||
|
self._loop = asyncio.get_running_loop()
|
||||||
|
except RuntimeError:
|
||||||
|
self._loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"RemoveConnect(name={self.db_name})"
|
return f"RemoveConnect(name={self.db_name})"
|
||||||
|
|
||||||
def table_names(self) -> List[str]:
|
def table_names(self) -> List[str]:
|
||||||
raise NotImplementedError
|
"""List the names of all tables in the database."""
|
||||||
|
result = self._loop.run_until_complete(self._client.list_tables())
|
||||||
|
return result
|
||||||
|
|
||||||
def open_table(self, name: str) -> Table:
|
def open_table(self, name: str) -> Table:
|
||||||
"""Open a Lance Table in the database.
|
"""Open a Lance Table in the database.
|
||||||
@@ -64,8 +74,31 @@ class RemoteDBConnection(DBConnection):
|
|||||||
name: str,
|
name: str,
|
||||||
data: DATA = None,
|
data: DATA = None,
|
||||||
schema: pa.Schema = None,
|
schema: pa.Schema = None,
|
||||||
mode: str = "create",
|
|
||||||
on_bad_vectors: str = "error",
|
on_bad_vectors: str = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
) -> Table:
|
) -> Table:
|
||||||
raise NotImplementedError
|
if data is None and schema is None:
|
||||||
|
raise ValueError("Either data or schema must be provided.")
|
||||||
|
if data is not None:
|
||||||
|
data = _sanitize_data(
|
||||||
|
data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if schema is None:
|
||||||
|
raise ValueError("Either data or schema must be provided")
|
||||||
|
data = pa.Table.from_pylist([], schema=schema)
|
||||||
|
|
||||||
|
from .table import RemoteTable
|
||||||
|
|
||||||
|
data = to_ipc_binary(data)
|
||||||
|
request_id = uuid.uuid4().hex
|
||||||
|
|
||||||
|
self._loop.run_until_complete(
|
||||||
|
self._client.post(
|
||||||
|
f"/v1/table/{name}/create",
|
||||||
|
data=data,
|
||||||
|
params={"request_id": request_id},
|
||||||
|
content_type=ARROW_STREAM_CONTENT_TYPE,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return RemoteTable(self, name)
|
||||||
|
|||||||
@@ -11,7 +11,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import asyncio
|
import uuid
|
||||||
|
from functools import cached_property
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
@@ -19,7 +20,10 @@ import pyarrow as pa
|
|||||||
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
|
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||||
|
|
||||||
from ..query import LanceQueryBuilder, Query
|
from ..query import LanceQueryBuilder, Query
|
||||||
from ..table import Query, Table
|
from ..schema import json_to_schema
|
||||||
|
from ..table import Query, Table, _sanitize_data
|
||||||
|
from .arrow import to_ipc_binary
|
||||||
|
from .client import ARROW_STREAM_CONTENT_TYPE
|
||||||
from .db import RemoteDBConnection
|
from .db import RemoteDBConnection
|
||||||
|
|
||||||
|
|
||||||
@@ -31,8 +35,14 @@ class RemoteTable(Table):
|
|||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"RemoteTable({self._conn.db_name}.{self.name})"
|
return f"RemoteTable({self._conn.db_name}.{self.name})"
|
||||||
|
|
||||||
|
@cached_property
|
||||||
def schema(self) -> pa.Schema:
|
def schema(self) -> pa.Schema:
|
||||||
raise NotImplementedError
|
"""Return the schema of the table."""
|
||||||
|
resp = self._conn._loop.run_until_complete(
|
||||||
|
self._conn._client.get(f"/v1/table/{self._name}/describe")
|
||||||
|
)
|
||||||
|
schema = json_to_schema(resp["schema"])
|
||||||
|
return schema
|
||||||
|
|
||||||
def to_arrow(self) -> pa.Table:
|
def to_arrow(self) -> pa.Table:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
@@ -54,7 +64,22 @@ class RemoteTable(Table):
|
|||||||
on_bad_vectors: str = "error",
|
on_bad_vectors: str = "error",
|
||||||
fill_value: float = 0.0,
|
fill_value: float = 0.0,
|
||||||
) -> int:
|
) -> int:
|
||||||
raise NotImplementedError
|
data = _sanitize_data(
|
||||||
|
data, self.schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
||||||
|
)
|
||||||
|
payload = to_ipc_binary(data)
|
||||||
|
|
||||||
|
request_id = uuid.uuid4().hex
|
||||||
|
|
||||||
|
self._conn._loop.run_until_complete(
|
||||||
|
self._conn._client.post(
|
||||||
|
f"/v1/table/{self._name}/insert",
|
||||||
|
data=payload,
|
||||||
|
params={"request_id": request_id, "mode": mode},
|
||||||
|
content_type=ARROW_STREAM_CONTENT_TYPE,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return len(data)
|
||||||
|
|
||||||
def search(
|
def search(
|
||||||
self, query: Union[VEC, str], vector_column: str = VECTOR_COLUMN_NAME
|
self, query: Union[VEC, str], vector_column: str = VECTOR_COLUMN_NAME
|
||||||
@@ -62,9 +87,5 @@ class RemoteTable(Table):
|
|||||||
return LanceQueryBuilder(self, query, vector_column)
|
return LanceQueryBuilder(self, query, vector_column)
|
||||||
|
|
||||||
def _execute_query(self, query: Query) -> pa.Table:
|
def _execute_query(self, query: Query) -> pa.Table:
|
||||||
try:
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
except RuntimeError:
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
result = self._conn._client.query(self._name, query)
|
result = self._conn._client.query(self._name, query)
|
||||||
return loop.run_until_complete(result).to_arrow()
|
return self._conn._loop.run_until_complete(result).to_arrow()
|
||||||
|
|||||||
45
python/lancedb/schema.py
Normal file
45
python/lancedb/schema.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Schema related utilities."""
|
||||||
|
|
||||||
|
from typing import Any, Dict, Type
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
from lance import json_to_schema, schema_to_json
|
||||||
|
|
||||||
|
|
||||||
|
def vector(dimension: int, value_type: pa.DataType = pa.float32()) -> pa.DataType:
|
||||||
|
"""A help function to create a vector type.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dimension: The dimension of the vector.
|
||||||
|
value_type: pa.DataType, optional
|
||||||
|
The type of the value in the vector.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
A PyArrow DataType for vectors.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
>>> import pyarrow as pa
|
||||||
|
>>> import lancedb
|
||||||
|
>>> schema = pa.schema([
|
||||||
|
... pa.field("id", pa.int64()),
|
||||||
|
... pa.field("vector", lancedb.vector(756)),
|
||||||
|
... ])
|
||||||
|
"""
|
||||||
|
return pa.list_(value_type, dimension)
|
||||||
@@ -16,7 +16,7 @@ from __future__ import annotations
|
|||||||
import os
|
import os
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import List, Union
|
from typing import Iterable, List, Union
|
||||||
|
|
||||||
import lance
|
import lance
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -44,7 +44,7 @@ def _sanitize_data(data, schema, on_bad_vectors, fill_value):
|
|||||||
data = _sanitize_schema(
|
data = _sanitize_schema(
|
||||||
data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
||||||
)
|
)
|
||||||
if not isinstance(data, pa.Table):
|
if not isinstance(data, (pa.Table, Iterable)):
|
||||||
raise TypeError(f"Unsupported data type: {type(data)}")
|
raise TypeError(f"Unsupported data type: {type(data)}")
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@@ -483,7 +483,7 @@ class LanceTable(Table):
|
|||||||
if schema is None:
|
if schema is None:
|
||||||
raise ValueError("Either data or schema must be provided")
|
raise ValueError("Either data or schema must be provided")
|
||||||
data = pa.Table.from_pylist([], schema=schema)
|
data = pa.Table.from_pylist([], schema=schema)
|
||||||
lance.write_dataset(data, tbl._dataset_uri, mode=mode)
|
lance.write_dataset(data, tbl._dataset_uri, schema=schema, mode=mode)
|
||||||
return LanceTable(db, name)
|
return LanceTable(db, name)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.1.10"
|
version = "0.1.11"
|
||||||
dependencies = ["pylance~=0.5.0", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic", "attr"]
|
dependencies = ["pylance~=0.5.8", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic>=2", "attr"]
|
||||||
description = "lancedb"
|
description = "lancedb"
|
||||||
authors = [
|
authors = [
|
||||||
{ name = "LanceDB Devs", email = "dev@lancedb.com" },
|
{ name = "LanceDB Devs", email = "dev@lancedb.com" },
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import lancedb
|
import lancedb
|
||||||
@@ -75,6 +76,32 @@ def test_ingest_pd(tmp_path):
|
|||||||
assert db.open_table("test").name == db["test"].name
|
assert db.open_table("test").name == db["test"].name
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_record_batch_iterator(tmp_path):
|
||||||
|
def batch_reader():
|
||||||
|
for i in range(5):
|
||||||
|
yield pa.RecordBatch.from_arrays(
|
||||||
|
[
|
||||||
|
pa.array([[3.1, 4.1], [5.9, 26.5]]),
|
||||||
|
pa.array(["foo", "bar"]),
|
||||||
|
pa.array([10.0, 20.0]),
|
||||||
|
],
|
||||||
|
["vector", "item", "price"],
|
||||||
|
)
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
tbl = db.create_table(
|
||||||
|
"test",
|
||||||
|
batch_reader(),
|
||||||
|
schema=pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32())),
|
||||||
|
pa.field("item", pa.utf8()),
|
||||||
|
pa.field("price", pa.float32()),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_create_mode(tmp_path):
|
def test_create_mode(tmp_path):
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
data = pd.DataFrame(
|
data = pd.DataFrame(
|
||||||
@@ -131,6 +158,9 @@ def test_empty_or_nonexistent_table(tmp_path):
|
|||||||
with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
db.open_table("does_not_exist")
|
db.open_table("does_not_exist")
|
||||||
|
|
||||||
|
schema = pa.schema([pa.field("a", pa.int32())])
|
||||||
|
db.create_table("test", schema=schema)
|
||||||
|
|
||||||
|
|
||||||
def test_replace_index(tmp_path):
|
def test_replace_index(tmp_path):
|
||||||
db = lancedb.connect(uri=tmp_path)
|
db = lancedb.connect(uri=tmp_path)
|
||||||
|
|||||||
155
python/tests/test_pydantic.py
Normal file
155
python/tests/test_pydantic.py
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
import pydantic
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from lancedb.pydantic import pydantic_to_schema, vector
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
sys.version_info < (3, 9),
|
||||||
|
reason="using native type alias requires python3.9 or higher",
|
||||||
|
)
|
||||||
|
def test_pydantic_to_arrow():
|
||||||
|
class StructModel(pydantic.BaseModel):
|
||||||
|
a: str
|
||||||
|
b: Optional[float]
|
||||||
|
|
||||||
|
class TestModel(pydantic.BaseModel):
|
||||||
|
id: int
|
||||||
|
s: str
|
||||||
|
vec: list[float]
|
||||||
|
li: List[int]
|
||||||
|
opt: Optional[str] = None
|
||||||
|
st: StructModel
|
||||||
|
# d: dict
|
||||||
|
|
||||||
|
m = TestModel(
|
||||||
|
id=1, s="hello", vec=[1.0, 2.0, 3.0], li=[2, 3, 4], st=StructModel(a="a", b=1.0)
|
||||||
|
)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(TestModel)
|
||||||
|
|
||||||
|
expect_schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64(), False),
|
||||||
|
pa.field("s", pa.utf8(), False),
|
||||||
|
pa.field("vec", pa.list_(pa.float64()), False),
|
||||||
|
pa.field("li", pa.list_(pa.int64()), False),
|
||||||
|
pa.field("opt", pa.utf8(), True),
|
||||||
|
pa.field(
|
||||||
|
"st",
|
||||||
|
pa.struct(
|
||||||
|
[pa.field("a", pa.utf8(), False), pa.field("b", pa.float64(), True)]
|
||||||
|
),
|
||||||
|
False,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert schema == expect_schema
|
||||||
|
|
||||||
|
|
||||||
|
def test_pydantic_to_arrow_py38():
|
||||||
|
class StructModel(pydantic.BaseModel):
|
||||||
|
a: str
|
||||||
|
b: Optional[float]
|
||||||
|
|
||||||
|
class TestModel(pydantic.BaseModel):
|
||||||
|
id: int
|
||||||
|
s: str
|
||||||
|
vec: List[float]
|
||||||
|
li: List[int]
|
||||||
|
opt: Optional[str] = None
|
||||||
|
st: StructModel
|
||||||
|
# d: dict
|
||||||
|
|
||||||
|
m = TestModel(
|
||||||
|
id=1, s="hello", vec=[1.0, 2.0, 3.0], li=[2, 3, 4], st=StructModel(a="a", b=1.0)
|
||||||
|
)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(TestModel)
|
||||||
|
|
||||||
|
expect_schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("id", pa.int64(), False),
|
||||||
|
pa.field("s", pa.utf8(), False),
|
||||||
|
pa.field("vec", pa.list_(pa.float64()), False),
|
||||||
|
pa.field("li", pa.list_(pa.int64()), False),
|
||||||
|
pa.field("opt", pa.utf8(), True),
|
||||||
|
pa.field(
|
||||||
|
"st",
|
||||||
|
pa.struct(
|
||||||
|
[pa.field("a", pa.utf8(), False), pa.field("b", pa.float64(), True)]
|
||||||
|
),
|
||||||
|
False,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert schema == expect_schema
|
||||||
|
|
||||||
|
|
||||||
|
def test_fixed_size_list_field():
|
||||||
|
class TestModel(pydantic.BaseModel):
|
||||||
|
vec: vector(16)
|
||||||
|
li: List[int]
|
||||||
|
|
||||||
|
data = TestModel(vec=list(range(16)), li=[1, 2, 3])
|
||||||
|
assert json.loads(data.model_dump_json()) == {
|
||||||
|
"vec": list(range(16)),
|
||||||
|
"li": [1, 2, 3],
|
||||||
|
}
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(TestModel)
|
||||||
|
assert schema == pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vec", pa.list_(pa.float32(), 16), False),
|
||||||
|
pa.field("li", pa.list_(pa.int64()), False),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
json_schema = TestModel.model_json_schema()
|
||||||
|
assert json_schema == {
|
||||||
|
"properties": {
|
||||||
|
"vec": {
|
||||||
|
"items": {"type": "number"},
|
||||||
|
"maxItems": 16,
|
||||||
|
"minItems": 16,
|
||||||
|
"title": "Vec",
|
||||||
|
"type": "array",
|
||||||
|
},
|
||||||
|
"li": {"items": {"type": "integer"}, "title": "Li", "type": "array"},
|
||||||
|
},
|
||||||
|
"required": ["vec", "li"],
|
||||||
|
"title": "TestModel",
|
||||||
|
"type": "object",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_fixed_size_list_validation():
|
||||||
|
class TestModel(pydantic.BaseModel):
|
||||||
|
vec: vector(8)
|
||||||
|
|
||||||
|
with pytest.raises(pydantic.ValidationError):
|
||||||
|
TestModel(vec=range(9))
|
||||||
|
|
||||||
|
with pytest.raises(pydantic.ValidationError):
|
||||||
|
TestModel(vec=range(7))
|
||||||
|
|
||||||
|
TestModel(vec=range(8))
|
||||||
@@ -119,6 +119,7 @@ def test_query_builder_with_different_vector_column():
|
|||||||
columns=["b"],
|
columns=["b"],
|
||||||
nprobes=20,
|
nprobes=20,
|
||||||
refine_factor=None,
|
refine_factor=None,
|
||||||
|
vector_column="foo_vector",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "vectordb-node"
|
name = "vectordb-node"
|
||||||
version = "0.1.10"
|
version = "0.1.13"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license = "Apache-2.0"
|
license = "Apache-2.0"
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
@@ -15,7 +15,11 @@ arrow-ipc = { workspace = true }
|
|||||||
arrow-schema = { workspace = true }
|
arrow-schema = { workspace = true }
|
||||||
once_cell = "1"
|
once_cell = "1"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
|
half = { workspace = true }
|
||||||
lance = { workspace = true }
|
lance = { workspace = true }
|
||||||
vectordb = { path = "../../vectordb" }
|
vectordb = { path = "../../vectordb" }
|
||||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||||
neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] }
|
neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] }
|
||||||
|
object_store = { workspace = true, features = ["aws"] }
|
||||||
|
async-trait = "0"
|
||||||
|
env_logger = "0"
|
||||||
|
|||||||
@@ -13,7 +13,6 @@
|
|||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use std::ops::Deref;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use arrow_array::cast::as_list_array;
|
use arrow_array::cast::as_list_array;
|
||||||
@@ -25,10 +24,13 @@ use lance::arrow::{FixedSizeListArrayExt, RecordBatchExt};
|
|||||||
pub(crate) fn convert_record_batch(record_batch: RecordBatch) -> RecordBatch {
|
pub(crate) fn convert_record_batch(record_batch: RecordBatch) -> RecordBatch {
|
||||||
let column = record_batch
|
let column = record_batch
|
||||||
.column_by_name("vector")
|
.column_by_name("vector")
|
||||||
|
.cloned()
|
||||||
.expect("vector column is missing");
|
.expect("vector column is missing");
|
||||||
let arr = as_list_array(column.deref());
|
// TODO: we should just consume the underlaying js buffer in the future instead of this arrow around a bunch of times
|
||||||
|
let arr = as_list_array(column.as_ref());
|
||||||
let list_size = arr.values().len() / record_batch.num_rows();
|
let list_size = arr.values().len() / record_batch.num_rows();
|
||||||
let r = FixedSizeListArray::try_new(arr.values(), list_size as i32).unwrap();
|
let r =
|
||||||
|
FixedSizeListArray::try_new_from_values(arr.values().to_owned(), list_size as i32).unwrap();
|
||||||
|
|
||||||
let schema = Arc::new(Schema::new(vec![Field::new(
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
||||||
"vector",
|
"vector",
|
||||||
|
|||||||
@@ -17,19 +17,23 @@ use std::convert::TryFrom;
|
|||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
use arrow_array::{Float32Array, RecordBatchIterator, RecordBatchReader};
|
use arrow_array::{Float32Array, RecordBatchIterator};
|
||||||
use arrow_ipc::writer::FileWriter;
|
use arrow_ipc::writer::FileWriter;
|
||||||
|
use async_trait::async_trait;
|
||||||
use futures::{TryFutureExt, TryStreamExt};
|
use futures::{TryFutureExt, TryStreamExt};
|
||||||
use lance::dataset::{WriteMode, WriteParams};
|
use lance::dataset::{ReadParams, WriteMode, WriteParams};
|
||||||
use lance::index::vector::MetricType;
|
use lance::index::vector::MetricType;
|
||||||
|
use lance::io::object_store::ObjectStoreParams;
|
||||||
use neon::prelude::*;
|
use neon::prelude::*;
|
||||||
use neon::types::buffer::TypedArray;
|
use neon::types::buffer::TypedArray;
|
||||||
|
use object_store::aws::{AwsCredential, AwsCredentialProvider};
|
||||||
|
use object_store::CredentialProvider;
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
use tokio::runtime::Runtime;
|
use tokio::runtime::Runtime;
|
||||||
|
|
||||||
use vectordb::database::Database;
|
use vectordb::database::Database;
|
||||||
use vectordb::error::Error;
|
use vectordb::error::Error;
|
||||||
use vectordb::table::Table;
|
use vectordb::table::{OpenTableParams, Table};
|
||||||
|
|
||||||
use crate::arrow::arrow_buffer_to_record_batch;
|
use crate::arrow::arrow_buffer_to_record_batch;
|
||||||
|
|
||||||
@@ -49,8 +53,38 @@ struct JsTable {
|
|||||||
|
|
||||||
impl Finalize for JsTable {}
|
impl Finalize for JsTable {}
|
||||||
|
|
||||||
|
// TODO: object_store didn't export this type so I copied it.
|
||||||
|
// Make a requiest to object_store to export this type
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct StaticCredentialProvider<T> {
|
||||||
|
credential: Arc<T>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> StaticCredentialProvider<T> {
|
||||||
|
pub fn new(credential: T) -> Self {
|
||||||
|
Self {
|
||||||
|
credential: Arc::new(credential),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl<T> CredentialProvider for StaticCredentialProvider<T>
|
||||||
|
where
|
||||||
|
T: std::fmt::Debug + Send + Sync,
|
||||||
|
{
|
||||||
|
type Credential = T;
|
||||||
|
|
||||||
|
async fn get_credential(&self) -> object_store::Result<Arc<T>> {
|
||||||
|
Ok(Arc::clone(&self.credential))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
|
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
|
||||||
static RUNTIME: OnceCell<Runtime> = OnceCell::new();
|
static RUNTIME: OnceCell<Runtime> = OnceCell::new();
|
||||||
|
static LOG: OnceCell<()> = OnceCell::new();
|
||||||
|
|
||||||
|
LOG.get_or_init(|| env_logger::init());
|
||||||
|
|
||||||
RUNTIME.get_or_try_init(|| Runtime::new().or_else(|err| cx.throw_error(err.to_string())))
|
RUNTIME.get_or_try_init(|| Runtime::new().or_else(|err| cx.throw_error(err.to_string())))
|
||||||
}
|
}
|
||||||
@@ -97,19 +131,74 @@ fn database_table_names(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
Ok(promise)
|
Ok(promise)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_aws_creds<T>(
|
||||||
|
cx: &mut FunctionContext,
|
||||||
|
arg_starting_location: i32,
|
||||||
|
) -> Result<Option<AwsCredentialProvider>, NeonResult<T>> {
|
||||||
|
let secret_key_id = cx
|
||||||
|
.argument_opt(arg_starting_location)
|
||||||
|
.map(|arg| arg.downcast_or_throw::<JsString, FunctionContext>(cx).ok())
|
||||||
|
.flatten()
|
||||||
|
.map(|v| v.value(cx));
|
||||||
|
|
||||||
|
let secret_key = cx
|
||||||
|
.argument_opt(arg_starting_location + 1)
|
||||||
|
.map(|arg| arg.downcast_or_throw::<JsString, FunctionContext>(cx).ok())
|
||||||
|
.flatten()
|
||||||
|
.map(|v| v.value(cx));
|
||||||
|
|
||||||
|
let temp_token = cx
|
||||||
|
.argument_opt(arg_starting_location + 2)
|
||||||
|
.map(|arg| arg.downcast_or_throw::<JsString, FunctionContext>(cx).ok())
|
||||||
|
.flatten()
|
||||||
|
.map(|v| v.value(cx));
|
||||||
|
|
||||||
|
match (secret_key_id, secret_key, temp_token) {
|
||||||
|
(Some(key_id), Some(key), optional_token) => Ok(Some(Arc::new(
|
||||||
|
StaticCredentialProvider::new(AwsCredential {
|
||||||
|
key_id: key_id,
|
||||||
|
secret_key: key,
|
||||||
|
token: optional_token,
|
||||||
|
}),
|
||||||
|
))),
|
||||||
|
(None, None, None) => Ok(None),
|
||||||
|
_ => Err(cx.throw_error("Invalid credentials configuration")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
let db = cx
|
let db = cx
|
||||||
.this()
|
.this()
|
||||||
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
|
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
|
||||||
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||||
|
|
||||||
|
let aws_creds = match get_aws_creds(&mut cx, 1) {
|
||||||
|
Ok(creds) => creds,
|
||||||
|
Err(err) => return err,
|
||||||
|
};
|
||||||
|
|
||||||
|
let param = ReadParams {
|
||||||
|
store_options: Some(ObjectStoreParams {
|
||||||
|
aws_credentials: aws_creds,
|
||||||
|
..ObjectStoreParams::default()
|
||||||
|
}),
|
||||||
|
..ReadParams::default()
|
||||||
|
};
|
||||||
|
|
||||||
let rt = runtime(&mut cx)?;
|
let rt = runtime(&mut cx)?;
|
||||||
let channel = cx.channel();
|
let channel = cx.channel();
|
||||||
let database = db.database.clone();
|
let database = db.database.clone();
|
||||||
|
|
||||||
let (deferred, promise) = cx.promise();
|
let (deferred, promise) = cx.promise();
|
||||||
rt.spawn(async move {
|
rt.spawn(async move {
|
||||||
let table_rst = database.open_table(&table_name).await;
|
let table_rst = database
|
||||||
|
.open_table_with_params(
|
||||||
|
&table_name,
|
||||||
|
OpenTableParams {
|
||||||
|
open_table_params: param,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
deferred.settle_with(&channel, move |mut cx| {
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
let table = Arc::new(Mutex::new(
|
let table = Arc::new(Mutex::new(
|
||||||
@@ -241,8 +330,6 @@ fn table_create(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
"create" => WriteMode::Create,
|
"create" => WriteMode::Create,
|
||||||
_ => return cx.throw_error("Table::create only supports 'overwrite' and 'create' modes"),
|
_ => return cx.throw_error("Table::create only supports 'overwrite' and 'create' modes"),
|
||||||
};
|
};
|
||||||
let mut params = WriteParams::default();
|
|
||||||
params.mode = mode;
|
|
||||||
|
|
||||||
let rt = runtime(&mut cx)?;
|
let rt = runtime(&mut cx)?;
|
||||||
let channel = cx.channel();
|
let channel = cx.channel();
|
||||||
@@ -250,11 +337,22 @@ fn table_create(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
let (deferred, promise) = cx.promise();
|
let (deferred, promise) = cx.promise();
|
||||||
let database = db.database.clone();
|
let database = db.database.clone();
|
||||||
|
|
||||||
|
let aws_creds = match get_aws_creds(&mut cx, 3) {
|
||||||
|
Ok(creds) => creds,
|
||||||
|
Err(err) => return err,
|
||||||
|
};
|
||||||
|
|
||||||
|
let params = WriteParams {
|
||||||
|
store_params: Some(ObjectStoreParams {
|
||||||
|
aws_credentials: aws_creds,
|
||||||
|
..ObjectStoreParams::default()
|
||||||
|
}),
|
||||||
|
mode: mode,
|
||||||
|
..WriteParams::default()
|
||||||
|
};
|
||||||
|
|
||||||
rt.block_on(async move {
|
rt.block_on(async move {
|
||||||
let batch_reader: Box<dyn RecordBatchReader> = Box::new(RecordBatchIterator::new(
|
let batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema);
|
||||||
batches.into_iter().map(Ok),
|
|
||||||
schema,
|
|
||||||
));
|
|
||||||
let table_rst = database
|
let table_rst = database
|
||||||
.create_table(&table_name, batch_reader, Some(params))
|
.create_table(&table_name, batch_reader, Some(params))
|
||||||
.await;
|
.await;
|
||||||
@@ -289,16 +387,27 @@ fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
let table = js_table.table.clone();
|
let table = js_table.table.clone();
|
||||||
let write_mode = write_mode_map.get(write_mode.as_str()).cloned();
|
let write_mode = write_mode_map.get(write_mode.as_str()).cloned();
|
||||||
|
|
||||||
|
let aws_creds = match get_aws_creds(&mut cx, 2) {
|
||||||
|
Ok(creds) => creds,
|
||||||
|
Err(err) => return err,
|
||||||
|
};
|
||||||
|
|
||||||
|
let params = WriteParams {
|
||||||
|
store_params: Some(ObjectStoreParams {
|
||||||
|
aws_credentials: aws_creds,
|
||||||
|
..ObjectStoreParams::default()
|
||||||
|
}),
|
||||||
|
mode: write_mode.unwrap_or(WriteMode::Append),
|
||||||
|
..WriteParams::default()
|
||||||
|
};
|
||||||
|
|
||||||
rt.block_on(async move {
|
rt.block_on(async move {
|
||||||
let batch_reader: Box<dyn RecordBatchReader> = Box::new(RecordBatchIterator::new(
|
let batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema);
|
||||||
batches.into_iter().map(Ok),
|
let add_result = table.lock().unwrap().add(batch_reader, Some(params)).await;
|
||||||
schema,
|
|
||||||
));
|
|
||||||
let add_result = table.lock().unwrap().add(batch_reader, write_mode).await;
|
|
||||||
|
|
||||||
deferred.settle_with(&channel, move |mut cx| {
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
let added = add_result.or_else(|err| cx.throw_error(err.to_string()))?;
|
let _added = add_result.or_else(|err| cx.throw_error(err.to_string()))?;
|
||||||
Ok(cx.number(added as f64))
|
Ok(cx.boolean(true))
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
Ok(promise)
|
Ok(promise)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "vectordb"
|
name = "vectordb"
|
||||||
version = "0.1.10"
|
version = "0.1.13"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license = "Apache-2.0"
|
license = "Apache-2.0"
|
||||||
@@ -13,6 +13,7 @@ arrow-data = { workspace = true }
|
|||||||
arrow-schema = { workspace = true }
|
arrow-schema = { workspace = true }
|
||||||
object_store = { workspace = true }
|
object_store = { workspace = true }
|
||||||
snafu = "0.7.4"
|
snafu = "0.7.4"
|
||||||
|
half = { workspace = true }
|
||||||
lance = { workspace = true }
|
lance = { workspace = true }
|
||||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||||
|
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ pub struct Database {
|
|||||||
object_store: ObjectStore,
|
object_store: ObjectStore,
|
||||||
|
|
||||||
pub(crate) uri: String,
|
pub(crate) uri: String,
|
||||||
|
pub(crate) base_path: object_store::path::Path,
|
||||||
}
|
}
|
||||||
|
|
||||||
const LANCE_EXTENSION: &str = "lance";
|
const LANCE_EXTENSION: &str = "lance";
|
||||||
@@ -43,12 +44,13 @@ impl Database {
|
|||||||
///
|
///
|
||||||
/// * A [Database] object.
|
/// * A [Database] object.
|
||||||
pub async fn connect(uri: &str) -> Result<Database> {
|
pub async fn connect(uri: &str) -> Result<Database> {
|
||||||
let (object_store, _) = ObjectStore::from_uri(uri).await?;
|
let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
|
||||||
if object_store.is_local() {
|
if object_store.is_local() {
|
||||||
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
|
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
|
||||||
}
|
}
|
||||||
Ok(Database {
|
Ok(Database {
|
||||||
uri: uri.to_string(),
|
uri: uri.to_string(),
|
||||||
|
base_path,
|
||||||
object_store,
|
object_store,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -70,7 +72,7 @@ impl Database {
|
|||||||
pub async fn table_names(&self) -> Result<Vec<String>> {
|
pub async fn table_names(&self) -> Result<Vec<String>> {
|
||||||
let f = self
|
let f = self
|
||||||
.object_store
|
.object_store
|
||||||
.read_dir(self.uri.as_str())
|
.read_dir(self.base_path.clone())
|
||||||
.await?
|
.await?
|
||||||
.iter()
|
.iter()
|
||||||
.map(|fname| Path::new(fname))
|
.map(|fname| Path::new(fname))
|
||||||
@@ -100,7 +102,7 @@ impl Database {
|
|||||||
pub async fn create_table(
|
pub async fn create_table(
|
||||||
&self,
|
&self,
|
||||||
name: &str,
|
name: &str,
|
||||||
batches: Box<dyn RecordBatchReader>,
|
batches: impl RecordBatchReader + Send + 'static,
|
||||||
params: Option<WriteParams>,
|
params: Option<WriteParams>,
|
||||||
) -> Result<Table> {
|
) -> Result<Table> {
|
||||||
Table::create(&self.uri, name, batches, params).await
|
Table::create(&self.uri, name, batches, params).await
|
||||||
@@ -141,8 +143,9 @@ impl Database {
|
|||||||
/// # Arguments
|
/// # Arguments
|
||||||
/// * `name` - The name of the table.
|
/// * `name` - The name of the table.
|
||||||
pub async fn drop_table(&self, name: &str) -> Result<()> {
|
pub async fn drop_table(&self, name: &str) -> Result<()> {
|
||||||
let dir_name = format!("{}/{}.{}", self.uri, name, LANCE_EXTENSION);
|
let dir_name = format!("{}.{}", name, LANCE_EXTENSION);
|
||||||
self.object_store.remove_dir_all(dir_name).await?;
|
let full_path = self.base_path.child(dir_name.clone());
|
||||||
|
self.object_store.remove_dir_all(full_path).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -173,10 +173,8 @@ mod tests {
|
|||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_setters_getters() {
|
async fn test_setters_getters() {
|
||||||
let mut batches: Box<dyn RecordBatchReader> = make_test_batches();
|
let batches = make_test_batches();
|
||||||
let ds = Dataset::write(&mut batches, "memory://foo", None)
|
let ds = Dataset::write(batches, "memory://foo", None).await.unwrap();
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let vector = Float32Array::from_iter_values([0.1, 0.2]);
|
let vector = Float32Array::from_iter_values([0.1, 0.2]);
|
||||||
let query = Query::new(Arc::new(ds), vector.clone());
|
let query = Query::new(Arc::new(ds), vector.clone());
|
||||||
@@ -202,10 +200,8 @@ mod tests {
|
|||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_execute() {
|
async fn test_execute() {
|
||||||
let mut batches: Box<dyn RecordBatchReader> = make_test_batches();
|
let batches = make_test_batches();
|
||||||
let ds = Dataset::write(&mut batches, "memory://foo", None)
|
let ds = Dataset::write(batches, "memory://foo", None).await.unwrap();
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let vector = Float32Array::from_iter_values([0.1; 128]);
|
let vector = Float32Array::from_iter_values([0.1; 128]);
|
||||||
let query = Query::new(Arc::new(ds), vector.clone());
|
let query = Query::new(Arc::new(ds), vector.clone());
|
||||||
@@ -213,7 +209,7 @@ mod tests {
|
|||||||
assert_eq!(result.is_ok(), true);
|
assert_eq!(result.is_ok(), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn make_test_batches() -> Box<dyn RecordBatchReader> {
|
fn make_test_batches() -> impl RecordBatchReader + Send + 'static {
|
||||||
let dim: usize = 128;
|
let dim: usize = 128;
|
||||||
let schema = Arc::new(ArrowSchema::new(vec![
|
let schema = Arc::new(ArrowSchema::new(vec![
|
||||||
ArrowField::new("key", DataType::Int32, false),
|
ArrowField::new("key", DataType::Int32, false),
|
||||||
@@ -227,11 +223,11 @@ mod tests {
|
|||||||
),
|
),
|
||||||
ArrowField::new("uri", DataType::Utf8, true),
|
ArrowField::new("uri", DataType::Utf8, true),
|
||||||
]));
|
]));
|
||||||
Box::new(RecordBatchIterator::new(
|
RecordBatchIterator::new(
|
||||||
vec![RecordBatch::new_empty(schema.clone())]
|
vec![RecordBatch::new_empty(schema.clone())]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(Ok),
|
.map(Ok),
|
||||||
schema,
|
schema,
|
||||||
))
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,14 +16,15 @@ use std::path::Path;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use arrow_array::{Float32Array, RecordBatchReader};
|
use arrow_array::{Float32Array, RecordBatchReader};
|
||||||
|
use arrow_schema::SchemaRef;
|
||||||
use lance::dataset::{Dataset, ReadParams, WriteParams};
|
use lance::dataset::{Dataset, ReadParams, WriteParams};
|
||||||
use lance::index::IndexType;
|
use lance::index::IndexType;
|
||||||
use snafu::prelude::*;
|
use snafu::prelude::*;
|
||||||
|
|
||||||
use crate::error::{Error, InvalidTableNameSnafu, Result};
|
use crate::error::{Error, InvalidTableNameSnafu, Result};
|
||||||
use crate::index::vector::VectorIndexBuilder;
|
use crate::index::vector::VectorIndexBuilder;
|
||||||
use crate::WriteMode;
|
|
||||||
use crate::query::Query;
|
use crate::query::Query;
|
||||||
|
use crate::WriteMode;
|
||||||
|
|
||||||
pub const VECTOR_COLUMN_NAME: &str = "vector";
|
pub const VECTOR_COLUMN_NAME: &str = "vector";
|
||||||
pub const LANCE_FILE_EXTENSION: &str = "lance";
|
pub const LANCE_FILE_EXTENSION: &str = "lance";
|
||||||
@@ -117,7 +118,7 @@ impl Table {
|
|||||||
pub async fn create(
|
pub async fn create(
|
||||||
base_uri: &str,
|
base_uri: &str,
|
||||||
name: &str,
|
name: &str,
|
||||||
mut batches: Box<dyn RecordBatchReader>,
|
batches: impl RecordBatchReader + Send + 'static,
|
||||||
params: Option<WriteParams>,
|
params: Option<WriteParams>,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
let base_path = Path::new(base_uri);
|
let base_path = Path::new(base_uri);
|
||||||
@@ -127,7 +128,7 @@ impl Table {
|
|||||||
.to_str()
|
.to_str()
|
||||||
.context(InvalidTableNameSnafu { name })?
|
.context(InvalidTableNameSnafu { name })?
|
||||||
.to_string();
|
.to_string();
|
||||||
let dataset = Dataset::write(&mut batches, &uri, params)
|
let dataset = Dataset::write(batches, &uri, params)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| match e {
|
.map_err(|e| match e {
|
||||||
lance::Error::DatasetAlreadyExists { .. } => Error::TableAlreadyExists {
|
lance::Error::DatasetAlreadyExists { .. } => Error::TableAlreadyExists {
|
||||||
@@ -144,6 +145,16 @@ impl Table {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Schema of this Table.
|
||||||
|
pub fn schema(&self) -> SchemaRef {
|
||||||
|
Arc::new(self.dataset.schema().into())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Version of this Table
|
||||||
|
pub fn version(&self) -> u64 {
|
||||||
|
self.dataset.version().version
|
||||||
|
}
|
||||||
|
|
||||||
/// Create index on the table.
|
/// Create index on the table.
|
||||||
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
|
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
|
||||||
use lance::index::DatasetIndexExt;
|
use lance::index::DatasetIndexExt;
|
||||||
@@ -176,14 +187,16 @@ impl Table {
|
|||||||
/// * The number of rows added
|
/// * The number of rows added
|
||||||
pub async fn add(
|
pub async fn add(
|
||||||
&mut self,
|
&mut self,
|
||||||
mut batches: Box<dyn RecordBatchReader>,
|
batches: impl RecordBatchReader + Send + 'static,
|
||||||
write_mode: Option<WriteMode>,
|
params: Option<WriteParams>,
|
||||||
) -> Result<usize> {
|
) -> Result<()> {
|
||||||
let mut params = WriteParams::default();
|
let params = params.unwrap_or(WriteParams {
|
||||||
params.mode = write_mode.unwrap_or(WriteMode::Append);
|
mode: WriteMode::Append,
|
||||||
|
..WriteParams::default()
|
||||||
|
});
|
||||||
|
|
||||||
self.dataset = Arc::new(Dataset::write(&mut batches, &self.uri, Some(params)).await?);
|
self.dataset = Arc::new(Dataset::write(batches, &self.uri, Some(params)).await?);
|
||||||
Ok(batches.count())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new Query object that can be executed.
|
/// Creates a new Query object that can be executed.
|
||||||
@@ -207,12 +220,12 @@ impl Table {
|
|||||||
/// Merge new data into this table.
|
/// Merge new data into this table.
|
||||||
pub async fn merge(
|
pub async fn merge(
|
||||||
&mut self,
|
&mut self,
|
||||||
mut batches: Box<dyn RecordBatchReader>,
|
batches: impl RecordBatchReader + Send + 'static,
|
||||||
left_on: &str,
|
left_on: &str,
|
||||||
right_on: &str,
|
right_on: &str,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut dataset = self.dataset.as_ref().clone();
|
let mut dataset = self.dataset.as_ref().clone();
|
||||||
dataset.merge(&mut batches, left_on, right_on).await?;
|
dataset.merge(batches, left_on, right_on).await?;
|
||||||
self.dataset = Arc::new(dataset);
|
self.dataset = Arc::new(dataset);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -253,8 +266,8 @@ mod tests {
|
|||||||
let dataset_path = tmp_dir.path().join("test.lance");
|
let dataset_path = tmp_dir.path().join("test.lance");
|
||||||
let uri = tmp_dir.path().to_str().unwrap();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let mut batches: Box<dyn RecordBatchReader> = make_test_batches();
|
let batches = make_test_batches();
|
||||||
Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
|
Dataset::write(batches, dataset_path.to_str().unwrap(), None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
@@ -272,6 +285,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
#[cfg(not(windows))]
|
||||||
fn test_object_store_path() {
|
fn test_object_store_path() {
|
||||||
use std::path::Path as StdPath;
|
use std::path::Path as StdPath;
|
||||||
let p = StdPath::new("s3://bucket/path/to/file");
|
let p = StdPath::new("s3://bucket/path/to/file");
|
||||||
@@ -284,11 +298,11 @@ mod tests {
|
|||||||
let tmp_dir = tempdir().unwrap();
|
let tmp_dir = tempdir().unwrap();
|
||||||
let uri = tmp_dir.path().to_str().unwrap();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let batches: Box<dyn RecordBatchReader> = make_test_batches();
|
let batches = make_test_batches();
|
||||||
let _ = batches.schema().clone();
|
let _ = batches.schema().clone();
|
||||||
Table::create(&uri, "test", batches, None).await.unwrap();
|
Table::create(&uri, "test", batches, None).await.unwrap();
|
||||||
|
|
||||||
let batches: Box<dyn RecordBatchReader> = make_test_batches();
|
let batches = make_test_batches();
|
||||||
let result = Table::create(&uri, "test", batches, None).await;
|
let result = Table::create(&uri, "test", batches, None).await;
|
||||||
assert!(matches!(
|
assert!(matches!(
|
||||||
result.unwrap_err(),
|
result.unwrap_err(),
|
||||||
@@ -301,12 +315,12 @@ mod tests {
|
|||||||
let tmp_dir = tempdir().unwrap();
|
let tmp_dir = tempdir().unwrap();
|
||||||
let uri = tmp_dir.path().to_str().unwrap();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let batches: Box<dyn RecordBatchReader> = make_test_batches();
|
let batches = make_test_batches();
|
||||||
let schema = batches.schema().clone();
|
let schema = batches.schema().clone();
|
||||||
let mut table = Table::create(&uri, "test", batches, None).await.unwrap();
|
let mut table = Table::create(&uri, "test", batches, None).await.unwrap();
|
||||||
assert_eq!(table.count_rows().await.unwrap(), 10);
|
assert_eq!(table.count_rows().await.unwrap(), 10);
|
||||||
|
|
||||||
let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchIterator::new(
|
let new_batches = RecordBatchIterator::new(
|
||||||
vec![RecordBatch::try_new(
|
vec![RecordBatch::try_new(
|
||||||
schema.clone(),
|
schema.clone(),
|
||||||
vec![Arc::new(Int32Array::from_iter_values(100..110))],
|
vec![Arc::new(Int32Array::from_iter_values(100..110))],
|
||||||
@@ -315,7 +329,7 @@ mod tests {
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(Ok),
|
.map(Ok),
|
||||||
schema.clone(),
|
schema.clone(),
|
||||||
));
|
);
|
||||||
|
|
||||||
table.add(new_batches, None).await.unwrap();
|
table.add(new_batches, None).await.unwrap();
|
||||||
assert_eq!(table.count_rows().await.unwrap(), 20);
|
assert_eq!(table.count_rows().await.unwrap(), 20);
|
||||||
@@ -327,12 +341,12 @@ mod tests {
|
|||||||
let tmp_dir = tempdir().unwrap();
|
let tmp_dir = tempdir().unwrap();
|
||||||
let uri = tmp_dir.path().to_str().unwrap();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let batches: Box<dyn RecordBatchReader> = make_test_batches();
|
let batches = make_test_batches();
|
||||||
let schema = batches.schema().clone();
|
let schema = batches.schema().clone();
|
||||||
let mut table = Table::create(uri, "test", batches, None).await.unwrap();
|
let mut table = Table::create(uri, "test", batches, None).await.unwrap();
|
||||||
assert_eq!(table.count_rows().await.unwrap(), 10);
|
assert_eq!(table.count_rows().await.unwrap(), 10);
|
||||||
|
|
||||||
let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchIterator::new(
|
let new_batches = RecordBatchIterator::new(
|
||||||
vec![RecordBatch::try_new(
|
vec![RecordBatch::try_new(
|
||||||
schema.clone(),
|
schema.clone(),
|
||||||
vec![Arc::new(Int32Array::from_iter_values(100..110))],
|
vec![Arc::new(Int32Array::from_iter_values(100..110))],
|
||||||
@@ -341,12 +355,14 @@ mod tests {
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(Ok),
|
.map(Ok),
|
||||||
schema.clone(),
|
schema.clone(),
|
||||||
));
|
);
|
||||||
|
|
||||||
table
|
let param: WriteParams = WriteParams {
|
||||||
.add(new_batches, Some(WriteMode::Overwrite))
|
mode: WriteMode::Overwrite,
|
||||||
.await
|
..Default::default()
|
||||||
.unwrap();
|
};
|
||||||
|
|
||||||
|
table.add(new_batches, Some(param)).await.unwrap();
|
||||||
assert_eq!(table.count_rows().await.unwrap(), 10);
|
assert_eq!(table.count_rows().await.unwrap(), 10);
|
||||||
assert_eq!(table.name, "test");
|
assert_eq!(table.name, "test");
|
||||||
}
|
}
|
||||||
@@ -357,8 +373,8 @@ mod tests {
|
|||||||
let dataset_path = tmp_dir.path().join("test.lance");
|
let dataset_path = tmp_dir.path().join("test.lance");
|
||||||
let uri = tmp_dir.path().to_str().unwrap();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let mut batches: Box<dyn RecordBatchReader> = make_test_batches();
|
let batches = make_test_batches();
|
||||||
Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
|
Dataset::write(batches, dataset_path.to_str().unwrap(), None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
@@ -369,7 +385,7 @@ mod tests {
|
|||||||
assert_eq!(vector, query.query_vector);
|
assert_eq!(vector, query.query_vector);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default, Debug)]
|
||||||
struct NoOpCacheWrapper {
|
struct NoOpCacheWrapper {
|
||||||
called: AtomicBool,
|
called: AtomicBool,
|
||||||
}
|
}
|
||||||
@@ -396,8 +412,8 @@ mod tests {
|
|||||||
let dataset_path = tmp_dir.path().join("test.lance");
|
let dataset_path = tmp_dir.path().join("test.lance");
|
||||||
let uri = tmp_dir.path().to_str().unwrap();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let mut batches: Box<dyn RecordBatchReader> = make_test_batches();
|
let batches = make_test_batches();
|
||||||
Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
|
Dataset::write(batches, dataset_path.to_str().unwrap(), None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
@@ -417,15 +433,15 @@ mod tests {
|
|||||||
assert!(wrapper.called());
|
assert!(wrapper.called());
|
||||||
}
|
}
|
||||||
|
|
||||||
fn make_test_batches() -> Box<dyn RecordBatchReader> {
|
fn make_test_batches() -> impl RecordBatchReader + Send + Sync + 'static {
|
||||||
let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)]));
|
let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)]));
|
||||||
Box::new(RecordBatchIterator::new(
|
RecordBatchIterator::new(
|
||||||
vec![RecordBatch::try_new(
|
vec![RecordBatch::try_new(
|
||||||
schema.clone(),
|
schema.clone(),
|
||||||
vec![Arc::new(Int32Array::from_iter_values(0..10))],
|
vec![Arc::new(Int32Array::from_iter_values(0..10))],
|
||||||
)],
|
)],
|
||||||
schema,
|
schema,
|
||||||
))
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
@@ -465,9 +481,7 @@ mod tests {
|
|||||||
schema,
|
schema,
|
||||||
);
|
);
|
||||||
|
|
||||||
let reader: Box<dyn RecordBatchReader + Send> = Box::new(batches);
|
let mut table = Table::create(uri, "test", batches, None).await.unwrap();
|
||||||
let mut table = Table::create(uri, "test", reader, None).await.unwrap();
|
|
||||||
|
|
||||||
let mut i = IvfPQIndexBuilder::new();
|
let mut i = IvfPQIndexBuilder::new();
|
||||||
|
|
||||||
let index_builder = i
|
let index_builder = i
|
||||||
|
|||||||
Reference in New Issue
Block a user