mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-24 13:59:58 +00:00
Compare commits
12 Commits
python-v0.
...
tuning/dat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
99d1a06a44 | ||
|
|
f23641d703 | ||
|
|
e9e0a37ca8 | ||
|
|
c37a28abbd | ||
|
|
98c1e635b3 | ||
|
|
9992b927fd | ||
|
|
80d501011c | ||
|
|
6e3a9d08e0 | ||
|
|
268d8e057b | ||
|
|
dfc518b8fb | ||
|
|
98acf34ae8 | ||
|
|
25988d23cd |
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.4.13
|
||||
current_version = 0.4.14
|
||||
commit = True
|
||||
message = Bump version: {current_version} → {new_version}
|
||||
tag = True
|
||||
|
||||
8
.github/workflows/docs_test.yml
vendored
8
.github/workflows/docs_test.yml
vendored
@@ -18,7 +18,7 @@ on:
|
||||
env:
|
||||
# Disable full debug symbol generation to speed up CI build and keep memory down
|
||||
# "1" means line tables only, which is useful for panic tracebacks.
|
||||
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=native -C target-feature=+f16c,+avx2,+fma"
|
||||
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=haswell -C target-feature=+f16c,+avx2,+fma"
|
||||
RUST_BACKTRACE: "1"
|
||||
|
||||
jobs:
|
||||
@@ -28,6 +28,8 @@ jobs:
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Print CPU capabilities
|
||||
run: cat /proc/cpuinfo
|
||||
- name: Install dependecies needed for ubuntu
|
||||
run: |
|
||||
sudo apt install -y protobuf-compiler libssl-dev
|
||||
@@ -39,7 +41,7 @@ jobs:
|
||||
cache: "pip"
|
||||
cache-dependency-path: "docs/test/requirements.txt"
|
||||
- name: Rust cache
|
||||
uses: swatinem/rust-cache@v2
|
||||
uses: swatinem/rust-cache@v2
|
||||
- name: Build Python
|
||||
working-directory: docs/test
|
||||
run:
|
||||
@@ -64,6 +66,8 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: true
|
||||
- name: Print CPU capabilities
|
||||
run: cat /proc/cpuinfo
|
||||
- name: Set up Node
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
|
||||
3
.github/workflows/node.yml
vendored
3
.github/workflows/node.yml
vendored
@@ -20,7 +20,8 @@ env:
|
||||
# "1" means line tables only, which is useful for panic tracebacks.
|
||||
#
|
||||
# Use native CPU to accelerate tests if possible, especially for f16
|
||||
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=native -C target-feature=+f16c,+avx2,+fma"
|
||||
# target-cpu=haswell fixes failing ci build
|
||||
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=haswell -C target-feature=+f16c,+avx2,+fma"
|
||||
RUST_BACKTRACE: "1"
|
||||
|
||||
jobs:
|
||||
|
||||
191
.github/workflows/npm-publish.yml
vendored
191
.github/workflows/npm-publish.yml
vendored
@@ -2,7 +2,7 @@ name: NPM Publish
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [ published ]
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
node:
|
||||
@@ -19,7 +19,7 @@ jobs:
|
||||
- uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: 20
|
||||
cache: 'npm'
|
||||
cache: "npm"
|
||||
cache-dependency-path: node/package-lock.json
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
npm run tsc
|
||||
npm pack
|
||||
- name: Upload Linux Artifacts
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: node-package
|
||||
path: |
|
||||
@@ -61,12 +61,41 @@ jobs:
|
||||
- name: Build MacOS native node modules
|
||||
run: bash ci/build_macos_artifacts.sh ${{ matrix.config.arch }}
|
||||
- name: Upload Darwin Artifacts
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: native-darwin
|
||||
name: node-native-darwin
|
||||
path: |
|
||||
node/dist/lancedb-vectordb-darwin*.tgz
|
||||
|
||||
nodejs-macos:
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- arch: x86_64-apple-darwin
|
||||
runner: macos-13
|
||||
- arch: aarch64-apple-darwin
|
||||
# xlarge is implicitly arm64.
|
||||
runner: macos-14
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install system dependencies
|
||||
run: brew install protobuf
|
||||
- name: Install npm dependencies
|
||||
run: |
|
||||
cd nodejs
|
||||
npm ci
|
||||
- name: Build MacOS native nodejs modules
|
||||
run: bash ci/build_macos_artifacts_nodejs.sh ${{ matrix.config.arch }}
|
||||
- name: Upload Darwin Artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: nodejs-native-darwin-${{ matrix.config.arch }}
|
||||
path: |
|
||||
nodejs/dist/*.node
|
||||
|
||||
node-linux:
|
||||
name: node-linux (${{ matrix.config.arch}}-unknown-linux-gnu
|
||||
@@ -103,12 +132,63 @@ jobs:
|
||||
run: |
|
||||
bash ci/build_linux_artifacts.sh ${{ matrix.config.arch }}
|
||||
- name: Upload Linux Artifacts
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: native-linux
|
||||
name: node-native-linux
|
||||
path: |
|
||||
node/dist/lancedb-vectordb-linux*.tgz
|
||||
|
||||
nodejs-linux:
|
||||
name: nodejs-linux (${{ matrix.config.arch}}-unknown-linux-gnu
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- arch: x86_64
|
||||
runner: ubuntu-latest
|
||||
- arch: aarch64
|
||||
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
|
||||
runner: buildjet-16vcpu-ubuntu-2204-arm
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
# Buildjet aarch64 runners have only 1.5 GB RAM per core, vs 3.5 GB per core for
|
||||
# x86_64 runners. To avoid OOM errors on ARM, we create a swap file.
|
||||
- name: Configure aarch64 build
|
||||
if: ${{ matrix.config.arch == 'aarch64' }}
|
||||
run: |
|
||||
free -h
|
||||
sudo fallocate -l 16G /swapfile
|
||||
sudo chmod 600 /swapfile
|
||||
sudo mkswap /swapfile
|
||||
sudo swapon /swapfile
|
||||
echo "/swapfile swap swap defaults 0 0" >> sudo /etc/fstab
|
||||
# print info
|
||||
swapon --show
|
||||
free -h
|
||||
- name: Build Linux Artifacts
|
||||
run: |
|
||||
bash ci/build_linux_artifacts_nodejs.sh ${{ matrix.config.arch }}
|
||||
- name: Upload Linux Artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: nodejs-native-linux-${{ matrix.config.arch }}
|
||||
path: |
|
||||
nodejs/dist/*.node
|
||||
# The generic files are the same in all distros so we just pick
|
||||
# one to do the upload.
|
||||
- name: Upload Generic Artifacts
|
||||
if: ${{ matrix.config.arch == 'x86_64' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: nodejs-dist
|
||||
path: |
|
||||
nodejs/dist/*
|
||||
!nodejs/dist/*.node
|
||||
|
||||
node-windows:
|
||||
runs-on: windows-2022
|
||||
# Only runs on tags that matches the make-release action
|
||||
@@ -136,25 +216,60 @@ jobs:
|
||||
- name: Build Windows native node modules
|
||||
run: .\ci\build_windows_artifacts.ps1 ${{ matrix.target }}
|
||||
- name: Upload Windows Artifacts
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: native-windows
|
||||
name: node-native-windows
|
||||
path: |
|
||||
node/dist/lancedb-vectordb-win32*.tgz
|
||||
|
||||
nodejs-windows:
|
||||
runs-on: windows-2022
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
target: [x86_64-pc-windows-msvc]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Protoc v21.12
|
||||
working-directory: C:\
|
||||
run: |
|
||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
||||
Set-Location C:\protoc
|
||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
||||
7z x protoc.zip
|
||||
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
||||
shell: powershell
|
||||
- name: Install npm dependencies
|
||||
run: |
|
||||
cd nodejs
|
||||
npm ci
|
||||
- name: Build Windows native node modules
|
||||
run: .\ci\build_windows_artifacts_nodejs.ps1 ${{ matrix.target }}
|
||||
- name: Upload Windows Artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: nodejs-native-windows
|
||||
path: |
|
||||
nodejs/dist/*.node
|
||||
|
||||
release:
|
||||
needs: [node, node-macos, node-linux, node-windows]
|
||||
runs-on: ubuntu-latest
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
steps:
|
||||
- uses: actions/download-artifact@v3
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: node-*
|
||||
- name: Display structure of downloaded files
|
||||
run: ls -R
|
||||
- uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: 20
|
||||
registry-url: 'https://registry.npmjs.org'
|
||||
registry-url: "https://registry.npmjs.org"
|
||||
- name: Publish to NPM
|
||||
env:
|
||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||
@@ -164,6 +279,45 @@ jobs:
|
||||
npm publish $filename
|
||||
done
|
||||
|
||||
release-nodejs:
|
||||
needs: [nodejs-macos, nodejs-linux, nodejs-windows]
|
||||
runs-on: ubuntu-latest
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
working-directory: nodejs
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: nodejs-dist
|
||||
path: nodejs/dist
|
||||
- uses: actions/download-artifact@v4
|
||||
name: Download arch-specific binaries
|
||||
with:
|
||||
pattern: nodejs-*
|
||||
path: nodejs/nodejs-artifacts
|
||||
merge-multiple: true
|
||||
- name: Display structure of downloaded files
|
||||
run: find .
|
||||
- uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: 20
|
||||
registry-url: "https://registry.npmjs.org"
|
||||
- name: Install napi-rs
|
||||
run: npm install -g @napi-rs/cli
|
||||
- name: Prepare artifacts
|
||||
run: npx napi artifacts -d nodejs-artifacts
|
||||
- name: Display structure of staged files
|
||||
run: find npm
|
||||
- name: Publish to NPM
|
||||
env:
|
||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||
run: npm publish --access public
|
||||
|
||||
update-package-lock:
|
||||
needs: [release]
|
||||
runs-on: ubuntu-latest
|
||||
@@ -178,3 +332,18 @@ jobs:
|
||||
- uses: ./.github/workflows/update_package_lock
|
||||
with:
|
||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||
|
||||
update-package-lock-nodejs:
|
||||
needs: [release-nodejs]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
persist-credentials: false
|
||||
fetch-depth: 0
|
||||
lfs: true
|
||||
- uses: ./.github/workflows/update_package_lock_nodejs
|
||||
with:
|
||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||
|
||||
33
.github/workflows/update_package_lock_nodejs/action.yml
vendored
Normal file
33
.github/workflows/update_package_lock_nodejs/action.yml
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
name: update_package_lock_nodejs
|
||||
description: "Update nodejs's package.lock"
|
||||
|
||||
inputs:
|
||||
github_token:
|
||||
required: true
|
||||
description: "github token for the repo"
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: 20
|
||||
- name: Set git configs
|
||||
shell: bash
|
||||
run: |
|
||||
git config user.name 'Lance Release'
|
||||
git config user.email 'lance-dev@lancedb.com'
|
||||
- name: Update package-lock.json file
|
||||
working-directory: ./nodejs
|
||||
run: |
|
||||
npm install
|
||||
git add package-lock.json
|
||||
git commit -m "Updating package-lock.json"
|
||||
shell: bash
|
||||
- name: Push changes
|
||||
if: ${{ inputs.dry_run }} == "false"
|
||||
uses: ad-m/github-push-action@master
|
||||
with:
|
||||
github_token: ${{ inputs.github_token }}
|
||||
branch: main
|
||||
tags: true
|
||||
19
.github/workflows/update_package_lock_run_nodejs.yml
vendored
Normal file
19
.github/workflows/update_package_lock_run_nodejs.yml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
name: Update NodeJs package-lock.json
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
persist-credentials: false
|
||||
fetch-depth: 0
|
||||
lfs: true
|
||||
- uses: ./.github/workflows/update_package_lock_nodejs
|
||||
with:
|
||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -34,6 +34,7 @@ python/dist
|
||||
node/dist
|
||||
node/examples/**/package-lock.json
|
||||
node/examples/**/dist
|
||||
nodejs/lancedb/native*
|
||||
dist
|
||||
|
||||
## Rust
|
||||
|
||||
@@ -39,3 +39,5 @@ pin-project = "1.0.7"
|
||||
snafu = "0.7.4"
|
||||
url = "2"
|
||||
num-traits = "0.2"
|
||||
regex = "1.10"
|
||||
lazy_static = "1"
|
||||
|
||||
21
ci/build_linux_artifacts_nodejs.sh
Executable file
21
ci/build_linux_artifacts_nodejs.sh
Executable file
@@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
ARCH=${1:-x86_64}
|
||||
|
||||
# We pass down the current user so that when we later mount the local files
|
||||
# into the container, the files are accessible by the current user.
|
||||
pushd ci/manylinux_nodejs
|
||||
docker build \
|
||||
-t lancedb-nodejs-manylinux \
|
||||
--build-arg="ARCH=$ARCH" \
|
||||
--build-arg="DOCKER_USER=$(id -u)" \
|
||||
--progress=plain \
|
||||
.
|
||||
popd
|
||||
|
||||
# We turn on memory swap to avoid OOM killer
|
||||
docker run \
|
||||
-v $(pwd):/io -w /io \
|
||||
--memory-swap=-1 \
|
||||
lancedb-nodejs-manylinux \
|
||||
bash ci/manylinux_nodejs/build.sh $ARCH
|
||||
34
ci/build_macos_artifacts_nodejs.sh
Normal file
34
ci/build_macos_artifacts_nodejs.sh
Normal file
@@ -0,0 +1,34 @@
|
||||
# Builds the macOS artifacts (nodejs binaries).
|
||||
# Usage: ./ci/build_macos_artifacts_nodejs.sh [target]
|
||||
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
|
||||
set -e
|
||||
|
||||
prebuild_rust() {
|
||||
# Building here for the sake of easier debugging.
|
||||
pushd rust/lancedb
|
||||
echo "Building rust library for $1"
|
||||
export RUST_BACKTRACE=1
|
||||
cargo build --release --target $1
|
||||
popd
|
||||
}
|
||||
|
||||
build_node_binaries() {
|
||||
pushd nodejs
|
||||
echo "Building nodejs library for $1"
|
||||
export RUST_TARGET=$1
|
||||
npm run build-release
|
||||
popd
|
||||
}
|
||||
|
||||
if [ -n "$1" ]; then
|
||||
targets=$1
|
||||
else
|
||||
targets="x86_64-apple-darwin aarch64-apple-darwin"
|
||||
fi
|
||||
|
||||
echo "Building artifacts for targets: $targets"
|
||||
for target in $targets
|
||||
do
|
||||
prebuild_rust $target
|
||||
build_node_binaries $target
|
||||
done
|
||||
41
ci/build_windows_artifacts_nodejs.ps1
Normal file
41
ci/build_windows_artifacts_nodejs.ps1
Normal file
@@ -0,0 +1,41 @@
|
||||
# Builds the Windows artifacts (nodejs binaries).
|
||||
# Usage: .\ci\build_windows_artifacts_nodejs.ps1 [target]
|
||||
# Targets supported:
|
||||
# - x86_64-pc-windows-msvc
|
||||
# - i686-pc-windows-msvc
|
||||
|
||||
function Prebuild-Rust {
|
||||
param (
|
||||
[string]$target
|
||||
)
|
||||
|
||||
# Building here for the sake of easier debugging.
|
||||
Push-Location -Path "rust/lancedb"
|
||||
Write-Host "Building rust library for $target"
|
||||
$env:RUST_BACKTRACE=1
|
||||
cargo build --release --target $target
|
||||
Pop-Location
|
||||
}
|
||||
|
||||
function Build-NodeBinaries {
|
||||
param (
|
||||
[string]$target
|
||||
)
|
||||
|
||||
Push-Location -Path "nodejs"
|
||||
Write-Host "Building nodejs library for $target"
|
||||
$env:RUST_TARGET=$target
|
||||
npm run build-release
|
||||
Pop-Location
|
||||
}
|
||||
|
||||
$targets = $args[0]
|
||||
if (-not $targets) {
|
||||
$targets = "x86_64-pc-windows-msvc"
|
||||
}
|
||||
|
||||
Write-Host "Building artifacts for targets: $targets"
|
||||
foreach ($target in $targets) {
|
||||
Prebuild-Rust $target
|
||||
Build-NodeBinaries $target
|
||||
}
|
||||
31
ci/manylinux_nodejs/Dockerfile
Normal file
31
ci/manylinux_nodejs/Dockerfile
Normal file
@@ -0,0 +1,31 @@
|
||||
# Many linux dockerfile with Rust, Node, and Lance dependencies installed.
|
||||
# This container allows building the node modules native libraries in an
|
||||
# environment with a very old glibc, so that we are compatible with a wide
|
||||
# range of linux distributions.
|
||||
ARG ARCH=x86_64
|
||||
|
||||
FROM quay.io/pypa/manylinux2014_${ARCH}
|
||||
|
||||
ARG ARCH=x86_64
|
||||
ARG DOCKER_USER=default_user
|
||||
|
||||
# Install static openssl
|
||||
COPY install_openssl.sh install_openssl.sh
|
||||
RUN ./install_openssl.sh ${ARCH} > /dev/null
|
||||
|
||||
# Protobuf is also installed as root.
|
||||
COPY install_protobuf.sh install_protobuf.sh
|
||||
RUN ./install_protobuf.sh ${ARCH}
|
||||
|
||||
ENV DOCKER_USER=${DOCKER_USER}
|
||||
# Create a group and user
|
||||
RUN echo ${ARCH} && adduser --user-group --create-home --uid ${DOCKER_USER} build_user
|
||||
|
||||
# We switch to the user to install Rust and Node, since those like to be
|
||||
# installed at the user level.
|
||||
USER ${DOCKER_USER}
|
||||
|
||||
COPY prepare_manylinux_node.sh prepare_manylinux_node.sh
|
||||
RUN cp /prepare_manylinux_node.sh $HOME/ && \
|
||||
cd $HOME && \
|
||||
./prepare_manylinux_node.sh ${ARCH}
|
||||
18
ci/manylinux_nodejs/build.sh
Executable file
18
ci/manylinux_nodejs/build.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
# Builds the nodejs module for manylinux. Invoked by ci/build_linux_artifacts_nodejs.sh.
|
||||
set -e
|
||||
ARCH=${1:-x86_64}
|
||||
|
||||
if [ "$ARCH" = "x86_64" ]; then
|
||||
export OPENSSL_LIB_DIR=/usr/local/lib64/
|
||||
else
|
||||
export OPENSSL_LIB_DIR=/usr/local/lib/
|
||||
fi
|
||||
export OPENSSL_STATIC=1
|
||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||
|
||||
source $HOME/.bashrc
|
||||
|
||||
cd nodejs
|
||||
npm ci
|
||||
npm run build-release
|
||||
26
ci/manylinux_nodejs/install_openssl.sh
Executable file
26
ci/manylinux_nodejs/install_openssl.sh
Executable file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
# Builds openssl from source so we can statically link to it
|
||||
|
||||
# this is to avoid the error we get with the system installation:
|
||||
# /usr/bin/ld: <library>: version node not found for symbol SSLeay@@OPENSSL_1.0.1
|
||||
# /usr/bin/ld: failed to set dynamic section sizes: Bad value
|
||||
set -e
|
||||
|
||||
git clone -b OpenSSL_1_1_1u \
|
||||
--single-branch \
|
||||
https://github.com/openssl/openssl.git
|
||||
|
||||
pushd openssl
|
||||
|
||||
if [[ $1 == x86_64* ]]; then
|
||||
ARCH=linux-x86_64
|
||||
else
|
||||
# gnu target
|
||||
ARCH=linux-aarch64
|
||||
fi
|
||||
|
||||
./Configure no-shared $ARCH
|
||||
|
||||
make
|
||||
|
||||
make install
|
||||
15
ci/manylinux_nodejs/install_protobuf.sh
Executable file
15
ci/manylinux_nodejs/install_protobuf.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
# Installs protobuf compiler. Should be run as root.
|
||||
set -e
|
||||
|
||||
if [[ $1 == x86_64* ]]; then
|
||||
ARCH=x86_64
|
||||
else
|
||||
# gnu target
|
||||
ARCH=aarch_64
|
||||
fi
|
||||
|
||||
PB_REL=https://github.com/protocolbuffers/protobuf/releases
|
||||
PB_VERSION=23.1
|
||||
curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-$ARCH.zip
|
||||
unzip protoc-$PB_VERSION-linux-$ARCH.zip -d /usr/local
|
||||
21
ci/manylinux_nodejs/prepare_manylinux_node.sh
Executable file
21
ci/manylinux_nodejs/prepare_manylinux_node.sh
Executable file
@@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
install_node() {
|
||||
echo "Installing node..."
|
||||
|
||||
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
|
||||
|
||||
source "$HOME"/.bashrc
|
||||
|
||||
nvm install --no-progress 16
|
||||
}
|
||||
|
||||
install_rust() {
|
||||
echo "Installing rust..."
|
||||
curl https://sh.rustup.rs -sSf | bash -s -- -y
|
||||
export PATH="$PATH:/root/.cargo/bin"
|
||||
}
|
||||
|
||||
install_node
|
||||
install_rust
|
||||
326
docs/mkdocs.yml
326
docs/mkdocs.yml
@@ -38,178 +38,180 @@ theme:
|
||||
custom_dir: overrides
|
||||
|
||||
plugins:
|
||||
- search
|
||||
- autorefs
|
||||
- mkdocstrings:
|
||||
handlers:
|
||||
python:
|
||||
paths: [../python]
|
||||
options:
|
||||
docstring_style: numpy
|
||||
heading_level: 4
|
||||
show_source: true
|
||||
show_symbol_type_in_heading: true
|
||||
show_signature_annotations: true
|
||||
members_order: source
|
||||
import:
|
||||
# for cross references
|
||||
- https://arrow.apache.org/docs/objects.inv
|
||||
- https://pandas.pydata.org/docs/objects.inv
|
||||
- mkdocs-jupyter
|
||||
- ultralytics:
|
||||
verbose: True
|
||||
enabled: True
|
||||
default_image: "assets/lancedb_and_lance.png" # Default image for all pages
|
||||
add_image: True # Automatically add meta image
|
||||
add_keywords: True # Add page keywords in the header tag
|
||||
add_share_buttons: True # Add social share buttons
|
||||
add_authors: False # Display page authors
|
||||
add_desc: False
|
||||
add_dates: False
|
||||
- search
|
||||
- autorefs
|
||||
- mkdocstrings:
|
||||
handlers:
|
||||
python:
|
||||
paths: [../python]
|
||||
options:
|
||||
docstring_style: numpy
|
||||
heading_level: 3
|
||||
show_source: true
|
||||
show_symbol_type_in_heading: true
|
||||
show_signature_annotations: true
|
||||
show_root_heading: true
|
||||
members_order: source
|
||||
import:
|
||||
# for cross references
|
||||
- https://arrow.apache.org/docs/objects.inv
|
||||
- https://pandas.pydata.org/docs/objects.inv
|
||||
- mkdocs-jupyter
|
||||
- ultralytics:
|
||||
verbose: True
|
||||
enabled: True
|
||||
default_image: "assets/lancedb_and_lance.png" # Default image for all pages
|
||||
add_image: True # Automatically add meta image
|
||||
add_keywords: True # Add page keywords in the header tag
|
||||
add_share_buttons: True # Add social share buttons
|
||||
add_authors: False # Display page authors
|
||||
add_desc: False
|
||||
add_dates: False
|
||||
|
||||
markdown_extensions:
|
||||
- admonition
|
||||
- footnotes
|
||||
- pymdownx.details
|
||||
- pymdownx.highlight:
|
||||
anchor_linenums: true
|
||||
line_spans: __span
|
||||
pygments_lang_class: true
|
||||
- pymdownx.inlinehilite
|
||||
- pymdownx.snippets:
|
||||
base_path: ..
|
||||
dedent_subsections: true
|
||||
- pymdownx.superfences
|
||||
- pymdownx.tabbed:
|
||||
alternate_style: true
|
||||
- md_in_html
|
||||
- attr_list
|
||||
- admonition
|
||||
- footnotes
|
||||
- pymdownx.details
|
||||
- pymdownx.highlight:
|
||||
anchor_linenums: true
|
||||
line_spans: __span
|
||||
pygments_lang_class: true
|
||||
- pymdownx.inlinehilite
|
||||
- pymdownx.snippets:
|
||||
base_path: ..
|
||||
dedent_subsections: true
|
||||
- pymdownx.superfences
|
||||
- pymdownx.tabbed:
|
||||
alternate_style: true
|
||||
- md_in_html
|
||||
- attr_list
|
||||
|
||||
nav:
|
||||
- Home:
|
||||
- LanceDB: index.md
|
||||
- 🏃🏼♂️ Quick start: basic.md
|
||||
- 📚 Concepts:
|
||||
- Vector search: concepts/vector_search.md
|
||||
- Indexing: concepts/index_ivfpq.md
|
||||
- Storage: concepts/storage.md
|
||||
- Data management: concepts/data_management.md
|
||||
- 🔨 Guides:
|
||||
- Working with tables: guides/tables.md
|
||||
- Building an ANN index: ann_indexes.md
|
||||
- Vector Search: search.md
|
||||
- Full-text search: fts.md
|
||||
- Hybrid search:
|
||||
- Overview: hybrid_search/hybrid_search.md
|
||||
- Comparing Rerankers: hybrid_search/eval.md
|
||||
- Airbnb financial data example: notebooks/hybrid_search.ipynb
|
||||
- Filtering: sql.md
|
||||
- Versioning & Reproducibility: notebooks/reproducibility.ipynb
|
||||
- Configuring Storage: guides/storage.md
|
||||
- 🧬 Managing embeddings:
|
||||
- Overview: embeddings/index.md
|
||||
- Embedding functions: embeddings/embedding_functions.md
|
||||
- Available models: embeddings/default_embedding_functions.md
|
||||
- User-defined embedding functions: embeddings/custom_embedding_function.md
|
||||
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
|
||||
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
|
||||
- 🔌 Integrations:
|
||||
- Tools and data formats: integrations/index.md
|
||||
- Pandas and PyArrow: python/pandas_and_pyarrow.md
|
||||
- Polars: python/polars_arrow.md
|
||||
- DuckDB: python/duckdb.md
|
||||
- LangChain 🔗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
|
||||
- LangChain JS/TS 🔗: https://js.langchain.com/docs/modules/data_connection/vectorstores/integrations/lancedb
|
||||
- LlamaIndex 🦙: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
|
||||
- Pydantic: python/pydantic.md
|
||||
- Voxel51: integrations/voxel51.md
|
||||
- PromptTools: integrations/prompttools.md
|
||||
- 🎯 Examples:
|
||||
- Overview: examples/index.md
|
||||
- 🐍 Python:
|
||||
- Overview: examples/examples_python.md
|
||||
- Home:
|
||||
- LanceDB: index.md
|
||||
- 🏃🏼♂️ Quick start: basic.md
|
||||
- 📚 Concepts:
|
||||
- Vector search: concepts/vector_search.md
|
||||
- Indexing: concepts/index_ivfpq.md
|
||||
- Storage: concepts/storage.md
|
||||
- Data management: concepts/data_management.md
|
||||
- 🔨 Guides:
|
||||
- Working with tables: guides/tables.md
|
||||
- Building an ANN index: ann_indexes.md
|
||||
- Vector Search: search.md
|
||||
- Full-text search: fts.md
|
||||
- Hybrid search:
|
||||
- Overview: hybrid_search/hybrid_search.md
|
||||
- Comparing Rerankers: hybrid_search/eval.md
|
||||
- Airbnb financial data example: notebooks/hybrid_search.ipynb
|
||||
- Filtering: sql.md
|
||||
- Versioning & Reproducibility: notebooks/reproducibility.ipynb
|
||||
- Configuring Storage: guides/storage.md
|
||||
- Sync -> Async Migration Guide: migration.md
|
||||
- 🧬 Managing embeddings:
|
||||
- Overview: embeddings/index.md
|
||||
- Embedding functions: embeddings/embedding_functions.md
|
||||
- Available models: embeddings/default_embedding_functions.md
|
||||
- User-defined embedding functions: embeddings/custom_embedding_function.md
|
||||
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
|
||||
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
|
||||
- 🔌 Integrations:
|
||||
- Tools and data formats: integrations/index.md
|
||||
- Pandas and PyArrow: python/pandas_and_pyarrow.md
|
||||
- Polars: python/polars_arrow.md
|
||||
- DuckDB: python/duckdb.md
|
||||
- LangChain 🔗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
|
||||
- LangChain JS/TS 🔗: https://js.langchain.com/docs/modules/data_connection/vectorstores/integrations/lancedb
|
||||
- LlamaIndex 🦙: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
|
||||
- Pydantic: python/pydantic.md
|
||||
- Voxel51: integrations/voxel51.md
|
||||
- PromptTools: integrations/prompttools.md
|
||||
- 🎯 Examples:
|
||||
- Overview: examples/index.md
|
||||
- 🐍 Python:
|
||||
- Overview: examples/examples_python.md
|
||||
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
|
||||
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
|
||||
- Multimodal search using CLIP: notebooks/multimodal_search.ipynb
|
||||
- Example - Calculate CLIP Embeddings with Roboflow Inference: examples/image_embeddings_roboflow.md
|
||||
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
||||
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
||||
- 👾 JavaScript:
|
||||
- Overview: examples/examples_js.md
|
||||
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
|
||||
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
|
||||
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
||||
- 🦀 Rust:
|
||||
- Overview: examples/examples_rust.md
|
||||
- 🔧 CLI & Config: cli_config.md
|
||||
- 💭 FAQs: faq.md
|
||||
- ⚙️ API reference:
|
||||
- 🐍 Python: python/python.md
|
||||
- 👾 JavaScript: javascript/modules.md
|
||||
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
|
||||
- ☁️ LanceDB Cloud:
|
||||
- Overview: cloud/index.md
|
||||
- API reference:
|
||||
- 🐍 Python: python/saas-python.md
|
||||
- 👾 JavaScript: javascript/saas-modules.md
|
||||
|
||||
- Quick start: basic.md
|
||||
- Concepts:
|
||||
- Vector search: concepts/vector_search.md
|
||||
- Indexing: concepts/index_ivfpq.md
|
||||
- Storage: concepts/storage.md
|
||||
- Data management: concepts/data_management.md
|
||||
- Guides:
|
||||
- Working with tables: guides/tables.md
|
||||
- Building an ANN index: ann_indexes.md
|
||||
- Vector Search: search.md
|
||||
- Full-text search: fts.md
|
||||
- Hybrid search:
|
||||
- Overview: hybrid_search/hybrid_search.md
|
||||
- Comparing Rerankers: hybrid_search/eval.md
|
||||
- Airbnb financial data example: notebooks/hybrid_search.ipynb
|
||||
- Filtering: sql.md
|
||||
- Versioning & Reproducibility: notebooks/reproducibility.ipynb
|
||||
- Configuring Storage: guides/storage.md
|
||||
- Sync -> Async Migration Guide: migration.md
|
||||
- Managing Embeddings:
|
||||
- Overview: embeddings/index.md
|
||||
- Embedding functions: embeddings/embedding_functions.md
|
||||
- Available models: embeddings/default_embedding_functions.md
|
||||
- User-defined embedding functions: embeddings/custom_embedding_function.md
|
||||
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
|
||||
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
|
||||
- Integrations:
|
||||
- Overview: integrations/index.md
|
||||
- Pandas and PyArrow: python/pandas_and_pyarrow.md
|
||||
- Polars: python/polars_arrow.md
|
||||
- DuckDB: python/duckdb.md
|
||||
- LangChain 🦜️🔗↗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
|
||||
- LangChain.js 🦜️🔗↗: https://js.langchain.com/docs/modules/data_connection/vectorstores/integrations/lancedb
|
||||
- LlamaIndex 🦙↗: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
|
||||
- Pydantic: python/pydantic.md
|
||||
- Voxel51: integrations/voxel51.md
|
||||
- PromptTools: integrations/prompttools.md
|
||||
- Examples:
|
||||
- examples/index.md
|
||||
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
|
||||
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
|
||||
- Multimodal search using CLIP: notebooks/multimodal_search.ipynb
|
||||
- Example - Calculate CLIP Embeddings with Roboflow Inference: examples/image_embeddings_roboflow.md
|
||||
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
||||
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
||||
- 👾 JavaScript:
|
||||
- Overview: examples/examples_js.md
|
||||
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
|
||||
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
|
||||
- YouTube Transcript Search (JS): examples/youtube_transcript_bot_with_nodejs.md
|
||||
- Serverless Chatbot from any website: examples/serverless_website_chatbot.md
|
||||
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
||||
- 🦀 Rust:
|
||||
- Overview: examples/examples_rust.md
|
||||
- 🔧 CLI & Config: cli_config.md
|
||||
- 💭 FAQs: faq.md
|
||||
- ⚙️ API reference:
|
||||
- 🐍 Python: python/python.md
|
||||
- 👾 JavaScript: javascript/modules.md
|
||||
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
|
||||
- ☁️ LanceDB Cloud:
|
||||
- Overview: cloud/index.md
|
||||
- API reference:
|
||||
- 🐍 Python: python/saas-python.md
|
||||
- 👾 JavaScript: javascript/saas-modules.md
|
||||
|
||||
|
||||
- Quick start: basic.md
|
||||
- Concepts:
|
||||
- Vector search: concepts/vector_search.md
|
||||
- Indexing: concepts/index_ivfpq.md
|
||||
- Storage: concepts/storage.md
|
||||
- Data management: concepts/data_management.md
|
||||
- Guides:
|
||||
- Working with tables: guides/tables.md
|
||||
- Building an ANN index: ann_indexes.md
|
||||
- Vector Search: search.md
|
||||
- Full-text search: fts.md
|
||||
- Hybrid search:
|
||||
- Overview: hybrid_search/hybrid_search.md
|
||||
- Comparing Rerankers: hybrid_search/eval.md
|
||||
- Airbnb financial data example: notebooks/hybrid_search.ipynb
|
||||
- Filtering: sql.md
|
||||
- Versioning & Reproducibility: notebooks/reproducibility.ipynb
|
||||
- Configuring Storage: guides/storage.md
|
||||
- Managing Embeddings:
|
||||
- Overview: embeddings/index.md
|
||||
- Embedding functions: embeddings/embedding_functions.md
|
||||
- Available models: embeddings/default_embedding_functions.md
|
||||
- User-defined embedding functions: embeddings/custom_embedding_function.md
|
||||
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
|
||||
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
|
||||
- Integrations:
|
||||
- Overview: integrations/index.md
|
||||
- Pandas and PyArrow: python/pandas_and_pyarrow.md
|
||||
- Polars: python/polars_arrow.md
|
||||
- DuckDB : python/duckdb.md
|
||||
- LangChain 🦜️🔗↗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
|
||||
- LangChain.js 🦜️🔗↗: https://js.langchain.com/docs/modules/data_connection/vectorstores/integrations/lancedb
|
||||
- LlamaIndex 🦙↗: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
|
||||
- Pydantic: python/pydantic.md
|
||||
- Voxel51: integrations/voxel51.md
|
||||
- PromptTools: integrations/prompttools.md
|
||||
- Examples:
|
||||
- examples/index.md
|
||||
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
|
||||
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
|
||||
- Multimodal search using CLIP: notebooks/multimodal_search.ipynb
|
||||
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
||||
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
||||
- YouTube Transcript Search (JS): examples/youtube_transcript_bot_with_nodejs.md
|
||||
- Serverless Chatbot from any website: examples/serverless_website_chatbot.md
|
||||
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
||||
- API reference:
|
||||
- Overview: api_reference.md
|
||||
- Python: python/python.md
|
||||
- Javascript: javascript/modules.md
|
||||
- Rust: https://docs.rs/lancedb/latest/lancedb/index.html
|
||||
- LanceDB Cloud:
|
||||
- Overview: cloud/index.md
|
||||
- API reference:
|
||||
- 🐍 Python: python/saas-python.md
|
||||
- 👾 JavaScript: javascript/saas-modules.md
|
||||
- API reference:
|
||||
- Overview: api_reference.md
|
||||
- Python: python/python.md
|
||||
- Javascript: javascript/modules.md
|
||||
- Rust: https://docs.rs/lancedb/latest/lancedb/index.html
|
||||
- LanceDB Cloud:
|
||||
- Overview: cloud/index.md
|
||||
- API reference:
|
||||
- 🐍 Python: python/saas-python.md
|
||||
- 👾 JavaScript: javascript/saas-modules.md
|
||||
|
||||
extra_css:
|
||||
- styles/global.css
|
||||
|
||||
@@ -48,11 +48,20 @@
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
```
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_basic.py:imports"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:connect"
|
||||
|
||||
--8<-- "python/python/tests/docs/test_basic.py:connect_async"
|
||||
```
|
||||
|
||||
!!! note "Asynchronous Python API"
|
||||
|
||||
The asynchronous Python API is new and has some slight differences compared
|
||||
to the synchronous API. Feel free to start using the asynchronous version.
|
||||
Once all features have migrated we will start to move the synchronous API to
|
||||
use the same syntax as the asynchronous API. To help with this migration we
|
||||
have created a [migration guide](migration.md) detailing the differences.
|
||||
|
||||
=== "Typescript"
|
||||
|
||||
@@ -82,15 +91,14 @@ If you need a reminder of the uri, you can call `db.uri()`.
|
||||
### Create a table from initial data
|
||||
|
||||
If you have data to insert into the table at creation time, you can simultaneously create a
|
||||
table and insert the data into it. The schema of the data will be used as the schema of the
|
||||
table and insert the data into it. The schema of the data will be used as the schema of the
|
||||
table.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
tbl = db.create_table("my_table",
|
||||
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||
--8<-- "python/python/tests/docs/test_basic.py:create_table"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:create_table_async"
|
||||
```
|
||||
|
||||
If the table already exists, LanceDB will raise an error by default.
|
||||
@@ -100,10 +108,8 @@ table.
|
||||
You can also pass in a pandas DataFrame directly:
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
df = pd.DataFrame([{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||
tbl = db.create_table("table_from_df", data=df)
|
||||
--8<-- "python/python/tests/docs/test_basic.py:create_table_pandas"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:create_table_async_pandas"
|
||||
```
|
||||
|
||||
=== "Typescript"
|
||||
@@ -138,15 +144,14 @@ table.
|
||||
|
||||
Sometimes you may not have the data to insert into the table at creation time.
|
||||
In this case, you can create an empty table and specify the schema, so that you can add
|
||||
data to the table at a later time (as long as it conforms to the schema). This is
|
||||
data to the table at a later time (as long as it conforms to the schema). This is
|
||||
similar to a `CREATE TABLE` statement in SQL.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
import pyarrow as pa
|
||||
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))])
|
||||
tbl = db.create_table("empty_table", schema=schema)
|
||||
--8<-- "python/python/tests/docs/test_basic.py:create_empty_table"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:create_empty_table_async"
|
||||
```
|
||||
|
||||
=== "Typescript"
|
||||
@@ -168,7 +173,8 @@ Once created, you can open a table as follows:
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
tbl = db.open_table("my_table")
|
||||
--8<-- "python/python/tests/docs/test_basic.py:open_table"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:open_table_async"
|
||||
```
|
||||
|
||||
=== "Typescript"
|
||||
@@ -188,7 +194,8 @@ If you forget the name of your table, you can always get a listing of all table
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
print(db.table_names())
|
||||
--8<-- "python/python/tests/docs/test_basic.py:table_names"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:table_names_async"
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
@@ -210,15 +217,8 @@ After a table has been created, you can always add more data to it as follows:
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
|
||||
# Option 1: Add a list of dicts to a table
|
||||
data = [{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
|
||||
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}]
|
||||
tbl.add(data)
|
||||
|
||||
# Option 2: Add a pandas DataFrame to a table
|
||||
df = pd.DataFrame(data)
|
||||
tbl.add(data)
|
||||
--8<-- "python/python/tests/docs/test_basic.py:add_data"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:add_data_async"
|
||||
```
|
||||
|
||||
=== "Typescript"
|
||||
@@ -240,7 +240,8 @@ Once you've embedded the query, you can find its nearest neighbors as follows:
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
tbl.search([100, 100]).limit(2).to_pandas()
|
||||
--8<-- "python/python/tests/docs/test_basic.py:vector_search"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:vector_search_async"
|
||||
```
|
||||
|
||||
This returns a pandas DataFrame with the results.
|
||||
@@ -274,7 +275,8 @@ LanceDB allows you to create an ANN index on a table as follows:
|
||||
=== "Python"
|
||||
|
||||
```py
|
||||
tbl.create_index()
|
||||
--8<-- "python/python/tests/docs/test_basic.py:create_index"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:create_index_async"
|
||||
```
|
||||
|
||||
=== "Typescript"
|
||||
@@ -286,15 +288,15 @@ LanceDB allows you to create an ANN index on a table as follows:
|
||||
=== "Rust"
|
||||
|
||||
```rust
|
||||
--8<-- "rust/lancedb/examples/simple.rs:create_index"
|
||||
--8<-- "rust/lancedb/examples/simple.rs:create_index"
|
||||
```
|
||||
|
||||
!!! note "Why do I need to create an index manually?"
|
||||
LanceDB does not automatically create the ANN index for two reasons. The first is that it's optimized
|
||||
for really fast retrievals via a disk-based index, and the second is that data and query workloads can
|
||||
be very diverse, so there's no one-size-fits-all index configuration. LanceDB provides many parameters
|
||||
to fine-tune index size, query latency and accuracy. See the section on
|
||||
[ANN indexes](ann_indexes.md) for more details.
|
||||
LanceDB does not automatically create the ANN index for two reasons. The first is that it's optimized
|
||||
for really fast retrievals via a disk-based index, and the second is that data and query workloads can
|
||||
be very diverse, so there's no one-size-fits-all index configuration. LanceDB provides many parameters
|
||||
to fine-tune index size, query latency and accuracy. See the section on
|
||||
[ANN indexes](ann_indexes.md) for more details.
|
||||
|
||||
## Delete rows from a table
|
||||
|
||||
@@ -305,7 +307,8 @@ This can delete any number of rows that match the filter.
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
tbl.delete('item = "fizz"')
|
||||
--8<-- "python/python/tests/docs/test_basic.py:delete_rows"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:delete_rows_async"
|
||||
```
|
||||
|
||||
=== "Typescript"
|
||||
@@ -322,7 +325,7 @@ This can delete any number of rows that match the filter.
|
||||
|
||||
The deletion predicate is a SQL expression that supports the same expressions
|
||||
as the `where()` clause (`only_if()` in Rust) on a search. They can be as
|
||||
simple or complex as needed. To see what expressions are supported, see the
|
||||
simple or complex as needed. To see what expressions are supported, see the
|
||||
[SQL filters](sql.md) section.
|
||||
|
||||
=== "Python"
|
||||
@@ -344,7 +347,8 @@ Use the `drop_table()` method on the database to remove a table.
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
db.drop_table("my_table")
|
||||
--8<-- "python/python/tests/docs/test_basic.py:drop_table"
|
||||
--8<-- "python/python/tests/docs/test_basic.py:drop_table_async"
|
||||
```
|
||||
|
||||
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||
|
||||
@@ -19,27 +19,163 @@ Allows you to set parameters when registering a `sentence-transformers` object.
|
||||
| `normalize` | `bool` | `True` | Whether to normalize the input text before feeding it to the model |
|
||||
|
||||
|
||||
```python
|
||||
db = lancedb.connect("/tmp/db")
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
func = registry.get("sentence-transformers").create(device="cpu")
|
||||
??? "Check out available sentence-transformer models here!"
|
||||
```markdown
|
||||
- sentence-transformers/all-MiniLM-L12-v2
|
||||
- sentence-transformers/paraphrase-mpnet-base-v2
|
||||
- sentence-transformers/gtr-t5-base
|
||||
- sentence-transformers/LaBSE
|
||||
- sentence-transformers/all-MiniLM-L6-v2
|
||||
- sentence-transformers/bert-base-nli-max-tokens
|
||||
- sentence-transformers/bert-base-nli-mean-tokens
|
||||
- sentence-transformers/bert-base-nli-stsb-mean-tokens
|
||||
- sentence-transformers/bert-base-wikipedia-sections-mean-tokens
|
||||
- sentence-transformers/bert-large-nli-cls-token
|
||||
- sentence-transformers/bert-large-nli-max-tokens
|
||||
- sentence-transformers/bert-large-nli-mean-tokens
|
||||
- sentence-transformers/bert-large-nli-stsb-mean-tokens
|
||||
- sentence-transformers/distilbert-base-nli-max-tokens
|
||||
- sentence-transformers/distilbert-base-nli-mean-tokens
|
||||
- sentence-transformers/distilbert-base-nli-stsb-mean-tokens
|
||||
- sentence-transformers/distilroberta-base-msmarco-v1
|
||||
- sentence-transformers/distilroberta-base-msmarco-v2
|
||||
- sentence-transformers/nli-bert-base-cls-pooling
|
||||
- sentence-transformers/nli-bert-base-max-pooling
|
||||
- sentence-transformers/nli-bert-base
|
||||
- sentence-transformers/nli-bert-large-cls-pooling
|
||||
- sentence-transformers/nli-bert-large-max-pooling
|
||||
- sentence-transformers/nli-bert-large
|
||||
- sentence-transformers/nli-distilbert-base-max-pooling
|
||||
- sentence-transformers/nli-distilbert-base
|
||||
- sentence-transformers/nli-roberta-base
|
||||
- sentence-transformers/nli-roberta-large
|
||||
- sentence-transformers/roberta-base-nli-mean-tokens
|
||||
- sentence-transformers/roberta-base-nli-stsb-mean-tokens
|
||||
- sentence-transformers/roberta-large-nli-mean-tokens
|
||||
- sentence-transformers/roberta-large-nli-stsb-mean-tokens
|
||||
- sentence-transformers/stsb-bert-base
|
||||
- sentence-transformers/stsb-bert-large
|
||||
- sentence-transformers/stsb-distilbert-base
|
||||
- sentence-transformers/stsb-roberta-base
|
||||
- sentence-transformers/stsb-roberta-large
|
||||
- sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens
|
||||
- sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens
|
||||
- sentence-transformers/xlm-r-base-en-ko-nli-ststb
|
||||
- sentence-transformers/xlm-r-bert-base-nli-mean-tokens
|
||||
- sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens
|
||||
- sentence-transformers/xlm-r-large-en-ko-nli-ststb
|
||||
- sentence-transformers/bert-base-nli-cls-token
|
||||
- sentence-transformers/all-distilroberta-v1
|
||||
- sentence-transformers/multi-qa-MiniLM-L6-dot-v1
|
||||
- sentence-transformers/multi-qa-distilbert-cos-v1
|
||||
- sentence-transformers/multi-qa-distilbert-dot-v1
|
||||
- sentence-transformers/multi-qa-mpnet-base-cos-v1
|
||||
- sentence-transformers/multi-qa-mpnet-base-dot-v1
|
||||
- sentence-transformers/nli-distilroberta-base-v2
|
||||
- sentence-transformers/all-MiniLM-L6-v1
|
||||
- sentence-transformers/all-mpnet-base-v1
|
||||
- sentence-transformers/all-mpnet-base-v2
|
||||
- sentence-transformers/all-roberta-large-v1
|
||||
- sentence-transformers/allenai-specter
|
||||
- sentence-transformers/average_word_embeddings_glove.6B.300d
|
||||
- sentence-transformers/average_word_embeddings_glove.840B.300d
|
||||
- sentence-transformers/average_word_embeddings_komninos
|
||||
- sentence-transformers/average_word_embeddings_levy_dependency
|
||||
- sentence-transformers/clip-ViT-B-32-multilingual-v1
|
||||
- sentence-transformers/clip-ViT-B-32
|
||||
- sentence-transformers/distilbert-base-nli-stsb-quora-ranking
|
||||
- sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking
|
||||
- sentence-transformers/distilroberta-base-paraphrase-v1
|
||||
- sentence-transformers/distiluse-base-multilingual-cased-v1
|
||||
- sentence-transformers/distiluse-base-multilingual-cased-v2
|
||||
- sentence-transformers/distiluse-base-multilingual-cased
|
||||
- sentence-transformers/facebook-dpr-ctx_encoder-multiset-base
|
||||
- sentence-transformers/facebook-dpr-ctx_encoder-single-nq-base
|
||||
- sentence-transformers/facebook-dpr-question_encoder-multiset-base
|
||||
- sentence-transformers/facebook-dpr-question_encoder-single-nq-base
|
||||
- sentence-transformers/gtr-t5-large
|
||||
- sentence-transformers/gtr-t5-xl
|
||||
- sentence-transformers/gtr-t5-xxl
|
||||
- sentence-transformers/msmarco-MiniLM-L-12-v3
|
||||
- sentence-transformers/msmarco-MiniLM-L-6-v3
|
||||
- sentence-transformers/msmarco-MiniLM-L12-cos-v5
|
||||
- sentence-transformers/msmarco-MiniLM-L6-cos-v5
|
||||
- sentence-transformers/msmarco-bert-base-dot-v5
|
||||
- sentence-transformers/msmarco-bert-co-condensor
|
||||
- sentence-transformers/msmarco-distilbert-base-dot-prod-v3
|
||||
- sentence-transformers/msmarco-distilbert-base-tas-b
|
||||
- sentence-transformers/msmarco-distilbert-base-v2
|
||||
- sentence-transformers/msmarco-distilbert-base-v3
|
||||
- sentence-transformers/msmarco-distilbert-base-v4
|
||||
- sentence-transformers/msmarco-distilbert-cos-v5
|
||||
- sentence-transformers/msmarco-distilbert-dot-v5
|
||||
- sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned
|
||||
- sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-trained-scratch
|
||||
- sentence-transformers/msmarco-distilroberta-base-v2
|
||||
- sentence-transformers/msmarco-roberta-base-ance-firstp
|
||||
- sentence-transformers/msmarco-roberta-base-v2
|
||||
- sentence-transformers/msmarco-roberta-base-v3
|
||||
- sentence-transformers/multi-qa-MiniLM-L6-cos-v1
|
||||
- sentence-transformers/nli-mpnet-base-v2
|
||||
- sentence-transformers/nli-roberta-base-v2
|
||||
- sentence-transformers/nq-distilbert-base-v1
|
||||
- sentence-transformers/paraphrase-MiniLM-L12-v2
|
||||
- sentence-transformers/paraphrase-MiniLM-L3-v2
|
||||
- sentence-transformers/paraphrase-MiniLM-L6-v2
|
||||
- sentence-transformers/paraphrase-TinyBERT-L6-v2
|
||||
- sentence-transformers/paraphrase-albert-base-v2
|
||||
- sentence-transformers/paraphrase-albert-small-v2
|
||||
- sentence-transformers/paraphrase-distilroberta-base-v1
|
||||
- sentence-transformers/paraphrase-distilroberta-base-v2
|
||||
- sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
|
||||
- sentence-transformers/paraphrase-multilingual-mpnet-base-v2
|
||||
- sentence-transformers/paraphrase-xlm-r-multilingual-v1
|
||||
- sentence-transformers/quora-distilbert-base
|
||||
- sentence-transformers/quora-distilbert-multilingual
|
||||
- sentence-transformers/sentence-t5-base
|
||||
- sentence-transformers/sentence-t5-large
|
||||
- sentence-transformers/sentence-t5-xxl
|
||||
- sentence-transformers/sentence-t5-xl
|
||||
- sentence-transformers/stsb-distilroberta-base-v2
|
||||
- sentence-transformers/stsb-mpnet-base-v2
|
||||
- sentence-transformers/stsb-roberta-base-v2
|
||||
- sentence-transformers/stsb-xlm-r-multilingual
|
||||
- sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1
|
||||
- sentence-transformers/clip-ViT-L-14
|
||||
- sentence-transformers/clip-ViT-B-16
|
||||
- sentence-transformers/use-cmlm-multilingual
|
||||
- sentence-transformers/all-MiniLM-L12-v1
|
||||
```
|
||||
|
||||
class Words(LanceModel):
|
||||
text: str = func.SourceField()
|
||||
vector: Vector(func.ndims()) = func.VectorField()
|
||||
!!! info
|
||||
You can also load many other model architectures from the library. For example models from sources such as BAAI, nomic, salesforce research, etc.
|
||||
See this HF hub page for all [supported models](https://huggingface.co/models?library=sentence-transformers).
|
||||
|
||||
table = db.create_table("words", schema=Words)
|
||||
table.add(
|
||||
[
|
||||
{"text": "hello world"}
|
||||
{"text": "goodbye world"}
|
||||
]
|
||||
)
|
||||
!!! note "BAAI Embeddings example"
|
||||
Here is an example that uses BAAI embedding model from the HuggingFace Hub [supported models](https://huggingface.co/models?library=sentence-transformers)
|
||||
```python
|
||||
db = lancedb.connect("/tmp/db")
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
model = registry.get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu")
|
||||
|
||||
class Words(LanceModel):
|
||||
text: str = model.SourceField()
|
||||
vector: Vector(model.ndims()) = model.VectorField()
|
||||
|
||||
table = db.create_table("words", schema=Words)
|
||||
table.add(
|
||||
[
|
||||
{"text": "hello world"}
|
||||
{"text": "goodbye world"}
|
||||
]
|
||||
)
|
||||
|
||||
query = "greetings"
|
||||
actual = table.search(query).limit(1).to_pydantic(Words)[0]
|
||||
print(actual.text)
|
||||
```
|
||||
Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models.
|
||||
|
||||
query = "greetings"
|
||||
actual = table.search(query).limit(1).to_pydantic(Words)[0]
|
||||
print(actual.text)
|
||||
```
|
||||
|
||||
### OpenAI embeddings
|
||||
LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances:
|
||||
|
||||
150
docs/src/eval/bench_fine_tuned_hybrid.py
Normal file
150
docs/src/eval/bench_fine_tuned_hybrid.py
Normal file
@@ -0,0 +1,150 @@
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import os
|
||||
import requests
|
||||
from llama_index.core import ServiceContext, VectorStoreIndex, StorageContext
|
||||
from llama_index.core.schema import TextNode
|
||||
from llama_index.vector_stores.lancedb import LanceDBVectorStore
|
||||
from lancedb.rerankers import CrossEncoderReranker, ColbertReranker, CohereReranker, LinearCombinationReranker
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
from llama_index.embeddings.openai import OpenAIEmbedding
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.embeddings import get_registry
|
||||
from lancedb.embeddings.fine_tuner.dataset import QADataset, TextChunk, DEFAULT_PROMPT_TMPL
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from lancedb.embeddings.fine_tuner.llm import Openai
|
||||
|
||||
import time
|
||||
import lancedb
|
||||
import wandb
|
||||
from pydantic import BaseModel, root_validator
|
||||
from typing import Optional
|
||||
|
||||
TRAIN_DATASET_FPATH = './data/train_dataset.json'
|
||||
VAL_DATASET_FPATH = './data/val_dataset.json'
|
||||
|
||||
with open(TRAIN_DATASET_FPATH, 'r+') as f:
|
||||
train_dataset = json.load(f)
|
||||
|
||||
with open(VAL_DATASET_FPATH, 'r+') as f:
|
||||
val_dataset = json.load(f)
|
||||
|
||||
def train_embedding_model(epoch):
|
||||
def download_test_files(url):
|
||||
# download to cwd
|
||||
files = []
|
||||
filename = os.path.basename(url)
|
||||
if not os.path.exists(filename):
|
||||
print(f"Downloading {url} to {filename}")
|
||||
r = requests.get(url)
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(r.content)
|
||||
files.append(filename)
|
||||
return files
|
||||
|
||||
def get_dataset(url, name):
|
||||
reader = SimpleDirectoryReader(input_files=download_test_files(url))
|
||||
docs = reader.load_data()
|
||||
|
||||
parser = SentenceSplitter()
|
||||
nodes = parser.get_nodes_from_documents(docs)
|
||||
|
||||
if os.path.exists(name):
|
||||
ds = QADataset.load(name)
|
||||
else:
|
||||
llm = Openai()
|
||||
|
||||
# convert Llama-index TextNode to TextChunk
|
||||
chunks = [TextChunk.from_llama_index_node(node) for node in nodes]
|
||||
|
||||
ds = QADataset.from_llm(chunks, llm, num_questions_per_chunk=2)
|
||||
ds.save(name)
|
||||
return ds
|
||||
train_url = 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/uber_2021.pdf'
|
||||
ds = get_dataset(train_url, "qa_dataset_uber")
|
||||
|
||||
|
||||
model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5")
|
||||
model.finetune(trainset=ds, valset=None, path="model_airbnb", epochs=epoch, log_wandb=True, run_name="lyft_finetune")
|
||||
|
||||
|
||||
def evaluate(
|
||||
dataset,
|
||||
embed_model,
|
||||
reranker=None,
|
||||
top_k=5,
|
||||
verbose=False,
|
||||
):
|
||||
corpus = dataset['corpus']
|
||||
queries = dataset['queries']
|
||||
relevant_docs = dataset['relevant_docs']
|
||||
|
||||
vector_store = LanceDBVectorStore(uri="/tmp/lancedb")
|
||||
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
||||
service_context = ServiceContext.from_defaults(embed_model=embed_model)
|
||||
nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
|
||||
index = VectorStoreIndex(
|
||||
nodes,
|
||||
service_context=service_context,
|
||||
show_progress=True,
|
||||
storage_context=storage_context,
|
||||
)
|
||||
tbl = vector_store.connection.open_table(vector_store.table_name)
|
||||
tbl.create_fts_index("text", replace=True)
|
||||
|
||||
eval_results = []
|
||||
for query_id, query in tqdm(queries.items()):
|
||||
query_vector = embed_model.get_query_embedding(query)
|
||||
try:
|
||||
if reranker is None:
|
||||
rs = tbl.search(query_vector).limit(top_k).to_pandas()
|
||||
else:
|
||||
rs = tbl.search((query_vector, query)).rerank(reranker=reranker).limit(top_k).to_pandas()
|
||||
except Exception as e:
|
||||
print(f'Error with query: {query_id} {e}')
|
||||
continue
|
||||
retrieved_ids = rs['id'].tolist()[:top_k]
|
||||
expected_id = relevant_docs[query_id][0]
|
||||
is_hit = expected_id in retrieved_ids # assume 1 relevant doc
|
||||
if len(eval_results) == 0:
|
||||
print(f"Query: {query}")
|
||||
print(f"Expected: {expected_id}")
|
||||
print(f"Retrieved: {retrieved_ids}")
|
||||
eval_result = {
|
||||
'is_hit': is_hit,
|
||||
'retrieved': retrieved_ids,
|
||||
'expected': expected_id,
|
||||
'query': query_id,
|
||||
}
|
||||
eval_results.append(eval_result)
|
||||
return eval_results
|
||||
|
||||
if __name__ == '__main__':
|
||||
train_embedding_model(4)
|
||||
#embed_model = OpenAIEmbedding() # model="text-embedding-3-small"
|
||||
rerankers = {
|
||||
"Vector Search": None,
|
||||
"Cohere": CohereReranker(),
|
||||
"Cross Encoder": CrossEncoderReranker(),
|
||||
"Colbert": ColbertReranker(),
|
||||
"linear": LinearCombinationReranker(),
|
||||
}
|
||||
top_ks = [3]
|
||||
for top_k in top_ks:
|
||||
#for epoch in epochs:
|
||||
for name, reranker in rerankers.items():
|
||||
#embed_model = HuggingFaceEmbedding("./model_airbnb")
|
||||
embed_model = OpenAIEmbedding()
|
||||
wandb.init(project=f"Reranker-based", name=name)
|
||||
val_eval_results = evaluate(val_dataset, embed_model, reranker=reranker, top_k=top_k)
|
||||
df = pd.DataFrame(val_eval_results)
|
||||
|
||||
hit_rate = df['is_hit'].mean()
|
||||
print(f'Hit rate: {hit_rate:.2f}')
|
||||
wandb.log({f"openai_base_hit_rate_@{top_k}": hit_rate})
|
||||
wandb.finish()
|
||||
|
||||
|
||||
71
docs/src/eval/test_fine_tune_from_llm.py
Normal file
71
docs/src/eval/test_fine_tune_from_llm.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import os
|
||||
import json
|
||||
import lancedb
|
||||
import pandas as pd
|
||||
|
||||
from lancedb.embeddings.fine_tuner.llm import Openai
|
||||
from lancedb.embeddings.fine_tuner.dataset import QADataset, TextChunk
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from llama_index.core.schema import MetadataMode
|
||||
from lancedb.embeddings import get_registry
|
||||
|
||||
|
||||
test_url = 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/lyft_2021.pdf'
|
||||
train_url = 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/uber_2021.pdf'
|
||||
def download_test_files(url):
|
||||
import os
|
||||
import requests
|
||||
|
||||
# download to cwd
|
||||
files = []
|
||||
filename = os.path.basename(url)
|
||||
if not os.path.exists(filename):
|
||||
print(f"Downloading {url} to {filename}")
|
||||
r = requests.get(url)
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(r.content)
|
||||
files.append(filename)
|
||||
return files
|
||||
|
||||
def get_dataset(url, name):
|
||||
reader = SimpleDirectoryReader(input_files=download_test_files(url))
|
||||
docs = reader.load_data()
|
||||
|
||||
parser = SentenceSplitter()
|
||||
nodes = parser.get_nodes_from_documents(docs)
|
||||
|
||||
if os.path.exists(name):
|
||||
ds = QADataset.load(name)
|
||||
else:
|
||||
llm = Openai()
|
||||
|
||||
# convert Llama-index TextNode to TextChunk
|
||||
chunks = [TextChunk.from_llama_index_node(node) for node in nodes]
|
||||
|
||||
ds = QADataset.from_llm(chunks, llm)
|
||||
ds.save(name)
|
||||
return ds
|
||||
|
||||
|
||||
|
||||
trainset = get_dataset(test_url, "qa_dataset_1")
|
||||
valset = get_dataset(train_url, "valset")
|
||||
|
||||
model = get_registry().get("sentence-transformers").create()
|
||||
model.finetune(trainset=trainset, valset=valset, path="model_finetuned_1", epochs=4)
|
||||
|
||||
base = get_registry().get("sentence-transformers").create()
|
||||
tuned = get_registry().get("sentence-transformers").create(name="./model_finetuned_1")
|
||||
openai = get_registry().get("openai").create(name="text-embedding-3-large")
|
||||
|
||||
|
||||
rs1 = base.evaluate(valset, path="val_res")
|
||||
rs2 = tuned.evaluate(valset, path="val_res")
|
||||
rs3 = openai.evaluate(valset)
|
||||
|
||||
print("openai-embedding-v3 hit-rate - ", pd.DataFrame(rs3)["is_hit"].mean())
|
||||
print("fine-tuned hit-rate - ", pd.DataFrame(rs2)["is_hit"].mean())
|
||||
print("Base model hite-rate - ", pd.DataFrame(rs1)["is_hit"].mean())
|
||||
|
||||
119
docs/src/eval/test_fine_tune_from_responses.py
Normal file
119
docs/src/eval/test_fine_tune_from_responses.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import uuid
|
||||
import lancedb
|
||||
import pandas as pd
|
||||
|
||||
from tqdm import tqdm
|
||||
from lancedb.embeddings.fine_tuner.llm import Openai
|
||||
from lancedb.embeddings.fine_tuner.dataset import QADataset, TextChunk, DEFAULT_PROMPT_TMPL
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from llama_index.core.schema import MetadataMode
|
||||
from lancedb.embeddings import get_registry
|
||||
|
||||
|
||||
|
||||
test_url = 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/lyft_2021.pdf'
|
||||
train_url = 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/uber_2021.pdf'
|
||||
def download_test_files(url):
|
||||
import os
|
||||
import requests
|
||||
|
||||
|
||||
# download to cwd
|
||||
files = []
|
||||
filename = os.path.basename(url)
|
||||
if not os.path.exists(filename):
|
||||
print(f"Downloading {url} to {filename}")
|
||||
r = requests.get(url)
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(r.content)
|
||||
files.append(filename)
|
||||
return files
|
||||
|
||||
|
||||
def get_node(url):
|
||||
reader = SimpleDirectoryReader(input_files=download_test_files(url))
|
||||
docs = reader.load_data()
|
||||
|
||||
parser = SentenceSplitter()
|
||||
nodes = parser.get_nodes_from_documents(docs)
|
||||
|
||||
return nodes
|
||||
def get_dataset(url, name):
|
||||
reader = SimpleDirectoryReader(input_files=download_test_files(url))
|
||||
docs = reader.load_data()
|
||||
|
||||
parser = SentenceSplitter()
|
||||
nodes = parser.get_nodes_from_documents(docs)
|
||||
|
||||
if os.path.exists(name):
|
||||
ds = QADataset.load(name)
|
||||
else:
|
||||
llm = Openai()
|
||||
|
||||
# convert Llama-index TextNode to TextChunk
|
||||
chunks = [TextChunk.from_llama_index_node(node) for node in nodes]
|
||||
|
||||
ds = QADataset.from_llm(chunks, llm)
|
||||
ds.save(name)
|
||||
return ds
|
||||
|
||||
nodes = get_node(train_url)
|
||||
|
||||
db = lancedb.connect("~/lancedb/fine-tuning")
|
||||
model = get_registry().get("openai").create()
|
||||
class Schema(LanceModel):
|
||||
id: str
|
||||
text: str = model.SourceField()
|
||||
vector: Vector(model.ndims()) = model.VectorField()
|
||||
|
||||
retriever = db.create_table("fine-tuning", schema=Schema, mode="overwrite")
|
||||
pylist = [{"id": str(node.node_id), "text": node.text} for node in nodes]
|
||||
retriever.add(pylist)
|
||||
|
||||
|
||||
|
||||
ds_name = "response_data"
|
||||
if os.path.exists(ds_name):
|
||||
ds = QADataset.load(ds_name)
|
||||
else:
|
||||
# Generate questions
|
||||
llm = Openai()
|
||||
text_chunks = [TextChunk.from_llama_index_node(node) for node in nodes]
|
||||
|
||||
queries = {}
|
||||
relevant_docs = {}
|
||||
for chunk in tqdm(text_chunks):
|
||||
text = chunk.text
|
||||
questions = llm.get_questions(DEFAULT_PROMPT_TMPL.format(context_str=text, num_questions_per_chunk=2))
|
||||
|
||||
for question in questions:
|
||||
question_id = str(uuid.uuid4())
|
||||
queries[question_id] = question
|
||||
relevant_docs[question_id] = [retriever.search(question).to_pandas()["id"].tolist()[0]]
|
||||
ds = QADataset.from_responses(text_chunks, queries, relevant_docs)
|
||||
ds.save(ds_name)
|
||||
|
||||
|
||||
# Fine-tune model
|
||||
valset = get_dataset(train_url, "valset")
|
||||
|
||||
model = get_registry().get("sentence-transformers").create()
|
||||
res_base = model.evaluate(valset)
|
||||
|
||||
model.finetune(trainset=ds, path="model_finetuned", epochs=4, log_wandb=True)
|
||||
tuned = get_registry().get("sentence-transformers").create(name="./model_finetuned")
|
||||
res_tuned = tuned.evaluate(valset)
|
||||
|
||||
openai_model = get_registry().get("openai").create()
|
||||
#res_openai = openai_model.evaluate(valset)
|
||||
|
||||
#print(f"openai model results: {pd.DataFrame(res_openai)['is_hit'].mean()}")
|
||||
print(f"base model results: {pd.DataFrame(res_base)['is_hit'].mean()}")
|
||||
print(f"tuned model results: {pd.DataFrame(res_tuned)['is_hit'].mean()}")
|
||||
|
||||
|
||||
@@ -1,11 +1,79 @@
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
var script = document.createElement("script");
|
||||
script.src = "https://widget.kapa.ai/kapa-widget.bundle.js";
|
||||
script.setAttribute("data-website-id", "c5881fae-cec0-490b-b45e-d83d131d4f25");
|
||||
script.setAttribute("data-project-name", "LanceDB");
|
||||
script.setAttribute("data-project-color", "#000000");
|
||||
script.setAttribute("data-project-logo", "https://avatars.githubusercontent.com/u/108903835?s=200&v=4");
|
||||
script.setAttribute("data-modal-example-questions","Help me create an IVF_PQ index,How do I do an exhaustive search?,How do I create a LanceDB table?,Can I use my own embedding function?");
|
||||
script.async = true;
|
||||
document.head.appendChild(script);
|
||||
});
|
||||
// Creates an SVG robot icon (from Lucide)
|
||||
function robotSVG() {
|
||||
var svg = document.createElementNS("http://www.w3.org/2000/svg", "svg");
|
||||
svg.setAttribute("width", "24");
|
||||
svg.setAttribute("height", "24");
|
||||
svg.setAttribute("viewBox", "0 0 24 24");
|
||||
svg.setAttribute("fill", "none");
|
||||
svg.setAttribute("stroke", "currentColor");
|
||||
svg.setAttribute("stroke-width", "2");
|
||||
svg.setAttribute("stroke-linecap", "round");
|
||||
svg.setAttribute("stroke-linejoin", "round");
|
||||
svg.setAttribute("class", "lucide lucide-bot-message-square");
|
||||
|
||||
var path1 = document.createElementNS("http://www.w3.org/2000/svg", "path");
|
||||
path1.setAttribute("d", "M12 6V2H8");
|
||||
svg.appendChild(path1);
|
||||
|
||||
var path2 = document.createElementNS("http://www.w3.org/2000/svg", "path");
|
||||
path2.setAttribute("d", "m8 18-4 4V8a2 2 0 0 1 2-2h12a2 2 0 0 1 2 2v8a2 2 0 0 1-2 2Z");
|
||||
svg.appendChild(path2);
|
||||
|
||||
var path3 = document.createElementNS("http://www.w3.org/2000/svg", "path");
|
||||
path3.setAttribute("d", "M2 12h2");
|
||||
svg.appendChild(path3);
|
||||
|
||||
var path4 = document.createElementNS("http://www.w3.org/2000/svg", "path");
|
||||
path4.setAttribute("d", "M9 11v2");
|
||||
svg.appendChild(path4);
|
||||
|
||||
var path5 = document.createElementNS("http://www.w3.org/2000/svg", "path");
|
||||
path5.setAttribute("d", "M15 11v2");
|
||||
svg.appendChild(path5);
|
||||
|
||||
var path6 = document.createElementNS("http://www.w3.org/2000/svg", "path");
|
||||
path6.setAttribute("d", "M20 12h2");
|
||||
svg.appendChild(path6);
|
||||
|
||||
return svg
|
||||
}
|
||||
|
||||
// Creates the Fluidic Chatbot buttom
|
||||
function fluidicButton() {
|
||||
var btn = document.createElement("a");
|
||||
btn.href = "https://asklancedb.com";
|
||||
btn.target = "_blank";
|
||||
btn.style.position = "fixed";
|
||||
btn.style.fontWeight = "bold";
|
||||
btn.style.fontSize = ".8rem";
|
||||
btn.style.right = "10px";
|
||||
btn.style.bottom = "10px";
|
||||
btn.style.width = "80px";
|
||||
btn.style.height = "80px";
|
||||
btn.style.background = "linear-gradient(135deg, #7C5EFF 0%, #625eff 100%)";
|
||||
btn.style.color = "white";
|
||||
btn.style.borderRadius = "5px";
|
||||
btn.style.display = "flex";
|
||||
btn.style.flexDirection = "column";
|
||||
btn.style.justifyContent = "center";
|
||||
btn.style.alignItems = "center";
|
||||
btn.style.zIndex = "1000";
|
||||
btn.style.opacity = "0";
|
||||
btn.style.boxShadow = "0 0 0 rgba(0, 0, 0, 0)";
|
||||
btn.style.transition = "opacity 0.2s ease-in, box-shadow 0.2s ease-in";
|
||||
|
||||
setTimeout(function() {
|
||||
btn.style.opacity = "1";
|
||||
btn.style.boxShadow = "0 0 .2rem #0000001a,0 .2rem .4rem #0003"
|
||||
}, 0);
|
||||
|
||||
return btn
|
||||
}
|
||||
|
||||
document.addEventListener("DOMContentLoaded", function() {
|
||||
var btn = fluidicButton()
|
||||
btn.appendChild(robotSVG());
|
||||
var text = document.createTextNode("Ask AI");
|
||||
btn.appendChild(text);
|
||||
document.body.appendChild(btn);
|
||||
});
|
||||
|
||||
76
docs/src/migration.md
Normal file
76
docs/src/migration.md
Normal file
@@ -0,0 +1,76 @@
|
||||
# Rust-backed Client Migration Guide
|
||||
|
||||
In an effort to ensure all clients have the same set of capabilities we have begun migrating the
|
||||
python and node clients onto a common Rust base library. In python, this new client is part of
|
||||
the same lancedb package, exposed as an asynchronous client. Once the asynchronous client has
|
||||
reached full functionality we will begin migrating the synchronous library to be a thin wrapper
|
||||
around the asynchronous client.
|
||||
|
||||
This guide describes the differences between the two APIs and will hopefully assist users
|
||||
that would like to migrate to the new API.
|
||||
|
||||
## Closeable Connections
|
||||
|
||||
The Connection now has a `close` method. You can call this when
|
||||
you are done with the connection to eagerly free resources. Currently
|
||||
this is limited to freeing/closing the HTTP connection for remote
|
||||
connections. In the future we may add caching or other resources to
|
||||
native connections so this is probably a good practice even if you
|
||||
aren't using remote connections.
|
||||
|
||||
In addition, the connection can be used as a context manager which may
|
||||
be a more convenient way to ensure the connection is closed.
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
|
||||
async def my_async_fn():
|
||||
with await lancedb.connect_async("my_uri") as db:
|
||||
print(await db.table_names())
|
||||
```
|
||||
|
||||
It is not mandatory to call the `close` method. If you do not call it
|
||||
then the connection will be closed when the object is garbage collected.
|
||||
|
||||
## Closeable Table
|
||||
|
||||
The Table now also has a `close` method, similar to the connection. This
|
||||
can be used to eagerly free the cache used by a Table object. Similar to
|
||||
the connection, it can be used as a context manager and it is not mandatory
|
||||
to call the `close` method.
|
||||
|
||||
### Changes to Table APIs
|
||||
|
||||
- Previously `Table.schema` was a property. Now it is an async method.
|
||||
- The method `Table.__len__` was removed and `len(table)` will no longer
|
||||
work. Use `Table.count_rows` instead.
|
||||
|
||||
### Creating Indices
|
||||
|
||||
The `Table.create_index` method is now used for creating both vector indices
|
||||
and scalar indices. It currently requires a column name to be specified (the
|
||||
column to index). Vector index defaults are now smarter and scale better with
|
||||
the size of the data.
|
||||
|
||||
To specify index configuration details you will need to specify which kind of
|
||||
index you are using.
|
||||
|
||||
### Querying
|
||||
|
||||
The `Table.search` method has been renamed to `AsyncTable.vector_search` for
|
||||
clarity.
|
||||
|
||||
## Features not yet supported
|
||||
|
||||
The following features are not yet supported by the asynchronous API. However,
|
||||
we plan to support them soon.
|
||||
|
||||
- You cannot specify an embedding function when creating or opening a table.
|
||||
You must calculate embeddings yourself if using the asynchronous API
|
||||
- The merge insert operation is not supported in the asynchronous API
|
||||
- Cleanup / compact / optimize indices are not supported in the asynchronous API
|
||||
- add / alter columns is not supported in the asynchronous API
|
||||
- The asynchronous API does not yet support any full text search or reranking
|
||||
search
|
||||
- Remote connections to LanceDb Cloud are not yet supported.
|
||||
- The method Table.head is not yet supported.
|
||||
@@ -8,17 +8,20 @@ This section contains the API reference for the OSS Python API.
|
||||
pip install lancedb
|
||||
```
|
||||
|
||||
## Connection
|
||||
The following methods describe the synchronous API client. There
|
||||
is also an [asynchronous API client](#connections-asynchronous).
|
||||
|
||||
## Connections (Synchronous)
|
||||
|
||||
::: lancedb.connect
|
||||
|
||||
::: lancedb.db.DBConnection
|
||||
|
||||
## Table
|
||||
## Tables (Synchronous)
|
||||
|
||||
::: lancedb.table.Table
|
||||
|
||||
## Querying
|
||||
## Querying (Synchronous)
|
||||
|
||||
::: lancedb.query.Query
|
||||
|
||||
@@ -86,4 +89,42 @@ pip install lancedb
|
||||
|
||||
::: lancedb.rerankers.cross_encoder.CrossEncoderReranker
|
||||
|
||||
::: lancedb.rerankers.openai.OpenaiReranker
|
||||
::: lancedb.rerankers.openai.OpenaiReranker
|
||||
|
||||
## Connections (Asynchronous)
|
||||
|
||||
Connections represent a connection to a LanceDb database and
|
||||
can be used to create, list, or open tables.
|
||||
|
||||
::: lancedb.connect_async
|
||||
|
||||
::: lancedb.db.AsyncConnection
|
||||
|
||||
## Tables (Asynchronous)
|
||||
|
||||
Table hold your actual data as a collection of records / rows.
|
||||
|
||||
::: lancedb.table.AsyncTable
|
||||
|
||||
## Indices (Asynchronous)
|
||||
|
||||
Indices can be created on a table to speed up queries. This section
|
||||
lists the indices that LanceDb supports.
|
||||
|
||||
::: lancedb.index.BTree
|
||||
|
||||
::: lancedb.index.IvfPq
|
||||
|
||||
## Querying (Asynchronous)
|
||||
|
||||
Queries allow you to return data from your database. Basic queries can be
|
||||
created with the [AsyncTable.query][lancedb.table.AsyncTable.query] method
|
||||
to return the entire (typically filtered) table. Vector searches return the
|
||||
rows nearest to a query vector and can be created with the
|
||||
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search] method.
|
||||
|
||||
::: lancedb.query.AsyncQueryBase
|
||||
|
||||
::: lancedb.query.AsyncQuery
|
||||
|
||||
::: lancedb.query.AsyncVectorQuery
|
||||
|
||||
56
node/package-lock.json
generated
56
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.4.13",
|
||||
"version": "0.4.14",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.4.13",
|
||||
"version": "0.4.14",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,11 +52,11 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.4.13",
|
||||
"@lancedb/vectordb-darwin-x64": "0.4.13",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.13",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.13",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.13"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.4.14",
|
||||
"@lancedb/vectordb-darwin-x64": "0.4.14",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.14",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.14",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.14"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
@@ -334,9 +334,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.4.13",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.13.tgz",
|
||||
"integrity": "sha512-JfroNCG8yKIU931Y+x8d0Fp8C9DHUSC5j+CjI+e5err7rTWtie4j3JbsXlWAnPFaFEOg0Xk3BWkSikCvhPGJGg==",
|
||||
"version": "0.4.14",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.14.tgz",
|
||||
"integrity": "sha512-fw6mf6UhFf4j2kKdFcw0P+SOiIqmRbt+YQSgDbF4BFU3OUSW0XyfETIj9cUMQbSwPFsofhlGp5BRpCd7W9noew==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -345,22 +345,10 @@
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.4.13",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.13.tgz",
|
||||
"integrity": "sha512-dG6IMvfpHpnHdbJ0UffzJ7cZfMiC02MjIi6YJzgx+hKz2UNXWNBIfTvvhqli85mZsGRXL1OYDdYv0K1YzNjXlA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.4.13",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.13.tgz",
|
||||
"integrity": "sha512-BRR1VzaMviXby7qmLm0axNZM8eUZF3ZqfvnDKdVRpC3LaRueD6pMXHuC2IUKaFkn7xktf+8BlDZb6foFNEj8bQ==",
|
||||
"version": "0.4.14",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.14.tgz",
|
||||
"integrity": "sha512-1+LFI8vU+f/lnGy1s3XCySuV4oj3ZUW03xtmedGBW8nv/Y/jWXP0OYJCRI72eu+dLIdu0tCPsEiu8Hl+o02t9g==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -369,22 +357,10 @@
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.4.13",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.13.tgz",
|
||||
"integrity": "sha512-WnekZ7ZMlria+NODZ6aBCljCFQSe2bBNUS9ZpyFl/Y1vHduSQPuBxM6V7vp2QubC0daq/rifgjDob89DF+x3xw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.4.13",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.13.tgz",
|
||||
"integrity": "sha512-3NDpMWBL2ksDHXAraXhowiLqQcNWM5bdbeHwze4+InYMD54hyQ2ODNc+4usxp63Nya9biVnFS27yXULqkzIEqQ==",
|
||||
"version": "0.4.14",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.14.tgz",
|
||||
"integrity": "sha512-fpuNMZ4aHSpZC3ztp5a0Wh18N6DpCx5EPWhS7bGA5XulGc0l+sZAJHfHwalx76ys//0Ns1z7cuKJhZpSa4SrdQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.4.13",
|
||||
"version": "0.4.14",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
@@ -88,10 +88,10 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.4.13",
|
||||
"@lancedb/vectordb-darwin-x64": "0.4.13",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.13",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.13",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.13"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.4.14",
|
||||
"@lancedb/vectordb-darwin-x64": "0.4.14",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.14",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.14",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.14"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@ import { RemoteConnection } from './remote'
|
||||
import { Query } from './query'
|
||||
import { isEmbeddingFunction } from './embedding/embedding_function'
|
||||
import { type Literal, toSQL } from './util'
|
||||
import { type HttpMiddleware } from './middleware'
|
||||
|
||||
const {
|
||||
databaseNew,
|
||||
@@ -302,6 +303,18 @@ export interface Connection {
|
||||
* @param name The name of the table to drop.
|
||||
*/
|
||||
dropTable(name: string): Promise<void>
|
||||
|
||||
/**
|
||||
* Instrument the behavior of this Connection with middleware.
|
||||
*
|
||||
* The middleware will be called in the order they are added.
|
||||
*
|
||||
* Currently this functionality is only supported for remote Connections.
|
||||
*
|
||||
* @param {HttpMiddleware} - Middleware which will instrument the Connection.
|
||||
* @returns - this Connection instrumented by the passed middleware
|
||||
*/
|
||||
withMiddleware(middleware: HttpMiddleware): Connection
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -541,6 +554,18 @@ export interface Table<T = number[]> {
|
||||
* names (e.g. "a").
|
||||
*/
|
||||
dropColumns(columnNames: string[]): Promise<void>
|
||||
|
||||
/**
|
||||
* Instrument the behavior of this Table with middleware.
|
||||
*
|
||||
* The middleware will be called in the order they are added.
|
||||
*
|
||||
* Currently this functionality is only supported for remote tables.
|
||||
*
|
||||
* @param {HttpMiddleware} - Middleware which will instrument the Table.
|
||||
* @returns - this Table instrumented by the passed middleware
|
||||
*/
|
||||
withMiddleware(middleware: HttpMiddleware): Table<T>
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -795,6 +820,10 @@ export class LocalConnection implements Connection {
|
||||
async dropTable (name: string): Promise<void> {
|
||||
await databaseDropTable.call(this._db, name)
|
||||
}
|
||||
|
||||
withMiddleware (middleware: HttpMiddleware): Connection {
|
||||
return this
|
||||
}
|
||||
}
|
||||
|
||||
export class LocalTable<T = number[]> implements Table<T> {
|
||||
@@ -1105,6 +1134,10 @@ export class LocalTable<T = number[]> implements Table<T> {
|
||||
async dropColumns (columnNames: string[]): Promise<void> {
|
||||
return tableDropColumns.call(this._tbl, columnNames)
|
||||
}
|
||||
|
||||
withMiddleware (middleware: HttpMiddleware): Table<T> {
|
||||
return this
|
||||
}
|
||||
}
|
||||
|
||||
export interface CleanupStats {
|
||||
|
||||
58
node/src/middleware.ts
Normal file
58
node/src/middleware.ts
Normal file
@@ -0,0 +1,58 @@
|
||||
// Copyright 2024 LanceDB Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
/**
|
||||
* Middleware for Remote LanceDB Connection or Table
|
||||
*/
|
||||
export interface HttpMiddleware {
|
||||
/**
|
||||
* A callback that can be used to instrument the behavior of http requests to remote
|
||||
* tables. It can be used to add headers, modify the request, or even short-circuit
|
||||
* the request and return a response without making the request to the remote endpoint.
|
||||
* It can also be used to modify the response from the remote endpoint.
|
||||
*
|
||||
* @param {RemoteResponse} res - Request to the remote endpoint
|
||||
* @param {onRemoteRequestNext} next - Callback to advance the middleware chain
|
||||
*/
|
||||
onRemoteRequest(
|
||||
req: RemoteRequest,
|
||||
next: (req: RemoteRequest) => Promise<RemoteResponse>,
|
||||
): Promise<RemoteResponse>
|
||||
};
|
||||
|
||||
export enum Method {
|
||||
GET,
|
||||
POST
|
||||
}
|
||||
|
||||
/**
|
||||
* A LanceDB Remote HTTP Request
|
||||
*/
|
||||
export interface RemoteRequest {
|
||||
uri: string
|
||||
method: Method
|
||||
headers: Map<string, string>
|
||||
params?: Map<string, string>
|
||||
body?: any
|
||||
}
|
||||
|
||||
/**
|
||||
* A LanceDB Remote HTTP Response
|
||||
*/
|
||||
export interface RemoteResponse {
|
||||
status: number
|
||||
statusText: string
|
||||
headers: Map<string, string>
|
||||
body: () => Promise<any>
|
||||
}
|
||||
@@ -12,13 +12,101 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
import axios, { type AxiosResponse } from 'axios'
|
||||
import axios, { type AxiosResponse, type ResponseType } from 'axios'
|
||||
|
||||
import { tableFromIPC, type Table as ArrowTable } from 'apache-arrow'
|
||||
|
||||
import { type RemoteResponse, type RemoteRequest, Method } from '../middleware'
|
||||
|
||||
interface HttpLancedbClientMiddleware {
|
||||
onRemoteRequest(
|
||||
req: RemoteRequest,
|
||||
next: (req: RemoteRequest) => Promise<RemoteResponse>,
|
||||
): Promise<RemoteResponse>
|
||||
}
|
||||
|
||||
/**
|
||||
* Invoke the middleware chain and at the end call the remote endpoint
|
||||
*/
|
||||
async function callWithMiddlewares (
|
||||
req: RemoteRequest,
|
||||
middlewares: HttpLancedbClientMiddleware[],
|
||||
opts?: MiddlewareInvocationOptions
|
||||
): Promise<RemoteResponse> {
|
||||
async function call (
|
||||
i: number,
|
||||
req: RemoteRequest
|
||||
): Promise<RemoteResponse> {
|
||||
// if we have reached the end of the middleware chain, make the request
|
||||
if (i > middlewares.length) {
|
||||
const headers = Object.fromEntries(req.headers.entries())
|
||||
const params = Object.fromEntries(req.params?.entries() ?? [])
|
||||
const timeout = 10000
|
||||
let res
|
||||
if (req.method === Method.POST) {
|
||||
res = await axios.post(
|
||||
req.uri,
|
||||
req.body,
|
||||
{
|
||||
headers,
|
||||
params,
|
||||
timeout,
|
||||
responseType: opts?.responseType
|
||||
}
|
||||
)
|
||||
} else {
|
||||
res = await axios.get(
|
||||
req.uri,
|
||||
{
|
||||
headers,
|
||||
params,
|
||||
timeout
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
return toLanceRes(res)
|
||||
}
|
||||
|
||||
// call next middleware in chain
|
||||
return await middlewares[i - 1].onRemoteRequest(
|
||||
req,
|
||||
async (req) => {
|
||||
return await call(i + 1, req)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
return await call(1, req)
|
||||
}
|
||||
|
||||
interface MiddlewareInvocationOptions {
|
||||
responseType?: ResponseType
|
||||
}
|
||||
|
||||
/**
|
||||
* Marshall the library response into a LanceDB response
|
||||
*/
|
||||
function toLanceRes (res: AxiosResponse): RemoteResponse {
|
||||
const headers = new Map()
|
||||
for (const h in res.headers) {
|
||||
headers.set(h, res.headers[h])
|
||||
}
|
||||
|
||||
return {
|
||||
status: res.status,
|
||||
statusText: res.statusText,
|
||||
headers,
|
||||
body: async () => {
|
||||
return res.data
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export class HttpLancedbClient {
|
||||
private readonly _url: string
|
||||
private readonly _apiKey: () => string
|
||||
private readonly _middlewares: HttpLancedbClientMiddleware[]
|
||||
|
||||
public constructor (
|
||||
url: string,
|
||||
@@ -27,6 +115,7 @@ export class HttpLancedbClient {
|
||||
) {
|
||||
this._url = url
|
||||
this._apiKey = () => apiKey
|
||||
this._middlewares = []
|
||||
}
|
||||
|
||||
get uri (): string {
|
||||
@@ -43,74 +132,61 @@ export class HttpLancedbClient {
|
||||
columns?: string[],
|
||||
filter?: string
|
||||
): Promise<ArrowTable<any>> {
|
||||
const response = await axios.post(
|
||||
`${this._url}/v1/table/${tableName}/query/`,
|
||||
{
|
||||
vector,
|
||||
k,
|
||||
nprobes,
|
||||
refineFactor,
|
||||
columns,
|
||||
filter,
|
||||
prefilter
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'x-api-key': this._apiKey(),
|
||||
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
|
||||
},
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 10000
|
||||
}
|
||||
).catch((err) => {
|
||||
console.error('error: ', err)
|
||||
if (err.response === undefined) {
|
||||
throw new Error(`Network Error: ${err.message as string}`)
|
||||
}
|
||||
return err.response
|
||||
})
|
||||
if (response.status !== 200) {
|
||||
const errorData = new TextDecoder().decode(response.data)
|
||||
throw new Error(
|
||||
`Server Error, status: ${response.status as number}, ` +
|
||||
`message: ${response.statusText as string}: ${errorData}`
|
||||
)
|
||||
}
|
||||
|
||||
const table = tableFromIPC(response.data)
|
||||
const result = await this.post(
|
||||
`/v1/table/${tableName}/query/`,
|
||||
{
|
||||
vector,
|
||||
k,
|
||||
nprobes,
|
||||
refineFactor,
|
||||
columns,
|
||||
filter,
|
||||
prefilter
|
||||
},
|
||||
undefined,
|
||||
undefined,
|
||||
'arraybuffer'
|
||||
)
|
||||
const table = tableFromIPC(await result.body())
|
||||
return table
|
||||
}
|
||||
|
||||
/**
|
||||
* Sent GET request.
|
||||
*/
|
||||
public async get (path: string, params?: Record<string, string | number>): Promise<AxiosResponse> {
|
||||
const response = await axios.get(
|
||||
`${this._url}${path}`,
|
||||
{
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'x-api-key': this._apiKey(),
|
||||
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
|
||||
},
|
||||
params,
|
||||
timeout: 10000
|
||||
}
|
||||
).catch((err) => {
|
||||
public async get (path: string, params?: Record<string, string>): Promise<RemoteResponse> {
|
||||
const req = {
|
||||
uri: `${this._url}${path}`,
|
||||
method: Method.GET,
|
||||
headers: new Map(Object.entries({
|
||||
'Content-Type': 'application/json',
|
||||
'x-api-key': this._apiKey(),
|
||||
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
|
||||
})),
|
||||
params: new Map(Object.entries(params ?? {}))
|
||||
}
|
||||
|
||||
let response
|
||||
try {
|
||||
response = await callWithMiddlewares(req, this._middlewares)
|
||||
return response
|
||||
} catch (err: any) {
|
||||
console.error('error: ', err)
|
||||
if (err.response === undefined) {
|
||||
throw new Error(`Network Error: ${err.message as string}`)
|
||||
}
|
||||
return err.response
|
||||
})
|
||||
|
||||
response = toLanceRes(err.response)
|
||||
}
|
||||
|
||||
if (response.status !== 200) {
|
||||
const errorData = new TextDecoder().decode(response.data)
|
||||
const errorData = new TextDecoder().decode(await response.body())
|
||||
throw new Error(
|
||||
`Server Error, status: ${response.status as number}, ` +
|
||||
`message: ${response.statusText as string}: ${errorData}`
|
||||
`Server Error, status: ${response.status}, ` +
|
||||
`message: ${response.statusText}: ${errorData}`
|
||||
)
|
||||
}
|
||||
|
||||
return response
|
||||
}
|
||||
|
||||
@@ -120,35 +196,65 @@ export class HttpLancedbClient {
|
||||
public async post (
|
||||
path: string,
|
||||
data?: any,
|
||||
params?: Record<string, string | number>,
|
||||
content?: string | undefined
|
||||
): Promise<AxiosResponse> {
|
||||
const response = await axios.post(
|
||||
`${this._url}${path}`,
|
||||
data,
|
||||
{
|
||||
headers: {
|
||||
'Content-Type': content ?? 'application/json',
|
||||
'x-api-key': this._apiKey(),
|
||||
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
|
||||
},
|
||||
params,
|
||||
timeout: 30000
|
||||
}
|
||||
).catch((err) => {
|
||||
params?: Record<string, string>,
|
||||
content?: string | undefined,
|
||||
responseType?: ResponseType | undefined
|
||||
): Promise<RemoteResponse> {
|
||||
const req = {
|
||||
uri: `${this._url}${path}`,
|
||||
method: Method.POST,
|
||||
headers: new Map(Object.entries({
|
||||
'Content-Type': content ?? 'application/json',
|
||||
'x-api-key': this._apiKey(),
|
||||
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
|
||||
})),
|
||||
params: new Map(Object.entries(params ?? {})),
|
||||
body: data
|
||||
}
|
||||
|
||||
let response
|
||||
try {
|
||||
response = await callWithMiddlewares(req, this._middlewares, { responseType })
|
||||
|
||||
// return response
|
||||
} catch (err: any) {
|
||||
console.error('error: ', err)
|
||||
if (err.response === undefined) {
|
||||
throw new Error(`Network Error: ${err.message as string}`)
|
||||
}
|
||||
return err.response
|
||||
})
|
||||
response = toLanceRes(err.response)
|
||||
}
|
||||
|
||||
if (response.status !== 200) {
|
||||
const errorData = new TextDecoder().decode(response.data)
|
||||
const errorData = new TextDecoder().decode(await response.body())
|
||||
throw new Error(
|
||||
`Server Error, status: ${response.status as number}, ` +
|
||||
`message: ${response.statusText as string}: ${errorData}`
|
||||
`Server Error, status: ${response.status}, ` +
|
||||
`message: ${response.statusText}: ${errorData}`
|
||||
)
|
||||
}
|
||||
|
||||
return response
|
||||
}
|
||||
|
||||
/**
|
||||
* Instrument this client with middleware
|
||||
* @param mw - The middleware that instruments the client
|
||||
* @returns - an instance of this client instrumented with the middleware
|
||||
*/
|
||||
public withMiddleware (mw: HttpLancedbClientMiddleware): HttpLancedbClient {
|
||||
const wrapped = this.clone()
|
||||
wrapped._middlewares.push(mw)
|
||||
return wrapped
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a clone of this client
|
||||
*/
|
||||
private clone (): HttpLancedbClient {
|
||||
const clone = new HttpLancedbClient(this._url, this._apiKey(), this._dbName)
|
||||
for (const mw of this._middlewares) {
|
||||
clone._middlewares.push(mw)
|
||||
}
|
||||
return clone
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,12 +39,13 @@ import {
|
||||
fromTableToStreamBuffer
|
||||
} from '../arrow'
|
||||
import { toSQL } from '../util'
|
||||
import { type HttpMiddleware } from '../middleware'
|
||||
|
||||
/**
|
||||
* Remote connection.
|
||||
*/
|
||||
export class RemoteConnection implements Connection {
|
||||
private readonly _client: HttpLancedbClient
|
||||
private _client: HttpLancedbClient
|
||||
private readonly _dbName: string
|
||||
|
||||
constructor (opts: ConnectionOptions) {
|
||||
@@ -84,10 +85,11 @@ export class RemoteConnection implements Connection {
|
||||
limit: number = 10
|
||||
): Promise<string[]> {
|
||||
const response = await this._client.get('/v1/table/', {
|
||||
limit,
|
||||
limit: `${limit}`,
|
||||
page_token: pageToken
|
||||
})
|
||||
return response.data.tables
|
||||
const body = await response.body()
|
||||
return body.tables
|
||||
}
|
||||
|
||||
async openTable (name: string): Promise<Table>
|
||||
@@ -163,7 +165,7 @@ export class RemoteConnection implements Connection {
|
||||
throw new Error(
|
||||
`Server Error, status: ${res.status}, ` +
|
||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||
`message: ${res.statusText}: ${res.data}`
|
||||
`message: ${res.statusText}: ${await res.body()}`
|
||||
)
|
||||
}
|
||||
|
||||
@@ -177,6 +179,17 @@ export class RemoteConnection implements Connection {
|
||||
async dropTable (name: string): Promise<void> {
|
||||
await this._client.post(`/v1/table/${name}/drop/`)
|
||||
}
|
||||
|
||||
withMiddleware (middleware: HttpMiddleware): Connection {
|
||||
const wrapped = this.clone()
|
||||
wrapped._client = wrapped._client.withMiddleware(middleware)
|
||||
return wrapped
|
||||
}
|
||||
|
||||
private clone (): RemoteConnection {
|
||||
const clone: RemoteConnection = Object.create(RemoteConnection.prototype)
|
||||
return Object.assign(clone, this)
|
||||
}
|
||||
}
|
||||
|
||||
export class RemoteQuery<T = number[]> extends Query<T> {
|
||||
@@ -229,7 +242,7 @@ export class RemoteQuery<T = number[]> extends Query<T> {
|
||||
// we are using extend until we have next next version release
|
||||
// Table and Connection has both been refactored to interfaces
|
||||
export class RemoteTable<T = number[]> implements Table<T> {
|
||||
private readonly _client: HttpLancedbClient
|
||||
private _client: HttpLancedbClient
|
||||
private readonly _embeddings?: EmbeddingFunction<T>
|
||||
private readonly _name: string
|
||||
|
||||
@@ -256,15 +269,15 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
get schema (): Promise<any> {
|
||||
return this._client
|
||||
.post(`/v1/table/${this._name}/describe/`)
|
||||
.then((res) => {
|
||||
.then(async (res) => {
|
||||
if (res.status !== 200) {
|
||||
throw new Error(
|
||||
`Server Error, status: ${res.status}, ` +
|
||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||
`message: ${res.statusText}: ${res.data}`
|
||||
`message: ${res.statusText}: ${await res.body()}`
|
||||
)
|
||||
}
|
||||
return res.data?.schema
|
||||
return (await res.body())?.schema
|
||||
})
|
||||
}
|
||||
|
||||
@@ -320,7 +333,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
throw new Error(
|
||||
`Server Error, status: ${res.status}, ` +
|
||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||
`message: ${res.statusText}: ${res.data}`
|
||||
`message: ${res.statusText}: ${await res.body()}`
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -346,7 +359,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
throw new Error(
|
||||
`Server Error, status: ${res.status}, ` +
|
||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||
`message: ${res.statusText}: ${res.data}`
|
||||
`message: ${res.statusText}: ${await res.body()}`
|
||||
)
|
||||
}
|
||||
return tbl.numRows
|
||||
@@ -372,7 +385,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
throw new Error(
|
||||
`Server Error, status: ${res.status}, ` +
|
||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||
`message: ${res.statusText}: ${res.data}`
|
||||
`message: ${res.statusText}: ${await res.body()}`
|
||||
)
|
||||
}
|
||||
return tbl.numRows
|
||||
@@ -415,7 +428,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
throw new Error(
|
||||
`Server Error, status: ${res.status}, ` +
|
||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||
`message: ${res.statusText}: ${res.data}`
|
||||
`message: ${res.statusText}: ${await res.body()}`
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -436,14 +449,14 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
throw new Error(
|
||||
`Server Error, status: ${res.status}, ` +
|
||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||
`message: ${res.statusText}: ${res.data}`
|
||||
`message: ${res.statusText}: ${await res.body()}`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async countRows (): Promise<number> {
|
||||
const result = await this._client.post(`/v1/table/${this._name}/describe/`)
|
||||
return result.data?.stats?.num_rows
|
||||
return (await result.body())?.stats?.num_rows
|
||||
}
|
||||
|
||||
async delete (filter: string): Promise<void> {
|
||||
@@ -476,7 +489,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
const results = await this._client.post(
|
||||
`/v1/table/${this._name}/index/list/`
|
||||
)
|
||||
return results.data.indexes?.map((index: any) => ({
|
||||
return (await results.body()).indexes?.map((index: any) => ({
|
||||
columns: index.columns,
|
||||
name: index.index_name,
|
||||
uuid: index.index_uuid
|
||||
@@ -487,9 +500,10 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
const results = await this._client.post(
|
||||
`/v1/table/${this._name}/index/${indexUuid}/stats/`
|
||||
)
|
||||
const body = await results.body()
|
||||
return {
|
||||
numIndexedRows: results.data.num_indexed_rows,
|
||||
numUnindexedRows: results.data.num_unindexed_rows
|
||||
numIndexedRows: body?.num_indexed_rows,
|
||||
numUnindexedRows: body?.num_unindexed_rows
|
||||
}
|
||||
}
|
||||
|
||||
@@ -504,4 +518,15 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
async dropColumns (columnNames: string[]): Promise<void> {
|
||||
throw new Error('Drop columns is not yet supported in LanceDB Cloud.')
|
||||
}
|
||||
|
||||
withMiddleware(middleware: HttpMiddleware): Table<T> {
|
||||
const wrapped = this.clone()
|
||||
wrapped._client = wrapped._client.withMiddleware(middleware)
|
||||
return wrapped
|
||||
}
|
||||
|
||||
private clone (): RemoteTable<T> {
|
||||
const clone: RemoteTable<T> = Object.create(RemoteTable.prototype)
|
||||
return Object.assign(clone, this)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,6 +106,9 @@ export class MakeArrowTableOptions {
|
||||
* An enhanced version of the {@link makeTable} function from Apache Arrow
|
||||
* that supports nested fields and embeddings columns.
|
||||
*
|
||||
* (typically you do not need to call this function. It will be called automatically
|
||||
* when creating a table or adding data to it)
|
||||
*
|
||||
* This function converts an array of Record<String, any> (row-major JS objects)
|
||||
* to an Arrow Table (a columnar structure)
|
||||
*
|
||||
|
||||
2
nodejs/lancedb/embedding/index.ts
Normal file
2
nodejs/lancedb/embedding/index.ts
Normal file
@@ -0,0 +1,2 @@
|
||||
export { EmbeddingFunction, isEmbeddingFunction } from "./embedding_function";
|
||||
export { OpenAIEmbeddingFunction } from "./openai";
|
||||
@@ -18,9 +18,34 @@ import {
|
||||
ConnectionOptions,
|
||||
} from "./native.js";
|
||||
|
||||
export { ConnectionOptions, WriteOptions, Query } from "./native.js";
|
||||
export { Connection, CreateTableOptions } from "./connection";
|
||||
export { Table, AddDataOptions } from "./table";
|
||||
export {
|
||||
WriteOptions,
|
||||
WriteMode,
|
||||
AddColumnsSql,
|
||||
ColumnAlteration,
|
||||
ConnectionOptions,
|
||||
} from "./native.js";
|
||||
export {
|
||||
makeArrowTable,
|
||||
MakeArrowTableOptions,
|
||||
Data,
|
||||
VectorColumnOptions,
|
||||
} from "./arrow";
|
||||
export {
|
||||
Connection,
|
||||
CreateTableOptions,
|
||||
TableNamesOptions,
|
||||
} from "./connection";
|
||||
export {
|
||||
ExecutableQuery,
|
||||
Query,
|
||||
QueryBase,
|
||||
VectorQuery,
|
||||
RecordBatchIterator,
|
||||
} from "./query";
|
||||
export { Index, IndexOptions, IvfPqOptions } from "./indices";
|
||||
export { Table, AddDataOptions, IndexConfig, UpdateOptions } from "./table";
|
||||
export * as embedding from "./embedding";
|
||||
|
||||
/**
|
||||
* Connect to a LanceDB instance at the given URI.
|
||||
|
||||
147
nodejs/lancedb/native.d.ts
vendored
147
nodejs/lancedb/native.d.ts
vendored
@@ -1,147 +0,0 @@
|
||||
/* tslint:disable */
|
||||
/* eslint-disable */
|
||||
|
||||
/* auto-generated by NAPI-RS */
|
||||
|
||||
/** A description of an index currently configured on a column */
|
||||
export interface IndexConfig {
|
||||
/** The type of the index */
|
||||
indexType: string
|
||||
/**
|
||||
* The columns in the index
|
||||
*
|
||||
* Currently this is always an array of size 1. In the future there may
|
||||
* be more columns to represent composite indices.
|
||||
*/
|
||||
columns: Array<string>
|
||||
}
|
||||
/**
|
||||
* A definition of a column alteration. The alteration changes the column at
|
||||
* `path` to have the new name `name`, to be nullable if `nullable` is true,
|
||||
* and to have the data type `data_type`. At least one of `rename` or `nullable`
|
||||
* must be provided.
|
||||
*/
|
||||
export interface ColumnAlteration {
|
||||
/**
|
||||
* The path to the column to alter. This is a dot-separated path to the column.
|
||||
* If it is a top-level column then it is just the name of the column. If it is
|
||||
* a nested column then it is the path to the column, e.g. "a.b.c" for a column
|
||||
* `c` nested inside a column `b` nested inside a column `a`.
|
||||
*/
|
||||
path: string
|
||||
/**
|
||||
* The new name of the column. If not provided then the name will not be changed.
|
||||
* This must be distinct from the names of all other columns in the table.
|
||||
*/
|
||||
rename?: string
|
||||
/** Set the new nullability. Note that a nullable column cannot be made non-nullable. */
|
||||
nullable?: boolean
|
||||
}
|
||||
/** A definition of a new column to add to a table. */
|
||||
export interface AddColumnsSql {
|
||||
/** The name of the new column. */
|
||||
name: string
|
||||
/**
|
||||
* The values to populate the new column with, as a SQL expression.
|
||||
* The expression can reference other columns in the table.
|
||||
*/
|
||||
valueSql: string
|
||||
}
|
||||
export interface ConnectionOptions {
|
||||
apiKey?: string
|
||||
hostOverride?: string
|
||||
/**
|
||||
* (For LanceDB OSS only): The interval, in seconds, at which to check for
|
||||
* updates to the table from other processes. If None, then consistency is not
|
||||
* checked. For performance reasons, this is the default. For strong
|
||||
* consistency, set this to zero seconds. Then every read will check for
|
||||
* updates from other processes. As a compromise, you can set this to a
|
||||
* non-zero value for eventual consistency. If more than that interval
|
||||
* has passed since the last check, then the table will be checked for updates.
|
||||
* Note: this consistency only applies to read operations. Write operations are
|
||||
* always consistent.
|
||||
*/
|
||||
readConsistencyInterval?: number
|
||||
}
|
||||
/** Write mode for writing a table. */
|
||||
export const enum WriteMode {
|
||||
Create = 'Create',
|
||||
Append = 'Append',
|
||||
Overwrite = 'Overwrite'
|
||||
}
|
||||
/** Write options when creating a Table. */
|
||||
export interface WriteOptions {
|
||||
mode?: WriteMode
|
||||
}
|
||||
export function connect(uri: string, options: ConnectionOptions): Promise<Connection>
|
||||
export class Connection {
|
||||
/** Create a new Connection instance from the given URI. */
|
||||
static new(uri: string, options: ConnectionOptions): Promise<Connection>
|
||||
display(): string
|
||||
isOpen(): boolean
|
||||
close(): void
|
||||
/** List all tables in the dataset. */
|
||||
tableNames(startAfter?: string | undefined | null, limit?: number | undefined | null): Promise<Array<string>>
|
||||
/**
|
||||
* Create table from a Apache Arrow IPC (file) buffer.
|
||||
*
|
||||
* Parameters:
|
||||
* - name: The name of the table.
|
||||
* - buf: The buffer containing the IPC file.
|
||||
*
|
||||
*/
|
||||
createTable(name: string, buf: Buffer, mode: string): Promise<Table>
|
||||
createEmptyTable(name: string, schemaBuf: Buffer, mode: string): Promise<Table>
|
||||
openTable(name: string): Promise<Table>
|
||||
/** Drop table with the name. Or raise an error if the table does not exist. */
|
||||
dropTable(name: string): Promise<void>
|
||||
}
|
||||
export class Index {
|
||||
static ivfPq(distanceType?: string | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): Index
|
||||
static btree(): Index
|
||||
}
|
||||
/** Typescript-style Async Iterator over RecordBatches */
|
||||
export class RecordBatchIterator {
|
||||
next(): Promise<Buffer | null>
|
||||
}
|
||||
export class Query {
|
||||
onlyIf(predicate: string): void
|
||||
select(columns: Array<[string, string]>): void
|
||||
limit(limit: number): void
|
||||
nearestTo(vector: Float32Array): VectorQuery
|
||||
execute(): Promise<RecordBatchIterator>
|
||||
}
|
||||
export class VectorQuery {
|
||||
column(column: string): void
|
||||
distanceType(distanceType: string): void
|
||||
postfilter(): void
|
||||
refineFactor(refineFactor: number): void
|
||||
nprobes(nprobe: number): void
|
||||
bypassVectorIndex(): void
|
||||
onlyIf(predicate: string): void
|
||||
select(columns: Array<[string, string]>): void
|
||||
limit(limit: number): void
|
||||
execute(): Promise<RecordBatchIterator>
|
||||
}
|
||||
export class Table {
|
||||
display(): string
|
||||
isOpen(): boolean
|
||||
close(): void
|
||||
/** Return Schema as empty Arrow IPC file. */
|
||||
schema(): Promise<Buffer>
|
||||
add(buf: Buffer, mode: string): Promise<void>
|
||||
countRows(filter?: string | undefined | null): Promise<number>
|
||||
delete(predicate: string): Promise<void>
|
||||
createIndex(index: Index | undefined | null, column: string, replace?: boolean | undefined | null): Promise<void>
|
||||
update(onlyIf: string | undefined | null, columns: Array<[string, string]>): Promise<void>
|
||||
query(): Query
|
||||
vectorSearch(vector: Float32Array): VectorQuery
|
||||
addColumns(transforms: Array<AddColumnsSql>): Promise<void>
|
||||
alterColumns(alterations: Array<ColumnAlteration>): Promise<void>
|
||||
dropColumns(columns: Array<string>): Promise<void>
|
||||
version(): Promise<number>
|
||||
checkout(version: number): Promise<void>
|
||||
checkoutLatest(): Promise<void>
|
||||
restore(): Promise<void>
|
||||
listIndices(): Promise<Array<IndexConfig>>
|
||||
}
|
||||
@@ -1,329 +0,0 @@
|
||||
/* tslint:disable */
|
||||
/* eslint-disable */
|
||||
/* prettier-ignore */
|
||||
|
||||
/* auto-generated by NAPI-RS */
|
||||
|
||||
const { existsSync, readFileSync } = require('fs')
|
||||
const { join } = require("path");
|
||||
|
||||
const { platform, arch } = process;
|
||||
|
||||
let nativeBinding = null;
|
||||
let localFileExisted = false;
|
||||
let loadError = null;
|
||||
|
||||
function isMusl() {
|
||||
// For Node 10
|
||||
if (!process.report || typeof process.report.getReport !== "function") {
|
||||
try {
|
||||
const lddPath = require("child_process")
|
||||
.execSync("which ldd")
|
||||
.toString()
|
||||
.trim();
|
||||
return readFileSync(lddPath, "utf8").includes("musl");
|
||||
} catch (e) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
const { glibcVersionRuntime } = process.report.getReport().header;
|
||||
return !glibcVersionRuntime;
|
||||
}
|
||||
}
|
||||
|
||||
switch (platform) {
|
||||
case "android":
|
||||
switch (arch) {
|
||||
case "arm64":
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.android-arm64.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.android-arm64.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-android-arm64");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
break;
|
||||
case "arm":
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.android-arm-eabi.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.android-arm-eabi.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-android-arm-eabi");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unsupported architecture on Android ${arch}`);
|
||||
}
|
||||
break;
|
||||
case "win32":
|
||||
switch (arch) {
|
||||
case "x64":
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.win32-x64-msvc.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.win32-x64-msvc.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-win32-x64-msvc");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
break;
|
||||
case "ia32":
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.win32-ia32-msvc.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.win32-ia32-msvc.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-win32-ia32-msvc");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
break;
|
||||
case "arm64":
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.win32-arm64-msvc.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.win32-arm64-msvc.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-win32-arm64-msvc");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unsupported architecture on Windows: ${arch}`);
|
||||
}
|
||||
break;
|
||||
case "darwin":
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.darwin-universal.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.darwin-universal.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-darwin-universal");
|
||||
}
|
||||
break;
|
||||
} catch {}
|
||||
switch (arch) {
|
||||
case "x64":
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.darwin-x64.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.darwin-x64.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-darwin-x64");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
break;
|
||||
case "arm64":
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.darwin-arm64.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.darwin-arm64.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-darwin-arm64");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unsupported architecture on macOS: ${arch}`);
|
||||
}
|
||||
break;
|
||||
case "freebsd":
|
||||
if (arch !== "x64") {
|
||||
throw new Error(`Unsupported architecture on FreeBSD: ${arch}`);
|
||||
}
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.freebsd-x64.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.freebsd-x64.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-freebsd-x64");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
break;
|
||||
case "linux":
|
||||
switch (arch) {
|
||||
case "x64":
|
||||
if (isMusl()) {
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.linux-x64-musl.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.linux-x64-musl.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-linux-x64-musl");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
} else {
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.linux-x64-gnu.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.linux-x64-gnu.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-linux-x64-gnu");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case "arm64":
|
||||
if (isMusl()) {
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.linux-arm64-musl.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.linux-arm64-musl.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-linux-arm64-musl");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
} else {
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.linux-arm64-gnu.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.linux-arm64-gnu.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-linux-arm64-gnu");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case "arm":
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.linux-arm-gnueabihf.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.linux-arm-gnueabihf.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-linux-arm-gnueabihf");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
break;
|
||||
case "riscv64":
|
||||
if (isMusl()) {
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.linux-riscv64-musl.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.linux-riscv64-musl.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-linux-riscv64-musl");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
} else {
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.linux-riscv64-gnu.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.linux-riscv64-gnu.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-linux-riscv64-gnu");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case "s390x":
|
||||
localFileExisted = existsSync(
|
||||
join(__dirname, "lancedb-nodejs.linux-s390x-gnu.node"),
|
||||
);
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require("./lancedb-nodejs.linux-s390x-gnu.node");
|
||||
} else {
|
||||
nativeBinding = require("lancedb-linux-s390x-gnu");
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unsupported architecture on Linux: ${arch}`);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unsupported OS: ${platform}, architecture: ${arch}`);
|
||||
}
|
||||
|
||||
if (!nativeBinding) {
|
||||
if (loadError) {
|
||||
throw loadError;
|
||||
}
|
||||
throw new Error(`Failed to load native binding`);
|
||||
}
|
||||
|
||||
const {
|
||||
Connection,
|
||||
Index,
|
||||
RecordBatchIterator,
|
||||
Query,
|
||||
VectorQuery,
|
||||
Table,
|
||||
WriteMode,
|
||||
connect,
|
||||
} = nativeBinding;
|
||||
|
||||
module.exports.Connection = Connection;
|
||||
module.exports.Index = Index;
|
||||
module.exports.RecordBatchIterator = RecordBatchIterator;
|
||||
module.exports.Query = Query;
|
||||
module.exports.VectorQuery = VectorQuery;
|
||||
module.exports.Table = Table;
|
||||
module.exports.WriteMode = WriteMode;
|
||||
module.exports.connect = connect;
|
||||
@@ -20,7 +20,7 @@ import {
|
||||
VectorQuery as NativeVectorQuery,
|
||||
} from "./native";
|
||||
import { type IvfPqOptions } from "./indices";
|
||||
class RecordBatchIterator implements AsyncIterator<RecordBatch> {
|
||||
export class RecordBatchIterator implements AsyncIterator<RecordBatch> {
|
||||
private promisedInner?: Promise<NativeBatchIterator>;
|
||||
private inner?: NativeBatchIterator;
|
||||
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# `lancedb-darwin-arm64`
|
||||
# `@lancedb/lancedb-darwin-arm64`
|
||||
|
||||
This is the **aarch64-apple-darwin** binary for `lancedb`
|
||||
This is the **aarch64-apple-darwin** binary for `@lancedb/lancedb`
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "lancedb-darwin-arm64",
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.4.3",
|
||||
"os": [
|
||||
"darwin"
|
||||
@@ -11,7 +11,7 @@
|
||||
"files": [
|
||||
"lancedb.darwin-arm64.node"
|
||||
],
|
||||
"license": "MIT",
|
||||
"license": "Apache 2.0",
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
}
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# `lancedb-darwin-x64`
|
||||
# `@lancedb/lancedb-darwin-x64`
|
||||
|
||||
This is the **x86_64-apple-darwin** binary for `lancedb`
|
||||
This is the **x86_64-apple-darwin** binary for `@lancedb/lancedb`
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "lancedb-darwin-x64",
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.4.3",
|
||||
"os": [
|
||||
"darwin"
|
||||
@@ -11,7 +11,7 @@
|
||||
"files": [
|
||||
"lancedb.darwin-x64.node"
|
||||
],
|
||||
"license": "MIT",
|
||||
"license": "Apache 2.0",
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
}
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# `lancedb-linux-arm64-gnu`
|
||||
# `@lancedb/lancedb-linux-arm64-gnu`
|
||||
|
||||
This is the **aarch64-unknown-linux-gnu** binary for `lancedb`
|
||||
This is the **aarch64-unknown-linux-gnu** binary for `@lancedb/lancedb`
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "lancedb-linux-arm64-gnu",
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.4.3",
|
||||
"os": [
|
||||
"linux"
|
||||
@@ -11,9 +11,9 @@
|
||||
"files": [
|
||||
"lancedb.linux-arm64-gnu.node"
|
||||
],
|
||||
"license": "MIT",
|
||||
"license": "Apache 2.0",
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
"node": ">= 18"
|
||||
},
|
||||
"libc": [
|
||||
"glibc"
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# `lancedb-linux-x64-gnu`
|
||||
# `@lancedb/lancedb-linux-x64-gnu`
|
||||
|
||||
This is the **x86_64-unknown-linux-gnu** binary for `lancedb`
|
||||
This is the **x86_64-unknown-linux-gnu** binary for `@lancedb/lancedb`
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "lancedb-linux-x64-gnu",
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.4.3",
|
||||
"os": [
|
||||
"linux"
|
||||
@@ -11,9 +11,9 @@
|
||||
"files": [
|
||||
"lancedb.linux-x64-gnu.node"
|
||||
],
|
||||
"license": "MIT",
|
||||
"license": "Apache 2.0",
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
"node": ">= 18"
|
||||
},
|
||||
"libc": [
|
||||
"glibc"
|
||||
|
||||
3
nodejs/npm/win32-x64-msvc/README.md
Normal file
3
nodejs/npm/win32-x64-msvc/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# `@lancedb/lancedb-win32-x64-msvc`
|
||||
|
||||
This is the **x86_64-pc-windows-msvc** binary for `@lancedb/lancedb`
|
||||
18
nodejs/npm/win32-x64-msvc/package.json
Normal file
18
nodejs/npm/win32-x64-msvc/package.json
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.4.3",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
"files": [
|
||||
"lancedb.win32-x64-msvc.node"
|
||||
],
|
||||
"license": "Apache 2.0",
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
}
|
||||
}
|
||||
198
nodejs/package-lock.json
generated
198
nodejs/package-lock.json
generated
@@ -1,11 +1,11 @@
|
||||
{
|
||||
"name": "lancedb",
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.4.3",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "lancedb",
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.4.3",
|
||||
"cpu": [
|
||||
"x64",
|
||||
@@ -15,8 +15,12 @@
|
||||
"os": [
|
||||
"darwin",
|
||||
"linux",
|
||||
"windows"
|
||||
"win32"
|
||||
],
|
||||
"dependencies": {
|
||||
"apache-arrow": "^15.0.0",
|
||||
"openai": "^4.29.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@napi-rs/cli": "^2.18.0",
|
||||
"@types/jest": "^29.1.2",
|
||||
@@ -29,6 +33,7 @@
|
||||
"eslint-plugin-jsdoc": "^48.2.1",
|
||||
"jest": "^29.7.0",
|
||||
"prettier": "^3.1.0",
|
||||
"shx": "^0.3.4",
|
||||
"tmp": "^0.2.3",
|
||||
"ts-jest": "^29.1.2",
|
||||
"typedoc": "^0.25.7",
|
||||
@@ -40,14 +45,11 @@
|
||||
"node": ">= 18"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"lancedb-darwin-arm64": "0.4.3",
|
||||
"lancedb-darwin-x64": "0.4.3",
|
||||
"lancedb-linux-arm64-gnu": "0.4.3",
|
||||
"lancedb-linux-x64-gnu": "0.4.3",
|
||||
"openai": "^4.28.4"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"apache-arrow": "^15.0.0"
|
||||
"@lancedb/lancedb-darwin-arm64": "0.4.3",
|
||||
"@lancedb/lancedb-darwin-x64": "0.4.3",
|
||||
"@lancedb/lancedb-linux-arm64-gnu": "0.4.3",
|
||||
"@lancedb/lancedb-linux-x64-gnu": "0.4.3",
|
||||
"@lancedb/lancedb-win32-x64-msvc": "0.4.3"
|
||||
}
|
||||
},
|
||||
"node_modules/@75lb/deep-merge": {
|
||||
@@ -1317,6 +1319,66 @@
|
||||
"@jridgewell/sourcemap-codec": "^1.4.14"
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/lancedb-darwin-arm64": {
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-arm64/-/lancedb-darwin-arm64-0.4.3.tgz",
|
||||
"integrity": "sha512-+kxuWUK9vtLBbjFMkIKeQ32kxK2tgvZRCQaU1I3RJ3+dLmDIVeIj+KJSlMelkKa2QC4JoyHQi9Ty1PdS2DojmQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/lancedb-darwin-x64": {
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-x64/-/lancedb-darwin-x64-0.4.3.tgz",
|
||||
"integrity": "sha512-JYvsSYxTOa/7OMojulz9h0gN2FwvypG/6l6dpLkViZ5LDvRcfVyDTzOLcOJkFn+db4TKeBOVyMWnnpDKaB+jLA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/lancedb-linux-x64-gnu": {
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-x64-gnu/-/lancedb-linux-x64-gnu-0.4.3.tgz",
|
||||
"integrity": "sha512-jDANHchWNGmu1wfAyBk0apoFlLxtJ7FRc31pAQ3tKE4fwlgG7bUcaTX6s5C3vMNWXnyQLQtVuWZNXi2nVj879g==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/lancedb-win32-x64-msvc": {
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-win32-x64-msvc/-/lancedb-win32-x64-msvc-0.4.3.tgz",
|
||||
"integrity": "sha512-qADveXyv4YzllIbOOq8soqFfL7p7I35uhrD3PcTvj4Qxuo6q7pgQWQz2Mt3kGBpyPkH2yE4wWAGJhayShLRbiQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/cli": {
|
||||
"version": "2.18.0",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/cli/-/cli-2.18.0.tgz",
|
||||
@@ -1396,7 +1458,6 @@
|
||||
"version": "0.5.6",
|
||||
"resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.6.tgz",
|
||||
"integrity": "sha512-aYX01Ke9hunpoCexYAgQucEpARGQ5w/cqHFrIR+e9gdKb1QWTsVJuTJ2ozQzIAxLyRQe/m+2RqzkyOOGiMKRQA==",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"tslib": "^2.4.0"
|
||||
}
|
||||
@@ -1445,8 +1506,7 @@
|
||||
"node_modules/@types/command-line-args": {
|
||||
"version": "5.2.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/command-line-args/-/command-line-args-5.2.3.tgz",
|
||||
"integrity": "sha512-uv0aG6R0Y8WHZLTamZwtfsDLVRnOa+n+n5rEvFWL5Na5gZ8V2Teab/duDPFzIIIhs9qizDpcavCusCLJZu62Kw==",
|
||||
"peer": true
|
||||
"integrity": "sha512-uv0aG6R0Y8WHZLTamZwtfsDLVRnOa+n+n5rEvFWL5Na5gZ8V2Teab/duDPFzIIIhs9qizDpcavCusCLJZu62Kw=="
|
||||
},
|
||||
"node_modules/@types/command-line-usage": {
|
||||
"version": "5.0.2",
|
||||
@@ -1514,7 +1574,6 @@
|
||||
"version": "2.6.11",
|
||||
"resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.11.tgz",
|
||||
"integrity": "sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"@types/node": "*",
|
||||
"form-data": "^4.0.0"
|
||||
@@ -1783,7 +1842,6 @@
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
|
||||
"integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"event-target-shim": "^5.0.0"
|
||||
},
|
||||
@@ -1816,7 +1874,6 @@
|
||||
"version": "4.5.0",
|
||||
"resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz",
|
||||
"integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"humanize-ms": "^1.2.1"
|
||||
},
|
||||
@@ -1913,7 +1970,6 @@
|
||||
"version": "15.0.0",
|
||||
"resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-15.0.0.tgz",
|
||||
"integrity": "sha512-e6aunxNKM+woQf137ny3tp/xbLjFJS2oGQxQhYGqW6dGeIwNV1jOeEAeR6sS2jwAI2qLO83gYIP2MBz02Gw5Xw==",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@swc/helpers": "^0.5.2",
|
||||
"@types/command-line-args": "^5.2.1",
|
||||
@@ -2001,8 +2057,7 @@
|
||||
"node_modules/asynckit": {
|
||||
"version": "0.4.0",
|
||||
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
|
||||
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
|
||||
"optional": true
|
||||
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="
|
||||
},
|
||||
"node_modules/babel-jest": {
|
||||
"version": "29.7.0",
|
||||
@@ -2129,8 +2184,7 @@
|
||||
"node_modules/base-64": {
|
||||
"version": "0.1.0",
|
||||
"resolved": "https://registry.npmjs.org/base-64/-/base-64-0.1.0.tgz",
|
||||
"integrity": "sha512-Y5gU45svrR5tI2Vt/X9GPd3L0HNIKzGu202EjxrXMpuc2V2CiKgemAbUUsqYmZJvPtCXoUKjNZwBJzsNScUbXA==",
|
||||
"optional": true
|
||||
"integrity": "sha512-Y5gU45svrR5tI2Vt/X9GPd3L0HNIKzGu202EjxrXMpuc2V2CiKgemAbUUsqYmZJvPtCXoUKjNZwBJzsNScUbXA=="
|
||||
},
|
||||
"node_modules/brace-expansion": {
|
||||
"version": "1.1.11",
|
||||
@@ -2296,7 +2350,6 @@
|
||||
"version": "0.0.2",
|
||||
"resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz",
|
||||
"integrity": "sha512-yrLQ/yVUFXkzg7EDQsPieE/53+0RlaWTs+wBrvW36cyilJ2SaDWfl4Yj7MtLTXleV9uEKefbAGUPv2/iWSooRA==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
@@ -2357,7 +2410,6 @@
|
||||
"version": "1.0.8",
|
||||
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
|
||||
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"delayed-stream": "~1.0.0"
|
||||
},
|
||||
@@ -2469,7 +2521,6 @@
|
||||
"version": "0.0.2",
|
||||
"resolved": "https://registry.npmjs.org/crypt/-/crypt-0.0.2.tgz",
|
||||
"integrity": "sha512-mCxBlsHFYh9C+HVpiEacem8FEBnMXgU9gy4zmNC+SXAZNB/1idgp/aulFJ4FgCi7GPEVbfyng092GqL2k2rmow==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
@@ -2530,7 +2581,6 @@
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
|
||||
"integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">=0.4.0"
|
||||
}
|
||||
@@ -2557,7 +2607,6 @@
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/digest-fetch/-/digest-fetch-1.3.0.tgz",
|
||||
"integrity": "sha512-CGJuv6iKNM7QyZlM2T3sPAdZWd/p9zQiRNS9G+9COUCwzWFTs0Xp8NF5iePx7wtvhDykReiRRrSeNb4oMmB8lA==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"base-64": "^0.1.0",
|
||||
"md5": "^2.3.0"
|
||||
@@ -2862,7 +2911,6 @@
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz",
|
||||
"integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">=6"
|
||||
}
|
||||
@@ -3024,7 +3072,6 @@
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
|
||||
"integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"asynckit": "^0.4.0",
|
||||
"combined-stream": "^1.0.8",
|
||||
@@ -3037,14 +3084,12 @@
|
||||
"node_modules/form-data-encoder": {
|
||||
"version": "1.7.2",
|
||||
"resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz",
|
||||
"integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==",
|
||||
"optional": true
|
||||
"integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="
|
||||
},
|
||||
"node_modules/formdata-node": {
|
||||
"version": "4.4.1",
|
||||
"resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz",
|
||||
"integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"node-domexception": "1.0.0",
|
||||
"web-streams-polyfill": "4.0.0-beta.3"
|
||||
@@ -3057,7 +3102,6 @@
|
||||
"version": "4.0.0-beta.3",
|
||||
"resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz",
|
||||
"integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
@@ -3272,7 +3316,6 @@
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz",
|
||||
"integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"ms": "^2.0.0"
|
||||
}
|
||||
@@ -3355,6 +3398,15 @@
|
||||
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/interpret": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/interpret/-/interpret-1.4.0.tgz",
|
||||
"integrity": "sha512-agE4QfB2Lkp9uICn7BAqoscw4SZP9kTE2hxiFI3jBPmXJfdqiahTbUuKGsMoN2GtqL9AxhYioAcVvgsb1HvRbA==",
|
||||
"dev": true,
|
||||
"engines": {
|
||||
"node": ">= 0.10"
|
||||
}
|
||||
},
|
||||
"node_modules/is-arrayish": {
|
||||
"version": "0.2.1",
|
||||
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz",
|
||||
@@ -3364,8 +3416,7 @@
|
||||
"node_modules/is-buffer": {
|
||||
"version": "1.1.6",
|
||||
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
|
||||
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==",
|
||||
"optional": true
|
||||
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
|
||||
},
|
||||
"node_modules/is-builtin-module": {
|
||||
"version": "3.2.1",
|
||||
@@ -4458,7 +4509,6 @@
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz",
|
||||
"integrity": "sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"charenc": "0.0.2",
|
||||
"crypt": "0.0.2",
|
||||
@@ -4497,7 +4547,6 @@
|
||||
"version": "1.52.0",
|
||||
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
|
||||
"integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
@@ -4506,7 +4555,6 @@
|
||||
"version": "2.1.35",
|
||||
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
|
||||
"integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"mime-db": "1.52.0"
|
||||
},
|
||||
@@ -4538,8 +4586,7 @@
|
||||
"node_modules/ms": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
|
||||
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
|
||||
"optional": true
|
||||
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
|
||||
},
|
||||
"node_modules/natural-compare": {
|
||||
"version": "1.4.0",
|
||||
@@ -4567,7 +4614,6 @@
|
||||
"url": "https://paypal.me/jimmywarting"
|
||||
}
|
||||
],
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">=10.5.0"
|
||||
}
|
||||
@@ -4576,7 +4622,6 @@
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
|
||||
"integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"whatwg-url": "^5.0.0"
|
||||
},
|
||||
@@ -4623,10 +4668,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/openai": {
|
||||
"version": "4.28.4",
|
||||
"resolved": "https://registry.npmjs.org/openai/-/openai-4.28.4.tgz",
|
||||
"integrity": "sha512-RNIwx4MT/F0zyizGcwS+bXKLzJ8QE9IOyigDG/ttnwB220d58bYjYFp0qjvGwEFBO6+pvFVIDABZPGDl46RFsg==",
|
||||
"optional": true,
|
||||
"version": "4.29.2",
|
||||
"resolved": "https://registry.npmjs.org/openai/-/openai-4.29.2.tgz",
|
||||
"integrity": "sha512-cPkT6zjEcE4qU5OW/SoDDuXEsdOLrXlAORhzmaguj5xZSPlgKvLhi27sFWhLKj07Y6WKNWxcwIbzm512FzTBNQ==",
|
||||
"dependencies": {
|
||||
"@types/node": "^18.11.18",
|
||||
"@types/node-fetch": "^2.6.4",
|
||||
@@ -4643,10 +4687,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/openai/node_modules/@types/node": {
|
||||
"version": "18.19.20",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.20.tgz",
|
||||
"integrity": "sha512-SKXZvI375jkpvAj8o+5U2518XQv76mAsixqfXiVyWyXZbVWQK25RurFovYpVIxVzul0rZoH58V/3SkEnm7s3qA==",
|
||||
"optional": true,
|
||||
"version": "18.19.26",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.26.tgz",
|
||||
"integrity": "sha512-+wiMJsIwLOYCvUqSdKTrfkS8mpTp+MPINe6+Np4TAGFWWRWiBQ5kSq9nZGCSPkzx9mvT+uEukzpX4MOSCydcvw==",
|
||||
"dependencies": {
|
||||
"undici-types": "~5.26.4"
|
||||
}
|
||||
@@ -4996,6 +5039,18 @@
|
||||
"integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/rechoir": {
|
||||
"version": "0.6.2",
|
||||
"resolved": "https://registry.npmjs.org/rechoir/-/rechoir-0.6.2.tgz",
|
||||
"integrity": "sha512-HFM8rkZ+i3zrV+4LQjwQ0W+ez98pApMGM3HUrN04j3CqzPOzl9nmP15Y8YXNm8QHGv/eacOVEjqhmWpkRV0NAw==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"resolve": "^1.1.6"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.10"
|
||||
}
|
||||
},
|
||||
"node_modules/repeat-string": {
|
||||
"version": "1.6.1",
|
||||
"resolved": "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz",
|
||||
@@ -5145,6 +5200,23 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/shelljs": {
|
||||
"version": "0.8.5",
|
||||
"resolved": "https://registry.npmjs.org/shelljs/-/shelljs-0.8.5.tgz",
|
||||
"integrity": "sha512-TiwcRcrkhHvbrZbnRcFYMLl30Dfov3HKqzp5tO5b4pt6G/SezKcYhmDg15zXVBswHmctSAQKznqNW2LO5tTDow==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"glob": "^7.0.0",
|
||||
"interpret": "^1.0.0",
|
||||
"rechoir": "^0.6.2"
|
||||
},
|
||||
"bin": {
|
||||
"shjs": "bin/shjs"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/shiki": {
|
||||
"version": "0.14.7",
|
||||
"resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.7.tgz",
|
||||
@@ -5157,6 +5229,22 @@
|
||||
"vscode-textmate": "^8.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/shx": {
|
||||
"version": "0.3.4",
|
||||
"resolved": "https://registry.npmjs.org/shx/-/shx-0.3.4.tgz",
|
||||
"integrity": "sha512-N6A9MLVqjxZYcVn8hLmtneQWIJtp8IKzMP4eMnx+nqkvXoqinUPCbUFLp2UcWTEIUONhlk0ewxr/jaVGlc+J+g==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"minimist": "^1.2.3",
|
||||
"shelljs": "^0.8.5"
|
||||
},
|
||||
"bin": {
|
||||
"shx": "lib/cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/signal-exit": {
|
||||
"version": "3.0.7",
|
||||
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
|
||||
@@ -5432,8 +5520,7 @@
|
||||
"node_modules/tr46": {
|
||||
"version": "0.0.3",
|
||||
"resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
|
||||
"integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
|
||||
"optional": true
|
||||
"integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="
|
||||
},
|
||||
"node_modules/ts-api-utils": {
|
||||
"version": "1.0.3",
|
||||
@@ -5929,7 +6016,6 @@
|
||||
"version": "3.3.3",
|
||||
"resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz",
|
||||
"integrity": "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">= 8"
|
||||
}
|
||||
@@ -5937,14 +6023,12 @@
|
||||
"node_modules/webidl-conversions": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
|
||||
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
|
||||
"optional": true
|
||||
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="
|
||||
},
|
||||
"node_modules/whatwg-url": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
|
||||
"integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"tr46": "~0.0.3",
|
||||
"webidl-conversions": "^3.0.0"
|
||||
|
||||
@@ -1,17 +1,18 @@
|
||||
{
|
||||
"name": "lancedb",
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.4.3",
|
||||
"main": "./dist/index.js",
|
||||
"types": "./dist/index.d.ts",
|
||||
"napi": {
|
||||
"name": "lancedb-nodejs",
|
||||
"name": "lancedb",
|
||||
"triples": {
|
||||
"defaults": false,
|
||||
"additional": [
|
||||
"aarch64-apple-darwin",
|
||||
"aarch64-unknown-linux-gnu",
|
||||
"x86_64-apple-darwin",
|
||||
"x86_64-unknown-linux-gnu"
|
||||
"x86_64-unknown-linux-gnu",
|
||||
"x86_64-pc-windows-msvc"
|
||||
]
|
||||
}
|
||||
},
|
||||
@@ -28,6 +29,7 @@
|
||||
"eslint-plugin-jsdoc": "^48.2.1",
|
||||
"jest": "^29.7.0",
|
||||
"prettier": "^3.1.0",
|
||||
"shx": "^0.3.4",
|
||||
"tmp": "^0.2.3",
|
||||
"ts-jest": "^29.1.2",
|
||||
"typedoc": "^0.25.7",
|
||||
@@ -48,13 +50,14 @@
|
||||
"os": [
|
||||
"darwin",
|
||||
"linux",
|
||||
"windows"
|
||||
"win32"
|
||||
],
|
||||
"scripts": {
|
||||
"artifacts": "napi artifacts",
|
||||
"build:native": "napi build --platform --release --js lancedb/native.js --dts lancedb/native.d.ts dist/",
|
||||
"build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js dist/",
|
||||
"build": "npm run build:debug && tsc -b",
|
||||
"build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js dist/",
|
||||
"build": "npm run build:debug && tsc -b && shx cp lancedb/native.d.ts dist/native.d.ts",
|
||||
"build-release": "npm run build:release && tsc -b && shx cp lancedb/native.d.ts dist/native.d.ts",
|
||||
"chkformat": "prettier . --check",
|
||||
"docs": "typedoc --plugin typedoc-plugin-markdown lancedb/index.ts",
|
||||
"lint": "eslint lancedb && eslint __test__",
|
||||
@@ -64,13 +67,14 @@
|
||||
"version": "napi version"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"lancedb-darwin-arm64": "0.4.3",
|
||||
"lancedb-darwin-x64": "0.4.3",
|
||||
"lancedb-linux-arm64-gnu": "0.4.3",
|
||||
"lancedb-linux-x64-gnu": "0.4.3",
|
||||
"openai": "^4.28.4"
|
||||
"@lancedb/lancedb-darwin-arm64": "0.4.3",
|
||||
"@lancedb/lancedb-darwin-x64": "0.4.3",
|
||||
"@lancedb/lancedb-linux-arm64-gnu": "0.4.3",
|
||||
"@lancedb/lancedb-linux-x64-gnu": "0.4.3",
|
||||
"@lancedb/lancedb-win32-x64-msvc": "0.4.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"dependencies": {
|
||||
"openai": "^4.29.2",
|
||||
"apache-arrow": "^15.0.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,34 +145,20 @@ async def connect_async(
|
||||
the last check, then the table will be checked for updates. Note: this
|
||||
consistency only applies to read operations. Write operations are
|
||||
always consistent.
|
||||
request_thread_pool: int or ThreadPoolExecutor, optional
|
||||
The thread pool to use for making batch requests to the LanceDB Cloud API.
|
||||
If an integer, then a ThreadPoolExecutor will be created with that
|
||||
number of threads. If None, then a ThreadPoolExecutor will be created
|
||||
with the default number of threads. If a ThreadPoolExecutor, then that
|
||||
executor will be used for making requests. This is for LanceDB Cloud
|
||||
only and is only used when making batch requests (i.e., passing in
|
||||
multiple queries to the search method at once).
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
For a local directory, provide a path for the database:
|
||||
|
||||
>>> import lancedb
|
||||
>>> db = lancedb.connect("~/.lancedb")
|
||||
|
||||
For object storage, use a URI prefix:
|
||||
|
||||
>>> db = lancedb.connect("s3://my-bucket/lancedb")
|
||||
|
||||
Connect to LancdDB cloud:
|
||||
|
||||
>>> db = lancedb.connect("db://my_database", api_key="ldb_...")
|
||||
>>> async def doctest_example():
|
||||
... # For a local directory, provide a path to the database
|
||||
... db = await lancedb.connect_async("~/.lancedb")
|
||||
... # For object storage, use a URI prefix
|
||||
... db = await lancedb.connect_async("s3://my-bucket/lancedb")
|
||||
|
||||
Returns
|
||||
-------
|
||||
conn : DBConnection
|
||||
conn : AsyncConnection
|
||||
A connection to a LanceDB database.
|
||||
"""
|
||||
if read_consistency_interval is not None:
|
||||
|
||||
@@ -25,13 +25,18 @@ from overrides import EnforceOverrides, override
|
||||
from pyarrow import fs
|
||||
|
||||
from lancedb.common import data_to_reader, validate_schema
|
||||
from lancedb.embeddings.registry import EmbeddingFunctionRegistry
|
||||
from lancedb.utils.events import register_event
|
||||
|
||||
from ._lancedb import connect as lancedb_connect
|
||||
from .pydantic import LanceModel
|
||||
from .table import AsyncTable, LanceTable, Table, _sanitize_data
|
||||
from .util import fs_from_uri, get_uri_location, get_uri_scheme, join_uri
|
||||
from .util import (
|
||||
fs_from_uri,
|
||||
get_uri_location,
|
||||
get_uri_scheme,
|
||||
join_uri,
|
||||
validate_table_name,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datetime import timedelta
|
||||
@@ -387,6 +392,7 @@ class LanceDBConnection(DBConnection):
|
||||
"""
|
||||
if mode.lower() not in ["create", "overwrite"]:
|
||||
raise ValueError("mode must be either 'create' or 'overwrite'")
|
||||
validate_table_name(name)
|
||||
|
||||
tbl = LanceTable.create(
|
||||
self,
|
||||
@@ -444,16 +450,17 @@ class LanceDBConnection(DBConnection):
|
||||
class AsyncConnection(object):
|
||||
"""An active LanceDB connection
|
||||
|
||||
To obtain a connection you can use the [connect] function.
|
||||
To obtain a connection you can use the [connect_async][lancedb.connect_async]
|
||||
function.
|
||||
|
||||
This could be a native connection (using lance) or a remote connection (e.g. for
|
||||
connecting to LanceDb Cloud)
|
||||
|
||||
Local connections do not currently hold any open resources but they may do so in the
|
||||
future (for example, for shared cache or connections to catalog services) Remote
|
||||
connections represent an open connection to the remote server. The [close] method
|
||||
can be used to release any underlying resources eagerly. The connection can also
|
||||
be used as a context manager:
|
||||
connections represent an open connection to the remote server. The
|
||||
[close][lancedb.db.AsyncConnection.close] method can be used to release any
|
||||
underlying resources eagerly. The connection can also be used as a context manager.
|
||||
|
||||
Connections can be shared on multiple threads and are expected to be long lived.
|
||||
Connections can also be used as a context manager, however, in many cases a single
|
||||
@@ -464,10 +471,9 @@ class AsyncConnection(object):
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> import asyncio
|
||||
>>> import lancedb
|
||||
>>> async def my_connect():
|
||||
... with await lancedb.connect("/tmp/my_dataset") as conn:
|
||||
>>> async def doctest_example():
|
||||
... with await lancedb.connect_async("/tmp/my_dataset") as conn:
|
||||
... # do something with the connection
|
||||
... pass
|
||||
... # conn is closed here
|
||||
@@ -528,9 +534,8 @@ class AsyncConnection(object):
|
||||
exist_ok: Optional[bool] = None,
|
||||
on_bad_vectors: Optional[str] = None,
|
||||
fill_value: Optional[float] = None,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
) -> AsyncTable:
|
||||
"""Create a [Table][lancedb.table.Table] in the database.
|
||||
"""Create an [AsyncTable][lancedb.table.AsyncTable] in the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -569,7 +574,7 @@ class AsyncConnection(object):
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceTable
|
||||
AsyncTable
|
||||
A reference to the newly created table.
|
||||
|
||||
!!! note
|
||||
@@ -583,12 +588,14 @@ class AsyncConnection(object):
|
||||
Can create with list of tuples or dictionaries:
|
||||
|
||||
>>> import lancedb
|
||||
>>> db = lancedb.connect("./.lancedb")
|
||||
>>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
|
||||
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
|
||||
>>> db.create_table("my_table", data)
|
||||
LanceTable(connection=..., name="my_table")
|
||||
>>> db["my_table"].head()
|
||||
>>> async def doctest_example():
|
||||
... db = await lancedb.connect_async("./.lancedb")
|
||||
... data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
|
||||
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
|
||||
... my_table = await db.create_table("my_table", data)
|
||||
... print(await my_table.query().limit(5).to_arrow())
|
||||
>>> import asyncio
|
||||
>>> asyncio.run(doctest_example())
|
||||
pyarrow.Table
|
||||
vector: fixed_size_list<item: float>[2]
|
||||
child 0, item: float
|
||||
@@ -607,9 +614,11 @@ class AsyncConnection(object):
|
||||
... "lat": [45.5, 40.1],
|
||||
... "long": [-122.7, -74.1]
|
||||
... })
|
||||
>>> db.create_table("table2", data)
|
||||
LanceTable(connection=..., name="table2")
|
||||
>>> db["table2"].head()
|
||||
>>> async def pandas_example():
|
||||
... db = await lancedb.connect_async("./.lancedb")
|
||||
... my_table = await db.create_table("table2", data)
|
||||
... print(await my_table.query().limit(5).to_arrow())
|
||||
>>> asyncio.run(pandas_example())
|
||||
pyarrow.Table
|
||||
vector: fixed_size_list<item: float>[2]
|
||||
child 0, item: float
|
||||
@@ -629,9 +638,11 @@ class AsyncConnection(object):
|
||||
... pa.field("lat", pa.float32()),
|
||||
... pa.field("long", pa.float32())
|
||||
... ])
|
||||
>>> db.create_table("table3", data, schema = custom_schema)
|
||||
LanceTable(connection=..., name="table3")
|
||||
>>> db["table3"].head()
|
||||
>>> async def with_schema():
|
||||
... db = await lancedb.connect_async("./.lancedb")
|
||||
... my_table = await db.create_table("table3", data, schema = custom_schema)
|
||||
... print(await my_table.query().limit(5).to_arrow())
|
||||
>>> asyncio.run(with_schema())
|
||||
pyarrow.Table
|
||||
vector: fixed_size_list<item: float>[2]
|
||||
child 0, item: float
|
||||
@@ -663,9 +674,10 @@ class AsyncConnection(object):
|
||||
... pa.field("item", pa.utf8()),
|
||||
... pa.field("price", pa.float32()),
|
||||
... ])
|
||||
>>> db.create_table("table4", make_batches(), schema=schema)
|
||||
LanceTable(connection=..., name="table4")
|
||||
|
||||
>>> async def iterable_example():
|
||||
... db = await lancedb.connect_async("./.lancedb")
|
||||
... await db.create_table("table4", make_batches(), schema=schema)
|
||||
>>> asyncio.run(iterable_example())
|
||||
"""
|
||||
if inspect.isclass(schema) and issubclass(schema, LanceModel):
|
||||
# convert LanceModel to pyarrow schema
|
||||
@@ -674,12 +686,6 @@ class AsyncConnection(object):
|
||||
schema = schema.to_arrow_schema()
|
||||
|
||||
metadata = None
|
||||
if embedding_functions is not None:
|
||||
# If we passed in embedding functions explicitly
|
||||
# then we'll override any schema metadata that
|
||||
# may was implicitly specified by the LanceModel schema
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
metadata = registry.get_table_metadata(embedding_functions)
|
||||
|
||||
# Defining defaults here and not in function prototype. In the future
|
||||
# these defaults will move into rust so better to keep them as None.
|
||||
@@ -760,11 +766,11 @@ class AsyncConnection(object):
|
||||
name: str
|
||||
The name of the table.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
await self._inner.drop_table(name)
|
||||
|
||||
async def drop_database(self):
|
||||
"""
|
||||
Drop database
|
||||
This is the same thing as dropping all the tables
|
||||
"""
|
||||
raise NotImplementedError
|
||||
await self._inner.drop_db()
|
||||
|
||||
@@ -10,13 +10,18 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
from pydantic import BaseModel, Field, PrivateAttr
|
||||
from tqdm import tqdm
|
||||
|
||||
import lancedb
|
||||
|
||||
from .fine_tuner import QADataset
|
||||
from .utils import TEXT, retry_with_exponential_backoff
|
||||
|
||||
|
||||
@@ -126,6 +131,22 @@ class EmbeddingFunction(BaseModel, ABC):
|
||||
def __hash__(self) -> int:
|
||||
return hash(frozenset(vars(self).items()))
|
||||
|
||||
def finetune(self, dataset: QADataset, *args, **kwargs):
|
||||
"""
|
||||
Finetune the embedding function on a dataset
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Finetuning is not supported for this embedding function"
|
||||
)
|
||||
|
||||
def evaluate(self, dataset: QADataset, top_k=5, path=None, *args, **kwargs):
|
||||
"""
|
||||
Evaluate the embedding function on a dataset
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Evaluation is not supported for this embedding function"
|
||||
)
|
||||
|
||||
|
||||
class EmbeddingFunctionConfig(BaseModel):
|
||||
"""
|
||||
@@ -159,3 +180,52 @@ class TextEmbeddingFunction(EmbeddingFunction):
|
||||
Generate the embeddings for the given texts
|
||||
"""
|
||||
pass
|
||||
|
||||
def evaluate(self, dataset: QADataset, top_k=5, path=None, *args, **kwargs):
|
||||
"""
|
||||
Evaluate the embedding function on a dataset. This calculates the hit-rate for
|
||||
the top-k retrieved documents for each query in the dataset. Assumes that the
|
||||
first relevant document is the expected document.
|
||||
Pro - Should work for any embedding model
|
||||
Con - Returns every simple metric.
|
||||
Parameters
|
||||
----------
|
||||
dataset: QADataset
|
||||
The dataset to evaluate on
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
The evaluation results
|
||||
"""
|
||||
corpus = dataset.corpus
|
||||
queries = dataset.queries
|
||||
relevant_docs = dataset.relevant_docs
|
||||
path = path or os.path.join(os.getcwd(), "eval")
|
||||
db = lancedb.connect(path)
|
||||
|
||||
class Schema(lancedb.pydantic.LanceModel):
|
||||
id: str
|
||||
text: str = self.SourceField()
|
||||
vector: lancedb.pydantic.Vector(self.ndims()) = self.VectorField()
|
||||
|
||||
retriever = db.create_table("eval", schema=Schema, mode="overwrite")
|
||||
pylist = [{"id": str(k), "text": v} for k, v in corpus.items()]
|
||||
retriever.add(pylist)
|
||||
|
||||
eval_results = []
|
||||
for query_id, query in tqdm(queries.items()):
|
||||
retrieved_nodes = retriever.search(query).limit(top_k).to_list()
|
||||
retrieved_ids = [node["id"] for node in retrieved_nodes]
|
||||
expected_id = relevant_docs[query_id][0]
|
||||
is_hit = expected_id in retrieved_ids # assume 1 relevant doc
|
||||
|
||||
eval_result = {
|
||||
"is_hit": is_hit,
|
||||
"retrieved": retrieved_ids,
|
||||
"expected": expected_id,
|
||||
"query": query_id,
|
||||
}
|
||||
eval_results.append(eval_result)
|
||||
|
||||
return eval_results
|
||||
|
||||
133
python/python/lancedb/embeddings/fine_tuner/README.md
Normal file
133
python/python/lancedb/embeddings/fine_tuner/README.md
Normal file
@@ -0,0 +1,133 @@
|
||||
Fine-tuning workflow for embeddings consists for the following parts:
|
||||
|
||||
### QADataset
|
||||
This class is used for managing the data for fine-tuning. It contains the following builder methods:
|
||||
```
|
||||
- from_llm(
|
||||
nodes: 'List[TextChunk]' ,
|
||||
llm: BaseLLM,
|
||||
qa_generate_prompt_tmpl: str = DEFAULT_PROMPT_TMPL,
|
||||
num_questions_per_chunk: int = 2,
|
||||
) -> "QADataset"
|
||||
```
|
||||
Create synthetic data from a language model and text chunks of the original document on which the model is to be fine-tuned.
|
||||
|
||||
```python
|
||||
|
||||
from_responses(docs: List['TextChunk'], queries: Dict[str, str], relevant_docs: Dict[str, List[str]])-> "QADataset"
|
||||
```
|
||||
Create dataset from queries and responses based on a real-world scenario. Designed to be used for knowledge distillation from a larger LLM to a smaller one.
|
||||
|
||||
It also contains the following data attributes:
|
||||
```
|
||||
queries (Dict[str, str]): Dict id -> query.
|
||||
corpus (Dict[str, str]): Dict id -> string.
|
||||
relevant_docs (Dict[str, List[str]]): Dict query id -> list of doc ids.
|
||||
```
|
||||
|
||||
### TextChunk
|
||||
This class is used for managing the data for fine-tuning. It is designed to allow working with and standardize various text splitting/pre-processing tools like llama-index and langchain. It contains the following attributes:
|
||||
```
|
||||
text: str
|
||||
id: str
|
||||
metadata: Dict[str, Any] = {}
|
||||
```
|
||||
|
||||
Builder Methods:
|
||||
|
||||
```python
|
||||
from_llama_index_node(node) -> "TextChunk"
|
||||
```
|
||||
Create a text chunk from a llama index node.
|
||||
|
||||
```python
|
||||
from_langchain_node(node) -> "TextChunk"
|
||||
```
|
||||
Create a text chunk from a langchain index node.
|
||||
|
||||
```python
|
||||
from_chunk(cls, chunk: str, metadata: dict = {}) -> "TextChunk"
|
||||
```
|
||||
Create a text chunk from a string.
|
||||
|
||||
### FineTuner
|
||||
This class is used for fine-tuning embeddings. It is exposed to the user via a high-level function in the base embedding api.
|
||||
```python
|
||||
class BaseEmbeddingTuner(ABC):
|
||||
"""Base Embedding finetuning engine."""
|
||||
|
||||
@abstractmethod
|
||||
def finetune(self) -> None:
|
||||
"""Goes off and does stuff."""
|
||||
|
||||
def helper(self) -> None:
|
||||
"""A helper method."""
|
||||
pass
|
||||
```
|
||||
|
||||
### Embedding API finetuning implementation
|
||||
Each embedding API needs to implement `finetune` method in order to support fine-tuning. A vanilla evaluation technique has been implemented in the `BaseEmbedding` class that calculates hit_rate @ `top_k`.
|
||||
|
||||
### Fine-tuning workflow
|
||||
The fine-tuning workflow is as follows:
|
||||
1. Create a `QADataset` object.
|
||||
2. Initialize any embedding function using LanceDB embedding API
|
||||
3. Call `finetune` method on the embedding object with the `QADataset` object as an argument.
|
||||
4. Evaluate the fine-tuned model using the `evaluate` method in the embedding API.
|
||||
|
||||
# End-to-End Examples
|
||||
The following is an example of how to fine-tune an embedding model using the LanceDB embedding API.
|
||||
|
||||
## Example 1: Fine-tuning from a synthetic dataset
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
from lancedb.embeddings.fine_tuner.llm import Openai
|
||||
from lancedb.embeddings.fine_tuner.dataset import QADataset, TextChunk
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from llama_index.core.schema import MetadataMode
|
||||
from lancedb.embeddings import get_registry
|
||||
|
||||
# 1. Create a QADataset object
|
||||
url = "uber10k.pdf"
|
||||
reader = SimpleDirectoryReader(input_files=url)
|
||||
docs = reader.load_data()
|
||||
|
||||
parser = SentenceSplitter()
|
||||
nodes = parser.get_nodes_from_documents(docs)
|
||||
|
||||
if os.path.exists(name):
|
||||
ds = QADataset.load(name)
|
||||
else:
|
||||
llm = Openai()
|
||||
|
||||
# convert Llama-index TextNode to TextChunk
|
||||
chunks = [TextChunk.from_llama_index_node(node) for node in nodes]
|
||||
|
||||
ds = QADataset.from_llm(chunks, llm)
|
||||
ds.save(name)
|
||||
|
||||
# 2. Initialize the embedding model
|
||||
model = get_registry().get("sentence-transformers").create()
|
||||
|
||||
# 3. Fine-tune the model
|
||||
model.finetune(trainset=ds, path="model_finetuned", epochs=4)
|
||||
|
||||
# 4. Evaluate the fine-tuned model
|
||||
base = get_registry().get("sentence-transformers").create()
|
||||
tuned = get_registry().get("sentence-transformers").create(name="./model_finetuned_1")
|
||||
openai = get_registry().get("openai").create(name="text-embedding-3-large")
|
||||
|
||||
|
||||
rs1 = base.evaluate(trainset, path="val_res")
|
||||
rs2 = tuned.evaluate(trainset, path="val_res")
|
||||
rs3 = openai.evaluate(trainset)
|
||||
|
||||
print("openai-embedding-v3 hit-rate - ", pd.DataFrame(rs3)["is_hit"].mean())
|
||||
print("fine-tuned hit-rate - ", pd.DataFrame(rs2)["is_hit"].mean())
|
||||
print("Base model hite-rate - ", pd.DataFrame(rs1)["is_hit"].mean())
|
||||
```
|
||||
|
||||
|
||||
4
python/python/lancedb/embeddings/fine_tuner/__init__.py
Normal file
4
python/python/lancedb/embeddings/fine_tuner/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .dataset import QADataset, TextChunk
|
||||
from .llm import Gemini, Openai
|
||||
|
||||
__all__ = ["QADataset", "TextChunk", "Openai", "Gemini"]
|
||||
13
python/python/lancedb/embeddings/fine_tuner/basetuner.py
Normal file
13
python/python/lancedb/embeddings/fine_tuner/basetuner.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class BaseEmbeddingTuner(ABC):
|
||||
"""Base Embedding finetuning engine."""
|
||||
|
||||
@abstractmethod
|
||||
def finetune(self) -> None:
|
||||
"""Goes off and does stuff."""
|
||||
|
||||
def helper(self) -> None:
|
||||
"""A helper method."""
|
||||
pass
|
||||
205
python/python/lancedb/embeddings/fine_tuner/dataset.py
Normal file
205
python/python/lancedb/embeddings/fine_tuner/dataset.py
Normal file
@@ -0,0 +1,205 @@
|
||||
import re
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple, Optional
|
||||
|
||||
import lance
|
||||
import pyarrow as pa
|
||||
from pydantic import BaseModel
|
||||
from tqdm import tqdm
|
||||
from lancedb.utils.general import LOGGER
|
||||
from .llm import BaseLLM
|
||||
|
||||
DEFAULT_PROMPT_TMPL = """\
|
||||
Context information is below.
|
||||
|
||||
---------------------
|
||||
{context_str}
|
||||
---------------------
|
||||
|
||||
Given the context information and no prior knowledge.
|
||||
generate only questions based on the below query.
|
||||
|
||||
You are a Teacher/ Professor. Your task is to setup \
|
||||
{num_questions_per_chunk} questions for an upcoming \
|
||||
quiz/examination. The questions should be diverse in nature \
|
||||
across the document. Restrict the questions to the \
|
||||
context information provided."
|
||||
"""
|
||||
|
||||
|
||||
class QADataset(BaseModel):
|
||||
"""Embedding QA Finetuning Dataset.
|
||||
|
||||
Args:
|
||||
queries (Dict[str, str]): Dict id -> query.
|
||||
corpus (Dict[str, str]): Dict id -> string.
|
||||
relevant_docs (Dict[str, List[str]]): Dict query id -> list of doc ids.
|
||||
|
||||
"""
|
||||
path: Optional[str] = None
|
||||
queries: Dict[str, str] # id -> query
|
||||
corpus: Dict[str, str] # id -> text
|
||||
relevant_docs: Dict[str, List[str]] # query id -> list of retrieved doc ids
|
||||
mode: str = "text"
|
||||
|
||||
@property
|
||||
def query_docid_pairs(self) -> List[Tuple[str, List[str]]]:
|
||||
"""Get query, relevant doc ids."""
|
||||
return [
|
||||
(query, self.relevant_docs[query_id])
|
||||
for query_id, query in self.queries.items()
|
||||
]
|
||||
|
||||
def save(self, path: str, mode: str = "overwrite") -> None:
|
||||
"""Save to lance dataset"""
|
||||
self.path = path
|
||||
save_dir = Path(path)
|
||||
save_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# convert to pydict {"id": []}
|
||||
queries = {
|
||||
"id": list(self.queries.keys()),
|
||||
"query": list(self.queries.values()),
|
||||
}
|
||||
corpus = {
|
||||
"id": list(self.corpus.keys()),
|
||||
"text": [
|
||||
val or " " for val in self.corpus.values()
|
||||
], # lance saves empty strings as null
|
||||
}
|
||||
relevant_docs = {
|
||||
"query_id": list(self.relevant_docs.keys()),
|
||||
"doc_id": list(self.relevant_docs.values()),
|
||||
}
|
||||
|
||||
# write to lance
|
||||
lance.write_dataset(
|
||||
pa.Table.from_pydict(queries), save_dir / "queries.lance", mode=mode
|
||||
)
|
||||
lance.write_dataset(
|
||||
pa.Table.from_pydict(corpus), save_dir / "corpus.lance", mode=mode
|
||||
)
|
||||
lance.write_dataset(
|
||||
pa.Table.from_pydict(relevant_docs),
|
||||
save_dir / "relevant_docs.lance",
|
||||
mode=mode,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str, version: Optional[int] = None) -> "QADataset":
|
||||
"""Load from .lance data"""
|
||||
load_dir = Path(path)
|
||||
queries = lance.dataset(load_dir / "queries.lance", version=version).to_table().to_pydict()
|
||||
corpus = lance.dataset(load_dir / "corpus.lance", version=version).to_table().to_pydict()
|
||||
relevant_docs = (
|
||||
lance.dataset(load_dir / "relevant_docs.lance", version=version).to_table().to_pydict()
|
||||
)
|
||||
return cls(
|
||||
path=str(path),
|
||||
queries=dict(zip(queries["id"], queries["query"])),
|
||||
corpus=dict(zip(corpus["id"], corpus["text"])),
|
||||
relevant_docs=dict(zip(relevant_docs["query_id"], relevant_docs["doc_id"])),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def switch_version(cls, version: int) -> "QADataset":
|
||||
"""Switch version of a dataset."""
|
||||
if not cls.path:
|
||||
raise ValueError("Path not set. You need to call save() first.")
|
||||
return cls.load(cls.path, version=version)
|
||||
|
||||
# generate queries as a convenience function
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
nodes: "List[TextChunk]",
|
||||
llm: BaseLLM,
|
||||
qa_generate_prompt_tmpl: str = DEFAULT_PROMPT_TMPL,
|
||||
num_questions_per_chunk: int = 2,
|
||||
) -> "QADataset":
|
||||
"""Generate examples given a set of nodes."""
|
||||
node_dict = {node.id: node.text for node in nodes}
|
||||
|
||||
queries = {}
|
||||
relevant_docs = {}
|
||||
for node_id, text in tqdm(node_dict.items()):
|
||||
query = qa_generate_prompt_tmpl.format(
|
||||
context_str=text, num_questions_per_chunk=num_questions_per_chunk
|
||||
)
|
||||
response = llm.chat_completion(query)
|
||||
|
||||
result = str(response).strip().split("\n")
|
||||
questions = [
|
||||
re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
|
||||
]
|
||||
questions = [question for question in questions if len(question) > 0]
|
||||
for question in questions:
|
||||
question_id = str(uuid.uuid4())
|
||||
queries[question_id] = question
|
||||
relevant_docs[question_id] = [node_id]
|
||||
|
||||
return QADataset(queries=queries, corpus=node_dict, relevant_docs=relevant_docs)
|
||||
|
||||
@classmethod
|
||||
def from_responses(
|
||||
cls,
|
||||
docs: List["TextChunk"],
|
||||
queries: Dict[str, str],
|
||||
relevant_docs: Dict[str, List[str]],
|
||||
) -> "QADataset":
|
||||
"""Create a QADataset from a list of TextChunks and a list of questions."""
|
||||
node_dict = {node.id: node.text for node in docs}
|
||||
return cls(queries=queries, corpus=node_dict, relevant_docs=relevant_docs)
|
||||
|
||||
def versions(self) -> List[int]:
|
||||
"""Get the versions of the dataset."""
|
||||
# TODO: tidy this up
|
||||
data_paths = self._get_data_file_paths()
|
||||
return lance.dataset(data_paths[0]).versions()
|
||||
|
||||
|
||||
def _get_data_file_paths(self) -> str:
|
||||
"""Get the absolute path of the dataset."""
|
||||
queries = self.path / "queries.lance"
|
||||
corpus = self.path / "corpus.lance"
|
||||
relevant_docs = self.path / "relevant_docs.lance"
|
||||
|
||||
return queries, corpus, relevant_docs
|
||||
|
||||
|
||||
|
||||
|
||||
class TextChunk(BaseModel):
|
||||
"""Simple text chunk for generating questions."""
|
||||
|
||||
text: str
|
||||
id: str
|
||||
metadata: Dict[str, Any] = {}
|
||||
|
||||
@classmethod
|
||||
def from_chunk(cls, chunk: str, metadata: dict = {}) -> "TextChunk":
|
||||
"""Create a SimpleTextChunk from a chunk."""
|
||||
# generate a unique id
|
||||
return cls(text=chunk, id=str(uuid.uuid4()), metadata=metadata)
|
||||
|
||||
@classmethod
|
||||
def from_llama_index_node(cls, node):
|
||||
"""Convert a llama index node to a text chunk."""
|
||||
return cls(text=node.text, id=node.node_id, metadata=node.metadata)
|
||||
|
||||
@classmethod
|
||||
def from_langchain_node(cls, node):
|
||||
"""Convert a langchaain node to a text chunk."""
|
||||
raise NotImplementedError("Not implemented yet.")
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to a dictionary."""
|
||||
return self.dict()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.text
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"SimpleTextChunk(text={self.text}, id={self.id}, \
|
||||
metadata={self.metadata})"
|
||||
85
python/python/lancedb/embeddings/fine_tuner/llm.py
Normal file
85
python/python/lancedb/embeddings/fine_tuner/llm.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import os
|
||||
import re
|
||||
from functools import cached_property
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ...util import attempt_import_or_raise
|
||||
from ..utils import api_key_not_found_help
|
||||
|
||||
|
||||
class BaseLLM(BaseModel):
|
||||
"""
|
||||
TODO:
|
||||
Base class for Language Model based Embedding Functions. This class is
|
||||
loosely desined rn, and will be updated as the usage gets clearer.
|
||||
"""
|
||||
|
||||
model_name: str
|
||||
model_kwargs: dict = {}
|
||||
|
||||
@cached_property
|
||||
def _client():
|
||||
"""
|
||||
Get the client for the language model
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def chat_completion(self, prompt: str, **kwargs):
|
||||
"""
|
||||
Get the chat completion for the given prompt
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Openai(BaseLLM):
|
||||
model_name: str = "gpt-3.5-turbo"
|
||||
kwargs: dict = {}
|
||||
api_key: Optional[str] = None
|
||||
|
||||
@cached_property
|
||||
def _client(self):
|
||||
"""
|
||||
Get the client for the language model
|
||||
"""
|
||||
openai = attempt_import_or_raise("openai")
|
||||
|
||||
if not os.environ.get("OPENAI_API_KEY"):
|
||||
api_key_not_found_help("openai")
|
||||
return openai.OpenAI()
|
||||
|
||||
def chat_completion(self, prompt: str) -> str:
|
||||
"""
|
||||
Get the chat completion for the given prompt
|
||||
"""
|
||||
|
||||
# TODO: this is legacy openai api replace with completions
|
||||
completion = self._client.chat.completions.create(
|
||||
model=self.model_name,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
**self.kwargs,
|
||||
)
|
||||
|
||||
text = completion.choices[0].message.content
|
||||
|
||||
return text
|
||||
|
||||
def get_questions(self, prompt: str) -> str:
|
||||
"""
|
||||
Get the chat completion for the given prompt
|
||||
"""
|
||||
response = self.chat_completion(prompt)
|
||||
result = str(response).strip().split("\n")
|
||||
questions = [
|
||||
re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
|
||||
]
|
||||
questions = [question for question in questions if len(question) > 0]
|
||||
return questions
|
||||
|
||||
|
||||
class Gemini(BaseLLM):
|
||||
pass
|
||||
@@ -103,9 +103,9 @@ class InstructorEmbeddingFunction(TextEmbeddingFunction):
|
||||
# convert_to_numpy: bool = True # Hardcoding this as numpy can be ingested directly
|
||||
|
||||
source_instruction: str = "represent the document for retrieval"
|
||||
query_instruction: str = (
|
||||
"represent the document for retrieving the most similar documents"
|
||||
)
|
||||
query_instruction: (
|
||||
str
|
||||
) = "represent the document for retrieving the most similar documents"
|
||||
|
||||
@weak_lru(maxsize=1)
|
||||
def ndims(self):
|
||||
|
||||
@@ -10,12 +10,16 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import List, Union
|
||||
from typing import Any, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from lancedb.embeddings.fine_tuner import QADataset
|
||||
from lancedb.utils.general import LOGGER
|
||||
|
||||
from ..util import attempt_import_or_raise
|
||||
from .base import TextEmbeddingFunction
|
||||
from .fine_tuner.basetuner import BaseEmbeddingTuner
|
||||
from .registry import register
|
||||
from .utils import weak_lru
|
||||
|
||||
@@ -80,3 +84,151 @@ class SentenceTransformerEmbeddings(TextEmbeddingFunction):
|
||||
"sentence_transformers", "sentence-transformers"
|
||||
)
|
||||
return sentence_transformers.SentenceTransformer(self.name, device=self.device)
|
||||
|
||||
def finetune(self, trainset: QADataset, *args, **kwargs):
|
||||
"""
|
||||
Finetune the Sentence Transformers model
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset: QADataset
|
||||
The dataset to use for finetuning
|
||||
"""
|
||||
tuner = SentenceTransformersTuner(
|
||||
model=self.embedding_model,
|
||||
trainset=trainset,
|
||||
**kwargs,
|
||||
)
|
||||
tuner.finetune()
|
||||
|
||||
|
||||
class SentenceTransformersTuner(BaseEmbeddingTuner):
|
||||
"""Sentence Transformers Embedding Finetuning Engine."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Any,
|
||||
trainset: QADataset,
|
||||
valset: Optional[QADataset] = None,
|
||||
path: Optional[str] = "~/.lancedb/embeddings/models",
|
||||
batch_size: int = 8,
|
||||
epochs: int = 1,
|
||||
show_progress: bool = True,
|
||||
eval_steps: int = 50,
|
||||
max_input_per_doc: int = -1,
|
||||
loss: Optional[Any] = None,
|
||||
evaluator: Optional[Any] = None,
|
||||
run_name: Optional[str] = None,
|
||||
log_wandb: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
model: str
|
||||
The model to use for finetuning.
|
||||
trainset: QADataset
|
||||
The training dataset.
|
||||
valset: Optional[QADataset]
|
||||
The validation dataset.
|
||||
path: Optional[str]
|
||||
The path to save the model.
|
||||
batch_size: int, default=8
|
||||
The batch size.
|
||||
epochs: int, default=1
|
||||
The number of epochs.
|
||||
show_progress: bool, default=True
|
||||
Whether to show progress.
|
||||
eval_steps: int, default=50
|
||||
The number of steps to evaluate.
|
||||
max_input_per_doc: int, default=-1
|
||||
The number of input per document.
|
||||
if -1, use all documents.
|
||||
"""
|
||||
from sentence_transformers import InputExample, losses
|
||||
from sentence_transformers.evaluation import InformationRetrievalEvaluator
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
self.model = model
|
||||
self.trainset = trainset
|
||||
self.valset = valset
|
||||
self.path = path
|
||||
self.batch_size = batch_size
|
||||
self.epochs = epochs
|
||||
self.show_progress = show_progress
|
||||
self.eval_steps = eval_steps
|
||||
self.max_input_per_doc = max_input_per_doc
|
||||
self.evaluator = None
|
||||
self.epochs = epochs
|
||||
self.show_progress = show_progress
|
||||
self.eval_steps = eval_steps
|
||||
self.run_name = run_name
|
||||
self.log_wandb = log_wandb
|
||||
|
||||
if self.max_input_per_doc < -1:
|
||||
raise ValueError("max_input_per_doc must be -1 or greater than 0.")
|
||||
|
||||
examples: Any = []
|
||||
for query_id, query in self.trainset.queries.items():
|
||||
if max_input_per_doc == -1:
|
||||
for node_id in self.trainset.relevant_docs[query_id]:
|
||||
text = self.trainset.corpus[node_id]
|
||||
example = InputExample(texts=[query, text])
|
||||
examples.append(example)
|
||||
else:
|
||||
node_id = self.trainset.relevant_docs[query_id][
|
||||
min(max_input_per_doc, len(self.trainset.relevant_docs[query_id]))
|
||||
]
|
||||
text = self.trainset.corpus[node_id]
|
||||
example = InputExample(texts=[query, text])
|
||||
examples.append(example)
|
||||
|
||||
self.examples = examples
|
||||
|
||||
self.loader: DataLoader = DataLoader(examples, batch_size=batch_size)
|
||||
|
||||
if self.valset is not None:
|
||||
eval_engine = evaluator or InformationRetrievalEvaluator
|
||||
self.evaluator = eval_engine(
|
||||
valset.queries, valset.corpus, valset.relevant_docs
|
||||
)
|
||||
self.evaluator = evaluator
|
||||
|
||||
# define loss
|
||||
self.loss = loss or losses.MultipleNegativesRankingLoss(self.model)
|
||||
self.warmup_steps = int(len(self.loader) * epochs * 0.1)
|
||||
|
||||
def finetune(self) -> None:
|
||||
"""Finetune the Sentence Transformers model."""
|
||||
self.model.fit(
|
||||
train_objectives=[(self.loader, self.loss)],
|
||||
epochs=self.epochs,
|
||||
warmup_steps=self.warmup_steps,
|
||||
output_path=self.path,
|
||||
show_progress_bar=self.show_progress,
|
||||
evaluator=self.evaluator,
|
||||
evaluation_steps=self.eval_steps,
|
||||
callback=self._wandb_callback if self.log_wandb else None,
|
||||
)
|
||||
|
||||
self.helper()
|
||||
|
||||
def helper(self) -> None:
|
||||
"""A helper method."""
|
||||
LOGGER.info("Finetuning complete.")
|
||||
LOGGER.info(f"Model saved to {self.path}.")
|
||||
LOGGER.info("You can now use the model as follows:")
|
||||
LOGGER.info(
|
||||
f"model = get_registry().get('sentence-transformers').create(name='./{self.path}')" # noqa
|
||||
)
|
||||
|
||||
def _wandb_callback(self, score, epoch, steps):
|
||||
try:
|
||||
import wandb
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"wandb is not installed. Please install it using `pip install wandb`"
|
||||
)
|
||||
run = wandb.run or wandb.init(
|
||||
project="sbert_lancedb_finetune", name=self.run_name
|
||||
)
|
||||
run.log({"epoch": epoch, "steps": steps, "score": score})
|
||||
|
||||
@@ -1033,7 +1033,7 @@ class AsyncQueryBase(object):
|
||||
Construct an AsyncQueryBase
|
||||
|
||||
This method is not intended to be called directly. Instead, use the
|
||||
[Table.query][] method to create a query.
|
||||
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
|
||||
"""
|
||||
self._inner = inner
|
||||
|
||||
@@ -1041,7 +1041,10 @@ class AsyncQueryBase(object):
|
||||
"""
|
||||
Only return rows matching the given predicate
|
||||
|
||||
The predicate should be supplied as an SQL query string. For example:
|
||||
The predicate should be supplied as an SQL query string.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> predicate = "x > 10"
|
||||
>>> predicate = "y > 0 AND y < 100"
|
||||
@@ -1112,7 +1115,8 @@ class AsyncQueryBase(object):
|
||||
Execute the query and collect the results into an Apache Arrow Table.
|
||||
|
||||
This method will collect all results into memory before returning. If
|
||||
you expect a large number of results, you may want to use [to_batches][]
|
||||
you expect a large number of results, you may want to use
|
||||
[to_batches][lancedb.query.AsyncQueryBase.to_batches]
|
||||
"""
|
||||
batch_iter = await self.to_batches()
|
||||
return pa.Table.from_batches(
|
||||
@@ -1123,12 +1127,13 @@ class AsyncQueryBase(object):
|
||||
"""
|
||||
Execute the query and collect the results into a pandas DataFrame.
|
||||
|
||||
This method will collect all results into memory before returning. If
|
||||
you expect a large number of results, you may want to use [to_batches][]
|
||||
and convert each batch to pandas separately.
|
||||
This method will collect all results into memory before returning. If you
|
||||
expect a large number of results, you may want to use
|
||||
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
||||
pandas separately.
|
||||
|
||||
Example
|
||||
-------
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> import asyncio
|
||||
>>> from lancedb import connect_async
|
||||
@@ -1148,7 +1153,7 @@ class AsyncQuery(AsyncQueryBase):
|
||||
Construct an AsyncQuery
|
||||
|
||||
This method is not intended to be called directly. Instead, use the
|
||||
[Table.query][] method to create a query.
|
||||
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
|
||||
"""
|
||||
super().__init__(inner)
|
||||
self._inner = inner
|
||||
@@ -1189,8 +1194,8 @@ class AsyncQuery(AsyncQueryBase):
|
||||
If there is only one vector column (a column whose data type is a
|
||||
fixed size list of floats) then the column does not need to be specified.
|
||||
If there is more than one vector column you must use
|
||||
[AsyncVectorQuery::column][] to specify which column you would like to
|
||||
compare with.
|
||||
[AsyncVectorQuery.column][lancedb.query.AsyncVectorQuery.column] to specify
|
||||
which column you would like to compare with.
|
||||
|
||||
If no index has been created on the vector column then a vector query
|
||||
will perform a distance comparison between the query vector and every
|
||||
@@ -1221,8 +1226,10 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
Construct an AsyncVectorQuery
|
||||
|
||||
This method is not intended to be called directly. Instead, create
|
||||
a query first with [Table.query][] and then use [AsyncQuery.nearest_to][]
|
||||
to convert to a vector query.
|
||||
a query first with [AsyncTable.query][lancedb.table.AsyncTable.query] and then
|
||||
use [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to]] to convert to
|
||||
a vector query. Or you can use
|
||||
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search]
|
||||
"""
|
||||
super().__init__(inner)
|
||||
self._inner = inner
|
||||
@@ -1232,7 +1239,7 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
Set the vector column to query
|
||||
|
||||
This controls which column is compared to the query vector supplied in
|
||||
the call to [Query.nearest_to][].
|
||||
the call to [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to].
|
||||
|
||||
This parameter must be specified if the table has more than one column
|
||||
whose data type is a fixed-size-list of floats.
|
||||
|
||||
@@ -26,6 +26,7 @@ from ..db import DBConnection
|
||||
from ..embeddings import EmbeddingFunctionConfig
|
||||
from ..pydantic import LanceModel
|
||||
from ..table import Table, _sanitize_data
|
||||
from ..util import validate_table_name
|
||||
from .arrow import to_ipc_binary
|
||||
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
|
||||
from .errors import LanceDBClientError
|
||||
@@ -223,6 +224,7 @@ class RemoteDBConnection(DBConnection):
|
||||
LanceTable(table4)
|
||||
|
||||
"""
|
||||
validate_table_name(name)
|
||||
if data is None and schema is None:
|
||||
raise ValueError("Either data or schema must be provided.")
|
||||
if embedding_functions is not None:
|
||||
|
||||
@@ -14,7 +14,7 @@ class CrossEncoderReranker(Reranker):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model : str, default "cross-encoder/ms-marco-TinyBERT-L-6"
|
||||
model_name : str, default "cross-encoder/ms-marco-TinyBERT-L-6"
|
||||
The name of the cross encoder model to use. See the sentence transformers
|
||||
documentation for a list of available models.
|
||||
column : str, default "text"
|
||||
|
||||
@@ -1893,8 +1893,8 @@ class AsyncTable:
|
||||
An AsyncTable object is expected to be long lived and reused for multiple
|
||||
operations. AsyncTable objects will cache a certain amount of index data in memory.
|
||||
This cache will be freed when the Table is garbage collected. To eagerly free the
|
||||
cache you can call the [close][AsyncTable.close] method. Once the AsyncTable is
|
||||
closed, it cannot be used for any further operations.
|
||||
cache you can call the [close][lancedb.AsyncTable.close] method. Once the
|
||||
AsyncTable is closed, it cannot be used for any further operations.
|
||||
|
||||
An AsyncTable can also be used as a context manager, and will automatically close
|
||||
when the context is exited. Closing a table is optional. If you do not close the
|
||||
@@ -1903,13 +1903,17 @@ class AsyncTable:
|
||||
Examples
|
||||
--------
|
||||
|
||||
Create using [DBConnection.create_table][lancedb.DBConnection.create_table]
|
||||
Create using [AsyncConnection.create_table][lancedb.AsyncConnection.create_table]
|
||||
(more examples in that method's documentation).
|
||||
|
||||
>>> import lancedb
|
||||
>>> db = lancedb.connect("./.lancedb")
|
||||
>>> table = db.create_table("my_table", data=[{"vector": [1.1, 1.2], "b": 2}])
|
||||
>>> table.head()
|
||||
>>> async def create_a_table():
|
||||
... db = await lancedb.connect_async("./.lancedb")
|
||||
... data = [{"vector": [1.1, 1.2], "b": 2}]
|
||||
... table = await db.create_table("my_table", data=data)
|
||||
... print(await table.query().limit(5).to_arrow())
|
||||
>>> import asyncio
|
||||
>>> asyncio.run(create_a_table())
|
||||
pyarrow.Table
|
||||
vector: fixed_size_list<item: float>[2]
|
||||
child 0, item: float
|
||||
@@ -1918,25 +1922,37 @@ class AsyncTable:
|
||||
vector: [[[1.1,1.2]]]
|
||||
b: [[2]]
|
||||
|
||||
Can append new data with [Table.add()][lancedb.table.Table.add].
|
||||
Can append new data with [AsyncTable.add()][lancedb.table.AsyncTable.add].
|
||||
|
||||
>>> table.add([{"vector": [0.5, 1.3], "b": 4}])
|
||||
>>> async def add_to_table():
|
||||
... db = await lancedb.connect_async("./.lancedb")
|
||||
... table = await db.open_table("my_table")
|
||||
... await table.add([{"vector": [0.5, 1.3], "b": 4}])
|
||||
>>> asyncio.run(add_to_table())
|
||||
|
||||
Can query the table with [Table.search][lancedb.table.Table.search].
|
||||
Can query the table with
|
||||
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search].
|
||||
|
||||
>>> table.search([0.4, 0.4]).select(["b", "vector"]).to_pandas()
|
||||
>>> async def search_table_for_vector():
|
||||
... db = await lancedb.connect_async("./.lancedb")
|
||||
... table = await db.open_table("my_table")
|
||||
... results = (
|
||||
... await table.vector_search([0.4, 0.4]).select(["b", "vector"]).to_pandas()
|
||||
... )
|
||||
... print(results)
|
||||
>>> asyncio.run(search_table_for_vector())
|
||||
b vector _distance
|
||||
0 4 [0.5, 1.3] 0.82
|
||||
1 2 [1.1, 1.2] 1.13
|
||||
|
||||
Search queries are much faster when an index is created. See
|
||||
[Table.create_index][lancedb.table.Table.create_index].
|
||||
[AsyncTable.create_index][lancedb.table.AsyncTable.create_index].
|
||||
"""
|
||||
|
||||
def __init__(self, table: LanceDBTable):
|
||||
"""Create a new Table object.
|
||||
"""Create a new AsyncTable object.
|
||||
|
||||
You should not create Table objects directly.
|
||||
You should not create AsyncTable objects directly.
|
||||
|
||||
Use [AsyncConnection.create_table][lancedb.AsyncConnection.create_table] and
|
||||
[AsyncConnection.open_table][lancedb.AsyncConnection.open_table] to obtain
|
||||
@@ -1988,6 +2004,14 @@ class AsyncTable:
|
||||
return await self._inner.count_rows(filter)
|
||||
|
||||
def query(self) -> AsyncQuery:
|
||||
"""
|
||||
Returns an [AsyncQuery][lancedb.query.AsyncQuery] that can be used
|
||||
to search the table.
|
||||
|
||||
Use methods on the returned query to control query behavior. The query
|
||||
can be executed with methods like [to_arrow][lancedb.query.AsyncQuery.to_arrow],
|
||||
[to_pandas][lancedb.query.AsyncQuery.to_pandas] and more.
|
||||
"""
|
||||
return AsyncQuery(self._inner.query())
|
||||
|
||||
async def to_pandas(self) -> "pd.DataFrame":
|
||||
@@ -2024,20 +2048,8 @@ class AsyncTable:
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index: Index
|
||||
The index to create.
|
||||
|
||||
LanceDb supports multiple types of indices. See the static methods on
|
||||
the Index class for more details.
|
||||
column: str, default None
|
||||
column: str
|
||||
The column to index.
|
||||
|
||||
When building a scalar index this must be set.
|
||||
|
||||
When building a vector index, this is optional. The default will look
|
||||
for any columns of type fixed-size-list with floating point values. If
|
||||
there is only one column of this type then it will be used. Otherwise
|
||||
an error will be returned.
|
||||
replace: bool, default True
|
||||
Whether to replace the existing index
|
||||
|
||||
@@ -2046,6 +2058,10 @@ class AsyncTable:
|
||||
that index is out of date.
|
||||
|
||||
The default is True
|
||||
config: Union[IvfPq, BTree], default None
|
||||
For advanced configuration you can specify the type of index you would
|
||||
like to create. You can also specify index-specific parameters when
|
||||
creating an index object.
|
||||
"""
|
||||
index = None
|
||||
if config is not None:
|
||||
@@ -2167,7 +2183,8 @@ class AsyncTable:
|
||||
Search the table with a given query vector.
|
||||
This is a convenience method for preparing a vector query and
|
||||
is the same thing as calling `nearestTo` on the builder returned
|
||||
by `query`. Seer [nearest_to][AsyncQuery.nearest_to] for more details.
|
||||
by `query`. Seer [nearest_to][lancedb.query.AsyncQuery.nearest_to] for more
|
||||
details.
|
||||
"""
|
||||
return self.query().nearest_to(query_vector)
|
||||
|
||||
@@ -2233,7 +2250,7 @@ class AsyncTable:
|
||||
x vector
|
||||
0 3 [5.0, 6.0]
|
||||
"""
|
||||
raise NotImplementedError
|
||||
return await self._inner.delete(where)
|
||||
|
||||
async def update(
|
||||
self,
|
||||
@@ -2289,102 +2306,6 @@ class AsyncTable:
|
||||
|
||||
return await self._inner.update(updates_sql, where)
|
||||
|
||||
async def cleanup_old_versions(
|
||||
self,
|
||||
older_than: Optional[timedelta] = None,
|
||||
*,
|
||||
delete_unverified: bool = False,
|
||||
) -> CleanupStats:
|
||||
"""
|
||||
Clean up old versions of the table, freeing disk space.
|
||||
|
||||
Note: This function is not available in LanceDb Cloud (since LanceDb
|
||||
Cloud manages cleanup for you automatically)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
older_than: timedelta, default None
|
||||
The minimum age of the version to delete. If None, then this defaults
|
||||
to two weeks.
|
||||
delete_unverified: bool, default False
|
||||
Because they may be part of an in-progress transaction, files newer
|
||||
than 7 days old are not deleted by default. If you are sure that
|
||||
there are no in-progress transactions, then you can set this to True
|
||||
to delete all files older than `older_than`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
CleanupStats
|
||||
The stats of the cleanup operation, including how many bytes were
|
||||
freed.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
async def compact_files(self, *args, **kwargs):
|
||||
"""
|
||||
Run the compaction process on the table.
|
||||
|
||||
Note: This function is not available in LanceDb Cloud (since LanceDb
|
||||
Cloud manages compaction for you automatically)
|
||||
|
||||
This can be run after making several small appends to optimize the table
|
||||
for faster reads.
|
||||
|
||||
Arguments are passed onto :meth:`lance.dataset.DatasetOptimizer.compact_files`.
|
||||
For most cases, the default should be fine.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
async def add_columns(self, transforms: Dict[str, str]):
|
||||
"""
|
||||
Add new columns with defined values.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transforms: Dict[str, str]
|
||||
A map of column name to a SQL expression to use to calculate the
|
||||
value of the new column. These expressions will be evaluated for
|
||||
each row in the table, and can reference existing columns.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
async def alter_columns(self, alterations: Iterable[Dict[str, str]]):
|
||||
"""
|
||||
Alter column names and nullability.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
alterations : Iterable[Dict[str, Any]]
|
||||
A sequence of dictionaries, each with the following keys:
|
||||
- "path": str
|
||||
The column path to alter. For a top-level column, this is the name.
|
||||
For a nested column, this is the dot-separated path, e.g. "a.b.c".
|
||||
- "name": str, optional
|
||||
The new name of the column. If not specified, the column name is
|
||||
not changed.
|
||||
- "nullable": bool, optional
|
||||
Whether the column should be nullable. If not specified, the column
|
||||
nullability is not changed. Only non-nullable columns can be changed
|
||||
to nullable. Currently, you cannot change a nullable column to
|
||||
non-nullable.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
async def drop_columns(self, columns: Iterable[str]):
|
||||
"""
|
||||
Drop columns from the table.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : Iterable[str]
|
||||
The names of the columns to drop.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
async def version(self) -> int:
|
||||
"""
|
||||
Retrieve the version of the table
|
||||
|
||||
@@ -25,6 +25,8 @@ import numpy as np
|
||||
import pyarrow as pa
|
||||
import pyarrow.fs as pa_fs
|
||||
|
||||
from ._lancedb import validate_table_name as native_validate_table_name
|
||||
|
||||
|
||||
def safe_import_adlfs():
|
||||
try:
|
||||
@@ -286,3 +288,8 @@ def deprecated(func):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return new_func
|
||||
|
||||
|
||||
def validate_table_name(name: str):
|
||||
"""Verify the table name is valid."""
|
||||
native_validate_table_name(name)
|
||||
|
||||
162
python/python/tests/docs/test_basic.py
Normal file
162
python/python/tests/docs/test_basic.py
Normal file
@@ -0,0 +1,162 @@
|
||||
import shutil
|
||||
|
||||
# --8<-- [start:imports]
|
||||
import lancedb
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
|
||||
# --8<-- [end:imports]
|
||||
import pytest
|
||||
from numpy.random import randint, random
|
||||
|
||||
shutil.rmtree("data/sample-lancedb", ignore_errors=True)
|
||||
|
||||
|
||||
def test_quickstart():
|
||||
# --8<-- [start:connect]
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
# --8<-- [end:connect]
|
||||
|
||||
# --8<-- [start:create_table]
|
||||
data = [
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
]
|
||||
|
||||
# Synchronous client
|
||||
tbl = db.create_table("my_table", data=data)
|
||||
# --8<-- [end:create_table]
|
||||
|
||||
# --8<-- [start:create_table_pandas]
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
]
|
||||
)
|
||||
# Synchronous client
|
||||
tbl = db.create_table("table_from_df", data=df)
|
||||
# --8<-- [end:create_table_pandas]
|
||||
|
||||
# --8<-- [start:create_empty_table]
|
||||
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))])
|
||||
# Synchronous client
|
||||
tbl = db.create_table("empty_table", schema=schema)
|
||||
# --8<-- [end:create_empty_table]
|
||||
# --8<-- [start:open_table]
|
||||
# Synchronous client
|
||||
tbl = db.open_table("my_table")
|
||||
# --8<-- [end:open_table]
|
||||
# --8<-- [start:table_names]
|
||||
# Synchronous client
|
||||
print(db.table_names())
|
||||
# --8<-- [end:table_names]
|
||||
# Synchronous client
|
||||
# --8<-- [start:add_data]
|
||||
# Option 1: Add a list of dicts to a table
|
||||
data = [
|
||||
{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
|
||||
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0},
|
||||
]
|
||||
tbl.add(data)
|
||||
|
||||
# Option 2: Add a pandas DataFrame to a table
|
||||
df = pd.DataFrame(data)
|
||||
tbl.add(data)
|
||||
# --8<-- [end:add_data]
|
||||
# --8<-- [start:vector_search]
|
||||
# Synchronous client
|
||||
tbl.search([100, 100]).limit(2).to_pandas()
|
||||
# --8<-- [end:vector_search]
|
||||
tbl.add(
|
||||
[
|
||||
{"vector": random(2), "item": "autogen", "price": randint(100)}
|
||||
for _ in range(1000)
|
||||
]
|
||||
)
|
||||
# --8<-- [start:create_index]
|
||||
# Synchronous client
|
||||
tbl.create_index(num_sub_vectors=1)
|
||||
# --8<-- [end:create_index]
|
||||
# --8<-- [start:delete_rows]
|
||||
# Synchronous client
|
||||
tbl.delete('item = "fizz"')
|
||||
# --8<-- [end:delete_rows]
|
||||
# --8<-- [start:drop_table]
|
||||
# Synchronous client
|
||||
db.drop_table("my_table")
|
||||
# --8<-- [end:drop_table]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_quickstart_async():
|
||||
# --8<-- [start:connect_async]
|
||||
# LanceDb offers both a synchronous and an asynchronous client. There are still a
|
||||
# few operations that are only supported by the synchronous client (e.g. embedding
|
||||
# functions, full text search) but both APIs should soon be equivalent
|
||||
|
||||
# In this guide we will give examples of both clients. In other guides we will
|
||||
# typically only provide examples with one client or the other.
|
||||
uri = "data/sample-lancedb"
|
||||
async_db = await lancedb.connect_async(uri)
|
||||
# --8<-- [end:connect_async]
|
||||
|
||||
data = [
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
]
|
||||
|
||||
# --8<-- [start:create_table_async]
|
||||
# Asynchronous client
|
||||
async_tbl = await async_db.create_table("my_table2", data=data)
|
||||
# --8<-- [end:create_table_async]
|
||||
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
]
|
||||
)
|
||||
|
||||
# --8<-- [start:create_table_async_pandas]
|
||||
# Asynchronous client
|
||||
async_tbl = await async_db.create_table("table_from_df2", df)
|
||||
# --8<-- [end:create_table_async_pandas]
|
||||
|
||||
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))])
|
||||
# --8<-- [start:create_empty_table_async]
|
||||
# Asynchronous client
|
||||
async_tbl = await async_db.create_table("empty_table2", schema=schema)
|
||||
# --8<-- [end:create_empty_table_async]
|
||||
# --8<-- [start:open_table_async]
|
||||
# Asynchronous client
|
||||
async_tbl = await async_db.open_table("my_table2")
|
||||
# --8<-- [end:open_table_async]
|
||||
# --8<-- [start:table_names_async]
|
||||
# Asynchronous client
|
||||
print(await async_db.table_names())
|
||||
# --8<-- [end:table_names_async]
|
||||
# --8<-- [start:add_data_async]
|
||||
# Asynchronous client
|
||||
await async_tbl.add(data)
|
||||
# --8<-- [end:add_data_async]
|
||||
# Add sufficient data for training
|
||||
data = [{"vector": [x, x], "item": "filler", "price": x * x} for x in range(1000)]
|
||||
await async_tbl.add(data)
|
||||
# --8<-- [start:vector_search_async]
|
||||
# Asynchronous client
|
||||
await async_tbl.vector_search([100, 100]).limit(2).to_pandas()
|
||||
# --8<-- [end:vector_search_async]
|
||||
# --8<-- [start:create_index_async]
|
||||
# Asynchronous client (must specify column to index)
|
||||
await async_tbl.create_index("vector")
|
||||
# --8<-- [end:create_index_async]
|
||||
# --8<-- [start:delete_rows_async]
|
||||
# Asynchronous client
|
||||
await async_tbl.delete('item = "fizz"')
|
||||
# --8<-- [end:delete_rows_async]
|
||||
# --8<-- [start:drop_table_async]
|
||||
# Asynchronous client
|
||||
await async_db.drop_table("my_table2")
|
||||
# --8<-- [end:drop_table_async]
|
||||
@@ -521,3 +521,15 @@ def test_prefilter_with_index(tmp_path):
|
||||
.to_arrow()
|
||||
)
|
||||
assert table.num_rows == 1
|
||||
|
||||
|
||||
def test_create_table_with_invalid_names(tmp_path):
|
||||
db = lancedb.connect(uri=tmp_path)
|
||||
data = [{"vector": np.random.rand(128), "item": "foo"} for i in range(10)]
|
||||
with pytest.raises(ValueError):
|
||||
db.create_table("foo/bar", data)
|
||||
with pytest.raises(ValueError):
|
||||
db.create_table("foo bar", data)
|
||||
with pytest.raises(ValueError):
|
||||
db.create_table("foo$$bar", data)
|
||||
db.create_table("foo.bar", data)
|
||||
|
||||
45
python/python/tests/test_embedding_fine_tuning.py
Normal file
45
python/python/tests/test_embedding_fine_tuning.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
from lancedb.embeddings import get_registry
|
||||
from lancedb.embeddings.fine_tuner import QADataset, TextChunk
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_finetuning_sentence_transformers(tmp_path):
|
||||
queries = {}
|
||||
relevant_docs = {}
|
||||
chunks = [
|
||||
"This is a chunk related to legal docs",
|
||||
"This is another chunk related financial docs",
|
||||
"This is a chunk related to sports docs",
|
||||
"This is another chunk related to fashion docs",
|
||||
]
|
||||
text_chunks = [TextChunk.from_chunk(chunk) for chunk in chunks]
|
||||
for chunk in tqdm(text_chunks):
|
||||
questions = [
|
||||
"What is this chunk about?",
|
||||
"What is the main topic of this chunk?",
|
||||
]
|
||||
for question in questions:
|
||||
question_id = str(uuid.uuid4())
|
||||
queries[question_id] = question
|
||||
relevant_docs[question_id] = [chunk.id]
|
||||
ds = QADataset.from_responses(text_chunks, queries, relevant_docs)
|
||||
|
||||
assert len(ds.queries) == 8
|
||||
assert len(ds.corpus) == 4
|
||||
|
||||
model = get_registry().get("sentence-transformers").create()
|
||||
model.finetune(trainset=ds, valset=ds, path=str(tmp_path / "model"), epochs=1)
|
||||
model = (
|
||||
get_registry().get("sentence-transformers").create(name=str(tmp_path / "model"))
|
||||
)
|
||||
res = model.evaluate(ds)
|
||||
assert res is not None
|
||||
|
||||
|
||||
def test_text_chunk():
|
||||
# TODO
|
||||
pass
|
||||
@@ -137,6 +137,21 @@ impl Connection {
|
||||
Ok(Table::new(table))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn drop_table(self_: PyRef<'_, Self>, name: String) -> PyResult<&PyAny> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.drop_table(name).await.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn drop_db(self_: PyRef<'_, Self>) -> PyResult<&PyAny> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
future_into_py(
|
||||
self_.py(),
|
||||
async move { inner.drop_db().await.infer_error() },
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
|
||||
@@ -42,6 +42,7 @@ pub fn _lancedb(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<VectorQuery>()?;
|
||||
m.add_class::<RecordBatchStream>()?;
|
||||
m.add_function(wrap_pyfunction!(connect, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
|
||||
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -80,6 +80,13 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn delete<'a>(self_: PyRef<'a, Self>, condition: String) -> PyResult<&'a PyAny> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner.delete(&condition).await.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn update<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
updates: &PyDict,
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::sync::Mutex;
|
||||
use lancedb::DistanceType;
|
||||
use pyo3::{
|
||||
exceptions::{PyRuntimeError, PyValueError},
|
||||
PyResult,
|
||||
pyfunction, PyResult,
|
||||
};
|
||||
|
||||
/// A wrapper around a rust builder
|
||||
@@ -49,3 +49,9 @@ pub fn parse_distance_type(distance_type: impl AsRef<str>) -> PyResult<DistanceT
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
pub(crate) fn validate_table_name(table_name: &str) -> PyResult<()> {
|
||||
lancedb::utils::validate_table_name(table_name)
|
||||
.map_err(|e| PyValueError::new_err(e.to_string()))
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-node"
|
||||
version = "0.4.13"
|
||||
version = "0.4.14"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.4.13"
|
||||
version = "0.4.14"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
@@ -22,6 +22,7 @@ chrono = { workspace = true }
|
||||
object_store = { workspace = true }
|
||||
snafu = { workspace = true }
|
||||
half = { workspace = true }
|
||||
lazy_static.workspace = true
|
||||
lance = { workspace = true }
|
||||
lance-index = { workspace = true }
|
||||
lance-linalg = { workspace = true }
|
||||
@@ -34,11 +35,10 @@ bytes = "1"
|
||||
futures.workspace = true
|
||||
num-traits.workspace = true
|
||||
url.workspace = true
|
||||
regex.workspace = true
|
||||
serde = { version = "^1" }
|
||||
serde_json = { version = "1" }
|
||||
|
||||
# For remote feature
|
||||
|
||||
reqwest = { version = "0.11.24", features = ["gzip", "json"], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -31,6 +31,7 @@ use crate::arrow::IntoArrow;
|
||||
use crate::error::{CreateDirSnafu, Error, InvalidTableNameSnafu, Result};
|
||||
use crate::io::object_store::MirroringObjectStoreWrapper;
|
||||
use crate::table::{NativeTable, WriteOptions};
|
||||
use crate::utils::validate_table_name;
|
||||
use crate::Table;
|
||||
|
||||
pub const LANCE_FILE_EXTENSION: &str = "lance";
|
||||
@@ -675,13 +676,18 @@ impl Database {
|
||||
|
||||
/// Get the URI of a table in the database.
|
||||
fn table_uri(&self, name: &str) -> Result<String> {
|
||||
validate_table_name(name)?;
|
||||
|
||||
let path = Path::new(&self.uri);
|
||||
let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
|
||||
|
||||
let mut uri = table_uri
|
||||
.as_path()
|
||||
.to_str()
|
||||
.context(InvalidTableNameSnafu { name })?
|
||||
.context(InvalidTableNameSnafu {
|
||||
name,
|
||||
reason: "Name is not valid URL",
|
||||
})?
|
||||
.to_string();
|
||||
|
||||
// If there are query string set on the connection, propagate to lance
|
||||
|
||||
@@ -20,8 +20,8 @@ use snafu::Snafu;
|
||||
#[derive(Debug, Snafu)]
|
||||
#[snafu(visibility(pub(crate)))]
|
||||
pub enum Error {
|
||||
#[snafu(display("Invalid table name: {name}"))]
|
||||
InvalidTableName { name: String },
|
||||
#[snafu(display("Invalid table name (\"{name}\"): {reason}"))]
|
||||
InvalidTableName { name: String, reason: String },
|
||||
#[snafu(display("Invalid input, {message}"))]
|
||||
InvalidInput { message: String },
|
||||
#[snafu(display("Table '{name}' was not found"))]
|
||||
|
||||
@@ -230,9 +230,9 @@ pub enum DistanceType {
|
||||
impl From<DistanceType> for LanceDistanceType {
|
||||
fn from(value: DistanceType) -> Self {
|
||||
match value {
|
||||
DistanceType::L2 => LanceDistanceType::L2,
|
||||
DistanceType::Cosine => LanceDistanceType::Cosine,
|
||||
DistanceType::Dot => LanceDistanceType::Dot,
|
||||
DistanceType::L2 => Self::L2,
|
||||
DistanceType::Cosine => Self::Cosine,
|
||||
DistanceType::Dot => Self::Dot,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -240,9 +240,9 @@ impl From<DistanceType> for LanceDistanceType {
|
||||
impl From<LanceDistanceType> for DistanceType {
|
||||
fn from(value: LanceDistanceType) -> Self {
|
||||
match value {
|
||||
LanceDistanceType::L2 => DistanceType::L2,
|
||||
LanceDistanceType::Cosine => DistanceType::Cosine,
|
||||
LanceDistanceType::Dot => DistanceType::Dot,
|
||||
LanceDistanceType::L2 => Self::L2,
|
||||
LanceDistanceType::Cosine => Self::Cosine,
|
||||
LanceDistanceType::Dot => Self::Dot,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -251,7 +251,7 @@ impl<'a> TryFrom<&'a str> for DistanceType {
|
||||
type Error = <LanceDistanceType as TryFrom<&'a str>>::Error;
|
||||
|
||||
fn try_from(value: &str) -> std::prelude::v1::Result<Self, Self::Error> {
|
||||
LanceDistanceType::try_from(value).map(DistanceType::from)
|
||||
LanceDistanceType::try_from(value).map(Self::from)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -854,6 +854,7 @@ impl NativeTable {
|
||||
.to_str()
|
||||
.ok_or(Error::InvalidTableName {
|
||||
name: uri.to_string(),
|
||||
reason: "Table name is not valid URL".to_string(),
|
||||
})?;
|
||||
Ok(name.to_string())
|
||||
}
|
||||
@@ -1197,7 +1198,7 @@ impl NativeTable {
|
||||
if dim != query_vector.len() as i32 {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"The dimension of the query vector does not match with the dimension of the vector column '{}':
|
||||
"The dimension of the query vector does not match with the dimension of the vector column '{}':
|
||||
query dim={}, expected vector dim={}",
|
||||
column,
|
||||
query_vector.len(),
|
||||
|
||||
@@ -1,12 +1,30 @@
|
||||
// Copyright 2024 LanceDB Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::Schema;
|
||||
|
||||
use lance::dataset::{ReadParams, WriteParams};
|
||||
use lance::io::{ObjectStoreParams, WrappingObjectStore};
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::error::{Error, Result};
|
||||
|
||||
lazy_static! {
|
||||
static ref TABLE_NAME_REGEX: regex::Regex = regex::Regex::new(r"^[a-zA-Z0-9_\-\.]+$").unwrap();
|
||||
}
|
||||
|
||||
pub trait PatchStoreParam {
|
||||
fn patch_with_store_wrapper(
|
||||
self,
|
||||
@@ -64,6 +82,25 @@ impl PatchReadParam for ReadParams {
|
||||
}
|
||||
}
|
||||
|
||||
/// Validate table name.
|
||||
pub fn validate_table_name(name: &str) -> Result<()> {
|
||||
if name.is_empty() {
|
||||
return Err(Error::InvalidTableName {
|
||||
name: name.to_string(),
|
||||
reason: "Table names cannot be empty strings".to_string(),
|
||||
});
|
||||
}
|
||||
if !TABLE_NAME_REGEX.is_match(name) {
|
||||
return Err(Error::InvalidTableName {
|
||||
name: name.to_string(),
|
||||
reason:
|
||||
"Table names can only contain alphanumeric characters, underscores, hyphens, and periods"
|
||||
.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Find one default column to create index.
|
||||
pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result<String> {
|
||||
// Try to find one fixed size list array column.
|
||||
@@ -145,4 +182,20 @@ mod tests {
|
||||
.to_string()
|
||||
.contains("More than one"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_table_name() {
|
||||
assert!(validate_table_name("my_table").is_ok());
|
||||
assert!(validate_table_name("my_table_1").is_ok());
|
||||
assert!(validate_table_name("123mytable").is_ok());
|
||||
assert!(validate_table_name("_12345table").is_ok());
|
||||
assert!(validate_table_name("table.12345").is_ok());
|
||||
assert!(validate_table_name("table.._dot_..12345").is_ok());
|
||||
|
||||
assert!(validate_table_name("").is_err());
|
||||
assert!(validate_table_name("my_table!").is_err());
|
||||
assert!(validate_table_name("my/table").is_err());
|
||||
assert!(validate_table_name("my@table").is_err());
|
||||
assert!(validate_table_name("name with space").is_err());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user