Compare commits

..

21 Commits

Author SHA1 Message Date
ayush chaurasia
aa2cba953d update 2024-03-26 21:28:15 +05:30
ayush chaurasia
53c946917e Merge branch 'main' into docs_march 2024-03-26 20:55:06 +05:30
Bert
cae0348c51 New logo on docs site (#1157) 2024-03-26 20:50:13 +05:30
Ayush Chaurasia
e9e0a37ca8 docs: Add all available HF/sentence transformers embedding models list (#1134)
Solves -  https://github.com/lancedb/lancedb/issues/968
2024-03-26 19:04:09 +05:30
Weston Pace
c37a28abbd docs: add the async python API to the docs (#1156) 2024-03-26 07:54:16 -05:00
Lance Release
98c1e635b3 Updating package-lock.json 2024-03-25 20:38:37 +00:00
Lance Release
9992b927fd Updating package-lock.json 2024-03-25 15:43:00 +00:00
Lance Release
80d501011c Bump version: 0.4.13 → 0.4.14 2024-03-25 15:42:49 +00:00
Weston Pace
6e3a9d08e0 feat: add publish step for nodejs (#1155)
This will start publishing `@lancedb/lancedb` with the new nodejs
package on our releases.
2024-03-25 11:23:30 -04:00
ayush chaurasia
db04453520 Merge branch 'main' of https://github.com/lancedb/lancedb into docs_march 2024-03-23 20:35:03 +05:30
Pranav Maddi
268d8e057b Adds a Ask LanceDB button to docs. (#1150)
This links out to the new [asklancedb.com](https://asklancedb.com) page.

Screenshots of the change:

![Quick start - LanceDB · 10 20am ·
03-22](https://github.com/lancedb/lancedb/assets/2371511/c45ba893-fc74-4957-bdd3-3712b351aff3)
![Quick start -
LanceDB](https://github.com/lancedb/lancedb/assets/2371511/d4762eb6-52af-4fd5-857e-3ed280716999)
2024-03-23 01:09:44 +05:30
Bert
dfc518b8fb Node SDK Client middleware for HTTP Requests (#1130)
Adds client-side middleware to LanceDB Node SDK to instrument HTTP
Requests

Example - adding `x-request-id` request header:
```js
class HttpMiddleware {
    constructor({ requestId }) {
        this.requestId = requestId
    }

    onRemoteRequest(req, next) {
        req.headers['x-request-id'] = this.requestId
        return next(req)
    }
}

const db = await lancedb.connect({
  uri: 'db://remote-123',
  apiKey: 'sk_...',
})

let tables = await db.withMiddleware(new HttpMiddleware({ requestId: '123' })).tableNames();

```

---------

Co-authored-by: Weston Pace <weston.pace@gmail.com>
2024-03-22 11:58:05 -04:00
ayush chaurasia
16f1480d64 update 2024-03-22 14:36:45 +05:30
ayush chaurasia
9a41894cd2 update 2024-03-22 14:33:39 +05:30
ayush chaurasia
ba1266a6b9 update 2024-03-22 13:58:23 +05:30
QianZhu
98acf34ae8 remove warnings (#1147) 2024-03-21 14:49:01 -07:00
Lei Xu
25988d23cd chore: validate table name (#1146)
Closes #1129
2024-03-21 14:46:13 -07:00
ayush chaurasia
245584ed27 update 2024-03-21 20:47:33 +05:30
ayush chaurasia
438c11157a update 2024-03-21 20:22:46 +05:30
ayush chaurasia
4f74e8384f add fts 2024-03-21 20:08:58 +05:30
ayush chaurasia
926bc8c4a2 update 2024-03-21 20:06:02 +05:30
83 changed files with 2503 additions and 1352 deletions

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.4.13
current_version = 0.4.14
commit = True
message = Bump version: {current_version} → {new_version}
tag = True

View File

@@ -18,7 +18,7 @@ on:
env:
# Disable full debug symbol generation to speed up CI build and keep memory down
# "1" means line tables only, which is useful for panic tracebacks.
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=native -C target-feature=+f16c,+avx2,+fma"
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=haswell -C target-feature=+f16c,+avx2,+fma"
RUST_BACKTRACE: "1"
jobs:
@@ -28,6 +28,8 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Print CPU capabilities
run: cat /proc/cpuinfo
- name: Install dependecies needed for ubuntu
run: |
sudo apt install -y protobuf-compiler libssl-dev
@@ -39,7 +41,7 @@ jobs:
cache: "pip"
cache-dependency-path: "docs/test/requirements.txt"
- name: Rust cache
uses: swatinem/rust-cache@v2
uses: swatinem/rust-cache@v2
- name: Build Python
working-directory: docs/test
run:
@@ -64,6 +66,8 @@ jobs:
with:
fetch-depth: 0
lfs: true
- name: Print CPU capabilities
run: cat /proc/cpuinfo
- name: Set up Node
uses: actions/setup-node@v4
with:

View File

@@ -20,7 +20,8 @@ env:
# "1" means line tables only, which is useful for panic tracebacks.
#
# Use native CPU to accelerate tests if possible, especially for f16
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=native -C target-feature=+f16c,+avx2,+fma"
# target-cpu=haswell fixes failing ci build
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=haswell -C target-feature=+f16c,+avx2,+fma"
RUST_BACKTRACE: "1"
jobs:

View File

@@ -2,7 +2,7 @@ name: NPM Publish
on:
release:
types: [ published ]
types: [published]
jobs:
node:
@@ -19,7 +19,7 @@ jobs:
- uses: actions/setup-node@v3
with:
node-version: 20
cache: 'npm'
cache: "npm"
cache-dependency-path: node/package-lock.json
- name: Install dependencies
run: |
@@ -31,7 +31,7 @@ jobs:
npm run tsc
npm pack
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: node-package
path: |
@@ -61,12 +61,41 @@ jobs:
- name: Build MacOS native node modules
run: bash ci/build_macos_artifacts.sh ${{ matrix.config.arch }}
- name: Upload Darwin Artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: native-darwin
name: node-native-darwin
path: |
node/dist/lancedb-vectordb-darwin*.tgz
nodejs-macos:
strategy:
matrix:
config:
- arch: x86_64-apple-darwin
runner: macos-13
- arch: aarch64-apple-darwin
# xlarge is implicitly arm64.
runner: macos-14
runs-on: ${{ matrix.config.runner }}
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install system dependencies
run: brew install protobuf
- name: Install npm dependencies
run: |
cd nodejs
npm ci
- name: Build MacOS native nodejs modules
run: bash ci/build_macos_artifacts_nodejs.sh ${{ matrix.config.arch }}
- name: Upload Darwin Artifacts
uses: actions/upload-artifact@v4
with:
name: nodejs-native-darwin-${{ matrix.config.arch }}
path: |
nodejs/dist/*.node
node-linux:
name: node-linux (${{ matrix.config.arch}}-unknown-linux-gnu
@@ -103,12 +132,63 @@ jobs:
run: |
bash ci/build_linux_artifacts.sh ${{ matrix.config.arch }}
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: native-linux
name: node-native-linux
path: |
node/dist/lancedb-vectordb-linux*.tgz
nodejs-linux:
name: nodejs-linux (${{ matrix.config.arch}}-unknown-linux-gnu
runs-on: ${{ matrix.config.runner }}
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
strategy:
fail-fast: false
matrix:
config:
- arch: x86_64
runner: ubuntu-latest
- arch: aarch64
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
runner: buildjet-16vcpu-ubuntu-2204-arm
steps:
- name: Checkout
uses: actions/checkout@v4
# Buildjet aarch64 runners have only 1.5 GB RAM per core, vs 3.5 GB per core for
# x86_64 runners. To avoid OOM errors on ARM, we create a swap file.
- name: Configure aarch64 build
if: ${{ matrix.config.arch == 'aarch64' }}
run: |
free -h
sudo fallocate -l 16G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
echo "/swapfile swap swap defaults 0 0" >> sudo /etc/fstab
# print info
swapon --show
free -h
- name: Build Linux Artifacts
run: |
bash ci/build_linux_artifacts_nodejs.sh ${{ matrix.config.arch }}
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: nodejs-native-linux-${{ matrix.config.arch }}
path: |
nodejs/dist/*.node
# The generic files are the same in all distros so we just pick
# one to do the upload.
- name: Upload Generic Artifacts
if: ${{ matrix.config.arch == 'x86_64' }}
uses: actions/upload-artifact@v4
with:
name: nodejs-dist
path: |
nodejs/dist/*
!nodejs/dist/*.node
node-windows:
runs-on: windows-2022
# Only runs on tags that matches the make-release action
@@ -136,25 +216,60 @@ jobs:
- name: Build Windows native node modules
run: .\ci\build_windows_artifacts.ps1 ${{ matrix.target }}
- name: Upload Windows Artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: native-windows
name: node-native-windows
path: |
node/dist/lancedb-vectordb-win32*.tgz
nodejs-windows:
runs-on: windows-2022
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
strategy:
fail-fast: false
matrix:
target: [x86_64-pc-windows-msvc]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install Protoc v21.12
working-directory: C:\
run: |
New-Item -Path 'C:\protoc' -ItemType Directory
Set-Location C:\protoc
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
7z x protoc.zip
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
shell: powershell
- name: Install npm dependencies
run: |
cd nodejs
npm ci
- name: Build Windows native node modules
run: .\ci\build_windows_artifacts_nodejs.ps1 ${{ matrix.target }}
- name: Upload Windows Artifacts
uses: actions/upload-artifact@v4
with:
name: nodejs-native-windows
path: |
nodejs/dist/*.node
release:
needs: [node, node-macos, node-linux, node-windows]
runs-on: ubuntu-latest
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
steps:
- uses: actions/download-artifact@v3
- uses: actions/download-artifact@v4
with:
pattern: node-*
- name: Display structure of downloaded files
run: ls -R
- uses: actions/setup-node@v3
with:
node-version: 20
registry-url: 'https://registry.npmjs.org'
registry-url: "https://registry.npmjs.org"
- name: Publish to NPM
env:
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
@@ -164,6 +279,45 @@ jobs:
npm publish $filename
done
release-nodejs:
needs: [nodejs-macos, nodejs-linux, nodejs-windows]
runs-on: ubuntu-latest
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
defaults:
run:
shell: bash
working-directory: nodejs
steps:
- name: Checkout
uses: actions/checkout@v4
- uses: actions/download-artifact@v4
with:
name: nodejs-dist
path: nodejs/dist
- uses: actions/download-artifact@v4
name: Download arch-specific binaries
with:
pattern: nodejs-*
path: nodejs/nodejs-artifacts
merge-multiple: true
- name: Display structure of downloaded files
run: find .
- uses: actions/setup-node@v3
with:
node-version: 20
registry-url: "https://registry.npmjs.org"
- name: Install napi-rs
run: npm install -g @napi-rs/cli
- name: Prepare artifacts
run: npx napi artifacts -d nodejs-artifacts
- name: Display structure of staged files
run: find npm
- name: Publish to NPM
env:
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
run: npm publish --access public
update-package-lock:
needs: [release]
runs-on: ubuntu-latest
@@ -178,3 +332,18 @@ jobs:
- uses: ./.github/workflows/update_package_lock
with:
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
update-package-lock-nodejs:
needs: [release-nodejs]
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
ref: main
persist-credentials: false
fetch-depth: 0
lfs: true
- uses: ./.github/workflows/update_package_lock_nodejs
with:
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}

View File

@@ -0,0 +1,33 @@
name: update_package_lock_nodejs
description: "Update nodejs's package.lock"
inputs:
github_token:
required: true
description: "github token for the repo"
runs:
using: "composite"
steps:
- uses: actions/setup-node@v3
with:
node-version: 20
- name: Set git configs
shell: bash
run: |
git config user.name 'Lance Release'
git config user.email 'lance-dev@lancedb.com'
- name: Update package-lock.json file
working-directory: ./nodejs
run: |
npm install
git add package-lock.json
git commit -m "Updating package-lock.json"
shell: bash
- name: Push changes
if: ${{ inputs.dry_run }} == "false"
uses: ad-m/github-push-action@master
with:
github_token: ${{ inputs.github_token }}
branch: main
tags: true

View File

@@ -0,0 +1,19 @@
name: Update NodeJs package-lock.json
on:
workflow_dispatch:
jobs:
publish:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
ref: main
persist-credentials: false
fetch-depth: 0
lfs: true
- uses: ./.github/workflows/update_package_lock_nodejs
with:
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}

1
.gitignore vendored
View File

@@ -34,6 +34,7 @@ python/dist
node/dist
node/examples/**/package-lock.json
node/examples/**/dist
nodejs/lancedb/native*
dist
## Rust

View File

@@ -39,3 +39,5 @@ pin-project = "1.0.7"
snafu = "0.7.4"
url = "2"
num-traits = "0.2"
regex = "1.10"
lazy_static = "1"

View File

@@ -0,0 +1,21 @@
#!/bin/bash
set -e
ARCH=${1:-x86_64}
# We pass down the current user so that when we later mount the local files
# into the container, the files are accessible by the current user.
pushd ci/manylinux_nodejs
docker build \
-t lancedb-nodejs-manylinux \
--build-arg="ARCH=$ARCH" \
--build-arg="DOCKER_USER=$(id -u)" \
--progress=plain \
.
popd
# We turn on memory swap to avoid OOM killer
docker run \
-v $(pwd):/io -w /io \
--memory-swap=-1 \
lancedb-nodejs-manylinux \
bash ci/manylinux_nodejs/build.sh $ARCH

View File

@@ -0,0 +1,34 @@
# Builds the macOS artifacts (nodejs binaries).
# Usage: ./ci/build_macos_artifacts_nodejs.sh [target]
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
set -e
prebuild_rust() {
# Building here for the sake of easier debugging.
pushd rust/lancedb
echo "Building rust library for $1"
export RUST_BACKTRACE=1
cargo build --release --target $1
popd
}
build_node_binaries() {
pushd nodejs
echo "Building nodejs library for $1"
export RUST_TARGET=$1
npm run build-release
popd
}
if [ -n "$1" ]; then
targets=$1
else
targets="x86_64-apple-darwin aarch64-apple-darwin"
fi
echo "Building artifacts for targets: $targets"
for target in $targets
do
prebuild_rust $target
build_node_binaries $target
done

View File

@@ -0,0 +1,41 @@
# Builds the Windows artifacts (nodejs binaries).
# Usage: .\ci\build_windows_artifacts_nodejs.ps1 [target]
# Targets supported:
# - x86_64-pc-windows-msvc
# - i686-pc-windows-msvc
function Prebuild-Rust {
param (
[string]$target
)
# Building here for the sake of easier debugging.
Push-Location -Path "rust/lancedb"
Write-Host "Building rust library for $target"
$env:RUST_BACKTRACE=1
cargo build --release --target $target
Pop-Location
}
function Build-NodeBinaries {
param (
[string]$target
)
Push-Location -Path "nodejs"
Write-Host "Building nodejs library for $target"
$env:RUST_TARGET=$target
npm run build-release
Pop-Location
}
$targets = $args[0]
if (-not $targets) {
$targets = "x86_64-pc-windows-msvc"
}
Write-Host "Building artifacts for targets: $targets"
foreach ($target in $targets) {
Prebuild-Rust $target
Build-NodeBinaries $target
}

View File

@@ -0,0 +1,31 @@
# Many linux dockerfile with Rust, Node, and Lance dependencies installed.
# This container allows building the node modules native libraries in an
# environment with a very old glibc, so that we are compatible with a wide
# range of linux distributions.
ARG ARCH=x86_64
FROM quay.io/pypa/manylinux2014_${ARCH}
ARG ARCH=x86_64
ARG DOCKER_USER=default_user
# Install static openssl
COPY install_openssl.sh install_openssl.sh
RUN ./install_openssl.sh ${ARCH} > /dev/null
# Protobuf is also installed as root.
COPY install_protobuf.sh install_protobuf.sh
RUN ./install_protobuf.sh ${ARCH}
ENV DOCKER_USER=${DOCKER_USER}
# Create a group and user
RUN echo ${ARCH} && adduser --user-group --create-home --uid ${DOCKER_USER} build_user
# We switch to the user to install Rust and Node, since those like to be
# installed at the user level.
USER ${DOCKER_USER}
COPY prepare_manylinux_node.sh prepare_manylinux_node.sh
RUN cp /prepare_manylinux_node.sh $HOME/ && \
cd $HOME && \
./prepare_manylinux_node.sh ${ARCH}

18
ci/manylinux_nodejs/build.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/bin/bash
# Builds the nodejs module for manylinux. Invoked by ci/build_linux_artifacts_nodejs.sh.
set -e
ARCH=${1:-x86_64}
if [ "$ARCH" = "x86_64" ]; then
export OPENSSL_LIB_DIR=/usr/local/lib64/
else
export OPENSSL_LIB_DIR=/usr/local/lib/
fi
export OPENSSL_STATIC=1
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
source $HOME/.bashrc
cd nodejs
npm ci
npm run build-release

View File

@@ -0,0 +1,26 @@
#!/bin/bash
# Builds openssl from source so we can statically link to it
# this is to avoid the error we get with the system installation:
# /usr/bin/ld: <library>: version node not found for symbol SSLeay@@OPENSSL_1.0.1
# /usr/bin/ld: failed to set dynamic section sizes: Bad value
set -e
git clone -b OpenSSL_1_1_1u \
--single-branch \
https://github.com/openssl/openssl.git
pushd openssl
if [[ $1 == x86_64* ]]; then
ARCH=linux-x86_64
else
# gnu target
ARCH=linux-aarch64
fi
./Configure no-shared $ARCH
make
make install

View File

@@ -0,0 +1,15 @@
#!/bin/bash
# Installs protobuf compiler. Should be run as root.
set -e
if [[ $1 == x86_64* ]]; then
ARCH=x86_64
else
# gnu target
ARCH=aarch_64
fi
PB_REL=https://github.com/protocolbuffers/protobuf/releases
PB_VERSION=23.1
curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-$ARCH.zip
unzip protoc-$PB_VERSION-linux-$ARCH.zip -d /usr/local

View File

@@ -0,0 +1,21 @@
#!/bin/bash
set -e
install_node() {
echo "Installing node..."
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
source "$HOME"/.bashrc
nvm install --no-progress 16
}
install_rust() {
echo "Installing rust..."
curl https://sh.rustup.rs -sSf | bash -s -- -y
export PATH="$PATH:/root/.cargo/bin"
}
install_node
install_rust

View File

@@ -38,178 +38,196 @@ theme:
custom_dir: overrides
plugins:
- search
- autorefs
- mkdocstrings:
handlers:
python:
paths: [../python]
options:
docstring_style: numpy
heading_level: 4
show_source: true
show_symbol_type_in_heading: true
show_signature_annotations: true
members_order: source
import:
# for cross references
- https://arrow.apache.org/docs/objects.inv
- https://pandas.pydata.org/docs/objects.inv
- mkdocs-jupyter
- ultralytics:
verbose: True
enabled: True
default_image: "assets/lancedb_and_lance.png" # Default image for all pages
add_image: True # Automatically add meta image
add_keywords: True # Add page keywords in the header tag
add_share_buttons: True # Add social share buttons
add_authors: False # Display page authors
add_desc: False
add_dates: False
- search
- autorefs
- mkdocstrings:
handlers:
python:
paths: [../python]
options:
docstring_style: numpy
heading_level: 3
show_source: true
show_symbol_type_in_heading: true
show_signature_annotations: true
show_root_heading: true
members_order: source
import:
# for cross references
- https://arrow.apache.org/docs/objects.inv
- https://pandas.pydata.org/docs/objects.inv
- mkdocs-jupyter
- ultralytics:
verbose: True
enabled: True
default_image: "assets/lancedb_and_lance.png" # Default image for all pages
add_image: True # Automatically add meta image
add_keywords: True # Add page keywords in the header tag
add_share_buttons: True # Add social share buttons
add_authors: False # Display page authors
add_desc: False
add_dates: False
markdown_extensions:
- admonition
- footnotes
- pymdownx.details
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.snippets:
base_path: ..
dedent_subsections: true
- pymdownx.superfences
- pymdownx.tabbed:
alternate_style: true
- md_in_html
- attr_list
- admonition
- footnotes
- pymdownx.details
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.snippets:
base_path: ..
dedent_subsections: true
- pymdownx.superfences
- pymdownx.tabbed:
alternate_style: true
- md_in_html
- attr_list
nav:
- Home:
- LanceDB: index.md
- 🏃🏼‍♂️ Quick start: basic.md
- 📚 Concepts:
- Vector search: concepts/vector_search.md
- Indexing: concepts/index_ivfpq.md
- Storage: concepts/storage.md
- Data management: concepts/data_management.md
- 🔨 Guides:
- Working with tables: guides/tables.md
- Building an ANN index: ann_indexes.md
- Vector Search: search.md
- Full-text search: fts.md
- Hybrid search:
- Overview: hybrid_search/hybrid_search.md
- Comparing Rerankers: hybrid_search/eval.md
- Airbnb financial data example: notebooks/hybrid_search.ipynb
- Filtering: sql.md
- Versioning & Reproducibility: notebooks/reproducibility.ipynb
- Configuring Storage: guides/storage.md
- 🧬 Managing embeddings:
- Overview: embeddings/index.md
- Embedding functions: embeddings/embedding_functions.md
- Available models: embeddings/default_embedding_functions.md
- User-defined embedding functions: embeddings/custom_embedding_function.md
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
- 🔌 Integrations:
- Tools and data formats: integrations/index.md
- Pandas and PyArrow: python/pandas_and_pyarrow.md
- Polars: python/polars_arrow.md
- DuckDB: python/duckdb.md
- LangChain 🔗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
- LangChain JS/TS 🔗: https://js.langchain.com/docs/modules/data_connection/vectorstores/integrations/lancedb
- LlamaIndex 🦙: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
- Pydantic: python/pydantic.md
- Voxel51: integrations/voxel51.md
- PromptTools: integrations/prompttools.md
- 🎯 Examples:
- Overview: examples/index.md
- 🐍 Python:
- Overview: examples/examples_python.md
- Home:
- LanceDB: index.md
- 🏃🏼‍♂️ Quick start: basic.md
- 📚 Concepts:
- Vector search: concepts/vector_search.md
- Indexing: concepts/index_ivfpq.md
- Storage: concepts/storage.md
- Data management: concepts/data_management.md
- 🔨 Guides:
- Working with tables: guides/tables.md
- Building an ANN index: ann_indexes.md
- Vector Search: search.md
- Full-text search: fts.md
- Hybrid search:
- Overview: hybrid_search/hybrid_search.md
- Comparing Rerankers: hybrid_search/eval.md
- Airbnb financial data example: notebooks/hybrid_search.ipynb
- Reranking:
- Quickstart: reranking/index.md
- Cohere Reranker: reranking/cohere.md
- Linear Combination Reranker: reranking/linear_combination.md
- Cross Encoder Reranker: reranking/cross_encoder.md
- ColBERT Reranker: reranking/colbert.md
- OpenAI Reranker: reranking/openai.md
- Building Custom Rerankers: reranking/custom_reranker.md
- Filtering: sql.md
- Versioning & Reproducibility: notebooks/reproducibility.ipynb
- Configuring Storage: guides/storage.md
- Sync -> Async Migration Guide: migration.md
- 🧬 Managing embeddings:
- Overview: embeddings/index.md
- Embedding functions: embeddings/embedding_functions.md
- Available models: embeddings/default_embedding_functions.md
- User-defined embedding functions: embeddings/custom_embedding_function.md
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
- 🔌 Integrations:
- Tools and data formats: integrations/index.md
- Pandas and PyArrow: python/pandas_and_pyarrow.md
- Polars: python/polars_arrow.md
- DuckDB: python/duckdb.md
- LangChain 🔗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
- LangChain JS/TS 🔗: https://js.langchain.com/docs/modules/data_connection/vectorstores/integrations/lancedb
- LlamaIndex 🦙: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
- Pydantic: python/pydantic.md
- Voxel51: integrations/voxel51.md
- PromptTools: integrations/prompttools.md
- 🎯 Examples:
- Overview: examples/index.md
- 🐍 Python:
- Overview: examples/examples_python.md
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
- Multimodal search using CLIP: notebooks/multimodal_search.ipynb
- Example - Calculate CLIP Embeddings with Roboflow Inference: examples/image_embeddings_roboflow.md
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
- 👾 JavaScript:
- Overview: examples/examples_js.md
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
- 🦀 Rust:
- Overview: examples/examples_rust.md
- 🔧 CLI & Config: cli_config.md
- 💭 FAQs: faq.md
- ⚙️ API reference:
- 🐍 Python: python/python.md
- 👾 JavaScript: javascript/modules.md
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
- ☁️ LanceDB Cloud:
- Overview: cloud/index.md
- API reference:
- 🐍 Python: python/saas-python.md
- 👾 JavaScript: javascript/saas-modules.md
- Quick start: basic.md
- Concepts:
- Vector search: concepts/vector_search.md
- Indexing: concepts/index_ivfpq.md
- Storage: concepts/storage.md
- Data management: concepts/data_management.md
- Guides:
- Working with tables: guides/tables.md
- Building an ANN index: ann_indexes.md
- Vector Search: search.md
- Full-text search: fts.md
- Hybrid search:
- Overview: hybrid_search/hybrid_search.md
- Comparing Rerankers: hybrid_search/eval.md
- Airbnb financial data example: notebooks/hybrid_search.ipynb
- Reranking:
- Quickstart: reranking/index.md
- Cohere Reranker: reranking/cohere.md
- Linear Combination Reranker: reranking/linear_combination.md
- Cross Encoder Reranker: reranking/cross_encoder.md
- ColBERT Reranker: reranking/colbert.md
- OpenAI Reranker: reranking/openai.md
- Building Custom Rerankers: reranking/custom_reranker.md
- Filtering: sql.md
- Versioning & Reproducibility: notebooks/reproducibility.ipynb
- Configuring Storage: guides/storage.md
- Sync -> Async Migration Guide: migration.md
- Managing Embeddings:
- Overview: embeddings/index.md
- Embedding functions: embeddings/embedding_functions.md
- Available models: embeddings/default_embedding_functions.md
- User-defined embedding functions: embeddings/custom_embedding_function.md
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
- Integrations:
- Overview: integrations/index.md
- Pandas and PyArrow: python/pandas_and_pyarrow.md
- Polars: python/polars_arrow.md
- DuckDB: python/duckdb.md
- LangChain 🦜️🔗↗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
- LangChain.js 🦜️🔗↗: https://js.langchain.com/docs/modules/data_connection/vectorstores/integrations/lancedb
- LlamaIndex 🦙↗: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
- Pydantic: python/pydantic.md
- Voxel51: integrations/voxel51.md
- PromptTools: integrations/prompttools.md
- Examples:
- examples/index.md
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
- Multimodal search using CLIP: notebooks/multimodal_search.ipynb
- Example - Calculate CLIP Embeddings with Roboflow Inference: examples/image_embeddings_roboflow.md
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
- 👾 JavaScript:
- Overview: examples/examples_js.md
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
- YouTube Transcript Search (JS): examples/youtube_transcript_bot_with_nodejs.md
- Serverless Chatbot from any website: examples/serverless_website_chatbot.md
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
- 🦀 Rust:
- Overview: examples/examples_rust.md
- 🔧 CLI & Config: cli_config.md
- 💭 FAQs: faq.md
- ⚙️ API reference:
- 🐍 Python: python/python.md
- 👾 JavaScript: javascript/modules.md
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
- ☁️ LanceDB Cloud:
- Overview: cloud/index.md
- API reference:
- 🐍 Python: python/saas-python.md
- 👾 JavaScript: javascript/saas-modules.md
- Quick start: basic.md
- Concepts:
- Vector search: concepts/vector_search.md
- Indexing: concepts/index_ivfpq.md
- Storage: concepts/storage.md
- Data management: concepts/data_management.md
- Guides:
- Working with tables: guides/tables.md
- Building an ANN index: ann_indexes.md
- Vector Search: search.md
- Full-text search: fts.md
- Hybrid search:
- Overview: hybrid_search/hybrid_search.md
- Comparing Rerankers: hybrid_search/eval.md
- Airbnb financial data example: notebooks/hybrid_search.ipynb
- Filtering: sql.md
- Versioning & Reproducibility: notebooks/reproducibility.ipynb
- Configuring Storage: guides/storage.md
- Managing Embeddings:
- Overview: embeddings/index.md
- Embedding functions: embeddings/embedding_functions.md
- Available models: embeddings/default_embedding_functions.md
- User-defined embedding functions: embeddings/custom_embedding_function.md
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
- Integrations:
- Overview: integrations/index.md
- Pandas and PyArrow: python/pandas_and_pyarrow.md
- Polars: python/polars_arrow.md
- DuckDB : python/duckdb.md
- LangChain 🦜️🔗↗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
- LangChain.js 🦜️🔗↗: https://js.langchain.com/docs/modules/data_connection/vectorstores/integrations/lancedb
- LlamaIndex 🦙↗: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
- Pydantic: python/pydantic.md
- Voxel51: integrations/voxel51.md
- PromptTools: integrations/prompttools.md
- Examples:
- examples/index.md
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
- Multimodal search using CLIP: notebooks/multimodal_search.ipynb
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
- YouTube Transcript Search (JS): examples/youtube_transcript_bot_with_nodejs.md
- Serverless Chatbot from any website: examples/serverless_website_chatbot.md
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
- API reference:
- Overview: api_reference.md
- Python: python/python.md
- Javascript: javascript/modules.md
- Rust: https://docs.rs/lancedb/latest/lancedb/index.html
- LanceDB Cloud:
- Overview: cloud/index.md
- API reference:
- 🐍 Python: python/saas-python.md
- 👾 JavaScript: javascript/saas-modules.md
- API reference:
- Overview: api_reference.md
- Python: python/python.md
- Javascript: javascript/modules.md
- Rust: https://docs.rs/lancedb/latest/lancedb/index.html
- LanceDB Cloud:
- Overview: cloud/index.md
- API reference:
- 🐍 Python: python/saas-python.md
- 👾 JavaScript: javascript/saas-modules.md
extra_css:
- styles/global.css
@@ -221,4 +239,4 @@ extra_javascript:
extra:
analytics:
provider: google
property: G-B7NFM40W74
property: G-B7NFM40W74

Binary file not shown.

Before

Width:  |  Height:  |  Size: 104 KiB

After

Width:  |  Height:  |  Size: 147 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 83 KiB

After

Width:  |  Height:  |  Size: 98 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 KiB

After

Width:  |  Height:  |  Size: 204 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 82 KiB

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

After

Width:  |  Height:  |  Size: 217 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 97 KiB

After

Width:  |  Height:  |  Size: 256 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.7 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@@ -48,11 +48,20 @@
=== "Python"
```python
import lancedb
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
```
```python
--8<-- "python/python/tests/docs/test_basic.py:imports"
--8<-- "python/python/tests/docs/test_basic.py:connect"
--8<-- "python/python/tests/docs/test_basic.py:connect_async"
```
!!! note "Asynchronous Python API"
The asynchronous Python API is new and has some slight differences compared
to the synchronous API. Feel free to start using the asynchronous version.
Once all features have migrated we will start to move the synchronous API to
use the same syntax as the asynchronous API. To help with this migration we
have created a [migration guide](migration.md) detailing the differences.
=== "Typescript"
@@ -82,15 +91,14 @@ If you need a reminder of the uri, you can call `db.uri()`.
### Create a table from initial data
If you have data to insert into the table at creation time, you can simultaneously create a
table and insert the data into it. The schema of the data will be used as the schema of the
table and insert the data into it. The schema of the data will be used as the schema of the
table.
=== "Python"
```python
tbl = db.create_table("my_table",
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
--8<-- "python/python/tests/docs/test_basic.py:create_table"
--8<-- "python/python/tests/docs/test_basic.py:create_table_async"
```
If the table already exists, LanceDB will raise an error by default.
@@ -100,10 +108,8 @@ table.
You can also pass in a pandas DataFrame directly:
```python
import pandas as pd
df = pd.DataFrame([{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
tbl = db.create_table("table_from_df", data=df)
--8<-- "python/python/tests/docs/test_basic.py:create_table_pandas"
--8<-- "python/python/tests/docs/test_basic.py:create_table_async_pandas"
```
=== "Typescript"
@@ -138,15 +144,14 @@ table.
Sometimes you may not have the data to insert into the table at creation time.
In this case, you can create an empty table and specify the schema, so that you can add
data to the table at a later time (as long as it conforms to the schema). This is
data to the table at a later time (as long as it conforms to the schema). This is
similar to a `CREATE TABLE` statement in SQL.
=== "Python"
```python
import pyarrow as pa
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))])
tbl = db.create_table("empty_table", schema=schema)
--8<-- "python/python/tests/docs/test_basic.py:create_empty_table"
--8<-- "python/python/tests/docs/test_basic.py:create_empty_table_async"
```
=== "Typescript"
@@ -168,7 +173,8 @@ Once created, you can open a table as follows:
=== "Python"
```python
tbl = db.open_table("my_table")
--8<-- "python/python/tests/docs/test_basic.py:open_table"
--8<-- "python/python/tests/docs/test_basic.py:open_table_async"
```
=== "Typescript"
@@ -188,7 +194,8 @@ If you forget the name of your table, you can always get a listing of all table
=== "Python"
```python
print(db.table_names())
--8<-- "python/python/tests/docs/test_basic.py:table_names"
--8<-- "python/python/tests/docs/test_basic.py:table_names_async"
```
=== "Javascript"
@@ -210,15 +217,8 @@ After a table has been created, you can always add more data to it as follows:
=== "Python"
```python
# Option 1: Add a list of dicts to a table
data = [{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}]
tbl.add(data)
# Option 2: Add a pandas DataFrame to a table
df = pd.DataFrame(data)
tbl.add(data)
--8<-- "python/python/tests/docs/test_basic.py:add_data"
--8<-- "python/python/tests/docs/test_basic.py:add_data_async"
```
=== "Typescript"
@@ -240,7 +240,8 @@ Once you've embedded the query, you can find its nearest neighbors as follows:
=== "Python"
```python
tbl.search([100, 100]).limit(2).to_pandas()
--8<-- "python/python/tests/docs/test_basic.py:vector_search"
--8<-- "python/python/tests/docs/test_basic.py:vector_search_async"
```
This returns a pandas DataFrame with the results.
@@ -274,7 +275,8 @@ LanceDB allows you to create an ANN index on a table as follows:
=== "Python"
```py
tbl.create_index()
--8<-- "python/python/tests/docs/test_basic.py:create_index"
--8<-- "python/python/tests/docs/test_basic.py:create_index_async"
```
=== "Typescript"
@@ -286,15 +288,15 @@ LanceDB allows you to create an ANN index on a table as follows:
=== "Rust"
```rust
--8<-- "rust/lancedb/examples/simple.rs:create_index"
--8<-- "rust/lancedb/examples/simple.rs:create_index"
```
!!! note "Why do I need to create an index manually?"
LanceDB does not automatically create the ANN index for two reasons. The first is that it's optimized
for really fast retrievals via a disk-based index, and the second is that data and query workloads can
be very diverse, so there's no one-size-fits-all index configuration. LanceDB provides many parameters
to fine-tune index size, query latency and accuracy. See the section on
[ANN indexes](ann_indexes.md) for more details.
LanceDB does not automatically create the ANN index for two reasons. The first is that it's optimized
for really fast retrievals via a disk-based index, and the second is that data and query workloads can
be very diverse, so there's no one-size-fits-all index configuration. LanceDB provides many parameters
to fine-tune index size, query latency and accuracy. See the section on
[ANN indexes](ann_indexes.md) for more details.
## Delete rows from a table
@@ -305,7 +307,8 @@ This can delete any number of rows that match the filter.
=== "Python"
```python
tbl.delete('item = "fizz"')
--8<-- "python/python/tests/docs/test_basic.py:delete_rows"
--8<-- "python/python/tests/docs/test_basic.py:delete_rows_async"
```
=== "Typescript"
@@ -322,7 +325,7 @@ This can delete any number of rows that match the filter.
The deletion predicate is a SQL expression that supports the same expressions
as the `where()` clause (`only_if()` in Rust) on a search. They can be as
simple or complex as needed. To see what expressions are supported, see the
simple or complex as needed. To see what expressions are supported, see the
[SQL filters](sql.md) section.
=== "Python"
@@ -344,7 +347,8 @@ Use the `drop_table()` method on the database to remove a table.
=== "Python"
```python
db.drop_table("my_table")
--8<-- "python/python/tests/docs/test_basic.py:drop_table"
--8<-- "python/python/tests/docs/test_basic.py:drop_table_async"
```
This permanently removes the table and is not recoverable, unlike deleting rows.

View File

@@ -19,27 +19,163 @@ Allows you to set parameters when registering a `sentence-transformers` object.
| `normalize` | `bool` | `True` | Whether to normalize the input text before feeding it to the model |
```python
db = lancedb.connect("/tmp/db")
registry = EmbeddingFunctionRegistry.get_instance()
func = registry.get("sentence-transformers").create(device="cpu")
??? "Check out available sentence-transformer models here!"
```markdown
- sentence-transformers/all-MiniLM-L12-v2
- sentence-transformers/paraphrase-mpnet-base-v2
- sentence-transformers/gtr-t5-base
- sentence-transformers/LaBSE
- sentence-transformers/all-MiniLM-L6-v2
- sentence-transformers/bert-base-nli-max-tokens
- sentence-transformers/bert-base-nli-mean-tokens
- sentence-transformers/bert-base-nli-stsb-mean-tokens
- sentence-transformers/bert-base-wikipedia-sections-mean-tokens
- sentence-transformers/bert-large-nli-cls-token
- sentence-transformers/bert-large-nli-max-tokens
- sentence-transformers/bert-large-nli-mean-tokens
- sentence-transformers/bert-large-nli-stsb-mean-tokens
- sentence-transformers/distilbert-base-nli-max-tokens
- sentence-transformers/distilbert-base-nli-mean-tokens
- sentence-transformers/distilbert-base-nli-stsb-mean-tokens
- sentence-transformers/distilroberta-base-msmarco-v1
- sentence-transformers/distilroberta-base-msmarco-v2
- sentence-transformers/nli-bert-base-cls-pooling
- sentence-transformers/nli-bert-base-max-pooling
- sentence-transformers/nli-bert-base
- sentence-transformers/nli-bert-large-cls-pooling
- sentence-transformers/nli-bert-large-max-pooling
- sentence-transformers/nli-bert-large
- sentence-transformers/nli-distilbert-base-max-pooling
- sentence-transformers/nli-distilbert-base
- sentence-transformers/nli-roberta-base
- sentence-transformers/nli-roberta-large
- sentence-transformers/roberta-base-nli-mean-tokens
- sentence-transformers/roberta-base-nli-stsb-mean-tokens
- sentence-transformers/roberta-large-nli-mean-tokens
- sentence-transformers/roberta-large-nli-stsb-mean-tokens
- sentence-transformers/stsb-bert-base
- sentence-transformers/stsb-bert-large
- sentence-transformers/stsb-distilbert-base
- sentence-transformers/stsb-roberta-base
- sentence-transformers/stsb-roberta-large
- sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens
- sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens
- sentence-transformers/xlm-r-base-en-ko-nli-ststb
- sentence-transformers/xlm-r-bert-base-nli-mean-tokens
- sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens
- sentence-transformers/xlm-r-large-en-ko-nli-ststb
- sentence-transformers/bert-base-nli-cls-token
- sentence-transformers/all-distilroberta-v1
- sentence-transformers/multi-qa-MiniLM-L6-dot-v1
- sentence-transformers/multi-qa-distilbert-cos-v1
- sentence-transformers/multi-qa-distilbert-dot-v1
- sentence-transformers/multi-qa-mpnet-base-cos-v1
- sentence-transformers/multi-qa-mpnet-base-dot-v1
- sentence-transformers/nli-distilroberta-base-v2
- sentence-transformers/all-MiniLM-L6-v1
- sentence-transformers/all-mpnet-base-v1
- sentence-transformers/all-mpnet-base-v2
- sentence-transformers/all-roberta-large-v1
- sentence-transformers/allenai-specter
- sentence-transformers/average_word_embeddings_glove.6B.300d
- sentence-transformers/average_word_embeddings_glove.840B.300d
- sentence-transformers/average_word_embeddings_komninos
- sentence-transformers/average_word_embeddings_levy_dependency
- sentence-transformers/clip-ViT-B-32-multilingual-v1
- sentence-transformers/clip-ViT-B-32
- sentence-transformers/distilbert-base-nli-stsb-quora-ranking
- sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking
- sentence-transformers/distilroberta-base-paraphrase-v1
- sentence-transformers/distiluse-base-multilingual-cased-v1
- sentence-transformers/distiluse-base-multilingual-cased-v2
- sentence-transformers/distiluse-base-multilingual-cased
- sentence-transformers/facebook-dpr-ctx_encoder-multiset-base
- sentence-transformers/facebook-dpr-ctx_encoder-single-nq-base
- sentence-transformers/facebook-dpr-question_encoder-multiset-base
- sentence-transformers/facebook-dpr-question_encoder-single-nq-base
- sentence-transformers/gtr-t5-large
- sentence-transformers/gtr-t5-xl
- sentence-transformers/gtr-t5-xxl
- sentence-transformers/msmarco-MiniLM-L-12-v3
- sentence-transformers/msmarco-MiniLM-L-6-v3
- sentence-transformers/msmarco-MiniLM-L12-cos-v5
- sentence-transformers/msmarco-MiniLM-L6-cos-v5
- sentence-transformers/msmarco-bert-base-dot-v5
- sentence-transformers/msmarco-bert-co-condensor
- sentence-transformers/msmarco-distilbert-base-dot-prod-v3
- sentence-transformers/msmarco-distilbert-base-tas-b
- sentence-transformers/msmarco-distilbert-base-v2
- sentence-transformers/msmarco-distilbert-base-v3
- sentence-transformers/msmarco-distilbert-base-v4
- sentence-transformers/msmarco-distilbert-cos-v5
- sentence-transformers/msmarco-distilbert-dot-v5
- sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned
- sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-trained-scratch
- sentence-transformers/msmarco-distilroberta-base-v2
- sentence-transformers/msmarco-roberta-base-ance-firstp
- sentence-transformers/msmarco-roberta-base-v2
- sentence-transformers/msmarco-roberta-base-v3
- sentence-transformers/multi-qa-MiniLM-L6-cos-v1
- sentence-transformers/nli-mpnet-base-v2
- sentence-transformers/nli-roberta-base-v2
- sentence-transformers/nq-distilbert-base-v1
- sentence-transformers/paraphrase-MiniLM-L12-v2
- sentence-transformers/paraphrase-MiniLM-L3-v2
- sentence-transformers/paraphrase-MiniLM-L6-v2
- sentence-transformers/paraphrase-TinyBERT-L6-v2
- sentence-transformers/paraphrase-albert-base-v2
- sentence-transformers/paraphrase-albert-small-v2
- sentence-transformers/paraphrase-distilroberta-base-v1
- sentence-transformers/paraphrase-distilroberta-base-v2
- sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
- sentence-transformers/paraphrase-multilingual-mpnet-base-v2
- sentence-transformers/paraphrase-xlm-r-multilingual-v1
- sentence-transformers/quora-distilbert-base
- sentence-transformers/quora-distilbert-multilingual
- sentence-transformers/sentence-t5-base
- sentence-transformers/sentence-t5-large
- sentence-transformers/sentence-t5-xxl
- sentence-transformers/sentence-t5-xl
- sentence-transformers/stsb-distilroberta-base-v2
- sentence-transformers/stsb-mpnet-base-v2
- sentence-transformers/stsb-roberta-base-v2
- sentence-transformers/stsb-xlm-r-multilingual
- sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1
- sentence-transformers/clip-ViT-L-14
- sentence-transformers/clip-ViT-B-16
- sentence-transformers/use-cmlm-multilingual
- sentence-transformers/all-MiniLM-L12-v1
```
class Words(LanceModel):
text: str = func.SourceField()
vector: Vector(func.ndims()) = func.VectorField()
!!! info
You can also load many other model architectures from the library. For example models from sources such as BAAI, nomic, salesforce research, etc.
See this HF hub page for all [supported models](https://huggingface.co/models?library=sentence-transformers).
table = db.create_table("words", schema=Words)
table.add(
[
{"text": "hello world"}
{"text": "goodbye world"}
]
)
!!! note "BAAI Embeddings example"
Here is an example that uses BAAI embedding model from the HuggingFace Hub [supported models](https://huggingface.co/models?library=sentence-transformers)
```python
db = lancedb.connect("/tmp/db")
registry = EmbeddingFunctionRegistry.get_instance()
model = registry.get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu")
class Words(LanceModel):
text: str = model.SourceField()
vector: Vector(model.ndims()) = model.VectorField()
table = db.create_table("words", schema=Words)
table.add(
[
{"text": "hello world"}
{"text": "goodbye world"}
]
)
query = "greetings"
actual = table.search(query).limit(1).to_pydantic(Words)[0]
print(actual.text)
```
Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models.
query = "greetings"
actual = table.search(query).limit(1).to_pydantic(Words)[0]
print(actual.text)
```
### OpenAI embeddings
LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances:

View File

@@ -1,11 +1,79 @@
document.addEventListener("DOMContentLoaded", function () {
var script = document.createElement("script");
script.src = "https://widget.kapa.ai/kapa-widget.bundle.js";
script.setAttribute("data-website-id", "c5881fae-cec0-490b-b45e-d83d131d4f25");
script.setAttribute("data-project-name", "LanceDB");
script.setAttribute("data-project-color", "#000000");
script.setAttribute("data-project-logo", "https://avatars.githubusercontent.com/u/108903835?s=200&v=4");
script.setAttribute("data-modal-example-questions","Help me create an IVF_PQ index,How do I do an exhaustive search?,How do I create a LanceDB table?,Can I use my own embedding function?");
script.async = true;
document.head.appendChild(script);
});
// Creates an SVG robot icon (from Lucide)
function robotSVG() {
var svg = document.createElementNS("http://www.w3.org/2000/svg", "svg");
svg.setAttribute("width", "24");
svg.setAttribute("height", "24");
svg.setAttribute("viewBox", "0 0 24 24");
svg.setAttribute("fill", "none");
svg.setAttribute("stroke", "currentColor");
svg.setAttribute("stroke-width", "2");
svg.setAttribute("stroke-linecap", "round");
svg.setAttribute("stroke-linejoin", "round");
svg.setAttribute("class", "lucide lucide-bot-message-square");
var path1 = document.createElementNS("http://www.w3.org/2000/svg", "path");
path1.setAttribute("d", "M12 6V2H8");
svg.appendChild(path1);
var path2 = document.createElementNS("http://www.w3.org/2000/svg", "path");
path2.setAttribute("d", "m8 18-4 4V8a2 2 0 0 1 2-2h12a2 2 0 0 1 2 2v8a2 2 0 0 1-2 2Z");
svg.appendChild(path2);
var path3 = document.createElementNS("http://www.w3.org/2000/svg", "path");
path3.setAttribute("d", "M2 12h2");
svg.appendChild(path3);
var path4 = document.createElementNS("http://www.w3.org/2000/svg", "path");
path4.setAttribute("d", "M9 11v2");
svg.appendChild(path4);
var path5 = document.createElementNS("http://www.w3.org/2000/svg", "path");
path5.setAttribute("d", "M15 11v2");
svg.appendChild(path5);
var path6 = document.createElementNS("http://www.w3.org/2000/svg", "path");
path6.setAttribute("d", "M20 12h2");
svg.appendChild(path6);
return svg
}
// Creates the Fluidic Chatbot buttom
function fluidicButton() {
var btn = document.createElement("a");
btn.href = "https://asklancedb.com";
btn.target = "_blank";
btn.style.position = "fixed";
btn.style.fontWeight = "bold";
btn.style.fontSize = ".8rem";
btn.style.right = "10px";
btn.style.bottom = "10px";
btn.style.width = "80px";
btn.style.height = "80px";
btn.style.background = "linear-gradient(135deg, #7C5EFF 0%, #625eff 100%)";
btn.style.color = "white";
btn.style.borderRadius = "5px";
btn.style.display = "flex";
btn.style.flexDirection = "column";
btn.style.justifyContent = "center";
btn.style.alignItems = "center";
btn.style.zIndex = "1000";
btn.style.opacity = "0";
btn.style.boxShadow = "0 0 0 rgba(0, 0, 0, 0)";
btn.style.transition = "opacity 0.2s ease-in, box-shadow 0.2s ease-in";
setTimeout(function() {
btn.style.opacity = "1";
btn.style.boxShadow = "0 0 .2rem #0000001a,0 .2rem .4rem #0003"
}, 0);
return btn
}
document.addEventListener("DOMContentLoaded", function() {
var btn = fluidicButton()
btn.appendChild(robotSVG());
var text = document.createTextNode("Ask AI");
btn.appendChild(text);
document.body.appendChild(btn);
});

View File

@@ -5,6 +5,9 @@ LanceDB supports both semantic and keyword-based search (also termed full-text s
## Hybrid search in LanceDB
You can perform hybrid search in LanceDB by combining the results of semantic and full-text search via a reranking algorithm of your choice. LanceDB provides multiple rerankers out of the box. However, you can always write a custom reranker if your use case need more sophisticated logic .
!!! note
You need to create a full-text search index before performing a hybrid search. You can create a full-text search index using the `create_fts_index()` method of the table object.
```python
import os
@@ -55,188 +58,7 @@ By default, LanceDB uses `LinearCombinationReranker(weight=0.7)` to combine and
## Available Rerankers
LanceDB provides a number of re-rankers out of the box. You can use any of these re-rankers by passing them to the `rerank()` method. Here's a list of available re-rankers:
LanceDB provides a number of re-rankers out of the box. You can use any of these re-rankers by passing them to the `rerank()` method. Visit the [rerankers](../reranking/) page for more information on each re-ranker.
### Linear Combination Reranker
This is the default re-ranker used by LanceDB. It combines the results of semantic and full-text search using a linear combination of the scores. The weights for the linear combination can be specified. It defaults to 0.7, i.e, 70% weight for semantic search and 30% weight for full-text search.
```python
from lancedb.rerankers import LinearCombinationReranker
reranker = LinearCombinationReranker(weight=0.3) # Use 0.3 as the weight for vector search
results = table.search("rebel", query_type="hybrid").rerank(reranker=reranker).to_pandas()
```
### Arguments
----------------
* `weight`: `float`, default `0.7`:
The weight to use for the semantic search score. The weight for the full-text search score is `1 - weights`.
* `fill`: `float`, default `1.0`:
The score to give to results that are only in one of the two result sets.This is treated as penalty, so a higher value means a lower score.
TODO: We should just hardcode this-- its pretty confusing as we invert scores to calculate final score
* `return_score` : str, default `"relevance"`
options are "relevance" or "all"
The type of score to return. If "relevance", will return only the `_relevance_score. If "all", will return all scores from the vector and FTS search along with the relevance score.
### Cohere Reranker
This re-ranker uses the [Cohere](https://cohere.ai/) API to combine the results of semantic and full-text search. You can use this re-ranker by passing `CohereReranker()` to the `rerank()` method. Note that you'll need to set the `COHERE_API_KEY` environment variable to use this re-ranker.
```python
from lancedb.rerankers import CohereReranker
reranker = CohereReranker()
results = table.search("vampire weekend", query_type="hybrid").rerank(reranker=reranker).to_pandas()
```
### Arguments
----------------
* `model_name` : str, default `"rerank-english-v2.0"`
The name of the cross encoder model to use. Available cohere models are:
- rerank-english-v2.0
- rerank-multilingual-v2.0
* `column` : str, default `"text"`
The name of the column to use as input to the cross encoder model.
* `top_n` : str, default `None`
The number of results to return. If None, will return all results.
!!! Note
Only returns `_relevance_score`. Does not support `return_score = "all"`.
### Cross Encoder Reranker
This reranker uses the [Sentence Transformers](https://www.sbert.net/) library to combine the results of semantic and full-text search. You can use it by passing `CrossEncoderReranker()` to the `rerank()` method.
```python
from lancedb.rerankers import CrossEncoderReranker
reranker = CrossEncoderReranker()
results = table.search("harmony hall", query_type="hybrid").rerank(reranker=reranker).to_pandas()
```
### Arguments
----------------
* `model` : str, default `"cross-encoder/ms-marco-TinyBERT-L-6"`
The name of the cross encoder model to use. Available cross encoder models can be found [here](https://www.sbert.net/docs/pretrained_cross-encoders.html)
* `column` : str, default `"text"`
The name of the column to use as input to the cross encoder model.
* `device` : str, default `None`
The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu".
!!! Note
Only returns `_relevance_score`. Does not support `return_score = "all"`.
### ColBERT Reranker
This reranker uses the ColBERT model to combine the results of semantic and full-text search. You can use it by passing `ColbertrReranker()` to the `rerank()` method.
ColBERT reranker model calculates relevance of given docs against the query and don't take existing fts and vector search scores into account, so it currently only supports `return_score="relevance"`. By default, it looks for `text` column to rerank the results. But you can specify the column name to use as input to the cross encoder model as described below.
```python
from lancedb.rerankers import ColbertReranker
reranker = ColbertReranker()
results = table.search("harmony hall", query_type="hybrid").rerank(reranker=reranker).to_pandas()
```
### Arguments
----------------
* `model_name` : `str`, default `"colbert-ir/colbertv2.0"`
The name of the cross encoder model to use.
* `column` : `str`, default `"text"`
The name of the column to use as input to the cross encoder model.
* `return_score` : `str`, default `"relevance"`
options are `"relevance"` or `"all"`. Only `"relevance"` is supported for now.
!!! Note
Only returns `_relevance_score`. Does not support `return_score = "all"`.
### OpenAI Reranker
This reranker uses the OpenAI API to combine the results of semantic and full-text search. You can use it by passing `OpenaiReranker()` to the `rerank()` method.
!!! Note
This prompts chat model to rerank results which is not a dedicated reranker model. This should be treated as experimental.
!!! Tip
- You might run out of token limit so set the search `limits` based on your token limit.
- It is recommended to use gpt-4-turbo-preview, the default model, older models might lead to undesired behaviour
```python
from lancedb.rerankers import OpenaiReranker
reranker = OpenaiReranker()
results = table.search("harmony hall", query_type="hybrid").rerank(reranker=reranker).to_pandas()
```
### Arguments
----------------
* `model_name` : `str`, default `"gpt-4-turbo-preview"`
The name of the cross encoder model to use.
* `column` : `str`, default `"text"`
The name of the column to use as input to the cross encoder model.
* `return_score` : `str`, default `"relevance"`
options are "relevance" or "all". Only "relevance" is supported for now.
* `api_key` : `str`, default `None`
The API key to use. If None, will use the OPENAI_API_KEY environment variable.
## Building Custom Rerankers
You can build your own custom reranker by subclassing the `Reranker` class and implementing the `rerank_hybrid()` method. Here's an example of a custom reranker that combines the results of semantic and full-text search using a linear combination of the scores.
The `Reranker` base interface comes with a `merge_results()` method that can be used to combine the results of semantic and full-text search. This is a vanilla merging algorithm that simply concatenates the results and removes the duplicates without taking the scores into consideration. It only keeps the first copy of the row encountered. This works well in cases that don't require the scores of semantic and full-text search to combine the results. If you want to use the scores or want to support `return_score="all"`, you'll need to implement your own merging algorithm.
```python
from lancedb.rerankers import Reranker
import pyarrow as pa
class MyReranker(Reranker):
def __init__(self, param1, param2, ..., return_score="relevance"):
super().__init__(return_score)
self.param1 = param1
self.param2 = param2
def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table):
# Use the built-in merging function
combined_result = self.merge_results(vector_results, fts_results)
# Do something with the combined results
# ...
# Return the combined results
return combined_result
```
### Example of a Custom Reranker
For the sake of simplicity let's build custom reranker that just enchances the Cohere Reranker by accepting a filter query, and accept other CohereReranker params as kwags.
```python
from typing import List, Union
import pandas as pd
from lancedb.rerankers import CohereReranker
class MofidifiedCohereReranker(CohereReranker):
def __init__(self, filters: Union[str, List[str]], **kwargs):
super().__init__(**kwargs)
filters = filters if isinstance(filters, list) else [filters]
self.filters = filters
def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table)-> pa.Table:
combined_result = super().rerank_hybrid(query, vector_results, fts_results)
df = combined_result.to_pandas()
for filter in self.filters:
df = df.query("not text.str.contains(@filter)")
return pa.Table.from_pandas(df)
```
!!! tip
The `vector_results` and `fts_results` are pyarrow tables. You can convert them to pandas dataframes using `to_pandas()` method and perform any operations you want. After you are done, you can convert the dataframe back to pyarrow table using `pa.Table.from_pandas()` method and return it.
## Custom Rerankers
You can also create custom rerankers by extending the base `Reranker` class. The custom reranker should implement the `rerank` method that takes a list of search results and returns a reranked list of search results. Visit the [custom rerankers](../reranking/custom_reranker.md) page for more information on creating custom rerankers.

76
docs/src/migration.md Normal file
View File

@@ -0,0 +1,76 @@
# Rust-backed Client Migration Guide
In an effort to ensure all clients have the same set of capabilities we have begun migrating the
python and node clients onto a common Rust base library. In python, this new client is part of
the same lancedb package, exposed as an asynchronous client. Once the asynchronous client has
reached full functionality we will begin migrating the synchronous library to be a thin wrapper
around the asynchronous client.
This guide describes the differences between the two APIs and will hopefully assist users
that would like to migrate to the new API.
## Closeable Connections
The Connection now has a `close` method. You can call this when
you are done with the connection to eagerly free resources. Currently
this is limited to freeing/closing the HTTP connection for remote
connections. In the future we may add caching or other resources to
native connections so this is probably a good practice even if you
aren't using remote connections.
In addition, the connection can be used as a context manager which may
be a more convenient way to ensure the connection is closed.
```python
import lancedb
async def my_async_fn():
with await lancedb.connect_async("my_uri") as db:
print(await db.table_names())
```
It is not mandatory to call the `close` method. If you do not call it
then the connection will be closed when the object is garbage collected.
## Closeable Table
The Table now also has a `close` method, similar to the connection. This
can be used to eagerly free the cache used by a Table object. Similar to
the connection, it can be used as a context manager and it is not mandatory
to call the `close` method.
### Changes to Table APIs
- Previously `Table.schema` was a property. Now it is an async method.
- The method `Table.__len__` was removed and `len(table)` will no longer
work. Use `Table.count_rows` instead.
### Creating Indices
The `Table.create_index` method is now used for creating both vector indices
and scalar indices. It currently requires a column name to be specified (the
column to index). Vector index defaults are now smarter and scale better with
the size of the data.
To specify index configuration details you will need to specify which kind of
index you are using.
### Querying
The `Table.search` method has been renamed to `AsyncTable.vector_search` for
clarity.
## Features not yet supported
The following features are not yet supported by the asynchronous API. However,
we plan to support them soon.
- You cannot specify an embedding function when creating or opening a table.
You must calculate embeddings yourself if using the asynchronous API
- The merge insert operation is not supported in the asynchronous API
- Cleanup / compact / optimize indices are not supported in the asynchronous API
- add / alter columns is not supported in the asynchronous API
- The asynchronous API does not yet support any full text search or reranking
search
- Remote connections to LanceDb Cloud are not yet supported.
- The method Table.head is not yet supported.

View File

@@ -8,17 +8,20 @@ This section contains the API reference for the OSS Python API.
pip install lancedb
```
## Connection
The following methods describe the synchronous API client. There
is also an [asynchronous API client](#connections-asynchronous).
## Connections (Synchronous)
::: lancedb.connect
::: lancedb.db.DBConnection
## Table
## Tables (Synchronous)
::: lancedb.table.Table
## Querying
## Querying (Synchronous)
::: lancedb.query.Query
@@ -86,4 +89,42 @@ pip install lancedb
::: lancedb.rerankers.cross_encoder.CrossEncoderReranker
::: lancedb.rerankers.openai.OpenaiReranker
::: lancedb.rerankers.openai.OpenaiReranker
## Connections (Asynchronous)
Connections represent a connection to a LanceDb database and
can be used to create, list, or open tables.
::: lancedb.connect_async
::: lancedb.db.AsyncConnection
## Tables (Asynchronous)
Table hold your actual data as a collection of records / rows.
::: lancedb.table.AsyncTable
## Indices (Asynchronous)
Indices can be created on a table to speed up queries. This section
lists the indices that LanceDb supports.
::: lancedb.index.BTree
::: lancedb.index.IvfPq
## Querying (Asynchronous)
Queries allow you to return data from your database. Basic queries can be
created with the [AsyncTable.query][lancedb.table.AsyncTable.query] method
to return the entire (typically filtered) table. Vector searches return the
rows nearest to a query vector and can be created with the
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search] method.
::: lancedb.query.AsyncQueryBase
::: lancedb.query.AsyncQuery
::: lancedb.query.AsyncVectorQuery

View File

@@ -0,0 +1,75 @@
# Cohere Reranker
This re-ranker uses the [Cohere](https://cohere.ai/) API to rerank the search results. You can use this re-ranker by passing `CohereReranker()` to the `rerank()` method. Note that you'll either need to set the `COHERE_API_KEY` environment variable or pass the `api_key` argument to use this re-ranker.
!!! note
Supported Query Types: Hybrid, Vector, FTS
```python
import numpy
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import CohereReranker
embedder = get_registry().get("sentence-transformers").create()
db = lancedb.connect("~/.lancedb")
class Schema(LanceModel):
text: str = embedder.SourceField()
vector: Vector(embedder.ndims()) = embedder.VectorField()
data = [
{"text": "hello world"},
{"text": "goodbye world"}
]
tbl = db.create_table("test", schema=Schema, mode="overwrite")
tbl.add(data)
reranker = CohereReranker(api_key="key")
# Run vector search with a reranker
result = tbl.search("hello").rerank(reranker=reranker).to_list()
# Run FTS search with a reranker
result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list()
# Run hybrid search with a reranker
tbl.create_fts_index("text", replace=True)
result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list()
```
Accepted Arguments
----------------
| Argument | Type | Default | Description |
| --- | --- | --- | --- |
| `model_name` | `str` | `"rerank-english-v2.0"` | The name of the reranker model to use. Available cohere models are: rerank-english-v2.0, rerank-multilingual-v2.0 |
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
| `top_n` | `str` | `None` | The number of results to return. If None, will return all results. |
| `api_key` | `str` | `None` | The API key for the Cohere API. If not provided, the `COHERE_API_KEY` environment variable is used. |
| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type |
## Supported Scores for each query type
You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type:
### Hybrid Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ❌ Not Supported | Returns have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`) |
### Vector Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ✅ Supported | Returns have vector(`_distance`) along with Hybrid Search score(`_relevance_score`) |
### FTS Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ✅ Supported | Returns have FTS(`score`) along with Hybrid Search score(`_relevance_score`) |

View File

@@ -0,0 +1,71 @@
# ColBERT Reranker
This re-ranker uses ColBERT model to rerank the search results. You can use this re-ranker by passing `ColbertReranker()` to the `rerank()` method.
!!! note
Supported Query Types: Hybrid, Vector, FTS
```python
import numpy
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import ColbertReranker
embedder = get_registry().get("sentence-transformers").create()
db = lancedb.connect("~/.lancedb")
class Schema(LanceModel):
text: str = embedder.SourceField()
vector: Vector(embedder.ndims()) = embedder.VectorField()
data = [
{"text": "hello world"},
{"text": "goodbye world"}
]
tbl = db.create_table("test", schema=Schema, mode="overwrite")
tbl.add(data)
reranker = ColbertReranker()
# Run vector search with a reranker
result = tbl.search("hello").rerank(reranker=reranker).to_list()
# Run FTS search with a reranker
result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list()
# Run hybrid search with a reranker
tbl.create_fts_index("text", replace=True)
result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list()
```
Accepted Arguments
----------------
| Argument | Type | Default | Description |
| --- | --- | --- | --- |
| `model_name` | `str` | `"colbert-ir/colbertv2.0"` | The name of the reranker model to use.|
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
| `device` | `str` | `None` | The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu". |
| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type |
## Supported Scores for each query type
You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type:
### Hybrid Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ❌ Not Supported | Returns have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`) |
### Vector Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ✅ Supported | Returns have vector(`_distance`) along with Hybrid Search score(`_relevance_score`) |
### FTS Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ✅ Supported | Returns have FTS(`score`) along with Hybrid Search score(`_relevance_score`) |

View File

@@ -0,0 +1,70 @@
# Cross Encoder Reranker
This re-ranker uses Cross Encoder models from sentence-transformers to rerank the search results. You can use this re-ranker by passing `CrossEncoderReranker()` to the `rerank()` method.
!!! note
Supported Query Types: Hybrid, Vector, FTS
```python
import numpy
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import CrossEncoderReranker
embedder = get_registry().get("sentence-transformers").create()
db = lancedb.connect("~/.lancedb")
class Schema(LanceModel):
text: str = embedder.SourceField()
vector: Vector(embedder.ndims()) = embedder.VectorField()
data = [
{"text": "hello world"},
{"text": "goodbye world"}
]
tbl = db.create_table("test", schema=Schema, mode="overwrite")
tbl.add(data)
reranker = CrossEncoderReranker()
# Run vector search with a reranker
result = tbl.search("hello").rerank(reranker=reranker).to_list()
# Run FTS search with a reranker
result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list()
# Run hybrid search with a reranker
tbl.create_fts_index("text", replace=True)
result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list()
```
Accepted Arguments
----------------
| Argument | Type | Default | Description |
| --- | --- | --- | --- |
| `model_name` | `str` | `""cross-encoder/ms-marco-TinyBERT-L-6"` | The name of the reranker model to use.|
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
| `device` | `str` | `None` | The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu". |
| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type |
## Supported Scores for each query type
You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type:
### Hybrid Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ❌ Not Supported | Returns have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`) |
### Vector Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ✅ Supported | Returns have vector(`_distance`) along with Hybrid Search score(`_relevance_score`) |
### FTS Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ✅ Supported | Returns have FTS(`score`) along with Hybrid Search score(`_relevance_score`) |

View File

@@ -0,0 +1,89 @@
## Building Custom Rerankers
You can build your own custom reranker by subclassing the `Reranker` class and implementing the `rerank_hybrid()` method. Optionally, you can also implement the `rerank_vector()` and `rerank_fts()` methods if you want to support reranking for vector and FTS search separately.
Here's an example of a custom reranker that combines the results of semantic and full-text search using a linear combination of the scores.
The `Reranker` base interface comes with a `merge_results()` method that can be used to combine the results of semantic and full-text search. This is a vanilla merging algorithm that simply concatenates the results and removes the duplicates without taking the scores into consideration. It only keeps the first copy of the row encountered. This works well in cases that don't require the scores of semantic and full-text search to combine the results. If you want to use the scores or want to support `return_score="all"`, you'll need to implement your own merging algorithm.
```python
from lancedb.rerankers import Reranker
import pyarrow as pa
class MyReranker(Reranker):
def __init__(self, param1, param2, ..., return_score="relevance"):
super().__init__(return_score)
self.param1 = param1
self.param2 = param2
def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table):
# Use the built-in merging function
combined_result = self.merge_results(vector_results, fts_results)
# Do something with the combined results
# ...
# Return the combined results
return combined_result
def rerank_vector(self, query: str, vector_results: pa.Table):
# Do something with the vector results
# ...
# Return the vector results
return vector_results
def rerank_fts(self, query: str, fts_results: pa.Table):
# Do something with the FTS results
# ...
# Return the FTS results
return fts_results
```
### Example of a Custom Reranker
For the sake of simplicity let's build custom reranker that just enchances the Cohere Reranker by accepting a filter query, and accept other CohereReranker params as kwags.
```python
from typing import List, Union
import pandas as pd
from lancedb.rerankers import CohereReranker
class ModifiedCohereReranker(CohereReranker):
def __init__(self, filters: Union[str, List[str]], **kwargs):
super().__init__(**kwargs)
filters = filters if isinstance(filters, list) else [filters]
self.filters = filters
def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table)-> pa.Table:
combined_result = super().rerank_hybrid(query, vector_results, fts_results)
df = combined_result.to_pandas()
for filter in self.filters:
df = df.query("not text.str.contains(@filter)")
return pa.Table.from_pandas(df)
def rerank_vector(self, query: str, vector_results: pa.Table)-> pa.Table:
vector_results = super().rerank_vector(query, vector_results)
df = vector_results.to_pandas()
for filter in self.filters:
df = df.query("not text.str.contains(@filter)")
return pa.Table.from_pandas(df)
def rerank_fts(self, query: str, fts_results: pa.Table)-> pa.Table:
fts_results = super().rerank_fts(query, fts_results)
df = fts_results.to_pandas()
for filter in self.filters:
df = df.query("not text.str.contains(@filter)")
return pa.Table.from_pandas(df)
```
!!! tip
The `vector_results` and `fts_results` are pyarrow tables. Lean more about pyarrow tables [here](https://arrow.apache.org/docs/python). It can be convered to other data types like pandas dataframe, pydict, pylist etc.
For example, You can convert them to pandas dataframes using `to_pandas()` method and perform any operations you want. After you are done, you can convert the dataframe back to pyarrow table using `pa.Table.from_pandas()` method and return it.

View File

@@ -0,0 +1,61 @@
Reranking is the process of reordering a list of items based on some criteria. In the context of search, reranking is used to reorder the search results returned by a search engine based on some criteria. This can be useful when the initial ranking of the search results is not satisfactory or when the user has provided additional information that can be used to improve the ranking of the search results.
LanceDB comes with some built-in rerankers. Some of the rerankers that are available in LanceDB are:
| Reranker | Description | Supported Query Types |
| --- | --- | --- |
| `LinearCombinationReranker` | Reranks search results based on a linear combination of FTS and vector search scores | Hybrid |
| `CohereReranker` | Uses cohere Reranker API to rerank results | Vector, FTS, Hybrid |
| `CrossEncoderReranker` | Uses a cross-encoder model to rerank search results | Vector, FTS, Hybrid |
| `ColbertReranker` | Uses a colbert model to rerank search results | Vector, FTS, Hybrid |
| `OpenaiReranker`(Experimental) | Uses OpenAI's chat model to rerank search results | Vector, FTS, Hybrid |
## Using a Reranker
Using rerankers is optional for vector and FTS. However, for hybrid search, rerankers are required. To use a reranker, you need to create an instance of the reranker and pass it to the `rerank` method of the query builder.
```python
import numpy
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import CohereReranker
embedder = get_registry().get("sentence-transformers").create()
db = lancedb.connect("~/.lancedb")
class Schema(LanceModel):
text: str = embedder.SourceField()
vector: Vector(embedder.ndims()) = embedder.VectorField()
data = [
{"text": "hello world"},
{"text": "goodbye world"}
]
tbl = db.create_table("test", data)
reranker = CohereReranker(api_key="your_api_key")
# Run vector search with a reranker
result = tbl.query("hello").rerank(reranker).to_list()
# Run FTS search with a reranker
result = tbl.query("hello", query_type="fts").rerank(reranker).to_list()
# Run hybrid search with a reranker
tbl.create_fts_index("text")
result = tbl.query("hello", query_type="hybrid").rerank(reranker).to_list()
```
## Available Rerankers
LanceDB comes with some built-in rerankers. Here are some of the rerankers that are available in LanceDB:
- [Cohere Reranker](./cohere.md)
- [Cross Encoder Reranker](./cross_encoder.md)
- [ColBERT Reranker](./colbert.md)
- [OpenAI Reranker](./openai.md)
- [Linear Combination Reranker](./linear_combination.md)
## Creating Custom Rerankers
LanceDB also you to create custom rerankers by extending the base `Reranker` class. The custom reranker should implement the `rerank` method that takes a list of search results and returns a reranked list of search results. This is covered in more detail in the [Creating Custom Rerankers](./custom_reranker.md) section.

View File

@@ -0,0 +1,52 @@
# Linear Combination Reranker
This is the default re-ranker used by LanceDB hybrid search. It combines the results of semantic and full-text search using a linear combination of the scores. The weights for the linear combination can be specified. It defaults to 0.7, i.e, 70% weight for semantic search and 30% weight for full-text search.
!!! note
Supported Query Types: Hybrid
```python
import numpy
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import LinearCombinationReranker
embedder = get_registry().get("sentence-transformers").create()
db = lancedb.connect("~/.lancedb")
class Schema(LanceModel):
text: str = embedder.SourceField()
vector: Vector(embedder.ndims()) = embedder.VectorField()
data = [
{"text": "hello world"},
{"text": "goodbye world"}
]
tbl = db.create_table("test", schema=Schema, mode="overwrite")
tbl.add(data)
reranker = LinearCombinationReranker()
# Run hybrid search with a reranker
tbl.create_fts_index("text", replace=True)
result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list()
```
Accepted Arguments
----------------
| Argument | Type | Default | Description |
| --- | --- | --- | --- |
| `weight` | `float` | `0.7` | The weight to use for the semantic search score. The weight for the full-text search score is `1 - weights`. |
| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all", will return all scores from the vector and FTS search along with the relevance score. |
## Supported Scores for each query type
You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type:
### Hybrid Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ✅ Supported | Returns have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_distance`) |

View File

@@ -0,0 +1,73 @@
# OpenAI Reranker (Experimental)
This re-ranker uses OpenAI chat model to rerank the search results. You can use this re-ranker by passing `OpenAI()` to the `rerank()` method.
!!! note
Supported Query Types: Hybrid, Vector, FTS
!!! warning
This re-ranker is experimental. OpenAI doesn't have a dedicated reranking model, so we are using the chat model for reranking.
```python
import numpy
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import OpenaiReranker
embedder = get_registry().get("sentence-transformers").create()
db = lancedb.connect("~/.lancedb")
class Schema(LanceModel):
text: str = embedder.SourceField()
vector: Vector(embedder.ndims()) = embedder.VectorField()
data = [
{"text": "hello world"},
{"text": "goodbye world"}
]
tbl = db.create_table("test", schema=Schema, mode="overwrite")
tbl.add(data)
reranker = OpenaiReranker()
# Run vector search with a reranker
result = tbl.search("hello").rerank(reranker=reranker).to_list()
# Run FTS search with a reranker
result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list()
# Run hybrid search with a reranker
tbl.create_fts_index("text", replace=True)
result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list()
```
Accepted Arguments
----------------
| Argument | Type | Default | Description |
| --- | --- | --- | --- |
| `model_name` | `str` | `"gpt-4-turbo-preview"` | The name of the reranker model to use.|
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type |
| `api_key` | str | `None` | The API key to use. If None, will use the OPENAI_API_KEY environment variable.
## Supported Scores for each query type
You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type:
### Hybrid Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ❌ Not Supported | Returns have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`) |
### Vector Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ✅ Supported | Returns have vector(`_distance`) along with Hybrid Search score(`_relevance_score`) |
### FTS Search
|`return_score`| Status | Description |
| --- | --- | --- |
| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column |
| `all` | ✅ Supported | Returns have FTS(`score`) along with Hybrid Search score(`_relevance_score`) |

View File

@@ -15,6 +15,7 @@ excluded_globs = [
"../src/ann_indexes.md",
"../src/basic.md",
"../src/hybrid_search/hybrid_search.md",
"../src/reranking/*.md",
]
python_prefix = "py"

56
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.4.13",
"version": "0.4.14",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.4.13",
"version": "0.4.14",
"cpu": [
"x64",
"arm64"
@@ -52,11 +52,11 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.4.13",
"@lancedb/vectordb-darwin-x64": "0.4.13",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.13",
"@lancedb/vectordb-linux-x64-gnu": "0.4.13",
"@lancedb/vectordb-win32-x64-msvc": "0.4.13"
"@lancedb/vectordb-darwin-arm64": "0.4.14",
"@lancedb/vectordb-darwin-x64": "0.4.14",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.14",
"@lancedb/vectordb-linux-x64-gnu": "0.4.14",
"@lancedb/vectordb-win32-x64-msvc": "0.4.14"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -334,9 +334,9 @@
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.4.13",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.13.tgz",
"integrity": "sha512-JfroNCG8yKIU931Y+x8d0Fp8C9DHUSC5j+CjI+e5err7rTWtie4j3JbsXlWAnPFaFEOg0Xk3BWkSikCvhPGJGg==",
"version": "0.4.14",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.14.tgz",
"integrity": "sha512-fw6mf6UhFf4j2kKdFcw0P+SOiIqmRbt+YQSgDbF4BFU3OUSW0XyfETIj9cUMQbSwPFsofhlGp5BRpCd7W9noew==",
"cpu": [
"arm64"
],
@@ -345,22 +345,10 @@
"darwin"
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.4.13",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.13.tgz",
"integrity": "sha512-dG6IMvfpHpnHdbJ0UffzJ7cZfMiC02MjIi6YJzgx+hKz2UNXWNBIfTvvhqli85mZsGRXL1OYDdYv0K1YzNjXlA==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.4.13",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.13.tgz",
"integrity": "sha512-BRR1VzaMviXby7qmLm0axNZM8eUZF3ZqfvnDKdVRpC3LaRueD6pMXHuC2IUKaFkn7xktf+8BlDZb6foFNEj8bQ==",
"version": "0.4.14",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.14.tgz",
"integrity": "sha512-1+LFI8vU+f/lnGy1s3XCySuV4oj3ZUW03xtmedGBW8nv/Y/jWXP0OYJCRI72eu+dLIdu0tCPsEiu8Hl+o02t9g==",
"cpu": [
"arm64"
],
@@ -369,22 +357,10 @@
"linux"
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.4.13",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.13.tgz",
"integrity": "sha512-WnekZ7ZMlria+NODZ6aBCljCFQSe2bBNUS9ZpyFl/Y1vHduSQPuBxM6V7vp2QubC0daq/rifgjDob89DF+x3xw==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.4.13",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.13.tgz",
"integrity": "sha512-3NDpMWBL2ksDHXAraXhowiLqQcNWM5bdbeHwze4+InYMD54hyQ2ODNc+4usxp63Nya9biVnFS27yXULqkzIEqQ==",
"version": "0.4.14",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.14.tgz",
"integrity": "sha512-fpuNMZ4aHSpZC3ztp5a0Wh18N6DpCx5EPWhS7bGA5XulGc0l+sZAJHfHwalx76ys//0Ns1z7cuKJhZpSa4SrdQ==",
"cpu": [
"x64"
],

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.4.13",
"version": "0.4.14",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
@@ -88,10 +88,10 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.4.13",
"@lancedb/vectordb-darwin-x64": "0.4.13",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.13",
"@lancedb/vectordb-linux-x64-gnu": "0.4.13",
"@lancedb/vectordb-win32-x64-msvc": "0.4.13"
"@lancedb/vectordb-darwin-arm64": "0.4.14",
"@lancedb/vectordb-darwin-x64": "0.4.14",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.14",
"@lancedb/vectordb-linux-x64-gnu": "0.4.14",
"@lancedb/vectordb-win32-x64-msvc": "0.4.14"
}
}

View File

@@ -24,6 +24,7 @@ import { RemoteConnection } from './remote'
import { Query } from './query'
import { isEmbeddingFunction } from './embedding/embedding_function'
import { type Literal, toSQL } from './util'
import { type HttpMiddleware } from './middleware'
const {
databaseNew,
@@ -302,6 +303,18 @@ export interface Connection {
* @param name The name of the table to drop.
*/
dropTable(name: string): Promise<void>
/**
* Instrument the behavior of this Connection with middleware.
*
* The middleware will be called in the order they are added.
*
* Currently this functionality is only supported for remote Connections.
*
* @param {HttpMiddleware} - Middleware which will instrument the Connection.
* @returns - this Connection instrumented by the passed middleware
*/
withMiddleware(middleware: HttpMiddleware): Connection
}
/**
@@ -541,6 +554,18 @@ export interface Table<T = number[]> {
* names (e.g. "a").
*/
dropColumns(columnNames: string[]): Promise<void>
/**
* Instrument the behavior of this Table with middleware.
*
* The middleware will be called in the order they are added.
*
* Currently this functionality is only supported for remote tables.
*
* @param {HttpMiddleware} - Middleware which will instrument the Table.
* @returns - this Table instrumented by the passed middleware
*/
withMiddleware(middleware: HttpMiddleware): Table<T>
}
/**
@@ -795,6 +820,10 @@ export class LocalConnection implements Connection {
async dropTable (name: string): Promise<void> {
await databaseDropTable.call(this._db, name)
}
withMiddleware (middleware: HttpMiddleware): Connection {
return this
}
}
export class LocalTable<T = number[]> implements Table<T> {
@@ -1105,6 +1134,10 @@ export class LocalTable<T = number[]> implements Table<T> {
async dropColumns (columnNames: string[]): Promise<void> {
return tableDropColumns.call(this._tbl, columnNames)
}
withMiddleware (middleware: HttpMiddleware): Table<T> {
return this
}
}
export interface CleanupStats {

58
node/src/middleware.ts Normal file
View File

@@ -0,0 +1,58 @@
// Copyright 2024 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* Middleware for Remote LanceDB Connection or Table
*/
export interface HttpMiddleware {
/**
* A callback that can be used to instrument the behavior of http requests to remote
* tables. It can be used to add headers, modify the request, or even short-circuit
* the request and return a response without making the request to the remote endpoint.
* It can also be used to modify the response from the remote endpoint.
*
* @param {RemoteResponse} res - Request to the remote endpoint
* @param {onRemoteRequestNext} next - Callback to advance the middleware chain
*/
onRemoteRequest(
req: RemoteRequest,
next: (req: RemoteRequest) => Promise<RemoteResponse>,
): Promise<RemoteResponse>
};
export enum Method {
GET,
POST
}
/**
* A LanceDB Remote HTTP Request
*/
export interface RemoteRequest {
uri: string
method: Method
headers: Map<string, string>
params?: Map<string, string>
body?: any
}
/**
* A LanceDB Remote HTTP Response
*/
export interface RemoteResponse {
status: number
statusText: string
headers: Map<string, string>
body: () => Promise<any>
}

View File

@@ -12,13 +12,101 @@
// See the License for the specific language governing permissions and
// limitations under the License.
import axios, { type AxiosResponse } from 'axios'
import axios, { type AxiosResponse, type ResponseType } from 'axios'
import { tableFromIPC, type Table as ArrowTable } from 'apache-arrow'
import { type RemoteResponse, type RemoteRequest, Method } from '../middleware'
interface HttpLancedbClientMiddleware {
onRemoteRequest(
req: RemoteRequest,
next: (req: RemoteRequest) => Promise<RemoteResponse>,
): Promise<RemoteResponse>
}
/**
* Invoke the middleware chain and at the end call the remote endpoint
*/
async function callWithMiddlewares (
req: RemoteRequest,
middlewares: HttpLancedbClientMiddleware[],
opts?: MiddlewareInvocationOptions
): Promise<RemoteResponse> {
async function call (
i: number,
req: RemoteRequest
): Promise<RemoteResponse> {
// if we have reached the end of the middleware chain, make the request
if (i > middlewares.length) {
const headers = Object.fromEntries(req.headers.entries())
const params = Object.fromEntries(req.params?.entries() ?? [])
const timeout = 10000
let res
if (req.method === Method.POST) {
res = await axios.post(
req.uri,
req.body,
{
headers,
params,
timeout,
responseType: opts?.responseType
}
)
} else {
res = await axios.get(
req.uri,
{
headers,
params,
timeout
}
)
}
return toLanceRes(res)
}
// call next middleware in chain
return await middlewares[i - 1].onRemoteRequest(
req,
async (req) => {
return await call(i + 1, req)
}
)
}
return await call(1, req)
}
interface MiddlewareInvocationOptions {
responseType?: ResponseType
}
/**
* Marshall the library response into a LanceDB response
*/
function toLanceRes (res: AxiosResponse): RemoteResponse {
const headers = new Map()
for (const h in res.headers) {
headers.set(h, res.headers[h])
}
return {
status: res.status,
statusText: res.statusText,
headers,
body: async () => {
return res.data
}
}
}
export class HttpLancedbClient {
private readonly _url: string
private readonly _apiKey: () => string
private readonly _middlewares: HttpLancedbClientMiddleware[]
public constructor (
url: string,
@@ -27,6 +115,7 @@ export class HttpLancedbClient {
) {
this._url = url
this._apiKey = () => apiKey
this._middlewares = []
}
get uri (): string {
@@ -43,74 +132,61 @@ export class HttpLancedbClient {
columns?: string[],
filter?: string
): Promise<ArrowTable<any>> {
const response = await axios.post(
`${this._url}/v1/table/${tableName}/query/`,
{
vector,
k,
nprobes,
refineFactor,
columns,
filter,
prefilter
},
{
headers: {
'Content-Type': 'application/json',
'x-api-key': this._apiKey(),
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
},
responseType: 'arraybuffer',
timeout: 10000
}
).catch((err) => {
console.error('error: ', err)
if (err.response === undefined) {
throw new Error(`Network Error: ${err.message as string}`)
}
return err.response
})
if (response.status !== 200) {
const errorData = new TextDecoder().decode(response.data)
throw new Error(
`Server Error, status: ${response.status as number}, ` +
`message: ${response.statusText as string}: ${errorData}`
)
}
const table = tableFromIPC(response.data)
const result = await this.post(
`/v1/table/${tableName}/query/`,
{
vector,
k,
nprobes,
refineFactor,
columns,
filter,
prefilter
},
undefined,
undefined,
'arraybuffer'
)
const table = tableFromIPC(await result.body())
return table
}
/**
* Sent GET request.
*/
public async get (path: string, params?: Record<string, string | number>): Promise<AxiosResponse> {
const response = await axios.get(
`${this._url}${path}`,
{
headers: {
'Content-Type': 'application/json',
'x-api-key': this._apiKey(),
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
},
params,
timeout: 10000
}
).catch((err) => {
public async get (path: string, params?: Record<string, string>): Promise<RemoteResponse> {
const req = {
uri: `${this._url}${path}`,
method: Method.GET,
headers: new Map(Object.entries({
'Content-Type': 'application/json',
'x-api-key': this._apiKey(),
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
})),
params: new Map(Object.entries(params ?? {}))
}
let response
try {
response = await callWithMiddlewares(req, this._middlewares)
return response
} catch (err: any) {
console.error('error: ', err)
if (err.response === undefined) {
throw new Error(`Network Error: ${err.message as string}`)
}
return err.response
})
response = toLanceRes(err.response)
}
if (response.status !== 200) {
const errorData = new TextDecoder().decode(response.data)
const errorData = new TextDecoder().decode(await response.body())
throw new Error(
`Server Error, status: ${response.status as number}, ` +
`message: ${response.statusText as string}: ${errorData}`
`Server Error, status: ${response.status}, ` +
`message: ${response.statusText}: ${errorData}`
)
}
return response
}
@@ -120,35 +196,65 @@ export class HttpLancedbClient {
public async post (
path: string,
data?: any,
params?: Record<string, string | number>,
content?: string | undefined
): Promise<AxiosResponse> {
const response = await axios.post(
`${this._url}${path}`,
data,
{
headers: {
'Content-Type': content ?? 'application/json',
'x-api-key': this._apiKey(),
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
},
params,
timeout: 30000
}
).catch((err) => {
params?: Record<string, string>,
content?: string | undefined,
responseType?: ResponseType | undefined
): Promise<RemoteResponse> {
const req = {
uri: `${this._url}${path}`,
method: Method.POST,
headers: new Map(Object.entries({
'Content-Type': content ?? 'application/json',
'x-api-key': this._apiKey(),
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
})),
params: new Map(Object.entries(params ?? {})),
body: data
}
let response
try {
response = await callWithMiddlewares(req, this._middlewares, { responseType })
// return response
} catch (err: any) {
console.error('error: ', err)
if (err.response === undefined) {
throw new Error(`Network Error: ${err.message as string}`)
}
return err.response
})
response = toLanceRes(err.response)
}
if (response.status !== 200) {
const errorData = new TextDecoder().decode(response.data)
const errorData = new TextDecoder().decode(await response.body())
throw new Error(
`Server Error, status: ${response.status as number}, ` +
`message: ${response.statusText as string}: ${errorData}`
`Server Error, status: ${response.status}, ` +
`message: ${response.statusText}: ${errorData}`
)
}
return response
}
/**
* Instrument this client with middleware
* @param mw - The middleware that instruments the client
* @returns - an instance of this client instrumented with the middleware
*/
public withMiddleware (mw: HttpLancedbClientMiddleware): HttpLancedbClient {
const wrapped = this.clone()
wrapped._middlewares.push(mw)
return wrapped
}
/**
* Make a clone of this client
*/
private clone (): HttpLancedbClient {
const clone = new HttpLancedbClient(this._url, this._apiKey(), this._dbName)
for (const mw of this._middlewares) {
clone._middlewares.push(mw)
}
return clone
}
}

View File

@@ -39,12 +39,13 @@ import {
fromTableToStreamBuffer
} from '../arrow'
import { toSQL } from '../util'
import { type HttpMiddleware } from '../middleware'
/**
* Remote connection.
*/
export class RemoteConnection implements Connection {
private readonly _client: HttpLancedbClient
private _client: HttpLancedbClient
private readonly _dbName: string
constructor (opts: ConnectionOptions) {
@@ -84,10 +85,11 @@ export class RemoteConnection implements Connection {
limit: number = 10
): Promise<string[]> {
const response = await this._client.get('/v1/table/', {
limit,
limit: `${limit}`,
page_token: pageToken
})
return response.data.tables
const body = await response.body()
return body.tables
}
async openTable (name: string): Promise<Table>
@@ -163,7 +165,7 @@ export class RemoteConnection implements Connection {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${res.data}`
`message: ${res.statusText}: ${await res.body()}`
)
}
@@ -177,6 +179,17 @@ export class RemoteConnection implements Connection {
async dropTable (name: string): Promise<void> {
await this._client.post(`/v1/table/${name}/drop/`)
}
withMiddleware (middleware: HttpMiddleware): Connection {
const wrapped = this.clone()
wrapped._client = wrapped._client.withMiddleware(middleware)
return wrapped
}
private clone (): RemoteConnection {
const clone: RemoteConnection = Object.create(RemoteConnection.prototype)
return Object.assign(clone, this)
}
}
export class RemoteQuery<T = number[]> extends Query<T> {
@@ -229,7 +242,7 @@ export class RemoteQuery<T = number[]> extends Query<T> {
// we are using extend until we have next next version release
// Table and Connection has both been refactored to interfaces
export class RemoteTable<T = number[]> implements Table<T> {
private readonly _client: HttpLancedbClient
private _client: HttpLancedbClient
private readonly _embeddings?: EmbeddingFunction<T>
private readonly _name: string
@@ -256,15 +269,15 @@ export class RemoteTable<T = number[]> implements Table<T> {
get schema (): Promise<any> {
return this._client
.post(`/v1/table/${this._name}/describe/`)
.then((res) => {
.then(async (res) => {
if (res.status !== 200) {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${res.data}`
`message: ${res.statusText}: ${await res.body()}`
)
}
return res.data?.schema
return (await res.body())?.schema
})
}
@@ -320,7 +333,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${res.data}`
`message: ${res.statusText}: ${await res.body()}`
)
}
}
@@ -346,7 +359,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${res.data}`
`message: ${res.statusText}: ${await res.body()}`
)
}
return tbl.numRows
@@ -372,7 +385,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${res.data}`
`message: ${res.statusText}: ${await res.body()}`
)
}
return tbl.numRows
@@ -415,7 +428,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${res.data}`
`message: ${res.statusText}: ${await res.body()}`
)
}
}
@@ -436,14 +449,14 @@ export class RemoteTable<T = number[]> implements Table<T> {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${res.data}`
`message: ${res.statusText}: ${await res.body()}`
)
}
}
async countRows (): Promise<number> {
const result = await this._client.post(`/v1/table/${this._name}/describe/`)
return result.data?.stats?.num_rows
return (await result.body())?.stats?.num_rows
}
async delete (filter: string): Promise<void> {
@@ -476,7 +489,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
const results = await this._client.post(
`/v1/table/${this._name}/index/list/`
)
return results.data.indexes?.map((index: any) => ({
return (await results.body()).indexes?.map((index: any) => ({
columns: index.columns,
name: index.index_name,
uuid: index.index_uuid
@@ -487,9 +500,10 @@ export class RemoteTable<T = number[]> implements Table<T> {
const results = await this._client.post(
`/v1/table/${this._name}/index/${indexUuid}/stats/`
)
const body = await results.body()
return {
numIndexedRows: results.data.num_indexed_rows,
numUnindexedRows: results.data.num_unindexed_rows
numIndexedRows: body?.num_indexed_rows,
numUnindexedRows: body?.num_unindexed_rows
}
}
@@ -504,4 +518,15 @@ export class RemoteTable<T = number[]> implements Table<T> {
async dropColumns (columnNames: string[]): Promise<void> {
throw new Error('Drop columns is not yet supported in LanceDB Cloud.')
}
withMiddleware(middleware: HttpMiddleware): Table<T> {
const wrapped = this.clone()
wrapped._client = wrapped._client.withMiddleware(middleware)
return wrapped
}
private clone (): RemoteTable<T> {
const clone: RemoteTable<T> = Object.create(RemoteTable.prototype)
return Object.assign(clone, this)
}
}

View File

@@ -106,6 +106,9 @@ export class MakeArrowTableOptions {
* An enhanced version of the {@link makeTable} function from Apache Arrow
* that supports nested fields and embeddings columns.
*
* (typically you do not need to call this function. It will be called automatically
* when creating a table or adding data to it)
*
* This function converts an array of Record<String, any> (row-major JS objects)
* to an Arrow Table (a columnar structure)
*

View File

@@ -0,0 +1,2 @@
export { EmbeddingFunction, isEmbeddingFunction } from "./embedding_function";
export { OpenAIEmbeddingFunction } from "./openai";

View File

@@ -18,9 +18,34 @@ import {
ConnectionOptions,
} from "./native.js";
export { ConnectionOptions, WriteOptions, Query } from "./native.js";
export { Connection, CreateTableOptions } from "./connection";
export { Table, AddDataOptions } from "./table";
export {
WriteOptions,
WriteMode,
AddColumnsSql,
ColumnAlteration,
ConnectionOptions,
} from "./native.js";
export {
makeArrowTable,
MakeArrowTableOptions,
Data,
VectorColumnOptions,
} from "./arrow";
export {
Connection,
CreateTableOptions,
TableNamesOptions,
} from "./connection";
export {
ExecutableQuery,
Query,
QueryBase,
VectorQuery,
RecordBatchIterator,
} from "./query";
export { Index, IndexOptions, IvfPqOptions } from "./indices";
export { Table, AddDataOptions, IndexConfig, UpdateOptions } from "./table";
export * as embedding from "./embedding";
/**
* Connect to a LanceDB instance at the given URI.

View File

@@ -1,147 +0,0 @@
/* tslint:disable */
/* eslint-disable */
/* auto-generated by NAPI-RS */
/** A description of an index currently configured on a column */
export interface IndexConfig {
/** The type of the index */
indexType: string
/**
* The columns in the index
*
* Currently this is always an array of size 1. In the future there may
* be more columns to represent composite indices.
*/
columns: Array<string>
}
/**
* A definition of a column alteration. The alteration changes the column at
* `path` to have the new name `name`, to be nullable if `nullable` is true,
* and to have the data type `data_type`. At least one of `rename` or `nullable`
* must be provided.
*/
export interface ColumnAlteration {
/**
* The path to the column to alter. This is a dot-separated path to the column.
* If it is a top-level column then it is just the name of the column. If it is
* a nested column then it is the path to the column, e.g. "a.b.c" for a column
* `c` nested inside a column `b` nested inside a column `a`.
*/
path: string
/**
* The new name of the column. If not provided then the name will not be changed.
* This must be distinct from the names of all other columns in the table.
*/
rename?: string
/** Set the new nullability. Note that a nullable column cannot be made non-nullable. */
nullable?: boolean
}
/** A definition of a new column to add to a table. */
export interface AddColumnsSql {
/** The name of the new column. */
name: string
/**
* The values to populate the new column with, as a SQL expression.
* The expression can reference other columns in the table.
*/
valueSql: string
}
export interface ConnectionOptions {
apiKey?: string
hostOverride?: string
/**
* (For LanceDB OSS only): The interval, in seconds, at which to check for
* updates to the table from other processes. If None, then consistency is not
* checked. For performance reasons, this is the default. For strong
* consistency, set this to zero seconds. Then every read will check for
* updates from other processes. As a compromise, you can set this to a
* non-zero value for eventual consistency. If more than that interval
* has passed since the last check, then the table will be checked for updates.
* Note: this consistency only applies to read operations. Write operations are
* always consistent.
*/
readConsistencyInterval?: number
}
/** Write mode for writing a table. */
export const enum WriteMode {
Create = 'Create',
Append = 'Append',
Overwrite = 'Overwrite'
}
/** Write options when creating a Table. */
export interface WriteOptions {
mode?: WriteMode
}
export function connect(uri: string, options: ConnectionOptions): Promise<Connection>
export class Connection {
/** Create a new Connection instance from the given URI. */
static new(uri: string, options: ConnectionOptions): Promise<Connection>
display(): string
isOpen(): boolean
close(): void
/** List all tables in the dataset. */
tableNames(startAfter?: string | undefined | null, limit?: number | undefined | null): Promise<Array<string>>
/**
* Create table from a Apache Arrow IPC (file) buffer.
*
* Parameters:
* - name: The name of the table.
* - buf: The buffer containing the IPC file.
*
*/
createTable(name: string, buf: Buffer, mode: string): Promise<Table>
createEmptyTable(name: string, schemaBuf: Buffer, mode: string): Promise<Table>
openTable(name: string): Promise<Table>
/** Drop table with the name. Or raise an error if the table does not exist. */
dropTable(name: string): Promise<void>
}
export class Index {
static ivfPq(distanceType?: string | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): Index
static btree(): Index
}
/** Typescript-style Async Iterator over RecordBatches */
export class RecordBatchIterator {
next(): Promise<Buffer | null>
}
export class Query {
onlyIf(predicate: string): void
select(columns: Array<[string, string]>): void
limit(limit: number): void
nearestTo(vector: Float32Array): VectorQuery
execute(): Promise<RecordBatchIterator>
}
export class VectorQuery {
column(column: string): void
distanceType(distanceType: string): void
postfilter(): void
refineFactor(refineFactor: number): void
nprobes(nprobe: number): void
bypassVectorIndex(): void
onlyIf(predicate: string): void
select(columns: Array<[string, string]>): void
limit(limit: number): void
execute(): Promise<RecordBatchIterator>
}
export class Table {
display(): string
isOpen(): boolean
close(): void
/** Return Schema as empty Arrow IPC file. */
schema(): Promise<Buffer>
add(buf: Buffer, mode: string): Promise<void>
countRows(filter?: string | undefined | null): Promise<number>
delete(predicate: string): Promise<void>
createIndex(index: Index | undefined | null, column: string, replace?: boolean | undefined | null): Promise<void>
update(onlyIf: string | undefined | null, columns: Array<[string, string]>): Promise<void>
query(): Query
vectorSearch(vector: Float32Array): VectorQuery
addColumns(transforms: Array<AddColumnsSql>): Promise<void>
alterColumns(alterations: Array<ColumnAlteration>): Promise<void>
dropColumns(columns: Array<string>): Promise<void>
version(): Promise<number>
checkout(version: number): Promise<void>
checkoutLatest(): Promise<void>
restore(): Promise<void>
listIndices(): Promise<Array<IndexConfig>>
}

View File

@@ -1,329 +0,0 @@
/* tslint:disable */
/* eslint-disable */
/* prettier-ignore */
/* auto-generated by NAPI-RS */
const { existsSync, readFileSync } = require('fs')
const { join } = require("path");
const { platform, arch } = process;
let nativeBinding = null;
let localFileExisted = false;
let loadError = null;
function isMusl() {
// For Node 10
if (!process.report || typeof process.report.getReport !== "function") {
try {
const lddPath = require("child_process")
.execSync("which ldd")
.toString()
.trim();
return readFileSync(lddPath, "utf8").includes("musl");
} catch (e) {
return true;
}
} else {
const { glibcVersionRuntime } = process.report.getReport().header;
return !glibcVersionRuntime;
}
}
switch (platform) {
case "android":
switch (arch) {
case "arm64":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.android-arm64.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.android-arm64.node");
} else {
nativeBinding = require("lancedb-android-arm64");
}
} catch (e) {
loadError = e;
}
break;
case "arm":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.android-arm-eabi.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.android-arm-eabi.node");
} else {
nativeBinding = require("lancedb-android-arm-eabi");
}
} catch (e) {
loadError = e;
}
break;
default:
throw new Error(`Unsupported architecture on Android ${arch}`);
}
break;
case "win32":
switch (arch) {
case "x64":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.win32-x64-msvc.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.win32-x64-msvc.node");
} else {
nativeBinding = require("lancedb-win32-x64-msvc");
}
} catch (e) {
loadError = e;
}
break;
case "ia32":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.win32-ia32-msvc.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.win32-ia32-msvc.node");
} else {
nativeBinding = require("lancedb-win32-ia32-msvc");
}
} catch (e) {
loadError = e;
}
break;
case "arm64":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.win32-arm64-msvc.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.win32-arm64-msvc.node");
} else {
nativeBinding = require("lancedb-win32-arm64-msvc");
}
} catch (e) {
loadError = e;
}
break;
default:
throw new Error(`Unsupported architecture on Windows: ${arch}`);
}
break;
case "darwin":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.darwin-universal.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.darwin-universal.node");
} else {
nativeBinding = require("lancedb-darwin-universal");
}
break;
} catch {}
switch (arch) {
case "x64":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.darwin-x64.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.darwin-x64.node");
} else {
nativeBinding = require("lancedb-darwin-x64");
}
} catch (e) {
loadError = e;
}
break;
case "arm64":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.darwin-arm64.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.darwin-arm64.node");
} else {
nativeBinding = require("lancedb-darwin-arm64");
}
} catch (e) {
loadError = e;
}
break;
default:
throw new Error(`Unsupported architecture on macOS: ${arch}`);
}
break;
case "freebsd":
if (arch !== "x64") {
throw new Error(`Unsupported architecture on FreeBSD: ${arch}`);
}
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.freebsd-x64.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.freebsd-x64.node");
} else {
nativeBinding = require("lancedb-freebsd-x64");
}
} catch (e) {
loadError = e;
}
break;
case "linux":
switch (arch) {
case "x64":
if (isMusl()) {
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.linux-x64-musl.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.linux-x64-musl.node");
} else {
nativeBinding = require("lancedb-linux-x64-musl");
}
} catch (e) {
loadError = e;
}
} else {
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.linux-x64-gnu.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.linux-x64-gnu.node");
} else {
nativeBinding = require("lancedb-linux-x64-gnu");
}
} catch (e) {
loadError = e;
}
}
break;
case "arm64":
if (isMusl()) {
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.linux-arm64-musl.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.linux-arm64-musl.node");
} else {
nativeBinding = require("lancedb-linux-arm64-musl");
}
} catch (e) {
loadError = e;
}
} else {
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.linux-arm64-gnu.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.linux-arm64-gnu.node");
} else {
nativeBinding = require("lancedb-linux-arm64-gnu");
}
} catch (e) {
loadError = e;
}
}
break;
case "arm":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.linux-arm-gnueabihf.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.linux-arm-gnueabihf.node");
} else {
nativeBinding = require("lancedb-linux-arm-gnueabihf");
}
} catch (e) {
loadError = e;
}
break;
case "riscv64":
if (isMusl()) {
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.linux-riscv64-musl.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.linux-riscv64-musl.node");
} else {
nativeBinding = require("lancedb-linux-riscv64-musl");
}
} catch (e) {
loadError = e;
}
} else {
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.linux-riscv64-gnu.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.linux-riscv64-gnu.node");
} else {
nativeBinding = require("lancedb-linux-riscv64-gnu");
}
} catch (e) {
loadError = e;
}
}
break;
case "s390x":
localFileExisted = existsSync(
join(__dirname, "lancedb-nodejs.linux-s390x-gnu.node"),
);
try {
if (localFileExisted) {
nativeBinding = require("./lancedb-nodejs.linux-s390x-gnu.node");
} else {
nativeBinding = require("lancedb-linux-s390x-gnu");
}
} catch (e) {
loadError = e;
}
break;
default:
throw new Error(`Unsupported architecture on Linux: ${arch}`);
}
break;
default:
throw new Error(`Unsupported OS: ${platform}, architecture: ${arch}`);
}
if (!nativeBinding) {
if (loadError) {
throw loadError;
}
throw new Error(`Failed to load native binding`);
}
const {
Connection,
Index,
RecordBatchIterator,
Query,
VectorQuery,
Table,
WriteMode,
connect,
} = nativeBinding;
module.exports.Connection = Connection;
module.exports.Index = Index;
module.exports.RecordBatchIterator = RecordBatchIterator;
module.exports.Query = Query;
module.exports.VectorQuery = VectorQuery;
module.exports.Table = Table;
module.exports.WriteMode = WriteMode;
module.exports.connect = connect;

View File

@@ -20,7 +20,7 @@ import {
VectorQuery as NativeVectorQuery,
} from "./native";
import { type IvfPqOptions } from "./indices";
class RecordBatchIterator implements AsyncIterator<RecordBatch> {
export class RecordBatchIterator implements AsyncIterator<RecordBatch> {
private promisedInner?: Promise<NativeBatchIterator>;
private inner?: NativeBatchIterator;

View File

@@ -1,3 +1,3 @@
# `lancedb-darwin-arm64`
# `@lancedb/lancedb-darwin-arm64`
This is the **aarch64-apple-darwin** binary for `lancedb`
This is the **aarch64-apple-darwin** binary for `@lancedb/lancedb`

View File

@@ -1,5 +1,5 @@
{
"name": "lancedb-darwin-arm64",
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.4.3",
"os": [
"darwin"
@@ -11,7 +11,7 @@
"files": [
"lancedb.darwin-arm64.node"
],
"license": "MIT",
"license": "Apache 2.0",
"engines": {
"node": ">= 18"
}

View File

@@ -1,3 +1,3 @@
# `lancedb-darwin-x64`
# `@lancedb/lancedb-darwin-x64`
This is the **x86_64-apple-darwin** binary for `lancedb`
This is the **x86_64-apple-darwin** binary for `@lancedb/lancedb`

View File

@@ -1,5 +1,5 @@
{
"name": "lancedb-darwin-x64",
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.4.3",
"os": [
"darwin"
@@ -11,7 +11,7 @@
"files": [
"lancedb.darwin-x64.node"
],
"license": "MIT",
"license": "Apache 2.0",
"engines": {
"node": ">= 18"
}

View File

@@ -1,3 +1,3 @@
# `lancedb-linux-arm64-gnu`
# `@lancedb/lancedb-linux-arm64-gnu`
This is the **aarch64-unknown-linux-gnu** binary for `lancedb`
This is the **aarch64-unknown-linux-gnu** binary for `@lancedb/lancedb`

View File

@@ -1,5 +1,5 @@
{
"name": "lancedb-linux-arm64-gnu",
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.4.3",
"os": [
"linux"
@@ -11,9 +11,9 @@
"files": [
"lancedb.linux-arm64-gnu.node"
],
"license": "MIT",
"license": "Apache 2.0",
"engines": {
"node": ">= 10"
"node": ">= 18"
},
"libc": [
"glibc"

View File

@@ -1,3 +1,3 @@
# `lancedb-linux-x64-gnu`
# `@lancedb/lancedb-linux-x64-gnu`
This is the **x86_64-unknown-linux-gnu** binary for `lancedb`
This is the **x86_64-unknown-linux-gnu** binary for `@lancedb/lancedb`

View File

@@ -1,5 +1,5 @@
{
"name": "lancedb-linux-x64-gnu",
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.4.3",
"os": [
"linux"
@@ -11,9 +11,9 @@
"files": [
"lancedb.linux-x64-gnu.node"
],
"license": "MIT",
"license": "Apache 2.0",
"engines": {
"node": ">= 10"
"node": ">= 18"
},
"libc": [
"glibc"

View File

@@ -0,0 +1,3 @@
# `@lancedb/lancedb-win32-x64-msvc`
This is the **x86_64-pc-windows-msvc** binary for `@lancedb/lancedb`

View File

@@ -0,0 +1,18 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.4.3",
"os": [
"win32"
],
"cpu": [
"x64"
],
"main": "lancedb.win32-x64-msvc.node",
"files": [
"lancedb.win32-x64-msvc.node"
],
"license": "Apache 2.0",
"engines": {
"node": ">= 18"
}
}

198
nodejs/package-lock.json generated
View File

@@ -1,11 +1,11 @@
{
"name": "lancedb",
"name": "@lancedb/lancedb",
"version": "0.4.3",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "lancedb",
"name": "@lancedb/lancedb",
"version": "0.4.3",
"cpu": [
"x64",
@@ -15,8 +15,12 @@
"os": [
"darwin",
"linux",
"windows"
"win32"
],
"dependencies": {
"apache-arrow": "^15.0.0",
"openai": "^4.29.2"
},
"devDependencies": {
"@napi-rs/cli": "^2.18.0",
"@types/jest": "^29.1.2",
@@ -29,6 +33,7 @@
"eslint-plugin-jsdoc": "^48.2.1",
"jest": "^29.7.0",
"prettier": "^3.1.0",
"shx": "^0.3.4",
"tmp": "^0.2.3",
"ts-jest": "^29.1.2",
"typedoc": "^0.25.7",
@@ -40,14 +45,11 @@
"node": ">= 18"
},
"optionalDependencies": {
"lancedb-darwin-arm64": "0.4.3",
"lancedb-darwin-x64": "0.4.3",
"lancedb-linux-arm64-gnu": "0.4.3",
"lancedb-linux-x64-gnu": "0.4.3",
"openai": "^4.28.4"
},
"peerDependencies": {
"apache-arrow": "^15.0.0"
"@lancedb/lancedb-darwin-arm64": "0.4.3",
"@lancedb/lancedb-darwin-x64": "0.4.3",
"@lancedb/lancedb-linux-arm64-gnu": "0.4.3",
"@lancedb/lancedb-linux-x64-gnu": "0.4.3",
"@lancedb/lancedb-win32-x64-msvc": "0.4.3"
}
},
"node_modules/@75lb/deep-merge": {
@@ -1317,6 +1319,66 @@
"@jridgewell/sourcemap-codec": "^1.4.14"
}
},
"node_modules/@lancedb/lancedb-darwin-arm64": {
"version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-arm64/-/lancedb-darwin-arm64-0.4.3.tgz",
"integrity": "sha512-+kxuWUK9vtLBbjFMkIKeQ32kxK2tgvZRCQaU1I3RJ3+dLmDIVeIj+KJSlMelkKa2QC4JoyHQi9Ty1PdS2DojmQ==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-darwin-x64": {
"version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-x64/-/lancedb-darwin-x64-0.4.3.tgz",
"integrity": "sha512-JYvsSYxTOa/7OMojulz9h0gN2FwvypG/6l6dpLkViZ5LDvRcfVyDTzOLcOJkFn+db4TKeBOVyMWnnpDKaB+jLA==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-linux-x64-gnu": {
"version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-x64-gnu/-/lancedb-linux-x64-gnu-0.4.3.tgz",
"integrity": "sha512-jDANHchWNGmu1wfAyBk0apoFlLxtJ7FRc31pAQ3tKE4fwlgG7bUcaTX6s5C3vMNWXnyQLQtVuWZNXi2nVj879g==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-win32-x64-msvc": {
"version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-win32-x64-msvc/-/lancedb-win32-x64-msvc-0.4.3.tgz",
"integrity": "sha512-qADveXyv4YzllIbOOq8soqFfL7p7I35uhrD3PcTvj4Qxuo6q7pgQWQz2Mt3kGBpyPkH2yE4wWAGJhayShLRbiQ==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@napi-rs/cli": {
"version": "2.18.0",
"resolved": "https://registry.npmjs.org/@napi-rs/cli/-/cli-2.18.0.tgz",
@@ -1396,7 +1458,6 @@
"version": "0.5.6",
"resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.6.tgz",
"integrity": "sha512-aYX01Ke9hunpoCexYAgQucEpARGQ5w/cqHFrIR+e9gdKb1QWTsVJuTJ2ozQzIAxLyRQe/m+2RqzkyOOGiMKRQA==",
"peer": true,
"dependencies": {
"tslib": "^2.4.0"
}
@@ -1445,8 +1506,7 @@
"node_modules/@types/command-line-args": {
"version": "5.2.3",
"resolved": "https://registry.npmjs.org/@types/command-line-args/-/command-line-args-5.2.3.tgz",
"integrity": "sha512-uv0aG6R0Y8WHZLTamZwtfsDLVRnOa+n+n5rEvFWL5Na5gZ8V2Teab/duDPFzIIIhs9qizDpcavCusCLJZu62Kw==",
"peer": true
"integrity": "sha512-uv0aG6R0Y8WHZLTamZwtfsDLVRnOa+n+n5rEvFWL5Na5gZ8V2Teab/duDPFzIIIhs9qizDpcavCusCLJZu62Kw=="
},
"node_modules/@types/command-line-usage": {
"version": "5.0.2",
@@ -1514,7 +1574,6 @@
"version": "2.6.11",
"resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.11.tgz",
"integrity": "sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g==",
"optional": true,
"dependencies": {
"@types/node": "*",
"form-data": "^4.0.0"
@@ -1783,7 +1842,6 @@
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
"integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==",
"optional": true,
"dependencies": {
"event-target-shim": "^5.0.0"
},
@@ -1816,7 +1874,6 @@
"version": "4.5.0",
"resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz",
"integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==",
"optional": true,
"dependencies": {
"humanize-ms": "^1.2.1"
},
@@ -1913,7 +1970,6 @@
"version": "15.0.0",
"resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-15.0.0.tgz",
"integrity": "sha512-e6aunxNKM+woQf137ny3tp/xbLjFJS2oGQxQhYGqW6dGeIwNV1jOeEAeR6sS2jwAI2qLO83gYIP2MBz02Gw5Xw==",
"peer": true,
"dependencies": {
"@swc/helpers": "^0.5.2",
"@types/command-line-args": "^5.2.1",
@@ -2001,8 +2057,7 @@
"node_modules/asynckit": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
"optional": true
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="
},
"node_modules/babel-jest": {
"version": "29.7.0",
@@ -2129,8 +2184,7 @@
"node_modules/base-64": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/base-64/-/base-64-0.1.0.tgz",
"integrity": "sha512-Y5gU45svrR5tI2Vt/X9GPd3L0HNIKzGu202EjxrXMpuc2V2CiKgemAbUUsqYmZJvPtCXoUKjNZwBJzsNScUbXA==",
"optional": true
"integrity": "sha512-Y5gU45svrR5tI2Vt/X9GPd3L0HNIKzGu202EjxrXMpuc2V2CiKgemAbUUsqYmZJvPtCXoUKjNZwBJzsNScUbXA=="
},
"node_modules/brace-expansion": {
"version": "1.1.11",
@@ -2296,7 +2350,6 @@
"version": "0.0.2",
"resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz",
"integrity": "sha512-yrLQ/yVUFXkzg7EDQsPieE/53+0RlaWTs+wBrvW36cyilJ2SaDWfl4Yj7MtLTXleV9uEKefbAGUPv2/iWSooRA==",
"optional": true,
"engines": {
"node": "*"
}
@@ -2357,7 +2410,6 @@
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
"optional": true,
"dependencies": {
"delayed-stream": "~1.0.0"
},
@@ -2469,7 +2521,6 @@
"version": "0.0.2",
"resolved": "https://registry.npmjs.org/crypt/-/crypt-0.0.2.tgz",
"integrity": "sha512-mCxBlsHFYh9C+HVpiEacem8FEBnMXgU9gy4zmNC+SXAZNB/1idgp/aulFJ4FgCi7GPEVbfyng092GqL2k2rmow==",
"optional": true,
"engines": {
"node": "*"
}
@@ -2530,7 +2581,6 @@
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
"integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
"optional": true,
"engines": {
"node": ">=0.4.0"
}
@@ -2557,7 +2607,6 @@
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/digest-fetch/-/digest-fetch-1.3.0.tgz",
"integrity": "sha512-CGJuv6iKNM7QyZlM2T3sPAdZWd/p9zQiRNS9G+9COUCwzWFTs0Xp8NF5iePx7wtvhDykReiRRrSeNb4oMmB8lA==",
"optional": true,
"dependencies": {
"base-64": "^0.1.0",
"md5": "^2.3.0"
@@ -2862,7 +2911,6 @@
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz",
"integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==",
"optional": true,
"engines": {
"node": ">=6"
}
@@ -3024,7 +3072,6 @@
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
"integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==",
"optional": true,
"dependencies": {
"asynckit": "^0.4.0",
"combined-stream": "^1.0.8",
@@ -3037,14 +3084,12 @@
"node_modules/form-data-encoder": {
"version": "1.7.2",
"resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz",
"integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==",
"optional": true
"integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="
},
"node_modules/formdata-node": {
"version": "4.4.1",
"resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz",
"integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==",
"optional": true,
"dependencies": {
"node-domexception": "1.0.0",
"web-streams-polyfill": "4.0.0-beta.3"
@@ -3057,7 +3102,6 @@
"version": "4.0.0-beta.3",
"resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz",
"integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==",
"optional": true,
"engines": {
"node": ">= 14"
}
@@ -3272,7 +3316,6 @@
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz",
"integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==",
"optional": true,
"dependencies": {
"ms": "^2.0.0"
}
@@ -3355,6 +3398,15 @@
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
"dev": true
},
"node_modules/interpret": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/interpret/-/interpret-1.4.0.tgz",
"integrity": "sha512-agE4QfB2Lkp9uICn7BAqoscw4SZP9kTE2hxiFI3jBPmXJfdqiahTbUuKGsMoN2GtqL9AxhYioAcVvgsb1HvRbA==",
"dev": true,
"engines": {
"node": ">= 0.10"
}
},
"node_modules/is-arrayish": {
"version": "0.2.1",
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz",
@@ -3364,8 +3416,7 @@
"node_modules/is-buffer": {
"version": "1.1.6",
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==",
"optional": true
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
},
"node_modules/is-builtin-module": {
"version": "3.2.1",
@@ -4458,7 +4509,6 @@
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz",
"integrity": "sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==",
"optional": true,
"dependencies": {
"charenc": "0.0.2",
"crypt": "0.0.2",
@@ -4497,7 +4547,6 @@
"version": "1.52.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
"integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
"optional": true,
"engines": {
"node": ">= 0.6"
}
@@ -4506,7 +4555,6 @@
"version": "2.1.35",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
"integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
"optional": true,
"dependencies": {
"mime-db": "1.52.0"
},
@@ -4538,8 +4586,7 @@
"node_modules/ms": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
"optional": true
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
},
"node_modules/natural-compare": {
"version": "1.4.0",
@@ -4567,7 +4614,6 @@
"url": "https://paypal.me/jimmywarting"
}
],
"optional": true,
"engines": {
"node": ">=10.5.0"
}
@@ -4576,7 +4622,6 @@
"version": "2.7.0",
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
"integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
"optional": true,
"dependencies": {
"whatwg-url": "^5.0.0"
},
@@ -4623,10 +4668,9 @@
}
},
"node_modules/openai": {
"version": "4.28.4",
"resolved": "https://registry.npmjs.org/openai/-/openai-4.28.4.tgz",
"integrity": "sha512-RNIwx4MT/F0zyizGcwS+bXKLzJ8QE9IOyigDG/ttnwB220d58bYjYFp0qjvGwEFBO6+pvFVIDABZPGDl46RFsg==",
"optional": true,
"version": "4.29.2",
"resolved": "https://registry.npmjs.org/openai/-/openai-4.29.2.tgz",
"integrity": "sha512-cPkT6zjEcE4qU5OW/SoDDuXEsdOLrXlAORhzmaguj5xZSPlgKvLhi27sFWhLKj07Y6WKNWxcwIbzm512FzTBNQ==",
"dependencies": {
"@types/node": "^18.11.18",
"@types/node-fetch": "^2.6.4",
@@ -4643,10 +4687,9 @@
}
},
"node_modules/openai/node_modules/@types/node": {
"version": "18.19.20",
"resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.20.tgz",
"integrity": "sha512-SKXZvI375jkpvAj8o+5U2518XQv76mAsixqfXiVyWyXZbVWQK25RurFovYpVIxVzul0rZoH58V/3SkEnm7s3qA==",
"optional": true,
"version": "18.19.26",
"resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.26.tgz",
"integrity": "sha512-+wiMJsIwLOYCvUqSdKTrfkS8mpTp+MPINe6+Np4TAGFWWRWiBQ5kSq9nZGCSPkzx9mvT+uEukzpX4MOSCydcvw==",
"dependencies": {
"undici-types": "~5.26.4"
}
@@ -4996,6 +5039,18 @@
"integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==",
"dev": true
},
"node_modules/rechoir": {
"version": "0.6.2",
"resolved": "https://registry.npmjs.org/rechoir/-/rechoir-0.6.2.tgz",
"integrity": "sha512-HFM8rkZ+i3zrV+4LQjwQ0W+ez98pApMGM3HUrN04j3CqzPOzl9nmP15Y8YXNm8QHGv/eacOVEjqhmWpkRV0NAw==",
"dev": true,
"dependencies": {
"resolve": "^1.1.6"
},
"engines": {
"node": ">= 0.10"
}
},
"node_modules/repeat-string": {
"version": "1.6.1",
"resolved": "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz",
@@ -5145,6 +5200,23 @@
"node": ">=8"
}
},
"node_modules/shelljs": {
"version": "0.8.5",
"resolved": "https://registry.npmjs.org/shelljs/-/shelljs-0.8.5.tgz",
"integrity": "sha512-TiwcRcrkhHvbrZbnRcFYMLl30Dfov3HKqzp5tO5b4pt6G/SezKcYhmDg15zXVBswHmctSAQKznqNW2LO5tTDow==",
"dev": true,
"dependencies": {
"glob": "^7.0.0",
"interpret": "^1.0.0",
"rechoir": "^0.6.2"
},
"bin": {
"shjs": "bin/shjs"
},
"engines": {
"node": ">=4"
}
},
"node_modules/shiki": {
"version": "0.14.7",
"resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.7.tgz",
@@ -5157,6 +5229,22 @@
"vscode-textmate": "^8.0.0"
}
},
"node_modules/shx": {
"version": "0.3.4",
"resolved": "https://registry.npmjs.org/shx/-/shx-0.3.4.tgz",
"integrity": "sha512-N6A9MLVqjxZYcVn8hLmtneQWIJtp8IKzMP4eMnx+nqkvXoqinUPCbUFLp2UcWTEIUONhlk0ewxr/jaVGlc+J+g==",
"dev": true,
"dependencies": {
"minimist": "^1.2.3",
"shelljs": "^0.8.5"
},
"bin": {
"shx": "lib/cli.js"
},
"engines": {
"node": ">=6"
}
},
"node_modules/signal-exit": {
"version": "3.0.7",
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
@@ -5432,8 +5520,7 @@
"node_modules/tr46": {
"version": "0.0.3",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
"integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
"optional": true
"integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="
},
"node_modules/ts-api-utils": {
"version": "1.0.3",
@@ -5929,7 +6016,6 @@
"version": "3.3.3",
"resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz",
"integrity": "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==",
"optional": true,
"engines": {
"node": ">= 8"
}
@@ -5937,14 +6023,12 @@
"node_modules/webidl-conversions": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
"optional": true
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="
},
"node_modules/whatwg-url": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
"integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
"optional": true,
"dependencies": {
"tr46": "~0.0.3",
"webidl-conversions": "^3.0.0"

View File

@@ -1,17 +1,18 @@
{
"name": "lancedb",
"name": "@lancedb/lancedb",
"version": "0.4.3",
"main": "./dist/index.js",
"types": "./dist/index.d.ts",
"napi": {
"name": "lancedb-nodejs",
"name": "lancedb",
"triples": {
"defaults": false,
"additional": [
"aarch64-apple-darwin",
"aarch64-unknown-linux-gnu",
"x86_64-apple-darwin",
"x86_64-unknown-linux-gnu"
"x86_64-unknown-linux-gnu",
"x86_64-pc-windows-msvc"
]
}
},
@@ -28,6 +29,7 @@
"eslint-plugin-jsdoc": "^48.2.1",
"jest": "^29.7.0",
"prettier": "^3.1.0",
"shx": "^0.3.4",
"tmp": "^0.2.3",
"ts-jest": "^29.1.2",
"typedoc": "^0.25.7",
@@ -48,13 +50,14 @@
"os": [
"darwin",
"linux",
"windows"
"win32"
],
"scripts": {
"artifacts": "napi artifacts",
"build:native": "napi build --platform --release --js lancedb/native.js --dts lancedb/native.d.ts dist/",
"build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js dist/",
"build": "npm run build:debug && tsc -b",
"build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js dist/",
"build": "npm run build:debug && tsc -b && shx cp lancedb/native.d.ts dist/native.d.ts",
"build-release": "npm run build:release && tsc -b && shx cp lancedb/native.d.ts dist/native.d.ts",
"chkformat": "prettier . --check",
"docs": "typedoc --plugin typedoc-plugin-markdown lancedb/index.ts",
"lint": "eslint lancedb && eslint __test__",
@@ -64,13 +67,14 @@
"version": "napi version"
},
"optionalDependencies": {
"lancedb-darwin-arm64": "0.4.3",
"lancedb-darwin-x64": "0.4.3",
"lancedb-linux-arm64-gnu": "0.4.3",
"lancedb-linux-x64-gnu": "0.4.3",
"openai": "^4.28.4"
"@lancedb/lancedb-darwin-arm64": "0.4.3",
"@lancedb/lancedb-darwin-x64": "0.4.3",
"@lancedb/lancedb-linux-arm64-gnu": "0.4.3",
"@lancedb/lancedb-linux-x64-gnu": "0.4.3",
"@lancedb/lancedb-win32-x64-msvc": "0.4.3"
},
"peerDependencies": {
"dependencies": {
"openai": "^4.29.2",
"apache-arrow": "^15.0.0"
}
}

View File

@@ -145,34 +145,20 @@ async def connect_async(
the last check, then the table will be checked for updates. Note: this
consistency only applies to read operations. Write operations are
always consistent.
request_thread_pool: int or ThreadPoolExecutor, optional
The thread pool to use for making batch requests to the LanceDB Cloud API.
If an integer, then a ThreadPoolExecutor will be created with that
number of threads. If None, then a ThreadPoolExecutor will be created
with the default number of threads. If a ThreadPoolExecutor, then that
executor will be used for making requests. This is for LanceDB Cloud
only and is only used when making batch requests (i.e., passing in
multiple queries to the search method at once).
Examples
--------
For a local directory, provide a path for the database:
>>> import lancedb
>>> db = lancedb.connect("~/.lancedb")
For object storage, use a URI prefix:
>>> db = lancedb.connect("s3://my-bucket/lancedb")
Connect to LancdDB cloud:
>>> db = lancedb.connect("db://my_database", api_key="ldb_...")
>>> async def doctest_example():
... # For a local directory, provide a path to the database
... db = await lancedb.connect_async("~/.lancedb")
... # For object storage, use a URI prefix
... db = await lancedb.connect_async("s3://my-bucket/lancedb")
Returns
-------
conn : DBConnection
conn : AsyncConnection
A connection to a LanceDB database.
"""
if read_consistency_interval is not None:

View File

@@ -25,13 +25,18 @@ from overrides import EnforceOverrides, override
from pyarrow import fs
from lancedb.common import data_to_reader, validate_schema
from lancedb.embeddings.registry import EmbeddingFunctionRegistry
from lancedb.utils.events import register_event
from ._lancedb import connect as lancedb_connect
from .pydantic import LanceModel
from .table import AsyncTable, LanceTable, Table, _sanitize_data
from .util import fs_from_uri, get_uri_location, get_uri_scheme, join_uri
from .util import (
fs_from_uri,
get_uri_location,
get_uri_scheme,
join_uri,
validate_table_name,
)
if TYPE_CHECKING:
from datetime import timedelta
@@ -387,6 +392,7 @@ class LanceDBConnection(DBConnection):
"""
if mode.lower() not in ["create", "overwrite"]:
raise ValueError("mode must be either 'create' or 'overwrite'")
validate_table_name(name)
tbl = LanceTable.create(
self,
@@ -444,16 +450,17 @@ class LanceDBConnection(DBConnection):
class AsyncConnection(object):
"""An active LanceDB connection
To obtain a connection you can use the [connect] function.
To obtain a connection you can use the [connect_async][lancedb.connect_async]
function.
This could be a native connection (using lance) or a remote connection (e.g. for
connecting to LanceDb Cloud)
Local connections do not currently hold any open resources but they may do so in the
future (for example, for shared cache or connections to catalog services) Remote
connections represent an open connection to the remote server. The [close] method
can be used to release any underlying resources eagerly. The connection can also
be used as a context manager:
connections represent an open connection to the remote server. The
[close][lancedb.db.AsyncConnection.close] method can be used to release any
underlying resources eagerly. The connection can also be used as a context manager.
Connections can be shared on multiple threads and are expected to be long lived.
Connections can also be used as a context manager, however, in many cases a single
@@ -464,10 +471,9 @@ class AsyncConnection(object):
Examples
--------
>>> import asyncio
>>> import lancedb
>>> async def my_connect():
... with await lancedb.connect("/tmp/my_dataset") as conn:
>>> async def doctest_example():
... with await lancedb.connect_async("/tmp/my_dataset") as conn:
... # do something with the connection
... pass
... # conn is closed here
@@ -528,9 +534,8 @@ class AsyncConnection(object):
exist_ok: Optional[bool] = None,
on_bad_vectors: Optional[str] = None,
fill_value: Optional[float] = None,
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
) -> AsyncTable:
"""Create a [Table][lancedb.table.Table] in the database.
"""Create an [AsyncTable][lancedb.table.AsyncTable] in the database.
Parameters
----------
@@ -569,7 +574,7 @@ class AsyncConnection(object):
Returns
-------
LanceTable
AsyncTable
A reference to the newly created table.
!!! note
@@ -583,12 +588,14 @@ class AsyncConnection(object):
Can create with list of tuples or dictionaries:
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
>>> db.create_table("my_table", data)
LanceTable(connection=..., name="my_table")
>>> db["my_table"].head()
>>> async def doctest_example():
... db = await lancedb.connect_async("./.lancedb")
... data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
... my_table = await db.create_table("my_table", data)
... print(await my_table.query().limit(5).to_arrow())
>>> import asyncio
>>> asyncio.run(doctest_example())
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
@@ -607,9 +614,11 @@ class AsyncConnection(object):
... "lat": [45.5, 40.1],
... "long": [-122.7, -74.1]
... })
>>> db.create_table("table2", data)
LanceTable(connection=..., name="table2")
>>> db["table2"].head()
>>> async def pandas_example():
... db = await lancedb.connect_async("./.lancedb")
... my_table = await db.create_table("table2", data)
... print(await my_table.query().limit(5).to_arrow())
>>> asyncio.run(pandas_example())
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
@@ -629,9 +638,11 @@ class AsyncConnection(object):
... pa.field("lat", pa.float32()),
... pa.field("long", pa.float32())
... ])
>>> db.create_table("table3", data, schema = custom_schema)
LanceTable(connection=..., name="table3")
>>> db["table3"].head()
>>> async def with_schema():
... db = await lancedb.connect_async("./.lancedb")
... my_table = await db.create_table("table3", data, schema = custom_schema)
... print(await my_table.query().limit(5).to_arrow())
>>> asyncio.run(with_schema())
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
@@ -663,9 +674,10 @@ class AsyncConnection(object):
... pa.field("item", pa.utf8()),
... pa.field("price", pa.float32()),
... ])
>>> db.create_table("table4", make_batches(), schema=schema)
LanceTable(connection=..., name="table4")
>>> async def iterable_example():
... db = await lancedb.connect_async("./.lancedb")
... await db.create_table("table4", make_batches(), schema=schema)
>>> asyncio.run(iterable_example())
"""
if inspect.isclass(schema) and issubclass(schema, LanceModel):
# convert LanceModel to pyarrow schema
@@ -674,12 +686,6 @@ class AsyncConnection(object):
schema = schema.to_arrow_schema()
metadata = None
if embedding_functions is not None:
# If we passed in embedding functions explicitly
# then we'll override any schema metadata that
# may was implicitly specified by the LanceModel schema
registry = EmbeddingFunctionRegistry.get_instance()
metadata = registry.get_table_metadata(embedding_functions)
# Defining defaults here and not in function prototype. In the future
# these defaults will move into rust so better to keep them as None.
@@ -760,11 +766,11 @@ class AsyncConnection(object):
name: str
The name of the table.
"""
raise NotImplementedError
await self._inner.drop_table(name)
async def drop_database(self):
"""
Drop database
This is the same thing as dropping all the tables
"""
raise NotImplementedError
await self._inner.drop_db()

View File

@@ -1033,7 +1033,7 @@ class AsyncQueryBase(object):
Construct an AsyncQueryBase
This method is not intended to be called directly. Instead, use the
[Table.query][] method to create a query.
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
"""
self._inner = inner
@@ -1041,7 +1041,10 @@ class AsyncQueryBase(object):
"""
Only return rows matching the given predicate
The predicate should be supplied as an SQL query string. For example:
The predicate should be supplied as an SQL query string.
Examples
--------
>>> predicate = "x > 10"
>>> predicate = "y > 0 AND y < 100"
@@ -1112,7 +1115,8 @@ class AsyncQueryBase(object):
Execute the query and collect the results into an Apache Arrow Table.
This method will collect all results into memory before returning. If
you expect a large number of results, you may want to use [to_batches][]
you expect a large number of results, you may want to use
[to_batches][lancedb.query.AsyncQueryBase.to_batches]
"""
batch_iter = await self.to_batches()
return pa.Table.from_batches(
@@ -1123,12 +1127,13 @@ class AsyncQueryBase(object):
"""
Execute the query and collect the results into a pandas DataFrame.
This method will collect all results into memory before returning. If
you expect a large number of results, you may want to use [to_batches][]
and convert each batch to pandas separately.
This method will collect all results into memory before returning. If you
expect a large number of results, you may want to use
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
pandas separately.
Example
-------
Examples
--------
>>> import asyncio
>>> from lancedb import connect_async
@@ -1148,7 +1153,7 @@ class AsyncQuery(AsyncQueryBase):
Construct an AsyncQuery
This method is not intended to be called directly. Instead, use the
[Table.query][] method to create a query.
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
"""
super().__init__(inner)
self._inner = inner
@@ -1189,8 +1194,8 @@ class AsyncQuery(AsyncQueryBase):
If there is only one vector column (a column whose data type is a
fixed size list of floats) then the column does not need to be specified.
If there is more than one vector column you must use
[AsyncVectorQuery::column][] to specify which column you would like to
compare with.
[AsyncVectorQuery.column][lancedb.query.AsyncVectorQuery.column] to specify
which column you would like to compare with.
If no index has been created on the vector column then a vector query
will perform a distance comparison between the query vector and every
@@ -1221,8 +1226,10 @@ class AsyncVectorQuery(AsyncQueryBase):
Construct an AsyncVectorQuery
This method is not intended to be called directly. Instead, create
a query first with [Table.query][] and then use [AsyncQuery.nearest_to][]
to convert to a vector query.
a query first with [AsyncTable.query][lancedb.table.AsyncTable.query] and then
use [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to]] to convert to
a vector query. Or you can use
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search]
"""
super().__init__(inner)
self._inner = inner
@@ -1232,7 +1239,7 @@ class AsyncVectorQuery(AsyncQueryBase):
Set the vector column to query
This controls which column is compared to the query vector supplied in
the call to [Query.nearest_to][].
the call to [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to].
This parameter must be specified if the table has more than one column
whose data type is a fixed-size-list of floats.

View File

@@ -26,6 +26,7 @@ from ..db import DBConnection
from ..embeddings import EmbeddingFunctionConfig
from ..pydantic import LanceModel
from ..table import Table, _sanitize_data
from ..util import validate_table_name
from .arrow import to_ipc_binary
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
from .errors import LanceDBClientError
@@ -223,6 +224,7 @@ class RemoteDBConnection(DBConnection):
LanceTable(table4)
"""
validate_table_name(name)
if data is None and schema is None:
raise ValueError("Either data or schema must be provided.")
if embedding_functions is not None:

View File

@@ -118,6 +118,11 @@ class Reranker(ABC):
The results from the vector search
fts_results : pa.Table
The results from the FTS search
Returns
-------
pa.Table
The merged results
"""
combined = pa.concat_tables([vector_results, fts_results], promote=True)
row_id = combined.column("_rowid")

View File

@@ -14,7 +14,7 @@ class CrossEncoderReranker(Reranker):
Parameters
----------
model : str, default "cross-encoder/ms-marco-TinyBERT-L-6"
model_name : str, default "cross-encoder/ms-marco-TinyBERT-L-6"
The name of the cross encoder model to use. See the sentence transformers
documentation for a list of available models.
column : str, default "text"

View File

@@ -1893,8 +1893,8 @@ class AsyncTable:
An AsyncTable object is expected to be long lived and reused for multiple
operations. AsyncTable objects will cache a certain amount of index data in memory.
This cache will be freed when the Table is garbage collected. To eagerly free the
cache you can call the [close][AsyncTable.close] method. Once the AsyncTable is
closed, it cannot be used for any further operations.
cache you can call the [close][lancedb.AsyncTable.close] method. Once the
AsyncTable is closed, it cannot be used for any further operations.
An AsyncTable can also be used as a context manager, and will automatically close
when the context is exited. Closing a table is optional. If you do not close the
@@ -1903,13 +1903,17 @@ class AsyncTable:
Examples
--------
Create using [DBConnection.create_table][lancedb.DBConnection.create_table]
Create using [AsyncConnection.create_table][lancedb.AsyncConnection.create_table]
(more examples in that method's documentation).
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> table = db.create_table("my_table", data=[{"vector": [1.1, 1.2], "b": 2}])
>>> table.head()
>>> async def create_a_table():
... db = await lancedb.connect_async("./.lancedb")
... data = [{"vector": [1.1, 1.2], "b": 2}]
... table = await db.create_table("my_table", data=data)
... print(await table.query().limit(5).to_arrow())
>>> import asyncio
>>> asyncio.run(create_a_table())
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
@@ -1918,25 +1922,37 @@ class AsyncTable:
vector: [[[1.1,1.2]]]
b: [[2]]
Can append new data with [Table.add()][lancedb.table.Table.add].
Can append new data with [AsyncTable.add()][lancedb.table.AsyncTable.add].
>>> table.add([{"vector": [0.5, 1.3], "b": 4}])
>>> async def add_to_table():
... db = await lancedb.connect_async("./.lancedb")
... table = await db.open_table("my_table")
... await table.add([{"vector": [0.5, 1.3], "b": 4}])
>>> asyncio.run(add_to_table())
Can query the table with [Table.search][lancedb.table.Table.search].
Can query the table with
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search].
>>> table.search([0.4, 0.4]).select(["b", "vector"]).to_pandas()
>>> async def search_table_for_vector():
... db = await lancedb.connect_async("./.lancedb")
... table = await db.open_table("my_table")
... results = (
... await table.vector_search([0.4, 0.4]).select(["b", "vector"]).to_pandas()
... )
... print(results)
>>> asyncio.run(search_table_for_vector())
b vector _distance
0 4 [0.5, 1.3] 0.82
1 2 [1.1, 1.2] 1.13
Search queries are much faster when an index is created. See
[Table.create_index][lancedb.table.Table.create_index].
[AsyncTable.create_index][lancedb.table.AsyncTable.create_index].
"""
def __init__(self, table: LanceDBTable):
"""Create a new Table object.
"""Create a new AsyncTable object.
You should not create Table objects directly.
You should not create AsyncTable objects directly.
Use [AsyncConnection.create_table][lancedb.AsyncConnection.create_table] and
[AsyncConnection.open_table][lancedb.AsyncConnection.open_table] to obtain
@@ -1988,6 +2004,14 @@ class AsyncTable:
return await self._inner.count_rows(filter)
def query(self) -> AsyncQuery:
"""
Returns an [AsyncQuery][lancedb.query.AsyncQuery] that can be used
to search the table.
Use methods on the returned query to control query behavior. The query
can be executed with methods like [to_arrow][lancedb.query.AsyncQuery.to_arrow],
[to_pandas][lancedb.query.AsyncQuery.to_pandas] and more.
"""
return AsyncQuery(self._inner.query())
async def to_pandas(self) -> "pd.DataFrame":
@@ -2024,20 +2048,8 @@ class AsyncTable:
Parameters
----------
index: Index
The index to create.
LanceDb supports multiple types of indices. See the static methods on
the Index class for more details.
column: str, default None
column: str
The column to index.
When building a scalar index this must be set.
When building a vector index, this is optional. The default will look
for any columns of type fixed-size-list with floating point values. If
there is only one column of this type then it will be used. Otherwise
an error will be returned.
replace: bool, default True
Whether to replace the existing index
@@ -2046,6 +2058,10 @@ class AsyncTable:
that index is out of date.
The default is True
config: Union[IvfPq, BTree], default None
For advanced configuration you can specify the type of index you would
like to create. You can also specify index-specific parameters when
creating an index object.
"""
index = None
if config is not None:
@@ -2167,7 +2183,8 @@ class AsyncTable:
Search the table with a given query vector.
This is a convenience method for preparing a vector query and
is the same thing as calling `nearestTo` on the builder returned
by `query`. Seer [nearest_to][AsyncQuery.nearest_to] for more details.
by `query`. Seer [nearest_to][lancedb.query.AsyncQuery.nearest_to] for more
details.
"""
return self.query().nearest_to(query_vector)
@@ -2233,7 +2250,7 @@ class AsyncTable:
x vector
0 3 [5.0, 6.0]
"""
raise NotImplementedError
return await self._inner.delete(where)
async def update(
self,
@@ -2289,102 +2306,6 @@ class AsyncTable:
return await self._inner.update(updates_sql, where)
async def cleanup_old_versions(
self,
older_than: Optional[timedelta] = None,
*,
delete_unverified: bool = False,
) -> CleanupStats:
"""
Clean up old versions of the table, freeing disk space.
Note: This function is not available in LanceDb Cloud (since LanceDb
Cloud manages cleanup for you automatically)
Parameters
----------
older_than: timedelta, default None
The minimum age of the version to delete. If None, then this defaults
to two weeks.
delete_unverified: bool, default False
Because they may be part of an in-progress transaction, files newer
than 7 days old are not deleted by default. If you are sure that
there are no in-progress transactions, then you can set this to True
to delete all files older than `older_than`.
Returns
-------
CleanupStats
The stats of the cleanup operation, including how many bytes were
freed.
"""
raise NotImplementedError
async def compact_files(self, *args, **kwargs):
"""
Run the compaction process on the table.
Note: This function is not available in LanceDb Cloud (since LanceDb
Cloud manages compaction for you automatically)
This can be run after making several small appends to optimize the table
for faster reads.
Arguments are passed onto :meth:`lance.dataset.DatasetOptimizer.compact_files`.
For most cases, the default should be fine.
"""
raise NotImplementedError
async def add_columns(self, transforms: Dict[str, str]):
"""
Add new columns with defined values.
This is not yet available in LanceDB Cloud.
Parameters
----------
transforms: Dict[str, str]
A map of column name to a SQL expression to use to calculate the
value of the new column. These expressions will be evaluated for
each row in the table, and can reference existing columns.
"""
raise NotImplementedError
async def alter_columns(self, alterations: Iterable[Dict[str, str]]):
"""
Alter column names and nullability.
This is not yet available in LanceDB Cloud.
alterations : Iterable[Dict[str, Any]]
A sequence of dictionaries, each with the following keys:
- "path": str
The column path to alter. For a top-level column, this is the name.
For a nested column, this is the dot-separated path, e.g. "a.b.c".
- "name": str, optional
The new name of the column. If not specified, the column name is
not changed.
- "nullable": bool, optional
Whether the column should be nullable. If not specified, the column
nullability is not changed. Only non-nullable columns can be changed
to nullable. Currently, you cannot change a nullable column to
non-nullable.
"""
raise NotImplementedError
async def drop_columns(self, columns: Iterable[str]):
"""
Drop columns from the table.
This is not yet available in LanceDB Cloud.
Parameters
----------
columns : Iterable[str]
The names of the columns to drop.
"""
raise NotImplementedError
async def version(self) -> int:
"""
Retrieve the version of the table

View File

@@ -25,6 +25,8 @@ import numpy as np
import pyarrow as pa
import pyarrow.fs as pa_fs
from ._lancedb import validate_table_name as native_validate_table_name
def safe_import_adlfs():
try:
@@ -286,3 +288,8 @@ def deprecated(func):
return func(*args, **kwargs)
return new_func
def validate_table_name(name: str):
"""Verify the table name is valid."""
native_validate_table_name(name)

View File

@@ -0,0 +1,162 @@
import shutil
# --8<-- [start:imports]
import lancedb
import pandas as pd
import pyarrow as pa
# --8<-- [end:imports]
import pytest
from numpy.random import randint, random
shutil.rmtree("data/sample-lancedb", ignore_errors=True)
def test_quickstart():
# --8<-- [start:connect]
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
# --8<-- [end:connect]
# --8<-- [start:create_table]
data = [
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
]
# Synchronous client
tbl = db.create_table("my_table", data=data)
# --8<-- [end:create_table]
# --8<-- [start:create_table_pandas]
df = pd.DataFrame(
[
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
]
)
# Synchronous client
tbl = db.create_table("table_from_df", data=df)
# --8<-- [end:create_table_pandas]
# --8<-- [start:create_empty_table]
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))])
# Synchronous client
tbl = db.create_table("empty_table", schema=schema)
# --8<-- [end:create_empty_table]
# --8<-- [start:open_table]
# Synchronous client
tbl = db.open_table("my_table")
# --8<-- [end:open_table]
# --8<-- [start:table_names]
# Synchronous client
print(db.table_names())
# --8<-- [end:table_names]
# Synchronous client
# --8<-- [start:add_data]
# Option 1: Add a list of dicts to a table
data = [
{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0},
]
tbl.add(data)
# Option 2: Add a pandas DataFrame to a table
df = pd.DataFrame(data)
tbl.add(data)
# --8<-- [end:add_data]
# --8<-- [start:vector_search]
# Synchronous client
tbl.search([100, 100]).limit(2).to_pandas()
# --8<-- [end:vector_search]
tbl.add(
[
{"vector": random(2), "item": "autogen", "price": randint(100)}
for _ in range(1000)
]
)
# --8<-- [start:create_index]
# Synchronous client
tbl.create_index(num_sub_vectors=1)
# --8<-- [end:create_index]
# --8<-- [start:delete_rows]
# Synchronous client
tbl.delete('item = "fizz"')
# --8<-- [end:delete_rows]
# --8<-- [start:drop_table]
# Synchronous client
db.drop_table("my_table")
# --8<-- [end:drop_table]
@pytest.mark.asyncio
async def test_quickstart_async():
# --8<-- [start:connect_async]
# LanceDb offers both a synchronous and an asynchronous client. There are still a
# few operations that are only supported by the synchronous client (e.g. embedding
# functions, full text search) but both APIs should soon be equivalent
# In this guide we will give examples of both clients. In other guides we will
# typically only provide examples with one client or the other.
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri)
# --8<-- [end:connect_async]
data = [
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
]
# --8<-- [start:create_table_async]
# Asynchronous client
async_tbl = await async_db.create_table("my_table2", data=data)
# --8<-- [end:create_table_async]
df = pd.DataFrame(
[
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
]
)
# --8<-- [start:create_table_async_pandas]
# Asynchronous client
async_tbl = await async_db.create_table("table_from_df2", df)
# --8<-- [end:create_table_async_pandas]
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))])
# --8<-- [start:create_empty_table_async]
# Asynchronous client
async_tbl = await async_db.create_table("empty_table2", schema=schema)
# --8<-- [end:create_empty_table_async]
# --8<-- [start:open_table_async]
# Asynchronous client
async_tbl = await async_db.open_table("my_table2")
# --8<-- [end:open_table_async]
# --8<-- [start:table_names_async]
# Asynchronous client
print(await async_db.table_names())
# --8<-- [end:table_names_async]
# --8<-- [start:add_data_async]
# Asynchronous client
await async_tbl.add(data)
# --8<-- [end:add_data_async]
# Add sufficient data for training
data = [{"vector": [x, x], "item": "filler", "price": x * x} for x in range(1000)]
await async_tbl.add(data)
# --8<-- [start:vector_search_async]
# Asynchronous client
await async_tbl.vector_search([100, 100]).limit(2).to_pandas()
# --8<-- [end:vector_search_async]
# --8<-- [start:create_index_async]
# Asynchronous client (must specify column to index)
await async_tbl.create_index("vector")
# --8<-- [end:create_index_async]
# --8<-- [start:delete_rows_async]
# Asynchronous client
await async_tbl.delete('item = "fizz"')
# --8<-- [end:delete_rows_async]
# --8<-- [start:drop_table_async]
# Asynchronous client
await async_db.drop_table("my_table2")
# --8<-- [end:drop_table_async]

View File

@@ -521,3 +521,15 @@ def test_prefilter_with_index(tmp_path):
.to_arrow()
)
assert table.num_rows == 1
def test_create_table_with_invalid_names(tmp_path):
db = lancedb.connect(uri=tmp_path)
data = [{"vector": np.random.rand(128), "item": "foo"} for i in range(10)]
with pytest.raises(ValueError):
db.create_table("foo/bar", data)
with pytest.raises(ValueError):
db.create_table("foo bar", data)
with pytest.raises(ValueError):
db.create_table("foo$$bar", data)
db.create_table("foo.bar", data)

View File

@@ -137,6 +137,21 @@ impl Connection {
Ok(Table::new(table))
})
}
pub fn drop_table(self_: PyRef<'_, Self>, name: String) -> PyResult<&PyAny> {
let inner = self_.get_inner()?.clone();
future_into_py(self_.py(), async move {
inner.drop_table(name).await.infer_error()
})
}
pub fn drop_db(self_: PyRef<'_, Self>) -> PyResult<&PyAny> {
let inner = self_.get_inner()?.clone();
future_into_py(
self_.py(),
async move { inner.drop_db().await.infer_error() },
)
}
}
#[pyfunction]

View File

@@ -42,6 +42,7 @@ pub fn _lancedb(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<VectorQuery>()?;
m.add_class::<RecordBatchStream>()?;
m.add_function(wrap_pyfunction!(connect, m)?)?;
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
Ok(())
}

View File

@@ -80,6 +80,13 @@ impl Table {
})
}
pub fn delete<'a>(self_: PyRef<'a, Self>, condition: String) -> PyResult<&'a PyAny> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
inner.delete(&condition).await.infer_error()
})
}
pub fn update<'a>(
self_: PyRef<'a, Self>,
updates: &PyDict,

View File

@@ -3,7 +3,7 @@ use std::sync::Mutex;
use lancedb::DistanceType;
use pyo3::{
exceptions::{PyRuntimeError, PyValueError},
PyResult,
pyfunction, PyResult,
};
/// A wrapper around a rust builder
@@ -49,3 +49,9 @@ pub fn parse_distance_type(distance_type: impl AsRef<str>) -> PyResult<DistanceT
))),
}
}
#[pyfunction]
pub(crate) fn validate_table_name(table_name: &str) -> PyResult<()> {
lancedb::utils::validate_table_name(table_name)
.map_err(|e| PyValueError::new_err(e.to_string()))
}

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.4.13"
version = "0.4.14"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.4.13"
version = "0.4.14"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true
@@ -22,6 +22,7 @@ chrono = { workspace = true }
object_store = { workspace = true }
snafu = { workspace = true }
half = { workspace = true }
lazy_static.workspace = true
lance = { workspace = true }
lance-index = { workspace = true }
lance-linalg = { workspace = true }
@@ -34,11 +35,10 @@ bytes = "1"
futures.workspace = true
num-traits.workspace = true
url.workspace = true
regex.workspace = true
serde = { version = "^1" }
serde_json = { version = "1" }
# For remote feature
reqwest = { version = "0.11.24", features = ["gzip", "json"], optional = true }
[dev-dependencies]

View File

@@ -31,6 +31,7 @@ use crate::arrow::IntoArrow;
use crate::error::{CreateDirSnafu, Error, InvalidTableNameSnafu, Result};
use crate::io::object_store::MirroringObjectStoreWrapper;
use crate::table::{NativeTable, WriteOptions};
use crate::utils::validate_table_name;
use crate::Table;
pub const LANCE_FILE_EXTENSION: &str = "lance";
@@ -675,13 +676,18 @@ impl Database {
/// Get the URI of a table in the database.
fn table_uri(&self, name: &str) -> Result<String> {
validate_table_name(name)?;
let path = Path::new(&self.uri);
let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let mut uri = table_uri
.as_path()
.to_str()
.context(InvalidTableNameSnafu { name })?
.context(InvalidTableNameSnafu {
name,
reason: "Name is not valid URL",
})?
.to_string();
// If there are query string set on the connection, propagate to lance

View File

@@ -20,8 +20,8 @@ use snafu::Snafu;
#[derive(Debug, Snafu)]
#[snafu(visibility(pub(crate)))]
pub enum Error {
#[snafu(display("Invalid table name: {name}"))]
InvalidTableName { name: String },
#[snafu(display("Invalid table name (\"{name}\"): {reason}"))]
InvalidTableName { name: String, reason: String },
#[snafu(display("Invalid input, {message}"))]
InvalidInput { message: String },
#[snafu(display("Table '{name}' was not found"))]

View File

@@ -230,9 +230,9 @@ pub enum DistanceType {
impl From<DistanceType> for LanceDistanceType {
fn from(value: DistanceType) -> Self {
match value {
DistanceType::L2 => LanceDistanceType::L2,
DistanceType::Cosine => LanceDistanceType::Cosine,
DistanceType::Dot => LanceDistanceType::Dot,
DistanceType::L2 => Self::L2,
DistanceType::Cosine => Self::Cosine,
DistanceType::Dot => Self::Dot,
}
}
}
@@ -240,9 +240,9 @@ impl From<DistanceType> for LanceDistanceType {
impl From<LanceDistanceType> for DistanceType {
fn from(value: LanceDistanceType) -> Self {
match value {
LanceDistanceType::L2 => DistanceType::L2,
LanceDistanceType::Cosine => DistanceType::Cosine,
LanceDistanceType::Dot => DistanceType::Dot,
LanceDistanceType::L2 => Self::L2,
LanceDistanceType::Cosine => Self::Cosine,
LanceDistanceType::Dot => Self::Dot,
}
}
}
@@ -251,7 +251,7 @@ impl<'a> TryFrom<&'a str> for DistanceType {
type Error = <LanceDistanceType as TryFrom<&'a str>>::Error;
fn try_from(value: &str) -> std::prelude::v1::Result<Self, Self::Error> {
LanceDistanceType::try_from(value).map(DistanceType::from)
LanceDistanceType::try_from(value).map(Self::from)
}
}

View File

@@ -854,6 +854,7 @@ impl NativeTable {
.to_str()
.ok_or(Error::InvalidTableName {
name: uri.to_string(),
reason: "Table name is not valid URL".to_string(),
})?;
Ok(name.to_string())
}
@@ -1197,7 +1198,7 @@ impl NativeTable {
if dim != query_vector.len() as i32 {
return Err(Error::InvalidInput {
message: format!(
"The dimension of the query vector does not match with the dimension of the vector column '{}':
"The dimension of the query vector does not match with the dimension of the vector column '{}':
query dim={}, expected vector dim={}",
column,
query_vector.len(),

View File

@@ -1,12 +1,30 @@
// Copyright 2024 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use arrow_schema::Schema;
use lance::dataset::{ReadParams, WriteParams};
use lance::io::{ObjectStoreParams, WrappingObjectStore};
use lazy_static::lazy_static;
use crate::error::{Error, Result};
lazy_static! {
static ref TABLE_NAME_REGEX: regex::Regex = regex::Regex::new(r"^[a-zA-Z0-9_\-\.]+$").unwrap();
}
pub trait PatchStoreParam {
fn patch_with_store_wrapper(
self,
@@ -64,6 +82,25 @@ impl PatchReadParam for ReadParams {
}
}
/// Validate table name.
pub fn validate_table_name(name: &str) -> Result<()> {
if name.is_empty() {
return Err(Error::InvalidTableName {
name: name.to_string(),
reason: "Table names cannot be empty strings".to_string(),
});
}
if !TABLE_NAME_REGEX.is_match(name) {
return Err(Error::InvalidTableName {
name: name.to_string(),
reason:
"Table names can only contain alphanumeric characters, underscores, hyphens, and periods"
.to_string(),
});
}
Ok(())
}
/// Find one default column to create index.
pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result<String> {
// Try to find one fixed size list array column.
@@ -145,4 +182,20 @@ mod tests {
.to_string()
.contains("More than one"));
}
#[test]
fn test_validate_table_name() {
assert!(validate_table_name("my_table").is_ok());
assert!(validate_table_name("my_table_1").is_ok());
assert!(validate_table_name("123mytable").is_ok());
assert!(validate_table_name("_12345table").is_ok());
assert!(validate_table_name("table.12345").is_ok());
assert!(validate_table_name("table.._dot_..12345").is_ok());
assert!(validate_table_name("").is_err());
assert!(validate_table_name("my_table!").is_err());
assert!(validate_table_name("my/table").is_err());
assert!(validate_table_name("my@table").is_err());
assert!(validate_table_name("name with space").is_err());
}
}