Compare commits

..

1 Commits

Author SHA1 Message Date
Lance Release
1e055158c1 Bump version: 0.20.0-beta.1 → 0.20.0-beta.2 2025-06-04 07:14:06 +00:00
187 changed files with 15522 additions and 5400 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.21.2"
current_version = "0.20.0-beta.2"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.
@@ -50,6 +50,11 @@ pre_commit_hooks = [
optional_value = "final"
values = ["beta", "final"]
[[tool.bumpversion.files]]
filename = "node/package.json"
replace = "\"version\": \"{new_version}\","
search = "\"version\": \"{current_version}\","
[[tool.bumpversion.files]]
filename = "nodejs/package.json"
replace = "\"version\": \"{new_version}\","
@@ -61,8 +66,39 @@ glob = "nodejs/npm/*/package.json"
replace = "\"version\": \"{new_version}\","
search = "\"version\": \"{current_version}\","
# vectodb node binary packages
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-darwin-arm64\": \"{new_version}\""
search = "\"@lancedb/vectordb-darwin-arm64\": \"{current_version}\""
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-darwin-x64\": \"{new_version}\""
search = "\"@lancedb/vectordb-darwin-x64\": \"{current_version}\""
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-linux-arm64-gnu\": \"{new_version}\""
search = "\"@lancedb/vectordb-linux-arm64-gnu\": \"{current_version}\""
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
search = "\"@lancedb/vectordb-win32-x64-msvc\": \"{current_version}\""
# Cargo files
# ------------
[[tool.bumpversion.files]]
filename = "rust/ffi/node/Cargo.toml"
replace = "\nversion = \"{new_version}\""
search = "\nversion = \"{current_version}\""
[[tool.bumpversion.files]]
filename = "rust/lancedb/Cargo.toml"
replace = "\nversion = \"{new_version}\""

View File

@@ -5,8 +5,8 @@ on:
tags-ignore:
# We don't publish pre-releases for Rust. Crates.io is just a source
# distribution, so we don't need to publish pre-releases.
- "v*-beta*"
- "*-v*" # for example, python-vX.Y.Z
- 'v*-beta*'
- '*-v*' # for example, python-vX.Y.Z
env:
# This env var is used by Swatinem/rust-cache@v2 for the cache
@@ -19,8 +19,6 @@ env:
jobs:
build:
runs-on: ubuntu-22.04
permissions:
id-token: write
timeout-minutes: 30
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
@@ -33,8 +31,6 @@ jobs:
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- uses: rust-lang/crates-io-auth-action@v1
id: auth
- name: Publish the package
run: |
cargo publish -p lancedb --all-features --token ${{ steps.auth.outputs.token }}
cargo publish -p lancedb --all-features --token ${{ secrets.CARGO_REGISTRY_TOKEN }}

View File

@@ -84,7 +84,7 @@ jobs:
run: |
pip install bump-my-version PyGithub packaging
bash ci/bump_version.sh ${{ inputs.type }} ${{ inputs.bump-minor }} v $COMMIT_BEFORE_BUMP
bash ci/update_lockfiles.sh --amend
bash ci/update_lockfiles.sh
- name: Push new version tag
if: ${{ !inputs.dry_run }}
uses: ad-m/github-push-action@master
@@ -93,3 +93,11 @@ jobs:
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
branch: ${{ github.ref }}
tags: true
- uses: ./.github/workflows/update_package_lock
if: ${{ !inputs.dry_run && inputs.other }}
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
- uses: ./.github/workflows/update_package_lock_nodejs
if: ${{ !inputs.dry_run && inputs.other }}
with:
github_token: ${{ secrets.GITHUB_TOKEN }}

147
.github/workflows/node.yml vendored Normal file
View File

@@ -0,0 +1,147 @@
name: Node
on:
push:
branches:
- main
pull_request:
paths:
- node/**
- rust/ffi/node/**
- .github/workflows/node.yml
- docker-compose.yml
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
# Disable full debug symbol generation to speed up CI build and keep memory down
# "1" means line tables only, which is useful for panic tracebacks.
#
# Use native CPU to accelerate tests if possible, especially for f16
# target-cpu=haswell fixes failing ci build
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=haswell -C target-feature=+f16c,+avx2,+fma"
RUST_BACKTRACE: "1"
jobs:
linux:
name: Linux (Node ${{ matrix.node-version }})
timeout-minutes: 30
strategy:
matrix:
node-version: [ "18", "20" ]
runs-on: "ubuntu-22.04"
defaults:
run:
shell: bash
working-directory: node
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- uses: actions/setup-node@v3
with:
node-version: ${{ matrix.node-version }}
cache: 'npm'
cache-dependency-path: node/package-lock.json
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
npm ci
npm run build
npm run pack-build
npm install --no-save ./dist/lancedb-vectordb-*.tgz
# Remove index.node to test with dependency installed
rm index.node
- name: Test
run: npm run test
macos:
timeout-minutes: 30
runs-on: "macos-13"
defaults:
run:
shell: bash
working-directory: node
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- uses: actions/setup-node@v3
with:
node-version: 20
cache: 'npm'
cache-dependency-path: node/package-lock.json
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: brew install protobuf
- name: Build
run: |
npm ci
npm run build
npm run pack-build
npm install --no-save ./dist/lancedb-vectordb-*.tgz
# Remove index.node to test with dependency installed
rm index.node
- name: Test
run: |
npm run test
aws-integtest:
timeout-minutes: 45
runs-on: "ubuntu-22.04"
defaults:
run:
shell: bash
working-directory: node
env:
AWS_ACCESS_KEY_ID: ACCESSKEY
AWS_SECRET_ACCESS_KEY: SECRETKEY
AWS_DEFAULT_REGION: us-west-2
# this one is for s3
AWS_ENDPOINT: http://localhost:4566
# this one is for dynamodb
DYNAMODB_ENDPOINT: http://localhost:4566
ALLOW_HTTP: true
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- uses: actions/setup-node@v3
with:
node-version: 20
cache: 'npm'
cache-dependency-path: node/package-lock.json
- name: start local stack
run: docker compose -f ../docker-compose.yml up -d --wait
- name: create s3
run: aws s3 mb s3://lancedb-integtest --endpoint $AWS_ENDPOINT
- name: create ddb
run: |
aws dynamodb create-table \
--table-name lancedb-integtest \
--attribute-definitions '[{"AttributeName": "base_uri", "AttributeType": "S"}, {"AttributeName": "version", "AttributeType": "N"}]' \
--key-schema '[{"AttributeName": "base_uri", "KeyType": "HASH"}, {"AttributeName": "version", "KeyType": "RANGE"}]' \
--provisioned-throughput '{"ReadCapacityUnits": 10, "WriteCapacityUnits": 10}' \
--endpoint-url $DYNAMODB_ENDPOINT
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
npm ci
npm run build
npm run pack-build
npm install --no-save ./dist/lancedb-vectordb-*.tgz
# Remove index.node to test with dependency installed
rm index.node
- name: Test
run: npm run integration-test

View File

@@ -365,3 +365,184 @@ jobs:
ARGS="$ARGS --tag preview"
fi
npm publish $ARGS
# ----------------------------------------------------------------------------
# vectordb release (legacy)
# ----------------------------------------------------------------------------
# TODO: delete this when we drop vectordb
node:
name: vectordb Typescript
runs-on: ubuntu-latest
defaults:
run:
shell: bash
working-directory: node
steps:
- name: Checkout
uses: actions/checkout@v4
- uses: actions/setup-node@v3
with:
node-version: 20
cache: "npm"
cache-dependency-path: node/package-lock.json
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
npm ci
npm run tsc
npm pack
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: node-package
path: |
node/vectordb-*.tgz
node-macos:
name: vectordb ${{ matrix.config.arch }}
strategy:
matrix:
config:
- arch: x86_64-apple-darwin
runner: macos-13
- arch: aarch64-apple-darwin
# xlarge is implicitly arm64.
runner: macos-14
runs-on: ${{ matrix.config.runner }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install system dependencies
run: brew install protobuf
- name: Install npm dependencies
run: |
cd node
npm ci
- name: Build MacOS native node modules
run: bash ci/build_macos_artifacts.sh ${{ matrix.config.arch }}
- name: Upload Darwin Artifacts
uses: actions/upload-artifact@v4
with:
name: node-native-darwin-${{ matrix.config.arch }}
path: |
node/dist/lancedb-vectordb-darwin*.tgz
node-linux-gnu:
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
runs-on: ${{ matrix.config.runner }}
strategy:
fail-fast: false
matrix:
config:
- arch: x86_64
runner: ubuntu-latest
- arch: aarch64
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
runner: warp-ubuntu-latest-arm64-4x
steps:
- name: Checkout
uses: actions/checkout@v4
# To avoid OOM errors on ARM, we create a swap file.
- name: Configure aarch64 build
if: ${{ matrix.config.arch == 'aarch64' }}
run: |
free -h
sudo fallocate -l 16G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
echo "/swapfile swap swap defaults 0 0" >> sudo /etc/fstab
# print info
swapon --show
free -h
- name: Build Linux Artifacts
run: |
bash ci/build_linux_artifacts.sh ${{ matrix.config.arch }} ${{ matrix.config.arch }}-unknown-linux-gnu
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: node-native-linux-${{ matrix.config.arch }}-gnu
path: |
node/dist/lancedb-vectordb-linux*.tgz
node-windows:
name: vectordb ${{ matrix.target }}
runs-on: windows-2022
strategy:
fail-fast: false
matrix:
target: [x86_64-pc-windows-msvc]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install Protoc v21.12
working-directory: C:\
run: |
New-Item -Path 'C:\protoc' -ItemType Directory
Set-Location C:\protoc
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
7z x protoc.zip
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
shell: powershell
- name: Install npm dependencies
run: |
cd node
npm ci
- name: Build Windows native node modules
run: .\ci\build_windows_artifacts.ps1 ${{ matrix.target }}
- name: Upload Windows Artifacts
uses: actions/upload-artifact@v4
with:
name: node-native-windows
path: |
node/dist/lancedb-vectordb-win32*.tgz
release:
name: vectordb NPM Publish
needs: [node, node-macos, node-linux-gnu, node-windows]
runs-on: ubuntu-latest
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
steps:
- uses: actions/download-artifact@v4
with:
pattern: node-*
- name: Display structure of downloaded files
run: ls -R
- uses: actions/setup-node@v3
with:
node-version: 20
registry-url: "https://registry.npmjs.org"
- name: Publish to NPM
env:
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
run: |
# Tag beta as "preview" instead of default "latest". See lancedb
# npm publish step for more info.
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
PUBLISH_ARGS="--tag preview"
fi
mv */*.tgz .
for filename in *.tgz; do
npm publish $PUBLISH_ARGS $filename
done
- name: Deprecate
env:
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
# We need to deprecate the old package to avoid confusion.
# Each time we publish a new version, it gets undeprecated.
run: npm deprecate vectordb "Use @lancedb/lancedb instead."
- name: Notify Slack Action
uses: ravsamhq/notify-slack-action@2.3.0
if: ${{ always() }}
with:
status: ${{ job.status }}
notify_when: "failure"
notification_title: "{workflow} is failing"
env:
SLACK_WEBHOOK_URL: ${{ secrets.ACTION_MONITORING_SLACK }}

View File

@@ -0,0 +1,33 @@
name: update_package_lock
description: "Update node's package.lock"
inputs:
github_token:
required: true
description: "github token for the repo"
runs:
using: "composite"
steps:
- uses: actions/setup-node@v3
with:
node-version: 20
- name: Set git configs
shell: bash
run: |
git config user.name 'Lance Release'
git config user.email 'lance-dev@lancedb.com'
- name: Update package-lock.json file
working-directory: ./node
run: |
npm install
git add package-lock.json
git commit -m "Updating package-lock.json"
shell: bash
- name: Push changes
if: ${{ inputs.dry_run }} == "false"
uses: ad-m/github-push-action@master
with:
github_token: ${{ inputs.github_token }}
branch: main
tags: true

View File

@@ -0,0 +1,33 @@
name: update_package_lock_nodejs
description: "Update nodejs's package.lock"
inputs:
github_token:
required: true
description: "github token for the repo"
runs:
using: "composite"
steps:
- uses: actions/setup-node@v3
with:
node-version: 20
- name: Set git configs
shell: bash
run: |
git config user.name 'Lance Release'
git config user.email 'lance-dev@lancedb.com'
- name: Update package-lock.json file
working-directory: ./nodejs
run: |
npm install
git add package-lock.json
git commit -m "Updating package-lock.json"
shell: bash
- name: Push changes
if: ${{ inputs.dry_run }} == "false"
uses: ad-m/github-push-action@master
with:
github_token: ${{ inputs.github_token }}
branch: main
tags: true

View File

@@ -1,24 +0,0 @@
LanceDB is a database designed for retrieval, including vector, full-text, and hybrid search.
It is a wrapper around Lance. There are two backends: local (in-process like SQLite) and
remote (against LanceDB Cloud).
The core of LanceDB is written in Rust. There are bindings in Python, Typescript, and Java.
Project layout:
* `rust/lancedb`: The LanceDB core Rust implementation.
* `python`: The Python bindings, using PyO3.
* `nodejs`: The Typescript bindings, using napi-rs
* `java`: The Java bindings
(`rust/ffi` and `node/` are for a deprecated package. You can ignore them.)
Common commands:
* Check for compiler errors: `cargo check --features remote --tests --examples`
* Run tests: `cargo test --features remote --tests`
* Run specific test: `cargo test --features remote -p <package_name> --test <test_name>`
* Lint: `cargo clippy --features remote --tests --examples`
* Format: `cargo fmt --all`
Before committing changes, run formatting.

1220
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,6 @@
[workspace]
members = [
"rust/ffi/node",
"rust/lancedb",
"nodejs",
"python",
@@ -20,16 +21,14 @@ categories = ["database-implementations"]
rust-version = "1.78.0"
[workspace.dependencies]
lance = { "version" = "=0.32.1", "features" = [
"dynamodb",
], "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
lance-io = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
lance-index = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
lance-linalg = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
lance-table = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
lance-testing = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
lance-datafusion = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
lance-encoding = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
lance = { "version" = "=0.29.0", "features" = ["dynamodb"], tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-io = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-index = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-linalg = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-table = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-testing = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-datafusion = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-encoding = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
# Note that this one does not include pyarrow
arrow = { version = "55.1", optional = false }
arrow-array = "55.1"
@@ -40,20 +39,20 @@ arrow-schema = "55.1"
arrow-arith = "55.1"
arrow-cast = "55.1"
async-trait = "0"
datafusion = { version = "48.0", default-features = false }
datafusion-catalog = "48.0"
datafusion-common = { version = "48.0", default-features = false }
datafusion-execution = "48.0"
datafusion-expr = "48.0"
datafusion-physical-plan = "48.0"
datafusion = { version = "47.0", default-features = false }
datafusion-catalog = "47.0"
datafusion-common = { version = "47.0", default-features = false }
datafusion-execution = "47.0"
datafusion-expr = "47.0"
datafusion-physical-plan = "47.0"
env_logger = "0.11"
half = { "version" = "2.6.0", default-features = false, features = [
half = { "version" = "=2.5.0", default-features = false, features = [
"num-traits",
] }
futures = "0"
log = "0.4"
moka = { version = "0.12", features = ["future"] }
object_store = "0.12.0"
object_store = "0.11.0"
pin-project = "1.0.7"
snafu = "0.8"
url = "2"

22
ci/build_linux_artifacts.sh Executable file
View File

@@ -0,0 +1,22 @@
#!/bin/bash
set -e
ARCH=${1:-x86_64}
TARGET_TRIPLE=${2:-x86_64-unknown-linux-gnu}
# We pass down the current user so that when we later mount the local files
# into the container, the files are accessible by the current user.
pushd ci/manylinux_node
docker build \
-t lancedb-node-manylinux \
--build-arg="ARCH=$ARCH" \
--build-arg="DOCKER_USER=$(id -u)" \
--progress=plain \
.
popd
# We turn on memory swap to avoid OOM killer
docker run \
-v $(pwd):/io -w /io \
--memory-swap=-1 \
lancedb-node-manylinux \
bash ci/manylinux_node/build_vectordb.sh $ARCH $TARGET_TRIPLE

View File

@@ -0,0 +1,34 @@
# Builds the macOS artifacts (node binaries).
# Usage: ./ci/build_macos_artifacts.sh [target]
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
set -e
prebuild_rust() {
# Building here for the sake of easier debugging.
pushd rust/ffi/node
echo "Building rust library for $1"
export RUST_BACKTRACE=1
cargo build --release --target $1
popd
}
build_node_binaries() {
pushd node
echo "Building node library for $1"
npm run build-release -- --target $1
npm run pack-build -- --target $1
popd
}
if [ -n "$1" ]; then
targets=$1
else
targets="x86_64-apple-darwin aarch64-apple-darwin"
fi
echo "Building artifacts for targets: $targets"
for target in $targets
do
prebuild_rust $target
build_node_binaries $target
done

View File

@@ -0,0 +1,42 @@
# Builds the Windows artifacts (node binaries).
# Usage: .\ci\build_windows_artifacts.ps1 [target]
# Targets supported:
# - x86_64-pc-windows-msvc
# - i686-pc-windows-msvc
# - aarch64-pc-windows-msvc
function Prebuild-Rust {
param (
[string]$target
)
# Building here for the sake of easier debugging.
Push-Location -Path "rust/ffi/node"
Write-Host "Building rust library for $target"
$env:RUST_BACKTRACE=1
cargo build --release --target $target
Pop-Location
}
function Build-NodeBinaries {
param (
[string]$target
)
Push-Location -Path "node"
Write-Host "Building node library for $target"
npm run build-release -- --target $target
npm run pack-build -- --target $target
Pop-Location
}
$targets = $args[0]
if (-not $targets) {
$targets = "x86_64-pc-windows-msvc", "aarch64-pc-windows-msvc"
}
Write-Host "Building artifacts for targets: $targets"
foreach ($target in $targets) {
Prebuild-Rust $target
Build-NodeBinaries $target
}

View File

@@ -0,0 +1,42 @@
# Builds the Windows artifacts (nodejs binaries).
# Usage: .\ci\build_windows_artifacts_nodejs.ps1 [target]
# Targets supported:
# - x86_64-pc-windows-msvc
# - i686-pc-windows-msvc
# - aarch64-pc-windows-msvc
function Prebuild-Rust {
param (
[string]$target
)
# Building here for the sake of easier debugging.
Push-Location -Path "rust/lancedb"
Write-Host "Building rust library for $target"
$env:RUST_BACKTRACE=1
cargo build --release --target $target
Pop-Location
}
function Build-NodeBinaries {
param (
[string]$target
)
Push-Location -Path "nodejs"
Write-Host "Building nodejs library for $target"
$env:RUST_TARGET=$target
npm run build-release
Pop-Location
}
$targets = $args[0]
if (-not $targets) {
$targets = "x86_64-pc-windows-msvc", "aarch64-pc-windows-msvc"
}
Write-Host "Building artifacts for targets: $targets"
foreach ($target in $targets) {
Prebuild-Rust $target
Build-NodeBinaries $target
}

View File

@@ -0,0 +1,27 @@
# Many linux dockerfile with Rust, Node, and Lance dependencies installed.
# This container allows building the node modules native libraries in an
# environment with a very old glibc, so that we are compatible with a wide
# range of linux distributions.
ARG ARCH=x86_64
FROM quay.io/pypa/manylinux_2_28_${ARCH}
ARG ARCH=x86_64
ARG DOCKER_USER=default_user
# Protobuf is also installed as root.
COPY install_protobuf.sh install_protobuf.sh
RUN ./install_protobuf.sh ${ARCH}
ENV DOCKER_USER=${DOCKER_USER}
# Create a group and user, but only if it doesn't exist
RUN echo ${ARCH} && id -u ${DOCKER_USER} >/dev/null 2>&1 || adduser --user-group --create-home --uid ${DOCKER_USER} build_user
# We switch to the user to install Rust and Node, since those like to be
# installed at the user level.
USER ${DOCKER_USER}
COPY prepare_manylinux_node.sh prepare_manylinux_node.sh
RUN cp /prepare_manylinux_node.sh $HOME/ && \
cd $HOME && \
./prepare_manylinux_node.sh ${ARCH}

View File

@@ -0,0 +1,13 @@
#!/bin/bash
# Builds the node module for manylinux. Invoked by ci/build_linux_artifacts.sh.
set -e
ARCH=${1:-x86_64}
TARGET_TRIPLE=${2:-x86_64-unknown-linux-gnu}
#Alpine doesn't have .bashrc
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
cd node
npm ci
npm run build-release
npm run pack-build -- -t $TARGET_TRIPLE

View File

@@ -0,0 +1,15 @@
#!/bin/bash
# Installs protobuf compiler. Should be run as root.
set -e
if [[ $1 == x86_64* ]]; then
ARCH=x86_64
else
# gnu target
ARCH=aarch_64
fi
PB_REL=https://github.com/protocolbuffers/protobuf/releases
PB_VERSION=23.1
curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-$ARCH.zip
unzip protoc-$PB_VERSION-linux-$ARCH.zip -d /usr/local

View File

@@ -0,0 +1,21 @@
#!/bin/bash
set -e
install_node() {
echo "Installing node..."
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
source "$HOME"/.bashrc
nvm install --no-progress 18
}
install_rust() {
echo "Installing rust..."
curl https://sh.rustup.rs -sSf | bash -s -- -y
export PATH="$PATH:/root/.cargo/bin"
}
install_node
install_rust

View File

@@ -1,188 +0,0 @@
import argparse
import sys
import json
def run_command(command: str) -> str:
"""
Run a shell command and return stdout as a string.
If exit code is not 0, raise an exception with the stderr output.
"""
import subprocess
result = subprocess.run(command, shell=True, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"Command failed with error: {result.stderr.strip()}")
return result.stdout.strip()
def get_latest_stable_version() -> str:
version_line = run_command("cargo info lance | grep '^version:'")
version = version_line.split(" ")[1].strip()
return version
def get_latest_preview_version() -> str:
lance_tags = run_command(
"git ls-remote --tags https://github.com/lancedb/lance.git | grep 'refs/tags/v[0-9beta.-]\\+$'"
).splitlines()
lance_tags = (
tag.split("refs/tags/")[1]
for tag in lance_tags
if "refs/tags/" in tag and "beta" in tag
)
from packaging.version import Version
latest = max(
(tag[1:] for tag in lance_tags if tag.startswith("v")), key=lambda t: Version(t)
)
return str(latest)
def extract_features(line: str) -> list:
"""
Extracts the features from a line in Cargo.toml.
Example: 'lance = { "version" = "=0.29.0", "features" = ["dynamodb"] }'
Returns: ['dynamodb']
"""
import re
match = re.search(r'"features"\s*=\s*\[\s*(.*?)\s*\]', line, re.DOTALL)
if match:
features_str = match.group(1)
return [f.strip('"') for f in features_str.split(",") if len(f) > 0]
return []
def update_cargo_toml(line_updater):
"""
Updates the Cargo.toml file by applying the line_updater function to each line.
The line_updater function should take a line as input and return the updated line.
"""
with open("Cargo.toml", "r") as f:
lines = f.readlines()
new_lines = []
lance_line = ""
is_parsing_lance_line = False
for line in lines:
if line.startswith("lance"):
# Update the line using the provided function
if line.strip().endswith("}"):
new_lines.append(line_updater(line))
else:
lance_line = line
is_parsing_lance_line = True
elif is_parsing_lance_line:
lance_line += line
if line.strip().endswith("}"):
new_lines.append(line_updater(lance_line))
lance_line = ""
is_parsing_lance_line = False
else:
print("doesn't end with }:", line)
else:
# Keep the line unchanged
new_lines.append(line)
with open("Cargo.toml", "w") as f:
f.writelines(new_lines)
def set_stable_version(version: str):
"""
Sets lines to
lance = { "version" = "=0.29.0", "features" = ["dynamodb"] }
lance-io = "=0.29.0"
...
"""
def line_updater(line: str) -> str:
package_name = line.split("=", maxsplit=1)[0].strip()
features = extract_features(line)
if features:
return f'{package_name} = {{ "version" = "={version}", "features" = {json.dumps(features)} }}\n'
else:
return f'{package_name} = "={version}"\n'
update_cargo_toml(line_updater)
def set_preview_version(version: str):
"""
Sets lines to
lance = { "version" = "=0.29.0", "features" = ["dynamodb"], tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-io = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
...
"""
def line_updater(line: str) -> str:
package_name = line.split("=", maxsplit=1)[0].strip()
features = extract_features(line)
base_version = version.split("-")[0] # Get the base version without beta suffix
if features:
return f'{package_name} = {{ "version" = "={base_version}", "features" = {json.dumps(features)}, "tag" = "v{version}", "git" = "https://github.com/lancedb/lance.git" }}\n'
else:
return f'{package_name} = {{ "version" = "={base_version}", "tag" = "v{version}", "git" = "https://github.com/lancedb/lance.git" }}\n'
update_cargo_toml(line_updater)
def set_local_version():
"""
Sets lines to
lance = { path = "../lance/rust/lance", features = ["dynamodb"] }
lance-io = { path = "../lance/rust/lance-io" }
...
"""
def line_updater(line: str) -> str:
package_name = line.split("=", maxsplit=1)[0].strip()
features = extract_features(line)
if features:
return f'{package_name} = {{ "path" = "../lance/rust/{package_name}", "features" = {json.dumps(features)} }}\n'
else:
return f'{package_name} = {{ "path" = "../lance/rust/{package_name}" }}\n'
update_cargo_toml(line_updater)
parser = argparse.ArgumentParser(description="Set the version of the Lance package.")
parser.add_argument(
"version",
type=str,
help="The version to set for the Lance package. Use 'stable' for the latest stable version, 'preview' for latest preview version, or a specific version number (e.g., '0.1.0'). You can also specify 'local' to use a local path.",
)
args = parser.parse_args()
if args.version == "stable":
latest_stable_version = get_latest_stable_version()
print(
f"Found latest stable version: \033[1mv{latest_stable_version}\033[0m",
file=sys.stderr,
)
set_stable_version(latest_stable_version)
elif args.version == "preview":
latest_preview_version = get_latest_preview_version()
print(
f"Found latest preview version: \033[1mv{latest_preview_version}\033[0m",
file=sys.stderr,
)
set_preview_version(latest_preview_version)
elif args.version == "local":
set_local_version()
else:
# Parse the version number.
version = args.version
# Ignore initial v if present.
if version.startswith("v"):
version = version[1:]
if "beta" in version:
set_preview_version(version)
else:
set_stable_version(version)
print("Updating lockfiles...", file=sys.stderr, end="")
run_command("cargo metadata > /dev/null")
print(" done.", file=sys.stderr)

View File

@@ -1,30 +1,18 @@
#!/usr/bin/env bash
set -euo pipefail
AMEND=false
for arg in "$@"; do
if [[ "$arg" == "--amend" ]]; then
AMEND=true
fi
done
# This updates the lockfile without building
cargo metadata --quiet > /dev/null
cargo metadata > /dev/null
pushd nodejs || exit 1
npm install --package-lock-only --silent
npm install --package-lock-only
popd
pushd node || exit 1
npm install --package-lock-only --silent
npm install --package-lock-only
popd
if git diff --quiet --exit-code; then
echo "No lockfile changes to commit; skipping amend."
elif $AMEND; then
git add Cargo.lock nodejs/package-lock.json node/package-lock.json
git commit --amend --no-edit
else
git add Cargo.lock nodejs/package-lock.json node/package-lock.json
git commit -m "Update lockfiles"
git commit --amend --no-edit
fi

View File

@@ -103,6 +103,264 @@ markdown_extensions:
permalink: ""
nav:
- Home:
- LanceDB: index.md
- 🏃🏼‍♂️ Quick start: basic.md
- 📚 Concepts:
- Vector search: concepts/vector_search.md
- Indexing:
- IVFPQ: concepts/index_ivfpq.md
- HNSW: concepts/index_hnsw.md
- Storage: concepts/storage.md
- Data management: concepts/data_management.md
- 🔨 Guides:
- Working with tables: guides/tables.md
- Building a vector index: ann_indexes.md
- Vector Search: search.md
- Full-text search (native): fts.md
- Full-text search (tantivy-based): fts_tantivy.md
- Building a scalar index: guides/scalar_index.md
- Hybrid search:
- Overview: hybrid_search/hybrid_search.md
- Comparing Rerankers: hybrid_search/eval.md
- Airbnb financial data example: notebooks/hybrid_search.ipynb
- Late interaction with MultiVector search:
- Overview: guides/multi-vector.md
- Example: notebooks/Multivector_on_LanceDB.ipynb
- RAG:
- Vanilla RAG: rag/vanilla_rag.md
- Multi-head RAG: rag/multi_head_rag.md
- Corrective RAG: rag/corrective_rag.md
- Agentic RAG: rag/agentic_rag.md
- Graph RAG: rag/graph_rag.md
- Self RAG: rag/self_rag.md
- Adaptive RAG: rag/adaptive_rag.md
- SFR RAG: rag/sfr_rag.md
- Advanced Techniques:
- HyDE: rag/advanced_techniques/hyde.md
- FLARE: rag/advanced_techniques/flare.md
- Reranking:
- Quickstart: reranking/index.md
- Cohere Reranker: reranking/cohere.md
- Linear Combination Reranker: reranking/linear_combination.md
- Reciprocal Rank Fusion Reranker: reranking/rrf.md
- Cross Encoder Reranker: reranking/cross_encoder.md
- ColBERT Reranker: reranking/colbert.md
- Jina Reranker: reranking/jina.md
- OpenAI Reranker: reranking/openai.md
- AnswerDotAi Rerankers: reranking/answerdotai.md
- Voyage AI Rerankers: reranking/voyageai.md
- Building Custom Rerankers: reranking/custom_reranker.md
- Example: notebooks/lancedb_reranking.ipynb
- Filtering: sql.md
- Versioning & Reproducibility:
- sync API: notebooks/reproducibility.ipynb
- async API: notebooks/reproducibility_async.ipynb
- Configuring Storage: guides/storage.md
- Migration Guide: migration.md
- Tuning retrieval performance:
- Choosing right query type: guides/tuning_retrievers/1_query_types.md
- Reranking: guides/tuning_retrievers/2_reranking.md
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
- 🧬 Managing embeddings:
- Understand Embeddings: embeddings/understanding_embeddings.md
- Get Started: embeddings/index.md
- Embedding functions: embeddings/embedding_functions.md
- Available models:
- Overview: embeddings/default_embedding_functions.md
- Text Embedding Functions:
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
- Huggingface Embedding Models: embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md
- Ollama Embeddings: embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md
- OpenAI Embeddings: embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md
- Instructor Embeddings: embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md
- Gemini Embeddings: embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md
- Cohere Embeddings: embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
- Voyage AI Embeddings: embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md
- Multimodal Embedding Functions:
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
- Jina Embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md
- User-defined embedding functions: embeddings/custom_embedding_function.md
- Variables and secrets: embeddings/variables_and_secrets.md
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
- 🔌 Integrations:
- Tools and data formats: integrations/index.md
- Pandas and PyArrow: python/pandas_and_pyarrow.md
- Polars: python/polars_arrow.md
- DuckDB: python/duckdb.md
- Datafusion: python/datafusion.md
- LangChain:
- LangChain 🔗: integrations/langchain.md
- LangChain demo: notebooks/langchain_demo.ipynb
- LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb
- LlamaIndex 🦙:
- LlamaIndex docs: integrations/llamaIndex.md
- LlamaIndex demo: notebooks/llamaIndex_demo.ipynb
- Pydantic: python/pydantic.md
- Voxel51: integrations/voxel51.md
- PromptTools: integrations/prompttools.md
- dlt: integrations/dlt.md
- phidata: integrations/phidata.md
- Genkit: integrations/genkit.md
- 🎯 Examples:
- Overview: examples/index.md
- 🐍 Python:
- Overview: examples/examples_python.md
- Build From Scratch: examples/python_examples/build_from_scratch.md
- Multimodal: examples/python_examples/multimodal.md
- Rag: examples/python_examples/rag.md
- Vector Search: examples/python_examples/vector_search.md
- Chatbot: examples/python_examples/chatbot.md
- Evaluation: examples/python_examples/evaluations.md
- AI Agent: examples/python_examples/aiagent.md
- Recommender System: examples/python_examples/recommendersystem.md
- Miscellaneous:
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
- 👾 JavaScript:
- Overview: examples/examples_js.md
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
- 🦀 Rust:
- Overview: examples/examples_rust.md
- 📓 Studies:
- ↗Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/
- 💭 FAQs: faq.md
- 🔍 Troubleshooting: troubleshooting.md
- ⚙️ API reference:
- 🐍 Python: python/python.md
- 👾 JavaScript (vectordb): javascript/modules.md
- 👾 JavaScript (lancedb): js/globals.md
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
- Quick start: basic.md
- Concepts:
- Vector search: concepts/vector_search.md
- Indexing:
- IVFPQ: concepts/index_ivfpq.md
- HNSW: concepts/index_hnsw.md
- Storage: concepts/storage.md
- Data management: concepts/data_management.md
- Guides:
- Working with tables: guides/tables.md
- Working with SQL: guides/sql_querying.md
- Building an ANN index: ann_indexes.md
- Vector Search: search.md
- Full-text search (native): fts.md
- Full-text search (tantivy-based): fts_tantivy.md
- Building a scalar index: guides/scalar_index.md
- Hybrid search:
- Overview: hybrid_search/hybrid_search.md
- Comparing Rerankers: hybrid_search/eval.md
- Airbnb financial data example: notebooks/hybrid_search.ipynb
- Late interaction with MultiVector search:
- Overview: guides/multi-vector.md
- Document search Example: notebooks/Multivector_on_LanceDB.ipynb
- RAG:
- Vanilla RAG: rag/vanilla_rag.md
- Multi-head RAG: rag/multi_head_rag.md
- Corrective RAG: rag/corrective_rag.md
- Agentic RAG: rag/agentic_rag.md
- Graph RAG: rag/graph_rag.md
- Self RAG: rag/self_rag.md
- Adaptive RAG: rag/adaptive_rag.md
- SFR RAG: rag/sfr_rag.md
- Advanced Techniques:
- HyDE: rag/advanced_techniques/hyde.md
- FLARE: rag/advanced_techniques/flare.md
- Reranking:
- Quickstart: reranking/index.md
- Cohere Reranker: reranking/cohere.md
- Linear Combination Reranker: reranking/linear_combination.md
- Reciprocal Rank Fusion Reranker: reranking/rrf.md
- Cross Encoder Reranker: reranking/cross_encoder.md
- ColBERT Reranker: reranking/colbert.md
- Jina Reranker: reranking/jina.md
- OpenAI Reranker: reranking/openai.md
- AnswerDotAi Rerankers: reranking/answerdotai.md
- Building Custom Rerankers: reranking/custom_reranker.md
- Example: notebooks/lancedb_reranking.ipynb
- Filtering: sql.md
- Versioning & Reproducibility:
- sync API: notebooks/reproducibility.ipynb
- async API: notebooks/reproducibility_async.ipynb
- Configuring Storage: guides/storage.md
- Migration Guide: migration.md
- Tuning retrieval performance:
- Choosing right query type: guides/tuning_retrievers/1_query_types.md
- Reranking: guides/tuning_retrievers/2_reranking.md
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
- Managing Embeddings:
- Understand Embeddings: embeddings/understanding_embeddings.md
- Get Started: embeddings/index.md
- Embedding functions: embeddings/embedding_functions.md
- Available models:
- Overview: embeddings/default_embedding_functions.md
- Text Embedding Functions:
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
- Huggingface Embedding Models: embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md
- Ollama Embeddings: embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md
- OpenAI Embeddings: embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md
- Instructor Embeddings: embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md
- Gemini Embeddings: embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md
- Cohere Embeddings: embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
- Multimodal Embedding Functions:
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
- Jina Embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md
- User-defined embedding functions: embeddings/custom_embedding_function.md
- Variables and secrets: embeddings/variables_and_secrets.md
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
- Integrations:
- Overview: integrations/index.md
- Pandas and PyArrow: python/pandas_and_pyarrow.md
- Polars: python/polars_arrow.md
- DuckDB: python/duckdb.md
- Datafusion: python/datafusion.md
- LangChain 🦜️🔗↗: integrations/langchain.md
- LangChain.js 🦜️🔗↗: https://js.langchain.com/docs/integrations/vectorstores/lancedb
- LlamaIndex 🦙↗: integrations/llamaIndex.md
- Pydantic: python/pydantic.md
- Voxel51: integrations/voxel51.md
- PromptTools: integrations/prompttools.md
- dlt: integrations/dlt.md
- phidata: integrations/phidata.md
- Genkit: integrations/genkit.md
- Examples:
- examples/index.md
- 🐍 Python:
- Overview: examples/examples_python.md
- Build From Scratch: examples/python_examples/build_from_scratch.md
- Multimodal: examples/python_examples/multimodal.md
- Rag: examples/python_examples/rag.md
- Vector Search: examples/python_examples/vector_search.md
- Chatbot: examples/python_examples/chatbot.md
- Evaluation: examples/python_examples/evaluations.md
- AI Agent: examples/python_examples/aiagent.md
- Recommender System: examples/python_examples/recommendersystem.md
- Miscellaneous:
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
- 👾 JavaScript:
- Overview: examples/examples_js.md
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
- 🦀 Rust:
- Overview: examples/examples_rust.md
- Studies:
- studies/overview.md
- ↗Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/
- API reference:
- Overview: api_reference.md
- Python: python/python.md

12
docs/package-lock.json generated
View File

@@ -19,7 +19,7 @@
},
"../node": {
"name": "vectordb",
"version": "0.21.2-beta.0",
"version": "0.12.0",
"cpu": [
"x64",
"arm64"
@@ -65,11 +65,11 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
"@lancedb/vectordb-darwin-arm64": "0.12.0",
"@lancedb/vectordb-darwin-x64": "0.12.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.12.0",
"@lancedb/vectordb-linux-x64-gnu": "0.12.0",
"@lancedb/vectordb-win32-x64-msvc": "0.12.0"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",

View File

@@ -1,9 +1,7 @@
# SQL Querying
You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
This guide will show how to query Lance tables them using both.
We will re-use the dataset [created previously](./tables.md):
We will re-use the dataset [created previously](./pandas_and_pyarrow.md):
```python
import lancedb
@@ -29,17 +27,21 @@ arrow_table = table.to_lance()
duckdb.query("SELECT * FROM arrow_table")
```
| vector | item | price |
| ----------- | ---- | ----- |
| [3.1, 4.1] | foo | 10.0 |
| [5.9, 26.5] | bar | 20.0 |
```
┌─────────────┬─────────┬────────┐
│ vector │ item │ price │
│ float[] │ varchar │ double │
├─────────────┼─────────┼────────┤
│ [3.1, 4.1] │ foo │ 10.0 │
│ [5.9, 26.5] │ bar │ 20.0 │
└─────────────┴─────────┴────────┘
```
## Querying a LanceDB Table with Apache Datafusion
Have the required imports before doing any querying.
=== "Python"
```python
--8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb"
--8<-- "python/python/tests/docs/test_guide_tables.py:import-session-context"
@@ -49,12 +51,16 @@ Have the required imports before doing any querying.
Register the table created with the Datafusion session context.
=== "Python"
```python
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
```
| vector | item | price |
| ----------- | ---- | ----- |
| [3.1, 4.1] | foo | 10.0 |
| [5.9, 26.5] | bar | 20.0 |
```
┌─────────────┬─────────┬────────┐
│ vector │ item │ price
│ float[] │ varchar │ double │
├─────────────┼─────────┼────────┤
│ [3.1, 4.1] │ foo │ 10.0 │
│ [5.9, 26.5] │ bar │ 20.0 │
└─────────────┴─────────┴────────┘
```

View File

@@ -1,53 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / BooleanQuery
# Class: BooleanQuery
Represents a full-text query interface.
This interface defines the structure and behavior for full-text queries,
including methods to retrieve the query type and convert the query to a dictionary format.
## Implements
- [`FullTextQuery`](../interfaces/FullTextQuery.md)
## Constructors
### new BooleanQuery()
```ts
new BooleanQuery(queries): BooleanQuery
```
Creates an instance of BooleanQuery.
#### Parameters
* **queries**: [[`Occur`](../enumerations/Occur.md), [`FullTextQuery`](../interfaces/FullTextQuery.md)][]
An array of (Occur, FullTextQuery objects) to combine.
Occur specifies whether the query must match, or should match.
#### Returns
[`BooleanQuery`](BooleanQuery.md)
## Methods
### queryType()
```ts
queryType(): FullTextQueryType
```
The type of the full-text query.
#### Returns
[`FullTextQueryType`](../enumerations/FullTextQueryType.md)
#### Implementation of
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`queryType`](../interfaces/FullTextQuery.md#querytype)

View File

@@ -40,8 +40,6 @@ Creates an instance of MatchQuery.
- `boost`: The boost factor for the query (default is 1.0).
- `fuzziness`: The fuzziness level for the query (default is 0).
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
- `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
* **options.boost?**: `number`
@@ -49,10 +47,6 @@ Creates an instance of MatchQuery.
* **options.maxExpansions?**: `number`
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
* **options.prefixLength?**: `number`
#### Returns
[`MatchQuery`](MatchQuery.md)

View File

@@ -38,12 +38,9 @@ Creates an instance of MultiMatchQuery.
* **options?**
Optional parameters for the multi-match query.
- `boosts`: An array of boost factors for each column (default is 1.0 for all).
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
* **options.boosts?**: `number`[]
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
#### Returns
[`MultiMatchQuery`](MultiMatchQuery.md)

View File

@@ -19,10 +19,7 @@ including methods to retrieve the query type and convert the query to a dictiona
### new PhraseQuery()
```ts
new PhraseQuery(
query,
column,
options?): PhraseQuery
new PhraseQuery(query, column): PhraseQuery
```
Creates an instance of `PhraseQuery`.
@@ -35,12 +32,6 @@ Creates an instance of `PhraseQuery`.
* **column**: `string`
The name of the column to search within.
* **options?**
Optional parameters for the phrase query.
- `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
* **options.slop?**: `number`
#### Returns
[`PhraseQuery`](PhraseQuery.md)

View File

@@ -1,84 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / Session
# Class: Session
A session for managing caches and object stores across LanceDB operations.
Sessions allow you to configure cache sizes for index and metadata caches,
which can significantly impact performance for large datasets.
## Constructors
### new Session()
```ts
new Session(indexCacheSizeBytes?, metadataCacheSizeBytes?): Session
```
Create a new session with custom cache sizes.
# Parameters
- `index_cache_size_bytes`: The size of the index cache in bytes.
Defaults to 6GB if not specified.
- `metadata_cache_size_bytes`: The size of the metadata cache in bytes.
Defaults to 1GB if not specified.
#### Parameters
* **indexCacheSizeBytes?**: `null` \| `bigint`
* **metadataCacheSizeBytes?**: `null` \| `bigint`
#### Returns
[`Session`](Session.md)
## Methods
### approxNumItems()
```ts
approxNumItems(): number
```
Get the approximate number of items cached in the session.
#### Returns
`number`
***
### sizeBytes()
```ts
sizeBytes(): bigint
```
Get the current size of the session caches in bytes.
#### Returns
`bigint`
***
### default()
```ts
static default(): Session
```
Create a session with default cache sizes.
This is equivalent to creating a session with 6GB index cache
and 1GB metadata cache.
#### Returns
[`Session`](Session.md)

View File

@@ -612,7 +612,7 @@ of the given query
#### Parameters
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
the query, a vector or string
* **queryType?**: `string`
@@ -799,7 +799,7 @@ by `query`.
#### Parameters
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
#### Returns

View File

@@ -386,53 +386,6 @@ called then every valid row from the table will be returned.
***
### maximumNprobes()
```ts
maximumNprobes(maximumNprobes): VectorQuery
```
Set the maximum number of probes used.
This controls the maximum number of partitions that will be searched. If this
number is greater than minimumNprobes then the excess partitions will _only_ be
searched if we have not found enough results. This can be useful when there is
a narrow filter to allow these queries to spend more time searching and avoid
potential false negatives.
#### Parameters
* **maximumNprobes**: `number`
#### Returns
[`VectorQuery`](VectorQuery.md)
***
### minimumNprobes()
```ts
minimumNprobes(minimumNprobes): VectorQuery
```
Set the minimum number of probes used.
This controls the minimum number of partitions that will be searched. This
parameter will impact every query against a vector index, regardless of the
filter. See `nprobes` for more details. Higher values will increase recall
but will also increase latency.
#### Parameters
* **minimumNprobes**: `number`
#### Returns
[`VectorQuery`](VectorQuery.md)
***
### nprobes()
```ts
@@ -460,10 +413,6 @@ For best results we recommend tuning this parameter with a benchmark against
your actual data to find the smallest possible value that will still give
you the desired recall.
For more fine grained control over behavior when you have a very narrow filter
you can use `minimumNprobes` and `maximumNprobes`. This method sets both
the minimum and maximum to the same value.
#### Parameters
* **nprobes**: `number`

View File

@@ -15,14 +15,6 @@ Enum representing the types of full-text queries supported.
## Enumeration Members
### Boolean
```ts
Boolean: "boolean";
```
***
### Boost
```ts

View File

@@ -1,37 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / Occur
# Enumeration: Occur
Enum representing the occurrence of terms in full-text queries.
- `Must`: The term must be present in the document.
- `Should`: The term should contribute to the document score, but is not required.
- `MustNot`: The term must not be present in the document.
## Enumeration Members
### Must
```ts
Must: "MUST";
```
***
### MustNot
```ts
MustNot: "MUST_NOT";
```
***
### Should
```ts
Should: "SHOULD";
```

View File

@@ -1,28 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / Operator
# Enumeration: Operator
Enum representing the logical operators used in full-text queries.
- `And`: All terms must match.
- `Or`: At least one term must match.
## Enumeration Members
### And
```ts
And: "AND";
```
***
### Or
```ts
Or: "OR";
```

View File

@@ -6,13 +6,10 @@
# Function: connect()
## connect(uri, options, session)
## connect(uri, options)
```ts
function connect(
uri,
options?,
session?): Promise<Connection>
function connect(uri, options?): Promise<Connection>
```
Connect to a LanceDB instance at the given URI.
@@ -32,8 +29,6 @@ Accepted formats:
* **options?**: `Partial`&lt;[`ConnectionOptions`](../interfaces/ConnectionOptions.md)&gt;
The options to use when connecting to the database
* **session?**: [`Session`](../classes/Session.md)
### Returns
`Promise`&lt;[`Connection`](../classes/Connection.md)&gt;
@@ -82,7 +77,7 @@ Accepted formats:
[ConnectionOptions](../interfaces/ConnectionOptions.md) for more details on the URI format.
### Examples
### Example
```ts
const conn = await connect({
@@ -90,11 +85,3 @@ const conn = await connect({
storageOptions: {timeout: "60s"}
});
```
```ts
const session = Session.default();
const conn = await connect({
uri: "/path/to/database",
session: session
});
```

View File

@@ -12,12 +12,9 @@
## Enumerations
- [FullTextQueryType](enumerations/FullTextQueryType.md)
- [Occur](enumerations/Occur.md)
- [Operator](enumerations/Operator.md)
## Classes
- [BooleanQuery](classes/BooleanQuery.md)
- [BoostQuery](classes/BoostQuery.md)
- [Connection](classes/Connection.md)
- [Index](classes/Index.md)
@@ -29,7 +26,6 @@
- [Query](classes/Query.md)
- [QueryBase](classes/QueryBase.md)
- [RecordBatchIterator](classes/RecordBatchIterator.md)
- [Session](classes/Session.md)
- [Table](classes/Table.md)
- [TagContents](classes/TagContents.md)
- [Tags](classes/Tags.md)
@@ -85,7 +81,6 @@
- [FieldLike](type-aliases/FieldLike.md)
- [IntoSql](type-aliases/IntoSql.md)
- [IntoVector](type-aliases/IntoVector.md)
- [MultiVector](type-aliases/MultiVector.md)
- [RecordBatchLike](type-aliases/RecordBatchLike.md)
- [SchemaLike](type-aliases/SchemaLike.md)
- [TableLike](type-aliases/TableLike.md)

View File

@@ -70,17 +70,6 @@ Defaults to 'us-east-1'.
***
### session?
```ts
optional session: Session;
```
(For LanceDB OSS only): the session to use for this connection. Holds
shared caches and other session-specific state.
***
### storageOptions?
```ts

View File

@@ -23,7 +23,7 @@ whether to remove punctuation
### baseTokenizer?
```ts
optional baseTokenizer: "raw" | "simple" | "whitespace" | "ngram";
optional baseTokenizer: "raw" | "simple" | "whitespace";
```
The tokenizer to use when building the index.
@@ -71,36 +71,6 @@ tokens longer than this length will be ignored
***
### ngramMaxLength?
```ts
optional ngramMaxLength: number;
```
ngram max length
***
### ngramMinLength?
```ts
optional ngramMinLength: number;
```
ngram min length
***
### prefixOnly?
```ts
optional prefixOnly: boolean;
```
whether to only index the prefix of the token for ngram tokenizer
***
### removeStopWords?
```ts

View File

@@ -8,7 +8,7 @@
## Properties
### ~~indexCacheSize?~~
### indexCacheSize?
```ts
optional indexCacheSize: number;
@@ -16,11 +16,6 @@ optional indexCacheSize: number;
Set the size of the index cache, specified as a number of entries
#### Deprecated
Use session-level cache configuration instead.
Create a Session with custom cache sizes and pass it to the connect() function.
The exact meaning of an "entry" will depend on the type of index:
- IVF: there is one entry for each IVF partition
- BTREE: there is one entry for the entire index

View File

@@ -24,10 +24,10 @@ The default is 7 days
// Delete all versions older than 1 day
const olderThan = new Date();
olderThan.setDate(olderThan.getDate() - 1));
tbl.optimize({cleanupOlderThan: olderThan});
tbl.cleanupOlderVersions(olderThan);
// Delete all versions except the current version
tbl.optimize({cleanupOlderThan: new Date()});
tbl.cleanupOlderVersions(new Date());
```
***

View File

@@ -1,11 +0,0 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / MultiVector
# Type Alias: MultiVector
```ts
type MultiVector: IntoVector[];
```

View File

@@ -428,7 +428,7 @@
"\n",
"**Why?** \n",
"Embedding the UFO dataset and ingesting it into LanceDB takes **~2 hours on a T4 GPU**. To save time: \n",
"- **Use the pre-prepared table with index created** (provided below) to proceed directly to **Step 7**: search. \n",
"- **Use the pre-prepared table with index created ** (provided below) to proceed directly to step7: search. \n",
"- **Step 5a** contains the full ingestion code for reference (run it only if necessary). \n",
"- **Step 6** contains the details on creating the index on the multivector column"
]

View File

@@ -30,8 +30,7 @@ excluded_globs = [
"../src/rag/advanced_techniques/*.md",
"../src/guides/scalar_index.md",
"../src/guides/storage.md",
"../src/search.md",
"../src/guides/sql_querying.md",
"../src/search.md"
]
python_prefix = "py"

View File

@@ -7,4 +7,3 @@ tantivy==0.20.1
--extra-index-url https://download.pytorch.org/whl/cpu
torch
polars>=0.19, <=1.3.0
datafusion

View File

@@ -1,19 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
wrapperVersion=3.3.2
distributionType=only-script
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.9/apache-maven-3.9.9-bin.zip

View File

@@ -1,37 +0,0 @@
# LanceDB Java SDK
## Configuration and Initialization
### LanceDB Cloud
For LanceDB Cloud, use the simplified builder API:
```java
import com.lancedb.lance.namespace.LanceRestNamespace;
// If your DB url is db://example-db, then your database here is example-db
LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
.apiKey("your_lancedb_cloud_api_key")
.database("your_database_name")
.build();
```
### LanceDB Enterprise
For Enterprise deployments, use your VPC endpoint:
```java
LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
.apiKey("your_lancedb_enterprise_api_key")
.database("your-top-dir") // Your top level folder under your cloud bucket, e.g. s3://your-bucket/your-top-dir/
.hostOverride("http://<vpc_endpoint_dns_name>:80")
.build();
```
## Development
Build:
```shell
./mvnw install
```

View File

@@ -19,7 +19,7 @@ lancedb = { path = "../../../rust/lancedb" }
lance = { workspace = true }
arrow = { workspace = true, features = ["ffi"] }
arrow-schema.workspace = true
tokio = "1.46"
tokio = "1.23"
jni = "0.21.1"
snafu.workspace = true
lazy_static.workspace = true

View File

@@ -8,24 +8,18 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.21.2-final.0</version>
<version>0.20.0-beta.2</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>lancedb-core</artifactId>
<name>${project.artifactId}</name>
<description>LanceDB Core</description>
<name>LanceDB Core</name>
<packaging>jar</packaging>
<properties>
<rust.release.build>false</rust.release.build>
</properties>
<dependencies>
<dependency>
<groupId>com.lancedb</groupId>
<artifactId>lance-namespace-core</artifactId>
<version>0.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-vector</artifactId>

View File

@@ -1,26 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.21.2-final.0</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>lancedb-lance-namespace</artifactId>
<name>${project.artifactId}</name>
<description>LanceDB Java Integration with Lance Namespace</description>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>com.lancedb</groupId>
<artifactId>lance-namespace-core</artifactId>
</dependency>
</dependencies>
</project>

View File

@@ -1,146 +0,0 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.lancedb.lancedb;
import com.lancedb.lance.namespace.LanceRestNamespace;
import com.lancedb.lance.namespace.client.apache.ApiClient;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
/** Util class to help construct a {@link LanceRestNamespace} for LanceDB. */
public class LanceDbRestNamespaces {
private static final String DEFAULT_REGION = "us-east-1";
private static final String CLOUD_URL_PATTERN = "https://%s.%s.api.lancedb.com";
private String apiKey;
private String database;
private Optional<String> hostOverride = Optional.empty();
private Optional<String> region = Optional.empty();
private Map<String, String> additionalConfig = new HashMap<>();
private LanceDbRestNamespaces() {}
/**
* Create a new builder instance.
*
* @return A new LanceRestNamespaceBuilder
*/
public static LanceDbRestNamespaces builder() {
return new LanceDbRestNamespaces();
}
/**
* Set the API key (required).
*
* @param apiKey The LanceDB API key
* @return This builder
*/
public LanceDbRestNamespaces apiKey(String apiKey) {
if (apiKey == null || apiKey.trim().isEmpty()) {
throw new IllegalArgumentException("API key cannot be null or empty");
}
this.apiKey = apiKey;
return this;
}
/**
* Set the database name (required).
*
* @param database The database name
* @return This builder
*/
public LanceDbRestNamespaces database(String database) {
if (database == null || database.trim().isEmpty()) {
throw new IllegalArgumentException("Database cannot be null or empty");
}
this.database = database;
return this;
}
/**
* Set a custom host override (optional). When set, this overrides the default LanceDB Cloud URL
* construction. Use this for LanceDB Enterprise deployments.
*
* @param hostOverride The complete base URL (e.g., "http://your-vpc-endpoint:80")
* @return This builder
*/
public LanceDbRestNamespaces hostOverride(String hostOverride) {
this.hostOverride = Optional.ofNullable(hostOverride);
return this;
}
/**
* Set the region for LanceDB Cloud (optional). Defaults to "us-east-1" if not specified. This is
* ignored when hostOverride is set.
*
* @param region The AWS region (e.g., "us-east-1", "eu-west-1")
* @return This builder
*/
public LanceDbRestNamespaces region(String region) {
this.region = Optional.ofNullable(region);
return this;
}
/**
* Add additional configuration parameters.
*
* @param key The configuration key
* @param value The configuration value
* @return This builder
*/
public LanceDbRestNamespaces config(String key, String value) {
this.additionalConfig.put(key, value);
return this;
}
/**
* Build the LanceRestNamespace instance.
*
* @return A configured LanceRestNamespace
* @throws IllegalStateException if required parameters are missing
*/
public LanceRestNamespace build() {
// Validate required fields
if (apiKey == null) {
throw new IllegalStateException("API key is required");
}
if (database == null) {
throw new IllegalStateException("Database is required");
}
// Build configuration map
Map<String, String> config = new HashMap<>(additionalConfig);
config.put("headers.x-lancedb-database", database);
config.put("headers.x-api-key", apiKey);
// Determine base URL
String baseUrl;
if (hostOverride.isPresent()) {
baseUrl = hostOverride.get();
config.put("host_override", hostOverride.get());
} else {
String effectiveRegion = region.orElse(DEFAULT_REGION);
baseUrl = String.format(CLOUD_URL_PATTERN, database, effectiveRegion);
config.put("region", effectiveRegion);
}
// Create and configure ApiClient
ApiClient apiClient = new ApiClient();
apiClient.setBasePath(baseUrl);
return new LanceRestNamespace(apiClient, config);
}
}

259
java/mvnw vendored
View File

@@ -1,259 +0,0 @@
#!/bin/sh
# ----------------------------------------------------------------------------
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ----------------------------------------------------------------------------
# ----------------------------------------------------------------------------
# Apache Maven Wrapper startup batch script, version 3.3.2
#
# Optional ENV vars
# -----------------
# JAVA_HOME - location of a JDK home dir, required when download maven via java source
# MVNW_REPOURL - repo url base for downloading maven distribution
# MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven
# MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output
# ----------------------------------------------------------------------------
set -euf
[ "${MVNW_VERBOSE-}" != debug ] || set -x
# OS specific support.
native_path() { printf %s\\n "$1"; }
case "$(uname)" in
CYGWIN* | MINGW*)
[ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")"
native_path() { cygpath --path --windows "$1"; }
;;
esac
# set JAVACMD and JAVACCMD
set_java_home() {
# For Cygwin and MinGW, ensure paths are in Unix format before anything is touched
if [ -n "${JAVA_HOME-}" ]; then
if [ -x "$JAVA_HOME/jre/sh/java" ]; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
JAVACCMD="$JAVA_HOME/jre/sh/javac"
else
JAVACMD="$JAVA_HOME/bin/java"
JAVACCMD="$JAVA_HOME/bin/javac"
if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then
echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2
echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2
return 1
fi
fi
else
JAVACMD="$(
'set' +e
'unset' -f command 2>/dev/null
'command' -v java
)" || :
JAVACCMD="$(
'set' +e
'unset' -f command 2>/dev/null
'command' -v javac
)" || :
if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then
echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2
return 1
fi
fi
}
# hash string like Java String::hashCode
hash_string() {
str="${1:-}" h=0
while [ -n "$str" ]; do
char="${str%"${str#?}"}"
h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296))
str="${str#?}"
done
printf %x\\n $h
}
verbose() { :; }
[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; }
die() {
printf %s\\n "$1" >&2
exit 1
}
trim() {
# MWRAPPER-139:
# Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds.
# Needed for removing poorly interpreted newline sequences when running in more
# exotic environments such as mingw bash on Windows.
printf "%s" "${1}" | tr -d '[:space:]'
}
# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties
while IFS="=" read -r key value; do
case "${key-}" in
distributionUrl) distributionUrl=$(trim "${value-}") ;;
distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;;
esac
done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties"
[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties"
case "${distributionUrl##*/}" in
maven-mvnd-*bin.*)
MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/
case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in
*AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;;
:Darwin*x86_64) distributionPlatform=darwin-amd64 ;;
:Darwin*arm64) distributionPlatform=darwin-aarch64 ;;
:Linux*x86_64*) distributionPlatform=linux-amd64 ;;
*)
echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2
distributionPlatform=linux-amd64
;;
esac
distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip"
;;
maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;;
*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;;
esac
# apply MVNW_REPOURL and calculate MAVEN_HOME
# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash>
[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}"
distributionUrlName="${distributionUrl##*/}"
distributionUrlNameMain="${distributionUrlName%.*}"
distributionUrlNameMain="${distributionUrlNameMain%-bin}"
MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}"
MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")"
exec_maven() {
unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || :
exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD"
}
if [ -d "$MAVEN_HOME" ]; then
verbose "found existing MAVEN_HOME at $MAVEN_HOME"
exec_maven "$@"
fi
case "${distributionUrl-}" in
*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;;
*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;;
esac
# prepare tmp dir
if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then
clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; }
trap clean HUP INT TERM EXIT
else
die "cannot create temp dir"
fi
mkdir -p -- "${MAVEN_HOME%/*}"
# Download and Install Apache Maven
verbose "Couldn't find MAVEN_HOME, downloading and installing it ..."
verbose "Downloading from: $distributionUrl"
verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName"
# select .zip or .tar.gz
if ! command -v unzip >/dev/null; then
distributionUrl="${distributionUrl%.zip}.tar.gz"
distributionUrlName="${distributionUrl##*/}"
fi
# verbose opt
__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR=''
[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v
# normalize http auth
case "${MVNW_PASSWORD:+has-password}" in
'') MVNW_USERNAME='' MVNW_PASSWORD='' ;;
has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;;
esac
if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then
verbose "Found wget ... using wget"
wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl"
elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then
verbose "Found curl ... using curl"
curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl"
elif set_java_home; then
verbose "Falling back to use Java to download"
javaSource="$TMP_DOWNLOAD_DIR/Downloader.java"
targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName"
cat >"$javaSource" <<-END
public class Downloader extends java.net.Authenticator
{
protected java.net.PasswordAuthentication getPasswordAuthentication()
{
return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() );
}
public static void main( String[] args ) throws Exception
{
setDefault( new Downloader() );
java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() );
}
}
END
# For Cygwin/MinGW, switch paths to Windows format before running javac and java
verbose " - Compiling Downloader.java ..."
"$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java"
verbose " - Running Downloader.java ..."
"$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")"
fi
# If specified, validate the SHA-256 sum of the Maven distribution zip file
if [ -n "${distributionSha256Sum-}" ]; then
distributionSha256Result=false
if [ "$MVN_CMD" = mvnd.sh ]; then
echo "Checksum validation is not supported for maven-mvnd." >&2
echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
exit 1
elif command -v sha256sum >/dev/null; then
if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then
distributionSha256Result=true
fi
elif command -v shasum >/dev/null; then
if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then
distributionSha256Result=true
fi
else
echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2
echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
exit 1
fi
if [ $distributionSha256Result = false ]; then
echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2
echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2
exit 1
fi
fi
# unzip and move
if command -v unzip >/dev/null; then
unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip"
else
tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar"
fi
printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url"
mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME"
clean || :
exec_maven "$@"

View File

@@ -6,10 +6,11 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.21.2-final.0</version>
<version>0.20.0-beta.2</version>
<packaging>pom</packaging>
<name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description>
<name>LanceDB Parent</name>
<description>LanceDB vector database Java API</description>
<url>http://lancedb.com/</url>
<developers>
@@ -28,7 +29,6 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<arrow.version>15.0.0</arrow.version>
<lance-namespace.verison>0.0.1</lance-namespace.verison>
<spotless.skip>false</spotless.skip>
<spotless.version>2.30.0</spotless.version>
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
@@ -52,7 +52,6 @@
<modules>
<module>core</module>
<module>lance-namespace</module>
</modules>
<scm>
@@ -63,11 +62,6 @@
<dependencyManagement>
<dependencies>
<dependency>
<groupId>com.lancedb</groupId>
<artifactId>lance-namespace-core</artifactId>
<version>${lance-namespace.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-vector</artifactId>

22
node/.eslintrc.js Normal file
View File

@@ -0,0 +1,22 @@
module.exports = {
env: {
browser: true,
es2021: true
},
extends: 'standard-with-typescript',
overrides: [
],
parserOptions: {
project: './tsconfig.json',
ecmaVersion: 'latest',
sourceType: 'module'
},
rules: {
"@typescript-eslint/method-signature-style": "off",
"@typescript-eslint/quotes": "off",
"@typescript-eslint/semi": "off",
"@typescript-eslint/explicit-function-return-type": "off",
"@typescript-eslint/space-before-function-paren": "off",
"@typescript-eslint/indent": "off",
}
}

4
node/.npmignore Normal file
View File

@@ -0,0 +1,4 @@
gen_test_data.py
index.node
dist/lancedb*.tgz
vectordb*.tgz

64
node/CHANGELOG.md Normal file
View File

@@ -0,0 +1,64 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.1.5] - 2023-06-00
### Added
- Support for macOS X86
## [0.1.4] - 2023-06-03
### Added
- Select / Project query API
### Changed
- Deprecated created_index in favor of createIndex
## [0.1.3] - 2023-06-01
### Added
- Support S3 and Google Cloud Storage
- Embedding functions support
- OpenAI embedding function
## [0.1.2] - 2023-05-27
### Added
- Append records API
- Extra query params to to nodejs client
- Create_index API
### Fixed
- bugfix: string columns should be converted to Utf8Array (#94)
## [0.1.1] - 2023-05-16
### Added
- create_table API
- limit parameter for queries
- Typescript / JavaScript examples
- Linux support
## [0.1.0] - 2023-05-16
### Added
- Initial JavaScript / Node.js library for LanceDB
- Read-only api to query LanceDB datasets
- Supports macOS arm only
## [pre-0.1.0]
- Various prototypes / test builds

66
node/README.md Normal file
View File

@@ -0,0 +1,66 @@
# LanceDB
A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb).
**DEPRECATED: This library is deprecated. Please use the new client,
[@lancedb/lancedb](https://www.npmjs.com/package/@lancedb/lancedb).**
## Installation
```bash
npm install vectordb
```
This will download the appropriate native library for your platform. We currently
support:
* Linux (x86_64 and aarch64)
* MacOS (Intel and ARM/M1/M2)
* Windows (x86_64 only)
We do not yet support musl-based Linux (such as Alpine Linux) or aarch64 Windows.
## Usage
### Basic Example
```javascript
const lancedb = require('vectordb');
const db = await lancedb.connect('data/sample-lancedb');
const table = await db.createTable("my_table",
[{ id: 1, vector: [0.1, 1.0], item: "foo", price: 10.0 },
{ id: 2, vector: [3.9, 0.5], item: "bar", price: 20.0 }])
const results = await table.search([0.1, 0.3]).limit(20).execute();
console.log(results);
```
The [examples](./examples) folder contains complete examples.
## Development
To build everything fresh:
```bash
npm install
npm run build
```
Then you should be able to run the tests with:
```bash
npm test
```
### Fix lints
To run the linter and have it automatically fix all errors
```bash
npm run lint -- --fix
```
To build documentation
```bash
npx typedoc --plugin typedoc-plugin-markdown --out ../docs/src/javascript src/index.ts
```

View File

@@ -0,0 +1,41 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
'use strict'
async function example () {
const lancedb = require('vectordb')
// You need to provide an OpenAI API key, here we read it from the OPENAI_API_KEY environment variable
const apiKey = process.env.OPENAI_API_KEY
// The embedding function will create embeddings for the 'text' column(text in this case)
const embedding = new lancedb.OpenAIEmbeddingFunction('text', apiKey)
const db = await lancedb.connect('data/sample-lancedb')
const data = [
{ id: 1, text: 'Black T-Shirt', price: 10 },
{ id: 2, text: 'Leather Jacket', price: 50 }
]
const table = await db.createTable('vectors', data, embedding)
console.log(await db.tableNames())
const results = await table
.search('keeps me warm')
.limit(1)
.execute()
console.log(results[0].text)
}
example().then(_ => { console.log('All done!') })

View File

@@ -0,0 +1,15 @@
{
"name": "vectordb-example-js-openai",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"dependencies": {
"vectordb": "file:../..",
"openai": "^3.2.1"
}
}

View File

@@ -0,0 +1,66 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
'use strict'
async function example() {
const lancedb = require('vectordb')
// Import transformers and the all-MiniLM-L6-v2 model (https://huggingface.co/Xenova/all-MiniLM-L6-v2)
const { pipeline } = await import('@xenova/transformers')
const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
// Create embedding function from pipeline which returns a list of vectors from batch
// sourceColumn is the name of the column in the data to be embedded
//
// Output of pipe is a Tensor { data: Float32Array(384) }, so filter for the vector
const embed_fun = {}
embed_fun.sourceColumn = 'text'
embed_fun.embed = async function (batch) {
let result = []
for (let text of batch) {
const res = await pipe(text, { pooling: 'mean', normalize: true })
result.push(Array.from(res['data']))
}
return (result)
}
// Link a folder and create a table with data
const db = await lancedb.connect('data/sample-lancedb')
const data = [
{ id: 1, text: 'Cherry', type: 'fruit' },
{ id: 2, text: 'Carrot', type: 'vegetable' },
{ id: 3, text: 'Potato', type: 'vegetable' },
{ id: 4, text: 'Apple', type: 'fruit' },
{ id: 5, text: 'Banana', type: 'fruit' }
]
const table = await db.createTable('food_table', data, embed_fun)
// Query the table
const results = await table
.search("a sweet fruit to eat")
.metricType("cosine")
.limit(2)
.execute()
console.log(results.map(r => r.text))
}
example().then(_ => { console.log("Done!") })

View File

@@ -0,0 +1,16 @@
{
"name": "vectordb-example-js-transformers",
"version": "1.0.0",
"description": "Example for using transformers.js with lancedb",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"dependencies": {
"@xenova/transformers": "^2.4.1",
"vectordb": "file:../.."
}
}

View File

@@ -0,0 +1,122 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
'use strict'
const lancedb = require('vectordb')
const fs = require('fs/promises')
const readline = require('readline/promises')
const { stdin: input, stdout: output } = require('process')
const { Configuration, OpenAIApi } = require('openai')
// Download file from XYZ
const INPUT_FILE_NAME = 'data/youtube-transcriptions_sample.jsonl';
(async () => {
// You need to provide an OpenAI API key, here we read it from the OPENAI_API_KEY environment variable
const apiKey = process.env.OPENAI_API_KEY
// The embedding function will create embeddings for the 'context' column
const embedFunction = new lancedb.OpenAIEmbeddingFunction('context', apiKey)
// Connects to LanceDB
const db = await lancedb.connect('data/youtube-lancedb')
// Open the vectors table or create one if it does not exist
let tbl
if ((await db.tableNames()).includes('vectors')) {
tbl = await db.openTable('vectors', embedFunction)
} else {
tbl = await createEmbeddingsTable(db, embedFunction)
}
// Use OpenAI Completion API to generate and answer based on the context that LanceDB provides
const configuration = new Configuration({ apiKey })
const openai = new OpenAIApi(configuration)
const rl = readline.createInterface({ input, output })
try {
while (true) {
const query = await rl.question('Prompt: ')
const results = await tbl
.search(query)
.select(['title', 'text', 'context'])
.limit(3)
.execute()
// console.table(results)
const response = await openai.createCompletion({
model: 'text-davinci-003',
prompt: createPrompt(query, results),
max_tokens: 400,
temperature: 0,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0
})
console.log(response.data.choices[0].text)
}
} catch (err) {
console.log('Error: ', err)
} finally {
rl.close()
}
process.exit(1)
})()
async function createEmbeddingsTable (db, embedFunction) {
console.log(`Creating embeddings from ${INPUT_FILE_NAME}`)
// read the input file into a JSON array, skipping empty lines
const lines = (await fs.readFile(INPUT_FILE_NAME, 'utf-8'))
.toString()
.split('\n')
.filter(line => line.length > 0)
.map(line => JSON.parse(line))
const data = contextualize(lines, 20, 'video_id')
return await db.createTable('vectors', data, embedFunction)
}
// Each transcript has a small text column, we include previous transcripts in order to
// have more context information when creating embeddings
function contextualize (rows, contextSize, groupColumn) {
const grouped = []
rows.forEach(row => {
if (!grouped[row[groupColumn]]) {
grouped[row[groupColumn]] = []
}
grouped[row[groupColumn]].push(row)
})
const data = []
Object.keys(grouped).forEach(key => {
for (let i = 0; i < grouped[key].length; i++) {
const start = i - contextSize > 0 ? i - contextSize : 0
grouped[key][i].context = grouped[key].slice(start, i + 1).map(r => r.text).join(' ')
}
data.push(...grouped[key])
})
return data
}
// Creates a prompt by aggregating all relevant contexts
function createPrompt (query, context) {
let prompt =
'Answer the question based on the context below.\n\n' +
'Context:\n'
// need to make sure our prompt is not larger than max size
prompt = prompt + context.map(c => c.context).join('\n\n---\n\n').substring(0, 3750)
prompt = prompt + `\n\nQuestion: ${query}\nAnswer:`
return prompt
}

View File

@@ -0,0 +1,15 @@
{
"name": "vectordb-example-js-openai",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"dependencies": {
"vectordb": "file:../..",
"openai": "^3.2.1"
}
}

36
node/examples/js/index.js Normal file
View File

@@ -0,0 +1,36 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
'use strict'
async function example () {
const lancedb = require('vectordb')
const db = await lancedb.connect('data/sample-lancedb')
const data = [
{ id: 1, vector: [0.1, 0.2], price: 10 },
{ id: 2, vector: [1.1, 1.2], price: 50 }
]
const table = await db.createTable('vectors', data)
console.log(await db.tableNames())
const results = await table
.search([0.1, 0.3])
.limit(20)
.execute()
console.log(results)
}
example()

View File

@@ -0,0 +1,14 @@
{
"name": "vectordb-example-js",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"dependencies": {
"vectordb": "file:../.."
}
}

View File

@@ -0,0 +1,22 @@
{
"name": "vectordb-example-ts",
"version": "1.0.0",
"description": "",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"tsc": "tsc -b",
"build": "tsc"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"devDependencies": {
"@types/node": "^18.16.2",
"ts-node": "^10.9.1",
"ts-node-dev": "^2.0.0",
"typescript": "*"
},
"dependencies": {
"vectordb": "file:../.."
}
}

View File

@@ -0,0 +1,35 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import * as vectordb from 'vectordb';
async function example () {
const db = await vectordb.connect('data/sample-lancedb')
const data = [
{ id: 1, vector: [0.1, 0.2], price: 10 },
{ id: 2, vector: [1.1, 1.2], price: 50 }
]
const table = await db.createTable('vectors', data)
console.log(await db.tableNames())
const results = await table
.search([0.1, 0.3])
.limit(20)
.execute()
console.log(results)
}
example().then(_ => { console.log ("All done!") })

View File

@@ -0,0 +1,10 @@
{
"include": ["src/**/*.ts"],
"compilerOptions": {
"target": "es2016",
"module": "commonjs",
"declaration": true,
"outDir": "./dist",
"strict": true
}
}

36
node/native.js Normal file
View File

@@ -0,0 +1,36 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
const { currentTarget } = require('@neon-rs/load')
let nativeLib
try {
// When developing locally, give preference to the local built library
nativeLib = require('./index.node')
} catch {
try {
nativeLib = require(`@lancedb/vectordb-${currentTarget()}`)
} catch (e) {
throw new Error(`vectordb: failed to load native library.
You may need to run \`npm install @lancedb/vectordb-${currentTarget()}\`.
If that does not work, please file a bug report at https://github.com/lancedb/lancedb/issues
Source error: ${e}`)
}
}
// Dynamic require for runtime.
module.exports = nativeLib

5239
node/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

98
node/package.json Normal file
View File

@@ -0,0 +1,98 @@
{
"name": "vectordb",
"version": "0.20.0-beta.2",
"description": " Serverless, low-latency vector database for AI applications",
"private": false,
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"tsc": "tsc -b",
"build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb_node index.node -- cargo build -p lancedb-node --message-format=json",
"build-release": "npm run build -- --release",
"test": "npm run tsc && mocha -recursive dist/test",
"integration-test": "npm run tsc && mocha -recursive dist/integration_test",
"lint": "eslint native.js src --ext .js,.ts",
"clean": "rm -rf node_modules *.node dist/",
"pack-build": "neon pack-build",
"check-npm": "printenv && which node && which npm && npm --version"
},
"repository": {
"type": "git",
"url": "https://github.com/lancedb/lancedb.git"
},
"homepage": "https://lancedb.github.io/lancedb/",
"bugs": {
"url": "https://github.com/lancedb/lancedb/issues"
},
"keywords": [
"data-format",
"data-science",
"machine-learning",
"data-analytics"
],
"author": "Lance Devs",
"license": "Apache-2.0",
"devDependencies": {
"@neon-rs/cli": "^0.0.160",
"@types/chai": "^4.3.4",
"@types/chai-as-promised": "^7.1.5",
"@types/mocha": "^10.0.1",
"@types/node": "^18.16.2",
"@types/sinon": "^10.0.15",
"@types/temp": "^0.9.1",
"@types/uuid": "^9.0.3",
"@typescript-eslint/eslint-plugin": "^5.59.1",
"apache-arrow-old": "npm:apache-arrow@13.0.0",
"cargo-cp-artifact": "^0.1",
"chai": "^4.3.7",
"chai-as-promised": "^7.1.1",
"eslint": "^8.39.0",
"eslint-config-standard-with-typescript": "^34.0.1",
"eslint-plugin-import": "^2.26.0",
"eslint-plugin-n": "^15.7.0",
"eslint-plugin-promise": "^6.1.1",
"mocha": "^10.2.0",
"openai": "^4.24.1",
"sinon": "^15.1.0",
"temp": "^0.9.4",
"ts-node": "^10.9.1",
"ts-node-dev": "^2.0.0",
"typedoc": "^0.24.7",
"typedoc-plugin-markdown": "^3.15.3",
"typescript": "^5.1.0",
"uuid": "^9.0.0"
},
"dependencies": {
"@neon-rs/load": "^0.0.74",
"axios": "^1.4.0"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
"apache-arrow": "^14.0.2"
},
"os": [
"darwin",
"linux",
"win32"
],
"cpu": [
"x64",
"arm64"
],
"neon": {
"targets": {
"x86_64-apple-darwin": "@lancedb/vectordb-darwin-x64",
"aarch64-apple-darwin": "@lancedb/vectordb-darwin-arm64",
"x86_64-unknown-linux-gnu": "@lancedb/vectordb-linux-x64-gnu",
"aarch64-unknown-linux-gnu": "@lancedb/vectordb-linux-arm64-gnu",
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc"
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-x64": "0.20.0-beta.2",
"@lancedb/vectordb-darwin-arm64": "0.20.0-beta.2",
"@lancedb/vectordb-linux-x64-gnu": "0.20.0-beta.2",
"@lancedb/vectordb-linux-arm64-gnu": "0.20.0-beta.2",
"@lancedb/vectordb-win32-x64-msvc": "0.20.0-beta.2"
}
}

635
node/src/arrow.ts Normal file
View File

@@ -0,0 +1,635 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import {
Field,
makeBuilder,
RecordBatchFileWriter,
Utf8,
type Vector,
FixedSizeList,
vectorFromArray,
Schema,
Table as ArrowTable,
RecordBatchStreamWriter,
List,
RecordBatch,
makeData,
Struct,
type Float,
DataType,
Binary,
Float32
} from "apache-arrow";
import { type EmbeddingFunction } from "./index";
import { sanitizeSchema } from "./sanitize";
/*
* Options to control how a column should be converted to a vector array
*/
export class VectorColumnOptions {
/** Vector column type. */
type: Float = new Float32();
constructor(values?: Partial<VectorColumnOptions>) {
Object.assign(this, values);
}
}
/** Options to control the makeArrowTable call. */
export class MakeArrowTableOptions {
/*
* Schema of the data.
*
* If this is not provided then the data type will be inferred from the
* JS type. Integer numbers will become int64, floating point numbers
* will become float64 and arrays will become variable sized lists with
* the data type inferred from the first element in the array.
*
* The schema must be specified if there are no records (e.g. to make
* an empty table)
*/
schema?: Schema;
/*
* Mapping from vector column name to expected type
*
* Lance expects vector columns to be fixed size list arrays (i.e. tensors)
* However, `makeArrowTable` will not infer this by default (it creates
* variable size list arrays). This field can be used to indicate that a column
* should be treated as a vector column and converted to a fixed size list.
*
* The keys should be the names of the vector columns. The value specifies the
* expected data type of the vector columns.
*
* If `schema` is provided then this field is ignored.
*
* By default, the column named "vector" will be assumed to be a float32
* vector column.
*/
vectorColumns: Record<string, VectorColumnOptions> = {
vector: new VectorColumnOptions()
};
embeddings?: EmbeddingFunction<any>;
/**
* If true then string columns will be encoded with dictionary encoding
*
* Set this to true if your string columns tend to repeat the same values
* often. For more precise control use the `schema` property to specify the
* data type for individual columns.
*
* If `schema` is provided then this property is ignored.
*/
dictionaryEncodeStrings: boolean = false;
constructor(values?: Partial<MakeArrowTableOptions>) {
Object.assign(this, values);
}
}
/**
* An enhanced version of the {@link makeTable} function from Apache Arrow
* that supports nested fields and embeddings columns.
*
* This function converts an array of Record<String, any> (row-major JS objects)
* to an Arrow Table (a columnar structure)
*
* Note that it currently does not support nulls.
*
* If a schema is provided then it will be used to determine the resulting array
* types. Fields will also be reordered to fit the order defined by the schema.
*
* If a schema is not provided then the types will be inferred and the field order
* will be controlled by the order of properties in the first record.
*
* If the input is empty then a schema must be provided to create an empty table.
*
* When a schema is not specified then data types will be inferred. The inference
* rules are as follows:
*
* - boolean => Bool
* - number => Float64
* - String => Utf8
* - Buffer => Binary
* - Record<String, any> => Struct
* - Array<any> => List
*
* @param data input data
* @param options options to control the makeArrowTable call.
*
* @example
*
* ```ts
*
* import { fromTableToBuffer, makeArrowTable } from "../arrow";
* import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
*
* const schema = new Schema([
* new Field("a", new Int32()),
* new Field("b", new Float32()),
* new Field("c", new FixedSizeList(3, new Field("item", new Float16()))),
* ]);
* const table = makeArrowTable([
* { a: 1, b: 2, c: [1, 2, 3] },
* { a: 4, b: 5, c: [4, 5, 6] },
* { a: 7, b: 8, c: [7, 8, 9] },
* ], { schema });
* ```
*
* By default it assumes that the column named `vector` is a vector column
* and it will be converted into a fixed size list array of type float32.
* The `vectorColumns` option can be used to support other vector column
* names and data types.
*
* ```ts
*
* const schema = new Schema([
new Field("a", new Float64()),
new Field("b", new Float64()),
new Field(
"vector",
new FixedSizeList(3, new Field("item", new Float32()))
),
]);
const table = makeArrowTable([
{ a: 1, b: 2, vector: [1, 2, 3] },
{ a: 4, b: 5, vector: [4, 5, 6] },
{ a: 7, b: 8, vector: [7, 8, 9] },
]);
assert.deepEqual(table.schema, schema);
* ```
*
* You can specify the vector column types and names using the options as well
*
* ```typescript
*
* const schema = new Schema([
new Field('a', new Float64()),
new Field('b', new Float64()),
new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
]);
* const table = makeArrowTable([
{ a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
{ a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
{ a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
], {
vectorColumns: {
vec1: { type: new Float16() },
vec2: { type: new Float16() }
}
}
* assert.deepEqual(table.schema, schema)
* ```
*/
export function makeArrowTable(
data: Array<Record<string, any>>,
options?: Partial<MakeArrowTableOptions>
): ArrowTable {
if (
data.length === 0 &&
(options?.schema === undefined || options?.schema === null)
) {
throw new Error("At least one record or a schema needs to be provided");
}
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
if (opt.schema !== undefined && opt.schema !== null) {
opt.schema = sanitizeSchema(opt.schema);
opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
}
const columns: Record<string, Vector> = {};
// TODO: sample dataset to find missing columns
// Prefer the field ordering of the schema, if present
const columnNames =
opt.schema != null ? (opt.schema.names as string[]) : Object.keys(data[0]);
for (const colName of columnNames) {
if (
data.length !== 0 &&
!Object.prototype.hasOwnProperty.call(data[0], colName)
) {
// The field is present in the schema, but not in the data, skip it
continue;
}
// Extract a single column from the records (transpose from row-major to col-major)
let values = data.map((datum) => datum[colName]);
// By default (type === undefined) arrow will infer the type from the JS type
let type;
if (opt.schema !== undefined) {
// If there is a schema provided, then use that for the type instead
type = opt.schema?.fields.filter((f) => f.name === colName)[0]?.type;
if (DataType.isInt(type) && type.bitWidth === 64) {
// wrap in BigInt to avoid bug: https://github.com/apache/arrow/issues/40051
values = values.map((v) => {
if (v === null) {
return v;
}
return BigInt(v);
});
}
} else {
// Otherwise, check to see if this column is one of the vector columns
// defined by opt.vectorColumns and, if so, use the fixed size list type
const vectorColumnOptions = opt.vectorColumns[colName];
if (vectorColumnOptions !== undefined) {
type = newVectorType(values[0].length, vectorColumnOptions.type);
}
}
try {
// Convert an Array of JS values to an arrow vector
columns[colName] = makeVector(values, type, opt.dictionaryEncodeStrings);
} catch (error: unknown) {
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
throw Error(`Could not convert column "${colName}" to Arrow: ${error}`);
}
}
if (opt.schema != null) {
// `new ArrowTable(columns)` infers a schema which may sometimes have
// incorrect nullability (it assumes nullable=true if there are 0 rows)
//
// `new ArrowTable(schema, columns)` will also fail because it will create a
// batch with an inferred schema and then complain that the batch schema
// does not match the provided schema.
//
// To work around this we first create a table with the wrong schema and
// then patch the schema of the batches so we can use
// `new ArrowTable(schema, batches)` which does not do any schema inference
const firstTable = new ArrowTable(columns);
const batchesFixed = firstTable.batches.map(
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
(batch) => new RecordBatch(opt.schema!, batch.data)
);
return new ArrowTable(opt.schema, batchesFixed);
} else {
return new ArrowTable(columns);
}
}
/**
* Create an empty Arrow table with the provided schema
*/
export function makeEmptyTable(schema: Schema): ArrowTable {
return makeArrowTable([], { schema });
}
// Helper function to convert Array<Array<any>> to a variable sized list array
function makeListVector(lists: any[][]): Vector<any> {
if (lists.length === 0 || lists[0].length === 0) {
throw Error("Cannot infer list vector from empty array or empty list");
}
const sampleList = lists[0];
let inferredType;
try {
const sampleVector = makeVector(sampleList);
inferredType = sampleVector.type;
} catch (error: unknown) {
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
throw Error(`Cannot infer list vector. Cannot infer inner type: ${error}`);
}
const listBuilder = makeBuilder({
type: new List(new Field("item", inferredType, true))
});
for (const list of lists) {
listBuilder.append(list);
}
return listBuilder.finish().toVector();
}
// Helper function to convert an Array of JS values to an Arrow Vector
function makeVector(
values: any[],
type?: DataType,
stringAsDictionary?: boolean
): Vector<any> {
if (type !== undefined) {
// No need for inference, let Arrow create it
return vectorFromArray(values, type);
}
if (values.length === 0) {
throw Error(
"makeVector requires at least one value or the type must be specfied"
);
}
const sampleValue = values.find((val) => val !== null && val !== undefined);
if (sampleValue === undefined) {
throw Error(
"makeVector cannot infer the type if all values are null or undefined"
);
}
if (Array.isArray(sampleValue)) {
// Default Arrow inference doesn't handle list types
return makeListVector(values);
} else if (Buffer.isBuffer(sampleValue)) {
// Default Arrow inference doesn't handle Buffer
return vectorFromArray(values, new Binary());
} else if (
!(stringAsDictionary ?? false) &&
(typeof sampleValue === "string" || sampleValue instanceof String)
) {
// If the type is string then don't use Arrow's default inference unless dictionaries are requested
// because it will always use dictionary encoding for strings
return vectorFromArray(values, new Utf8());
} else {
// Convert a JS array of values to an arrow vector
return vectorFromArray(values);
}
}
async function applyEmbeddings<T>(
table: ArrowTable,
embeddings?: EmbeddingFunction<T>,
schema?: Schema
): Promise<ArrowTable> {
if (embeddings == null) {
return table;
}
if (schema !== undefined && schema !== null) {
schema = sanitizeSchema(schema);
}
// Convert from ArrowTable to Record<String, Vector>
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
const name = table.schema.fields[idx].name;
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const vec = table.getChildAt(idx)!;
return [name, vec];
});
const newColumns = Object.fromEntries(colEntries);
const sourceColumn = newColumns[embeddings.sourceColumn];
const destColumn = embeddings.destColumn ?? "vector";
const innerDestType = embeddings.embeddingDataType ?? new Float32();
if (sourceColumn === undefined) {
throw new Error(
`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`
);
}
if (table.numRows === 0) {
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
// We have an empty table and it already has the embedding column so no work needs to be done
// Note: we don't return an error like we did below because this is a common occurrence. For example,
// if we call convertToTable with 0 records and a schema that includes the embedding
return table;
}
if (embeddings.embeddingDimension !== undefined) {
const destType = newVectorType(
embeddings.embeddingDimension,
innerDestType
);
newColumns[destColumn] = makeVector([], destType);
} else if (schema != null) {
const destField = schema.fields.find((f) => f.name === destColumn);
if (destField != null) {
newColumns[destColumn] = makeVector([], destField.type);
} else {
throw new Error(
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`
);
}
} else {
throw new Error(
"Attempt to apply embeddings to an empty table when the embeddings function does not specify `embeddingDimension`"
);
}
} else {
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
throw new Error(
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`
);
}
if (table.batches.length > 1) {
throw new Error(
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch"
);
}
const values = sourceColumn.toArray();
const vectors = await embeddings.embed(values as T[]);
if (vectors.length !== values.length) {
throw new Error(
"Embedding function did not return an embedding for each input element"
);
}
const destType = newVectorType(vectors[0].length, innerDestType);
newColumns[destColumn] = makeVector(vectors, destType);
}
const newTable = new ArrowTable(newColumns);
if (schema != null) {
if (schema.fields.find((f) => f.name === destColumn) === undefined) {
throw new Error(
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`
);
}
return alignTable(newTable, schema);
}
return newTable;
}
/*
* Convert an Array of records into an Arrow Table, optionally applying an
* embeddings function to it.
*
* This function calls `makeArrowTable` first to create the Arrow Table.
* Any provided `makeTableOptions` (e.g. a schema) will be passed on to
* that call.
*
* The embedding function will be passed a column of values (based on the
* `sourceColumn` of the embedding function) and expects to receive back
* number[][] which will be converted into a fixed size list column. By
* default this will be a fixed size list of Float32 but that can be
* customized by the `embeddingDataType` property of the embedding function.
*
* If a schema is provided in `makeTableOptions` then it should include the
* embedding columns. If no schema is provded then embedding columns will
* be placed at the end of the table, after all of the input columns.
*/
export async function convertToTable<T>(
data: Array<Record<string, unknown>>,
embeddings?: EmbeddingFunction<T>,
makeTableOptions?: Partial<MakeArrowTableOptions>
): Promise<ArrowTable> {
const table = makeArrowTable(data, makeTableOptions);
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
}
// Creates the Arrow Type for a Vector column with dimension `dim`
function newVectorType<T extends Float>(
dim: number,
innerType: T
): FixedSizeList<T> {
// Somewhere we always default to have the elements nullable, so we need to set it to true
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
const children = new Field<T>("item", innerType, true);
return new FixedSizeList(dim, children);
}
/**
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
*
* This function will call `convertToTable` and pass on `embeddings` and `schema`
*
* `schema` is required if data is empty
*/
export async function fromRecordsToBuffer<T>(
data: Array<Record<string, unknown>>,
embeddings?: EmbeddingFunction<T>,
schema?: Schema
): Promise<Buffer> {
if (schema !== undefined && schema !== null) {
schema = sanitizeSchema(schema);
}
const table = await convertToTable(data, embeddings, { schema, embeddings });
const writer = RecordBatchFileWriter.writeAll(table);
return Buffer.from(await writer.toUint8Array());
}
/**
* Serialize an Array of records into a buffer using the Arrow IPC Stream serialization
*
* This function will call `convertToTable` and pass on `embeddings` and `schema`
*
* `schema` is required if data is empty
*/
export async function fromRecordsToStreamBuffer<T>(
data: Array<Record<string, unknown>>,
embeddings?: EmbeddingFunction<T>,
schema?: Schema
): Promise<Buffer> {
if (schema !== null && schema !== undefined) {
schema = sanitizeSchema(schema);
}
const table = await convertToTable(data, embeddings, { schema });
const writer = RecordBatchStreamWriter.writeAll(table);
return Buffer.from(await writer.toUint8Array());
}
/**
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
*
* This function will apply `embeddings` to the table in a manner similar to
* `convertToTable`.
*
* `schema` is required if the table is empty
*/
export async function fromTableToBuffer<T>(
table: ArrowTable,
embeddings?: EmbeddingFunction<T>,
schema?: Schema
): Promise<Buffer> {
if (schema !== null && schema !== undefined) {
schema = sanitizeSchema(schema);
}
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
const writer = RecordBatchFileWriter.writeAll(tableWithEmbeddings);
return Buffer.from(await writer.toUint8Array());
}
/**
* Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
*
* This function will apply `embeddings` to the table in a manner similar to
* `convertToTable`.
*
* `schema` is required if the table is empty
*/
export async function fromTableToStreamBuffer<T>(
table: ArrowTable,
embeddings?: EmbeddingFunction<T>,
schema?: Schema
): Promise<Buffer> {
if (schema !== null && schema !== undefined) {
schema = sanitizeSchema(schema);
}
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
return Buffer.from(await writer.toUint8Array());
}
function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
const alignedChildren = [];
for (const field of schema.fields) {
const indexInBatch = batch.schema.fields?.findIndex(
(f) => f.name === field.name
);
if (indexInBatch < 0) {
throw new Error(
`The column ${field.name} was not found in the Arrow Table`
);
}
alignedChildren.push(batch.data.children[indexInBatch]);
}
const newData = makeData({
type: new Struct(schema.fields),
length: batch.numRows,
nullCount: batch.nullCount,
children: alignedChildren
});
return new RecordBatch(schema, newData);
}
function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
const alignedBatches = table.batches.map((batch) =>
alignBatch(batch, schema)
);
return new ArrowTable(schema, alignedBatches);
}
// Creates an empty Arrow Table
export function createEmptyTable(schema: Schema): ArrowTable {
return new ArrowTable(sanitizeSchema(schema));
}
function validateSchemaEmbeddings(
schema: Schema<any>,
data: Array<Record<string, unknown>>,
embeddings: EmbeddingFunction<any> | undefined
) {
const fields = [];
const missingEmbeddingFields = [];
// First we check if the field is a `FixedSizeList`
// Then we check if the data contains the field
// if it does not, we add it to the list of missing embedding fields
// Finally, we check if those missing embedding fields are `this._embeddings`
// if they are not, we throw an error
for (const field of schema.fields) {
if (field.type instanceof FixedSizeList) {
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
missingEmbeddingFields.push(field);
} else {
fields.push(field);
}
} else {
fields.push(field);
}
}
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
throw new Error(
`Table has embeddings: "${missingEmbeddingFields
.map((f) => f.name)
.join(",")}", but no embedding function was provided`
);
}
return new Schema(fields, schema.metadata);
}

View File

@@ -0,0 +1,68 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { type Float } from 'apache-arrow'
/**
* An embedding function that automatically creates vector representation for a given column.
*/
export interface EmbeddingFunction<T> {
/**
* The name of the column that will be used as input for the Embedding Function.
*/
sourceColumn: string
/**
* The data type of the embedding
*
* The embedding function should return `number`. This will be converted into
* an Arrow float array. By default this will be Float32 but this property can
* be used to control the conversion.
*/
embeddingDataType?: Float
/**
* The dimension of the embedding
*
* This is optional, normally this can be determined by looking at the results of
* `embed`. If this is not specified, and there is an attempt to apply the embedding
* to an empty table, then that process will fail.
*/
embeddingDimension?: number
/**
* The name of the column that will contain the embedding
*
* By default this is "vector"
*/
destColumn?: string
/**
* Should the source column be excluded from the resulting table
*
* By default the source column is included. Set this to true and
* only the embedding will be stored.
*/
excludeSource?: boolean
/**
* Creates a vector representation for the given values.
*/
embed: (data: T[]) => Promise<number[][]>
}
export function isEmbeddingFunction<T> (value: any): value is EmbeddingFunction<T> {
return typeof value.sourceColumn === 'string' &&
typeof value.embed === 'function'
}

View File

@@ -0,0 +1,57 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { type EmbeddingFunction } from '../index'
import type OpenAI from 'openai'
export class OpenAIEmbeddingFunction implements EmbeddingFunction<string> {
private readonly _openai: OpenAI
private readonly _modelName: string
constructor (sourceColumn: string, openAIKey: string, modelName: string = 'text-embedding-ada-002') {
/**
* @type {import("openai").default}
*/
let Openai
try {
// eslint-disable-next-line @typescript-eslint/no-var-requires
Openai = require('openai')
} catch {
throw new Error('please install openai@^4.24.1 using npm install openai')
}
this.sourceColumn = sourceColumn
const configuration = {
apiKey: openAIKey
}
this._openai = new Openai(configuration)
this._modelName = modelName
}
async embed (data: string[]): Promise<number[][]> {
const response = await this._openai.embeddings.create({
model: this._modelName,
input: data
})
const embeddings: number[][] = []
for (let i = 0; i < response.data.length; i++) {
embeddings.push(response.data[i].embedding)
}
return embeddings
}
sourceColumn: string
}

1399
node/src/index.ts Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,180 @@
// Copyright 2023 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { describe } from 'mocha'
import * as chai from 'chai'
import { assert } from 'chai'
import * as chaiAsPromised from 'chai-as-promised'
import { v4 as uuidv4 } from 'uuid'
import * as lancedb from '../index'
import { tmpdir } from 'os'
import * as fs from 'fs'
import * as path from 'path'
chai.use(chaiAsPromised)
describe('LanceDB AWS Integration test', function () {
it('s3+ddb schema is processed correctly', async function () {
this.timeout(15000)
// WARNING: specifying engine is NOT a publicly supported feature in lancedb yet
// THE API WILL CHANGE
const conn = await lancedb.connect('s3://lancedb-integtest?engine=ddb&ddbTableName=lancedb-integtest')
const data = [{ vector: Array(128).fill(1.0) }]
const tableName = uuidv4()
let table = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
const futs = [table.add(data), table.add(data), table.add(data), table.add(data), table.add(data)]
await Promise.allSettled(futs)
table = await conn.openTable(tableName)
assert.equal(await table.countRows(), 6)
})
})
describe('LanceDB Mirrored Store Integration test', function () {
it('s3://...?mirroredStore=... param is processed correctly', async function () {
this.timeout(600000)
const dir = tmpdir()
console.log(dir)
const conn = await lancedb.connect({ uri: `s3://lancedb-integtest?mirroredStore=${dir}`, storageOptions: { allowHttp: 'true' } })
const data = Array(200).fill({ vector: Array(128).fill(1.0), id: 0 })
data.push(...Array(200).fill({ vector: Array(128).fill(1.0), id: 1 }))
data.push(...Array(200).fill({ vector: Array(128).fill(1.0), id: 2 }))
data.push(...Array(200).fill({ vector: Array(128).fill(1.0), id: 3 }))
const tableName = uuidv4()
// try create table and check if it's mirrored
const t = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
const mirroredPath = path.join(dir, `${tableName}.lance`)
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
if (err != null) throw err
// there should be three dirs
assert.equal(files.length, 3)
assert.isTrue(files[0].isDirectory())
assert.isTrue(files[1].isDirectory())
fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].name.endsWith('.txn'))
})
fs.readdir(path.join(mirroredPath, '_versions'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].name.endsWith('.manifest'))
})
fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].name.endsWith('.lance'))
})
})
// try create index and check if it's mirrored
await t.createIndex({ column: 'vector', type: 'ivf_pq' })
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
if (err != null) throw err
// there should be four dirs
assert.equal(files.length, 4)
assert.isTrue(files[0].isDirectory())
assert.isTrue(files[1].isDirectory())
assert.isTrue(files[2].isDirectory())
// Two TXs now
fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 2)
assert.isTrue(files[0].name.endsWith('.txn'))
assert.isTrue(files[1].name.endsWith('.txn'))
})
fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].name.endsWith('.lance'))
})
fs.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].isDirectory())
fs.readdir(path.join(mirroredPath, '_indices', files[0].name), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].isFile())
assert.isTrue(files[0].name.endsWith('.idx'))
})
})
})
// try delete and check if it's mirrored
await t.delete('id = 0')
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
if (err != null) throw err
// there should be five dirs
assert.equal(files.length, 5)
assert.isTrue(files[0].isDirectory())
assert.isTrue(files[1].isDirectory())
assert.isTrue(files[2].isDirectory())
assert.isTrue(files[3].isDirectory())
assert.isTrue(files[4].isDirectory())
// Three TXs now
fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 3)
assert.isTrue(files[0].name.endsWith('.txn'))
assert.isTrue(files[1].name.endsWith('.txn'))
})
fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].name.endsWith('.lance'))
})
fs.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].isDirectory())
fs.readdir(path.join(mirroredPath, '_indices', files[0].name), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].isFile())
assert.isTrue(files[0].name.endsWith('.idx'))
})
})
fs.readdir(path.join(mirroredPath, '_deletions'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].name.endsWith('.arrow'))
})
})
})
})

58
node/src/middleware.ts Normal file
View File

@@ -0,0 +1,58 @@
// Copyright 2024 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* Middleware for Remote LanceDB Connection or Table
*/
export interface HttpMiddleware {
/**
* A callback that can be used to instrument the behavior of http requests to remote
* tables. It can be used to add headers, modify the request, or even short-circuit
* the request and return a response without making the request to the remote endpoint.
* It can also be used to modify the response from the remote endpoint.
*
* @param {RemoteResponse} res - Request to the remote endpoint
* @param {onRemoteRequestNext} next - Callback to advance the middleware chain
*/
onRemoteRequest(
req: RemoteRequest,
next: (req: RemoteRequest) => Promise<RemoteResponse>,
): Promise<RemoteResponse>
};
export enum Method {
GET,
POST
}
/**
* A LanceDB Remote HTTP Request
*/
export interface RemoteRequest {
uri: string
method: Method
headers: Map<string, string>
params?: Map<string, string>
body?: any
}
/**
* A LanceDB Remote HTTP Response
*/
export interface RemoteResponse {
status: number
statusText: string
headers: Map<string, string>
body: () => Promise<any>
}

163
node/src/query.ts Normal file
View File

@@ -0,0 +1,163 @@
// Copyright 2023 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { Vector, tableFromIPC } from 'apache-arrow'
import { type EmbeddingFunction } from './embedding/embedding_function'
import { type MetricType } from '.'
// eslint-disable-next-line @typescript-eslint/no-var-requires
const { tableSearch } = require('../native.js')
/**
* A builder for nearest neighbor queries for LanceDB.
*/
export class Query<T = number[]> {
private readonly _query?: T
private readonly _tbl?: any
private _queryVector?: number[]
private _limit?: number
private _refineFactor?: number
private _nprobes: number
private _select?: string[]
private _filter?: string
private _metricType?: MetricType
private _prefilter: boolean
private _fastSearch: boolean
protected readonly _embeddings?: EmbeddingFunction<T>
constructor (query?: T, tbl?: any, embeddings?: EmbeddingFunction<T>) {
this._tbl = tbl
this._query = query
this._limit = 10
this._nprobes = 20
this._refineFactor = undefined
this._select = undefined
this._filter = undefined
this._metricType = undefined
this._embeddings = embeddings
this._prefilter = false
this._fastSearch = false
}
/***
* Sets the number of results that will be returned
* default value is 10
* @param value number of results
*/
limit (value: number): Query<T> {
this._limit = value
return this
}
/**
* Refine the results by reading extra elements and re-ranking them in memory.
* @param value refine factor to use in this query.
*/
refineFactor (value: number): Query<T> {
this._refineFactor = value
return this
}
/**
* The number of probes used. A higher number makes search more accurate but also slower.
* @param value The number of probes used.
*/
nprobes (value: number): Query<T> {
this._nprobes = value
return this
}
/**
* A filter statement to be applied to this query.
* @param value A filter in the same format used by a sql WHERE clause.
*/
filter (value: string): Query<T> {
this._filter = value
return this
}
where = this.filter
/** Return only the specified columns.
*
* @param value Only select the specified columns. If not specified, all columns will be returned.
*/
select (value: string[]): Query<T> {
this._select = value
return this
}
/**
* The MetricType used for this Query.
* @param value The metric to the. @see MetricType for the different options
*/
metricType (value: MetricType): Query<T> {
this._metricType = value
return this
}
prefilter (value: boolean): Query<T> {
this._prefilter = value
return this
}
/**
* Skip searching un-indexed data. This can make search faster, but will miss
* any data that is not yet indexed.
*/
fastSearch (value: boolean): Query<T> {
this._fastSearch = value
return this
}
/**
* Execute the query and return the results as an Array of Objects
*/
async execute<T = Record<string, unknown>> (): Promise<T[]> {
if (this._query !== undefined) {
if (this._embeddings !== undefined) {
this._queryVector = (await this._embeddings.embed([this._query]))[0]
} else {
this._queryVector = this._query as number[]
}
}
const isElectron = this.isElectron()
const buffer = await tableSearch.call(this._tbl, this, isElectron)
const data = tableFromIPC(buffer)
return data.toArray().map((entry: Record<string, unknown>) => {
const newObject: Record<string, unknown> = {}
Object.keys(entry).forEach((key: string) => {
if (entry[key] instanceof Vector) {
// toJSON() returns f16 array correctly
newObject[key] = (entry[key] as any).toJSON()
} else {
newObject[key] = entry[key] as any
}
})
return newObject as unknown as T
})
}
// See https://github.com/electron/electron/issues/2288
private isElectron (): boolean {
try {
// eslint-disable-next-line no-prototype-builtins
return (process?.versions?.hasOwnProperty('electron') || navigator?.userAgent?.toLowerCase()?.includes(' electron'))
} catch (e) {
return false
}
}
}

302
node/src/remote/client.ts Normal file
View File

@@ -0,0 +1,302 @@
// Copyright 2023 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import axios, { type AxiosError, type AxiosResponse, type ResponseType } from 'axios'
import { tableFromIPC, type Table as ArrowTable } from 'apache-arrow'
import { type RemoteResponse, type RemoteRequest, Method } from '../middleware'
import type { MetricType } from '..'
interface HttpLancedbClientMiddleware {
onRemoteRequest(
req: RemoteRequest,
next: (req: RemoteRequest) => Promise<RemoteResponse>,
): Promise<RemoteResponse>
}
/**
* Invoke the middleware chain and at the end call the remote endpoint
*/
async function callWithMiddlewares (
req: RemoteRequest,
middlewares: HttpLancedbClientMiddleware[],
opts?: MiddlewareInvocationOptions
): Promise<RemoteResponse> {
async function call (
i: number,
req: RemoteRequest
): Promise<RemoteResponse> {
// if we have reached the end of the middleware chain, make the request
if (i > middlewares.length) {
const headers = Object.fromEntries(req.headers.entries())
const params = Object.fromEntries(req.params?.entries() ?? [])
const timeout = opts?.timeout
let res
if (req.method === Method.POST) {
res = await axios.post(
req.uri,
req.body,
{
headers,
params,
timeout,
responseType: opts?.responseType
}
)
} else {
res = await axios.get(
req.uri,
{
headers,
params,
timeout
}
)
}
return toLanceRes(res)
}
// call next middleware in chain
return await middlewares[i - 1].onRemoteRequest(
req,
async (req) => {
return await call(i + 1, req)
}
)
}
return await call(1, req)
}
interface MiddlewareInvocationOptions {
responseType?: ResponseType
timeout?: number
}
/**
* Marshall the library response into a LanceDB response
*/
function toLanceRes (res: AxiosResponse): RemoteResponse {
const headers = new Map()
for (const h in res.headers) {
headers.set(h, res.headers[h])
}
return {
status: res.status,
statusText: res.statusText,
headers,
body: async () => {
return res.data
}
}
}
async function decodeErrorData(
res: RemoteResponse,
responseType?: ResponseType
): Promise<string> {
const errorData = await res.body()
if (responseType === 'arraybuffer') {
return new TextDecoder().decode(errorData)
} else {
if (typeof errorData === 'object') {
return JSON.stringify(errorData)
}
return errorData
}
}
export class HttpLancedbClient {
private readonly _url: string
private readonly _apiKey: () => string
private readonly _middlewares: HttpLancedbClientMiddleware[]
private readonly _timeout: number | undefined
public constructor (
url: string,
apiKey: string,
timeout?: number,
private readonly _dbName?: string
) {
this._url = url
this._apiKey = () => apiKey
this._middlewares = []
this._timeout = timeout
}
get uri (): string {
return this._url
}
public async search (
tableName: string,
vector: number[],
k: number,
nprobes: number,
prefilter: boolean,
refineFactor?: number,
columns?: string[],
filter?: string,
metricType?: MetricType,
fastSearch?: boolean
): Promise<ArrowTable<any>> {
const result = await this.post(
`/v1/table/${tableName}/query/`,
{
vector,
k,
nprobes,
refine_factor: refineFactor,
columns,
filter,
prefilter,
metric: metricType,
fast_search: fastSearch
},
undefined,
undefined,
'arraybuffer'
)
const table = tableFromIPC(await result.body())
return table
}
/**
* Sent GET request.
*/
public async get (path: string, params?: Record<string, string>): Promise<RemoteResponse> {
const req = {
uri: `${this._url}${path}`,
method: Method.GET,
headers: new Map(Object.entries({
'Content-Type': 'application/json',
'x-api-key': this._apiKey(),
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
})),
params: new Map(Object.entries(params ?? {}))
}
let response
try {
response = await callWithMiddlewares(req, this._middlewares)
return response
} catch (err: any) {
console.error(serializeErrorAsJson(err))
if (err.response === undefined) {
throw new Error(`Network Error: ${err.message as string}`)
}
response = toLanceRes(err.response)
}
if (response.status !== 200) {
const errorData = await decodeErrorData(response)
throw new Error(
`Server Error, status: ${response.status}, ` +
`message: ${response.statusText}: ${errorData}`
)
}
return response
}
/**
* Sent POST request.
*/
public async post (
path: string,
data?: any,
params?: Record<string, string>,
content?: string | undefined,
responseType?: ResponseType | undefined
): Promise<RemoteResponse> {
const req = {
uri: `${this._url}${path}`,
method: Method.POST,
headers: new Map(Object.entries({
'Content-Type': content ?? 'application/json',
'x-api-key': this._apiKey(),
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
})),
params: new Map(Object.entries(params ?? {})),
body: data
}
let response
try {
response = await callWithMiddlewares(req, this._middlewares, {
responseType,
timeout: this._timeout
})
// return response
} catch (err: any) {
console.error(serializeErrorAsJson(err))
if (err.response === undefined) {
throw new Error(`Network Error: ${err.message as string}`)
}
response = toLanceRes(err.response)
}
if (response.status !== 200) {
const errorData = await decodeErrorData(response, responseType)
throw new Error(
`Server Error, status: ${response.status}, ` +
`message: ${response.statusText}: ${errorData}`
)
}
return response
}
/**
* Instrument this client with middleware
* @param mw - The middleware that instruments the client
* @returns - an instance of this client instrumented with the middleware
*/
public withMiddleware (mw: HttpLancedbClientMiddleware): HttpLancedbClient {
const wrapped = this.clone()
wrapped._middlewares.push(mw)
return wrapped
}
/**
* Make a clone of this client
*/
private clone (): HttpLancedbClient {
const clone = new HttpLancedbClient(this._url, this._apiKey(), this._timeout, this._dbName)
for (const mw of this._middlewares) {
clone._middlewares.push(mw)
}
return clone
}
}
function serializeErrorAsJson(err: AxiosError) {
const error = JSON.parse(JSON.stringify(err, Object.getOwnPropertyNames(err)))
error.response = err.response != null
? JSON.parse(JSON.stringify(
err.response,
// config contains the request data, too noisy
Object.getOwnPropertyNames(err.response).filter(prop => prop !== 'config')
))
: null
return JSON.stringify({ error })
}

567
node/src/remote/index.ts Normal file
View File

@@ -0,0 +1,567 @@
// Copyright 2023 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import {
type EmbeddingFunction,
type Table,
type VectorIndexParams,
type Connection,
type ConnectionOptions,
type CreateTableOptions,
type VectorIndex,
type WriteOptions,
type IndexStats,
type UpdateArgs,
type UpdateSqlArgs,
makeArrowTable,
type MergeInsertArgs,
type ColumnAlteration
} from '../index'
import { Query } from '../query'
import { Vector, Table as ArrowTable } from 'apache-arrow'
import { HttpLancedbClient } from './client'
import { isEmbeddingFunction } from '../embedding/embedding_function'
import {
createEmptyTable,
fromRecordsToStreamBuffer,
fromTableToStreamBuffer
} from '../arrow'
import { toSQL, TTLCache } from '../util'
import { type HttpMiddleware } from '../middleware'
/**
* Remote connection.
*/
export class RemoteConnection implements Connection {
private _client: HttpLancedbClient
private readonly _dbName: string
private readonly _tableCache = new TTLCache(300_000)
constructor (opts: ConnectionOptions) {
if (!opts.uri.startsWith('db://')) {
throw new Error(`Invalid remote DB URI: ${opts.uri}`)
}
if (opts.apiKey == null || opts.apiKey === '') {
opts = Object.assign({}, opts, { apiKey: process.env.LANCEDB_API_KEY })
}
if (opts.apiKey === undefined || opts.region === undefined) {
throw new Error(
'API key and region are must be passed for remote connections. ' +
'API key can also be set through LANCEDB_API_KEY env variable.')
}
this._dbName = opts.uri.slice('db://'.length)
let server: string
if (opts.hostOverride === undefined) {
server = `https://${this._dbName}.${opts.region}.api.lancedb.com`
} else {
server = opts.hostOverride
}
this._client = new HttpLancedbClient(
server,
opts.apiKey,
opts.timeout,
opts.hostOverride === undefined ? undefined : this._dbName
)
}
get uri (): string {
// add the lancedb+ prefix back
return 'db://' + this._client.uri
}
async tableNames (
pageToken: string = '',
limit: number = 10
): Promise<string[]> {
const response = await this._client.get('/v1/table/', {
limit: `${limit}`,
page_token: pageToken
})
const body = await response.body()
for (const table of body.tables) {
this._tableCache.set(table, true)
}
return body.tables
}
async openTable (name: string): Promise<Table>
async openTable<T>(
name: string,
embeddings: EmbeddingFunction<T>
): Promise<Table<T>>
async openTable<T>(
name: string,
embeddings?: EmbeddingFunction<T>
): Promise<Table<T>> {
// check if the table exists
if (this._tableCache.get(name) === undefined) {
await this._client.post(`/v1/table/${encodeURIComponent(name)}/describe/`)
this._tableCache.set(name, true)
}
if (embeddings !== undefined) {
return new RemoteTable(this._client, name, embeddings)
} else {
return new RemoteTable(this._client, name)
}
}
async createTable<T>(
nameOrOpts: string | CreateTableOptions<T>,
data?: Array<Record<string, unknown>> | ArrowTable,
optsOrEmbedding?: WriteOptions | EmbeddingFunction<T>,
opt?: WriteOptions
): Promise<Table<T>> {
// Logic copied from LocatlConnection, refactor these to a base class + connectionImpl pattern
let schema
let embeddings: undefined | EmbeddingFunction<T>
let tableName: string
if (typeof nameOrOpts === 'string') {
if (
optsOrEmbedding !== undefined &&
isEmbeddingFunction(optsOrEmbedding)
) {
embeddings = optsOrEmbedding
}
tableName = nameOrOpts
} else {
schema = nameOrOpts.schema
embeddings = nameOrOpts.embeddingFunction
tableName = nameOrOpts.name
if (data === undefined) {
data = nameOrOpts.data
}
}
let buffer: Buffer
function isEmpty (
data: Array<Record<string, unknown>> | ArrowTable<any>
): boolean {
if (data instanceof ArrowTable) {
return data.numRows === 0
}
return data.length === 0
}
if (data === undefined || isEmpty(data)) {
if (schema === undefined) {
throw new Error('Either data or schema needs to defined')
}
buffer = await fromTableToStreamBuffer(createEmptyTable(schema))
} else if (data instanceof ArrowTable) {
buffer = await fromTableToStreamBuffer(data, embeddings)
} else {
// data is Array<Record<...>>
buffer = await fromRecordsToStreamBuffer(data, embeddings)
}
const res = await this._client.post(
`/v1/table/${encodeURIComponent(tableName)}/create/`,
buffer,
undefined,
'application/vnd.apache.arrow.stream'
)
if (res.status !== 200) {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${await res.body()}`
)
}
this._tableCache.set(tableName, true)
if (embeddings === undefined) {
return new RemoteTable(this._client, tableName)
} else {
return new RemoteTable(this._client, tableName, embeddings)
}
}
async dropTable (name: string): Promise<void> {
await this._client.post(`/v1/table/${encodeURIComponent(name)}/drop/`)
this._tableCache.delete(name)
}
withMiddleware (middleware: HttpMiddleware): Connection {
const wrapped = this.clone()
wrapped._client = wrapped._client.withMiddleware(middleware)
return wrapped
}
private clone (): RemoteConnection {
const clone: RemoteConnection = Object.create(RemoteConnection.prototype)
return Object.assign(clone, this)
}
}
export class RemoteQuery<T = number[]> extends Query<T> {
constructor (
query: T,
private readonly _client: HttpLancedbClient,
private readonly _name: string,
embeddings?: EmbeddingFunction<T>
) {
super(query, undefined, embeddings)
}
// TODO: refactor this to a base class + queryImpl pattern
async execute<T = Record<string, unknown>>(): Promise<T[]> {
const embeddings = this._embeddings
const query = (this as any)._query
let queryVector: number[]
if (embeddings !== undefined) {
queryVector = (await embeddings.embed([query]))[0]
} else {
queryVector = query as number[]
}
const data = await this._client.search(
this._name,
queryVector,
(this as any)._limit,
(this as any)._nprobes,
(this as any)._prefilter,
(this as any)._refineFactor,
(this as any)._select,
(this as any)._filter,
(this as any)._metricType,
(this as any)._fastSearch
)
return data.toArray().map((entry: Record<string, unknown>) => {
const newObject: Record<string, unknown> = {}
Object.keys(entry).forEach((key: string) => {
if (entry[key] instanceof Vector) {
newObject[key] = (entry[key] as any).toArray()
} else {
newObject[key] = entry[key] as any
}
})
return newObject as unknown as T
})
}
}
// we are using extend until we have next next version release
// Table and Connection has both been refactored to interfaces
export class RemoteTable<T = number[]> implements Table<T> {
private _client: HttpLancedbClient
private readonly _embeddings?: EmbeddingFunction<T>
private readonly _name: string
constructor (client: HttpLancedbClient, name: string)
constructor (
client: HttpLancedbClient,
name: string,
embeddings: EmbeddingFunction<T>
)
constructor (
client: HttpLancedbClient,
name: string,
embeddings?: EmbeddingFunction<T>
) {
this._client = client
this._name = name
this._embeddings = embeddings
}
get name (): string {
return this._name
}
get schema (): Promise<any> {
return this._client
.post(`/v1/table/${encodeURIComponent(this._name)}/describe/`)
.then(async (res) => {
if (res.status !== 200) {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${await res.body()}`
)
}
return (await res.body())?.schema
})
}
search (query: T): Query<T> {
return new RemoteQuery(query, this._client, encodeURIComponent(this._name)) //, this._embeddings_new)
}
filter (where: string): Query<T> {
throw new Error('Not implemented')
}
async mergeInsert (on: string, data: Array<Record<string, unknown>> | ArrowTable, args: MergeInsertArgs): Promise<void> {
let tbl: ArrowTable
if (data instanceof ArrowTable) {
tbl = data
} else {
tbl = makeArrowTable(data, await this.schema)
}
const queryParams: any = {
on
}
if (args.whenMatchedUpdateAll !== false && args.whenMatchedUpdateAll !== null && args.whenMatchedUpdateAll !== undefined) {
queryParams.when_matched_update_all = 'true'
if (typeof args.whenMatchedUpdateAll === 'string') {
queryParams.when_matched_update_all_filt = args.whenMatchedUpdateAll
}
} else {
queryParams.when_matched_update_all = 'false'
}
if (args.whenNotMatchedInsertAll ?? false) {
queryParams.when_not_matched_insert_all = 'true'
} else {
queryParams.when_not_matched_insert_all = 'false'
}
if (args.whenNotMatchedBySourceDelete !== false && args.whenNotMatchedBySourceDelete !== null && args.whenNotMatchedBySourceDelete !== undefined) {
queryParams.when_not_matched_by_source_delete = 'true'
if (typeof args.whenNotMatchedBySourceDelete === 'string') {
queryParams.when_not_matched_by_source_delete_filt = args.whenNotMatchedBySourceDelete
}
} else {
queryParams.when_not_matched_by_source_delete = 'false'
}
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
const res = await this._client.post(
`/v1/table/${encodeURIComponent(this._name)}/merge_insert/`,
buffer,
queryParams,
'application/vnd.apache.arrow.stream'
)
if (res.status !== 200) {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${await res.body()}`
)
}
}
async add (data: Array<Record<string, unknown>> | ArrowTable): Promise<number> {
let tbl: ArrowTable
if (data instanceof ArrowTable) {
tbl = data
} else {
tbl = makeArrowTable(data, await this.schema)
}
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
const res = await this._client.post(
`/v1/table/${encodeURIComponent(this._name)}/insert/`,
buffer,
{
mode: 'append'
},
'application/vnd.apache.arrow.stream'
)
if (res.status !== 200) {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${await res.body()}`
)
}
return tbl.numRows
}
async overwrite (data: Array<Record<string, unknown>> | ArrowTable): Promise<number> {
let tbl: ArrowTable
if (data instanceof ArrowTable) {
tbl = data
} else {
tbl = makeArrowTable(data)
}
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
const res = await this._client.post(
`/v1/table/${encodeURIComponent(this._name)}/insert/`,
buffer,
{
mode: 'overwrite'
},
'application/vnd.apache.arrow.stream'
)
if (res.status !== 200) {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${await res.body()}`
)
}
return tbl.numRows
}
async createIndex (indexParams: VectorIndexParams): Promise<void> {
const unsupportedParams = [
'index_name',
'num_partitions',
'max_iters',
'use_opq',
'num_sub_vectors',
'num_bits',
'max_opq_iters',
'replace'
]
for (const param of unsupportedParams) {
// eslint-disable-next-line @typescript-eslint/strict-boolean-expressions
if (indexParams[param as keyof VectorIndexParams]) {
throw new Error(`${param} is not supported for remote connections`)
}
}
const column = indexParams.column ?? 'vector'
const indexType = 'vector'
const metricType = indexParams.metric_type ?? 'L2'
const indexCacheSize = indexParams.index_cache_size ?? null
const data = {
column,
index_type: indexType,
metric_type: metricType,
index_cache_size: indexCacheSize
}
const res = await this._client.post(
`/v1/table/${encodeURIComponent(this._name)}/create_index/`,
data
)
if (res.status !== 200) {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${await res.body()}`
)
}
}
async createScalarIndex (column: string): Promise<void> {
const indexType = 'scalar'
const data = {
column,
index_type: indexType,
replace: true
}
const res = await this._client.post(
`/v1/table/${encodeURIComponent(this._name)}/create_scalar_index/`,
data
)
if (res.status !== 200) {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${await res.body()}`
)
}
}
async dropIndex (index_name: string): Promise<void> {
const res = await this._client.post(
`/v1/table/${encodeURIComponent(this._name)}/index/${encodeURIComponent(index_name)}/drop/`
)
if (res.status !== 200) {
throw new Error(
`Server Error, status: ${res.status}, ` +
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
`message: ${res.statusText}: ${await res.body()}`
)
}
}
async countRows (filter?: string): Promise<number> {
const result = await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/count_rows/`, {
predicate: filter
})
return (await result.body())
}
async delete (filter: string): Promise<void> {
await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/delete/`, {
predicate: filter
})
}
async update (args: UpdateArgs | UpdateSqlArgs): Promise<void> {
let filter: string | null
let updates: Record<string, string>
if ('valuesSql' in args) {
filter = args.where ?? null
updates = args.valuesSql
} else {
filter = args.where ?? null
updates = {}
for (const [key, value] of Object.entries(args.values)) {
updates[key] = toSQL(value)
}
}
await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/update/`, {
predicate: filter,
updates: Object.entries(updates).map(([key, value]) => [key, value])
})
}
async listIndices (): Promise<VectorIndex[]> {
const results = await this._client.post(
`/v1/table/${encodeURIComponent(this._name)}/index/list/`
)
return (await results.body()).indexes?.map((index: any) => ({
columns: index.columns,
name: index.index_name,
uuid: index.index_uuid,
status: index.status
}))
}
async indexStats (indexName: string): Promise<IndexStats> {
const results = await this._client.post(
`/v1/table/${encodeURIComponent(this._name)}/index/${indexName}/stats/`
)
const body = await results.body()
return {
numIndexedRows: body?.num_indexed_rows,
numUnindexedRows: body?.num_unindexed_rows,
indexType: body?.index_type,
distanceType: body?.distance_type
}
}
async addColumns (newColumnTransforms: Array<{ name: string, valueSql: string }>): Promise<void> {
throw new Error('Add columns is not yet supported in LanceDB Cloud.')
}
async alterColumns (columnAlterations: ColumnAlteration[]): Promise<void> {
throw new Error('Alter columns is not yet supported in LanceDB Cloud.')
}
async dropColumns (columnNames: string[]): Promise<void> {
throw new Error('Drop columns is not yet supported in LanceDB Cloud.')
}
withMiddleware(middleware: HttpMiddleware): Table<T> {
const wrapped = this.clone()
wrapped._client = wrapped._client.withMiddleware(middleware)
return wrapped
}
private clone (): RemoteTable<T> {
const clone: RemoteTable<T> = Object.create(RemoteTable.prototype)
return Object.assign(clone, this)
}
}

508
node/src/sanitize.ts Normal file
View File

@@ -0,0 +1,508 @@
// Copyright 2023 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// The utilities in this file help sanitize data from the user's arrow
// library into the types expected by vectordb's arrow library. Node
// generally allows for mulitple versions of the same library (and sometimes
// even multiple copies of the same version) to be installed at the same
// time. However, arrow-js uses instanceof which expected that the input
// comes from the exact same library instance. This is not always the case
// and so we must sanitize the input to ensure that it is compatible.
import {
Field,
Utf8,
FixedSizeBinary,
FixedSizeList,
Schema,
List,
Struct,
Float,
Bool,
Date_,
Decimal,
type DataType,
Dictionary,
Binary,
Float32,
Interval,
Map_,
Duration,
Union,
Time,
Timestamp,
Type,
Null,
Int,
type Precision,
type DateUnit,
Int8,
Int16,
Int32,
Int64,
Uint8,
Uint16,
Uint32,
Uint64,
Float16,
Float64,
DateDay,
DateMillisecond,
DenseUnion,
SparseUnion,
TimeNanosecond,
TimeMicrosecond,
TimeMillisecond,
TimeSecond,
TimestampNanosecond,
TimestampMicrosecond,
TimestampMillisecond,
TimestampSecond,
IntervalDayTime,
IntervalYearMonth,
DurationNanosecond,
DurationMicrosecond,
DurationMillisecond,
DurationSecond
} from "apache-arrow";
import type { IntBitWidth, TimeBitWidth } from "apache-arrow/type";
function sanitizeMetadata(
metadataLike?: unknown
): Map<string, string> | undefined {
if (metadataLike === undefined || metadataLike === null) {
return undefined;
}
if (!(metadataLike instanceof Map)) {
throw Error("Expected metadata, if present, to be a Map<string, string>");
}
for (const item of metadataLike) {
if (!(typeof item[0] === "string" || !(typeof item[1] === "string"))) {
throw Error(
"Expected metadata, if present, to be a Map<string, string> but it had non-string keys or values"
);
}
}
return metadataLike as Map<string, string>;
}
function sanitizeInt(typeLike: object) {
if (
!("bitWidth" in typeLike) ||
typeof typeLike.bitWidth !== "number" ||
!("isSigned" in typeLike) ||
typeof typeLike.isSigned !== "boolean"
) {
throw Error(
"Expected an Int Type to have a `bitWidth` and `isSigned` property"
);
}
return new Int(typeLike.isSigned, typeLike.bitWidth as IntBitWidth);
}
function sanitizeFloat(typeLike: object) {
if (!("precision" in typeLike) || typeof typeLike.precision !== "number") {
throw Error("Expected a Float Type to have a `precision` property");
}
return new Float(typeLike.precision as Precision);
}
function sanitizeDecimal(typeLike: object) {
if (
!("scale" in typeLike) ||
typeof typeLike.scale !== "number" ||
!("precision" in typeLike) ||
typeof typeLike.precision !== "number" ||
!("bitWidth" in typeLike) ||
typeof typeLike.bitWidth !== "number"
) {
throw Error(
"Expected a Decimal Type to have `scale`, `precision`, and `bitWidth` properties"
);
}
return new Decimal(typeLike.scale, typeLike.precision, typeLike.bitWidth);
}
function sanitizeDate(typeLike: object) {
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
throw Error("Expected a Date type to have a `unit` property");
}
return new Date_(typeLike.unit as DateUnit);
}
function sanitizeTime(typeLike: object) {
if (
!("unit" in typeLike) ||
typeof typeLike.unit !== "number" ||
!("bitWidth" in typeLike) ||
typeof typeLike.bitWidth !== "number"
) {
throw Error(
"Expected a Time type to have `unit` and `bitWidth` properties"
);
}
return new Time(typeLike.unit, typeLike.bitWidth as TimeBitWidth);
}
function sanitizeTimestamp(typeLike: object) {
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
throw Error("Expected a Timestamp type to have a `unit` property");
}
let timezone = null;
if ("timezone" in typeLike && typeof typeLike.timezone === "string") {
timezone = typeLike.timezone;
}
return new Timestamp(typeLike.unit, timezone);
}
function sanitizeTypedTimestamp(
typeLike: object,
Datatype:
| typeof TimestampNanosecond
| typeof TimestampMicrosecond
| typeof TimestampMillisecond
| typeof TimestampSecond
) {
let timezone = null;
if ("timezone" in typeLike && typeof typeLike.timezone === "string") {
timezone = typeLike.timezone;
}
return new Datatype(timezone);
}
function sanitizeInterval(typeLike: object) {
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
throw Error("Expected an Interval type to have a `unit` property");
}
return new Interval(typeLike.unit);
}
function sanitizeList(typeLike: object) {
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
throw Error(
"Expected a List type to have an array-like `children` property"
);
}
if (typeLike.children.length !== 1) {
throw Error("Expected a List type to have exactly one child");
}
return new List(sanitizeField(typeLike.children[0]));
}
function sanitizeStruct(typeLike: object) {
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
throw Error(
"Expected a Struct type to have an array-like `children` property"
);
}
return new Struct(typeLike.children.map((child) => sanitizeField(child)));
}
function sanitizeUnion(typeLike: object) {
if (
!("typeIds" in typeLike) ||
!("mode" in typeLike) ||
typeof typeLike.mode !== "number"
) {
throw Error(
"Expected a Union type to have `typeIds` and `mode` properties"
);
}
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
throw Error(
"Expected a Union type to have an array-like `children` property"
);
}
return new Union(
typeLike.mode,
typeLike.typeIds as any,
typeLike.children.map((child) => sanitizeField(child))
);
}
function sanitizeTypedUnion(
typeLike: object,
UnionType: typeof DenseUnion | typeof SparseUnion
) {
if (!("typeIds" in typeLike)) {
throw Error(
"Expected a DenseUnion/SparseUnion type to have a `typeIds` property"
);
}
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
throw Error(
"Expected a DenseUnion/SparseUnion type to have an array-like `children` property"
);
}
return new UnionType(
typeLike.typeIds as any,
typeLike.children.map((child) => sanitizeField(child))
);
}
function sanitizeFixedSizeBinary(typeLike: object) {
if (!("byteWidth" in typeLike) || typeof typeLike.byteWidth !== "number") {
throw Error(
"Expected a FixedSizeBinary type to have a `byteWidth` property"
);
}
return new FixedSizeBinary(typeLike.byteWidth);
}
function sanitizeFixedSizeList(typeLike: object) {
if (!("listSize" in typeLike) || typeof typeLike.listSize !== "number") {
throw Error("Expected a FixedSizeList type to have a `listSize` property");
}
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
throw Error(
"Expected a FixedSizeList type to have an array-like `children` property"
);
}
if (typeLike.children.length !== 1) {
throw Error("Expected a FixedSizeList type to have exactly one child");
}
return new FixedSizeList(
typeLike.listSize,
sanitizeField(typeLike.children[0])
);
}
function sanitizeMap(typeLike: object) {
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
throw Error(
"Expected a Map type to have an array-like `children` property"
);
}
if (!("keysSorted" in typeLike) || typeof typeLike.keysSorted !== "boolean") {
throw Error("Expected a Map type to have a `keysSorted` property");
}
return new Map_(
typeLike.children.map((field) => sanitizeField(field)) as any,
typeLike.keysSorted
);
}
function sanitizeDuration(typeLike: object) {
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
throw Error("Expected a Duration type to have a `unit` property");
}
return new Duration(typeLike.unit);
}
function sanitizeDictionary(typeLike: object) {
if (!("id" in typeLike) || typeof typeLike.id !== "number") {
throw Error("Expected a Dictionary type to have an `id` property");
}
if (!("indices" in typeLike) || typeof typeLike.indices !== "object") {
throw Error("Expected a Dictionary type to have an `indices` property");
}
if (!("dictionary" in typeLike) || typeof typeLike.dictionary !== "object") {
throw Error("Expected a Dictionary type to have an `dictionary` property");
}
if (!("isOrdered" in typeLike) || typeof typeLike.isOrdered !== "boolean") {
throw Error("Expected a Dictionary type to have an `isOrdered` property");
}
return new Dictionary(
sanitizeType(typeLike.dictionary),
sanitizeType(typeLike.indices) as any,
typeLike.id,
typeLike.isOrdered
);
}
function sanitizeType(typeLike: unknown): DataType<any> {
if (typeof typeLike !== "object" || typeLike === null) {
throw Error("Expected a Type but object was null/undefined");
}
if (!("typeId" in typeLike) || !(typeof typeLike.typeId !== "function")) {
throw Error("Expected a Type to have a typeId function");
}
let typeId: Type;
if (typeof typeLike.typeId === "function") {
typeId = (typeLike.typeId as () => unknown)() as Type;
} else if (typeof typeLike.typeId === "number") {
typeId = typeLike.typeId as Type;
} else {
throw Error("Type's typeId property was not a function or number");
}
switch (typeId) {
case Type.NONE:
throw Error("Received a Type with a typeId of NONE");
case Type.Null:
return new Null();
case Type.Int:
return sanitizeInt(typeLike);
case Type.Float:
return sanitizeFloat(typeLike);
case Type.Binary:
return new Binary();
case Type.Utf8:
return new Utf8();
case Type.Bool:
return new Bool();
case Type.Decimal:
return sanitizeDecimal(typeLike);
case Type.Date:
return sanitizeDate(typeLike);
case Type.Time:
return sanitizeTime(typeLike);
case Type.Timestamp:
return sanitizeTimestamp(typeLike);
case Type.Interval:
return sanitizeInterval(typeLike);
case Type.List:
return sanitizeList(typeLike);
case Type.Struct:
return sanitizeStruct(typeLike);
case Type.Union:
return sanitizeUnion(typeLike);
case Type.FixedSizeBinary:
return sanitizeFixedSizeBinary(typeLike);
case Type.FixedSizeList:
return sanitizeFixedSizeList(typeLike);
case Type.Map:
return sanitizeMap(typeLike);
case Type.Duration:
return sanitizeDuration(typeLike);
case Type.Dictionary:
return sanitizeDictionary(typeLike);
case Type.Int8:
return new Int8();
case Type.Int16:
return new Int16();
case Type.Int32:
return new Int32();
case Type.Int64:
return new Int64();
case Type.Uint8:
return new Uint8();
case Type.Uint16:
return new Uint16();
case Type.Uint32:
return new Uint32();
case Type.Uint64:
return new Uint64();
case Type.Float16:
return new Float16();
case Type.Float32:
return new Float32();
case Type.Float64:
return new Float64();
case Type.DateMillisecond:
return new DateMillisecond();
case Type.DateDay:
return new DateDay();
case Type.TimeNanosecond:
return new TimeNanosecond();
case Type.TimeMicrosecond:
return new TimeMicrosecond();
case Type.TimeMillisecond:
return new TimeMillisecond();
case Type.TimeSecond:
return new TimeSecond();
case Type.TimestampNanosecond:
return sanitizeTypedTimestamp(typeLike, TimestampNanosecond);
case Type.TimestampMicrosecond:
return sanitizeTypedTimestamp(typeLike, TimestampMicrosecond);
case Type.TimestampMillisecond:
return sanitizeTypedTimestamp(typeLike, TimestampMillisecond);
case Type.TimestampSecond:
return sanitizeTypedTimestamp(typeLike, TimestampSecond);
case Type.DenseUnion:
return sanitizeTypedUnion(typeLike, DenseUnion);
case Type.SparseUnion:
return sanitizeTypedUnion(typeLike, SparseUnion);
case Type.IntervalDayTime:
return new IntervalDayTime();
case Type.IntervalYearMonth:
return new IntervalYearMonth();
case Type.DurationNanosecond:
return new DurationNanosecond();
case Type.DurationMicrosecond:
return new DurationMicrosecond();
case Type.DurationMillisecond:
return new DurationMillisecond();
case Type.DurationSecond:
return new DurationSecond();
}
}
function sanitizeField(fieldLike: unknown): Field {
if (fieldLike instanceof Field) {
return fieldLike;
}
if (typeof fieldLike !== "object" || fieldLike === null) {
throw Error("Expected a Field but object was null/undefined");
}
if (
!("type" in fieldLike) ||
!("name" in fieldLike) ||
!("nullable" in fieldLike)
) {
throw Error(
"The field passed in is missing a `type`/`name`/`nullable` property"
);
}
const type = sanitizeType(fieldLike.type);
const name = fieldLike.name;
if (!(typeof name === "string")) {
throw Error("The field passed in had a non-string `name` property");
}
const nullable = fieldLike.nullable;
if (!(typeof nullable === "boolean")) {
throw Error("The field passed in had a non-boolean `nullable` property");
}
let metadata;
if ("metadata" in fieldLike) {
metadata = sanitizeMetadata(fieldLike.metadata);
}
return new Field(name, type, nullable, metadata);
}
/**
* Convert something schemaLike into a Schema instance
*
* This method is often needed even when the caller is using a Schema
* instance because they might be using a different instance of apache-arrow
* than lancedb is using.
*/
export function sanitizeSchema(schemaLike: unknown): Schema {
if (schemaLike instanceof Schema) {
return schemaLike;
}
if (typeof schemaLike !== "object" || schemaLike === null) {
throw Error("Expected a Schema but object was null/undefined");
}
if (!("fields" in schemaLike)) {
throw Error(
"The schema passed in does not appear to be a schema (no 'fields' property)"
);
}
let metadata;
if ("metadata" in schemaLike) {
metadata = sanitizeMetadata(schemaLike.metadata);
}
if (!Array.isArray(schemaLike.fields)) {
throw Error(
"The schema passed in had a 'fields' property but it was not an array"
);
}
const sanitizedFields = schemaLike.fields.map((field) =>
sanitizeField(field)
);
return new Schema(sanitizedFields, metadata);
}

360
node/src/test/arrow.test.ts Normal file
View File

@@ -0,0 +1,360 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { describe } from 'mocha'
import { assert, expect, use as chaiUse } from 'chai'
import * as chaiAsPromised from 'chai-as-promised'
import { convertToTable, fromTableToBuffer, makeArrowTable, makeEmptyTable } from '../arrow'
import {
Field,
FixedSizeList,
Float16,
Float32,
Int32,
tableFromIPC,
Schema,
Float64,
type Table,
Binary,
Bool,
Utf8,
Struct,
List,
DataType,
Dictionary,
Int64,
MetadataVersion
} from 'apache-arrow'
import {
Dictionary as OldDictionary,
Field as OldField,
FixedSizeList as OldFixedSizeList,
Float32 as OldFloat32,
Int32 as OldInt32,
Struct as OldStruct,
Schema as OldSchema,
TimestampNanosecond as OldTimestampNanosecond,
Utf8 as OldUtf8
} from 'apache-arrow-old'
import { type EmbeddingFunction } from '../embedding/embedding_function'
chaiUse(chaiAsPromised)
function sampleRecords (): Array<Record<string, any>> {
return [
{
binary: Buffer.alloc(5),
boolean: false,
number: 7,
string: 'hello',
struct: { x: 0, y: 0 },
list: ['anime', 'action', 'comedy']
}
]
}
// Helper method to verify various ways to create a table
async function checkTableCreation (tableCreationMethod: (records: any, recordsReversed: any, schema: Schema) => Promise<Table>): Promise<void> {
const records = sampleRecords()
const recordsReversed = [{
list: ['anime', 'action', 'comedy'],
struct: { x: 0, y: 0 },
string: 'hello',
number: 7,
boolean: false,
binary: Buffer.alloc(5)
}]
const schema = new Schema([
new Field('binary', new Binary(), false),
new Field('boolean', new Bool(), false),
new Field('number', new Float64(), false),
new Field('string', new Utf8(), false),
new Field('struct', new Struct([
new Field('x', new Float64(), false),
new Field('y', new Float64(), false)
])),
new Field('list', new List(new Field('item', new Utf8(), false)), false)
])
const table = await tableCreationMethod(records, recordsReversed, schema)
schema.fields.forEach((field, idx) => {
const actualField = table.schema.fields[idx]
assert.isFalse(actualField.nullable)
assert.equal(table.getChild(field.name)?.type.toString(), field.type.toString())
assert.equal(table.getChildAt(idx)?.type.toString(), field.type.toString())
})
}
describe('The function makeArrowTable', function () {
it('will use data types from a provided schema instead of inference', async function () {
const schema = new Schema([
new Field('a', new Int32()),
new Field('b', new Float32()),
new Field('c', new FixedSizeList(3, new Field('item', new Float16()))),
new Field('d', new Int64())
])
const table = makeArrowTable(
[
{ a: 1, b: 2, c: [1, 2, 3], d: 9 },
{ a: 4, b: 5, c: [4, 5, 6], d: 10 },
{ a: 7, b: 8, c: [7, 8, 9], d: null }
],
{ schema }
)
const buf = await fromTableToBuffer(table)
assert.isAbove(buf.byteLength, 0)
const actual = tableFromIPC(buf)
assert.equal(actual.numRows, 3)
const actualSchema = actual.schema
assert.deepEqual(actualSchema, schema)
})
it('will assume the column `vector` is FixedSizeList<Float32> by default', async function () {
const schema = new Schema([
new Field('a', new Float64()),
new Field('b', new Float64()),
new Field(
'vector',
new FixedSizeList(3, new Field('item', new Float32(), true))
)
])
const table = makeArrowTable([
{ a: 1, b: 2, vector: [1, 2, 3] },
{ a: 4, b: 5, vector: [4, 5, 6] },
{ a: 7, b: 8, vector: [7, 8, 9] }
])
const buf = await fromTableToBuffer(table)
assert.isAbove(buf.byteLength, 0)
const actual = tableFromIPC(buf)
assert.equal(actual.numRows, 3)
const actualSchema = actual.schema
assert.deepEqual(actualSchema, schema)
})
it('can support multiple vector columns', async function () {
const schema = new Schema([
new Field('a', new Float64()),
new Field('b', new Float64()),
new Field('vec1', new FixedSizeList(3, new Field('item', new Float16(), true))),
new Field('vec2', new FixedSizeList(3, new Field('item', new Float16(), true)))
])
const table = makeArrowTable(
[
{ a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
{ a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
{ a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
],
{
vectorColumns: {
vec1: { type: new Float16() },
vec2: { type: new Float16() }
}
}
)
const buf = await fromTableToBuffer(table)
assert.isAbove(buf.byteLength, 0)
const actual = tableFromIPC(buf)
assert.equal(actual.numRows, 3)
const actualSchema = actual.schema
assert.deepEqual(actualSchema, schema)
})
it('will allow different vector column types', async function () {
const table = makeArrowTable(
[
{ fp16: [1], fp32: [1], fp64: [1] }
],
{
vectorColumns: {
fp16: { type: new Float16() },
fp32: { type: new Float32() },
fp64: { type: new Float64() }
}
}
)
assert.equal(table.getChild('fp16')?.type.children[0].type.toString(), new Float16().toString())
assert.equal(table.getChild('fp32')?.type.children[0].type.toString(), new Float32().toString())
assert.equal(table.getChild('fp64')?.type.children[0].type.toString(), new Float64().toString())
})
it('will use dictionary encoded strings if asked', async function () {
const table = makeArrowTable([{ str: 'hello' }])
assert.isTrue(DataType.isUtf8(table.getChild('str')?.type))
const tableWithDict = makeArrowTable([{ str: 'hello' }], { dictionaryEncodeStrings: true })
assert.isTrue(DataType.isDictionary(tableWithDict.getChild('str')?.type))
const schema = new Schema([
new Field('str', new Dictionary(new Utf8(), new Int32()))
])
const tableWithDict2 = makeArrowTable([{ str: 'hello' }], { schema })
assert.isTrue(DataType.isDictionary(tableWithDict2.getChild('str')?.type))
})
it('will infer data types correctly', async function () {
await checkTableCreation(async (records) => makeArrowTable(records))
})
it('will allow a schema to be provided', async function () {
await checkTableCreation(async (records, _, schema) => makeArrowTable(records, { schema }))
})
it('will use the field order of any provided schema', async function () {
await checkTableCreation(async (_, recordsReversed, schema) => makeArrowTable(recordsReversed, { schema }))
})
it('will make an empty table', async function () {
await checkTableCreation(async (_, __, schema) => makeArrowTable([], { schema }))
})
})
class DummyEmbedding implements EmbeddingFunction<string> {
public readonly sourceColumn = 'string'
public readonly embeddingDimension = 2
public readonly embeddingDataType = new Float16()
async embed (data: string[]): Promise<number[][]> {
return data.map(
() => [0.0, 0.0]
)
}
}
class DummyEmbeddingWithNoDimension implements EmbeddingFunction<string> {
public readonly sourceColumn = 'string'
async embed (data: string[]): Promise<number[][]> {
return data.map(
() => [0.0, 0.0]
)
}
}
describe('convertToTable', function () {
it('will infer data types correctly', async function () {
await checkTableCreation(async (records) => await convertToTable(records))
})
it('will allow a schema to be provided', async function () {
await checkTableCreation(async (records, _, schema) => await convertToTable(records, undefined, { schema }))
})
it('will use the field order of any provided schema', async function () {
await checkTableCreation(async (_, recordsReversed, schema) => await convertToTable(recordsReversed, undefined, { schema }))
})
it('will make an empty table', async function () {
await checkTableCreation(async (_, __, schema) => await convertToTable([], undefined, { schema }))
})
it('will apply embeddings', async function () {
const records = sampleRecords()
const table = await convertToTable(records, new DummyEmbedding())
assert.isTrue(DataType.isFixedSizeList(table.getChild('vector')?.type))
assert.equal(table.getChild('vector')?.type.children[0].type.toString(), new Float16().toString())
})
it('will fail if missing the embedding source column', async function () {
return await expect(convertToTable([{ id: 1 }], new DummyEmbedding())).to.be.rejectedWith("'string' was not present")
})
it('use embeddingDimension if embedding missing from table', async function () {
const schema = new Schema([
new Field('string', new Utf8(), false)
])
// Simulate getting an empty Arrow table (minus embedding) from some other source
// In other words, we aren't starting with records
const table = makeEmptyTable(schema)
// If the embedding specifies the dimension we are fine
await fromTableToBuffer(table, new DummyEmbedding())
// We can also supply a schema and should be ok
const schemaWithEmbedding = new Schema([
new Field('string', new Utf8(), false),
new Field('vector', new FixedSizeList(2, new Field('item', new Float16(), false)), false)
])
await fromTableToBuffer(table, new DummyEmbeddingWithNoDimension(), schemaWithEmbedding)
// Otherwise we will get an error
return await expect(fromTableToBuffer(table, new DummyEmbeddingWithNoDimension())).to.be.rejectedWith('does not specify `embeddingDimension`')
})
it('will apply embeddings to an empty table', async function () {
const schema = new Schema([
new Field('string', new Utf8(), false),
new Field('vector', new FixedSizeList(2, new Field('item', new Float16(), false)), false)
])
const table = await convertToTable([], new DummyEmbedding(), { schema })
assert.isTrue(DataType.isFixedSizeList(table.getChild('vector')?.type))
assert.equal(table.getChild('vector')?.type.children[0].type.toString(), new Float16().toString())
})
it('will complain if embeddings present but schema missing embedding column', async function () {
const schema = new Schema([
new Field('string', new Utf8(), false)
])
return await expect(convertToTable([], new DummyEmbedding(), { schema })).to.be.rejectedWith('column vector was missing')
})
it('will provide a nice error if run twice', async function () {
const records = sampleRecords()
const table = await convertToTable(records, new DummyEmbedding())
// fromTableToBuffer will try and apply the embeddings again
return await expect(fromTableToBuffer(table, new DummyEmbedding())).to.be.rejectedWith('already existed')
})
})
describe('makeEmptyTable', function () {
it('will make an empty table', async function () {
await checkTableCreation(async (_, __, schema) => makeEmptyTable(schema))
})
})
describe('when using two versions of arrow', function () {
it('can still import data', async function() {
const schema = new OldSchema([
new OldField('id', new OldInt32()),
new OldField('vector', new OldFixedSizeList(1024, new OldField("item", new OldFloat32(), true))),
new OldField('struct', new OldStruct([
new OldField('nested', new OldDictionary(new OldUtf8(), new OldInt32(), 1, true)),
new OldField('ts_with_tz', new OldTimestampNanosecond("some_tz")),
new OldField('ts_no_tz', new OldTimestampNanosecond(null))
]))
]) as any
// We use arrow version 13 to emulate a "foreign arrow" and this version doesn't have metadataVersion
// In theory, this wouldn't matter. We don't rely on that property. However, it causes deepEqual to
// fail so we patch it back in
schema.metadataVersion = MetadataVersion.V5
const table = makeArrowTable(
[],
{ schema }
)
const buf = await fromTableToBuffer(table)
assert.isAbove(buf.byteLength, 0)
const actual = tableFromIPC(buf)
const actualSchema = actual.schema
assert.deepEqual(actualSchema, schema)
})
})

View File

@@ -0,0 +1,55 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { describe } from 'mocha'
import { assert } from 'chai'
import { OpenAIEmbeddingFunction } from '../../embedding/openai'
import { isEmbeddingFunction } from '../../embedding/embedding_function'
// eslint-disable-next-line @typescript-eslint/no-var-requires
const OpenAIApi = require('openai')
// eslint-disable-next-line @typescript-eslint/no-var-requires
const { stub } = require('sinon')
describe('OpenAPIEmbeddings', function () {
const stubValue = {
data: [
{
embedding: Array(1536).fill(1.0)
},
{
embedding: Array(1536).fill(2.0)
}
]
}
describe('#embed', function () {
it('should create vector embeddings', async function () {
const openAIStub = stub(OpenAIApi.Embeddings.prototype, 'create').returns(stubValue)
const f = new OpenAIEmbeddingFunction('text', 'sk-key')
const vectors = await f.embed(['abc', 'def'])
assert.isTrue(openAIStub.calledOnce)
assert.equal(vectors.length, 2)
assert.deepEqual(vectors[0], stubValue.data[0].embedding)
assert.deepEqual(vectors[1], stubValue.data[1].embedding)
})
})
describe('isEmbeddingFunction', function () {
it('should match the isEmbeddingFunction guard', function () {
assert.isTrue(isEmbeddingFunction(new OpenAIEmbeddingFunction('text', 'sk-key')))
})
})
})

76
node/src/test/io.ts Normal file
View File

@@ -0,0 +1,76 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// IO tests
import { describe } from 'mocha'
import { assert } from 'chai'
import * as lancedb from '../index'
import { type ConnectionOptions } from '../index'
describe('LanceDB S3 client', function () {
if (process.env.TEST_S3_BASE_URL != null) {
const baseUri = process.env.TEST_S3_BASE_URL
it('should have a valid url', async function () {
const opts = { uri: `${baseUri}/valid_url` }
const table = await createTestDB(opts, 2, 20)
const con = await lancedb.connect(opts)
assert.equal(con.uri, opts.uri)
const results = await table.search([0.1, 0.3]).limit(5).execute()
assert.equal(results.length, 5)
}).timeout(10_000)
} else {
describe.skip('Skip S3 test', function () {})
}
if (process.env.TEST_S3_BASE_URL != null && process.env.TEST_AWS_ACCESS_KEY_ID != null && process.env.TEST_AWS_SECRET_ACCESS_KEY != null) {
const baseUri = process.env.TEST_S3_BASE_URL
it('use custom credentials', async function () {
const opts: ConnectionOptions = {
uri: `${baseUri}/custom_credentials`,
awsCredentials: {
accessKeyId: process.env.TEST_AWS_ACCESS_KEY_ID as string,
secretKey: process.env.TEST_AWS_SECRET_ACCESS_KEY as string
}
}
const table = await createTestDB(opts, 2, 20)
console.log(table)
const con = await lancedb.connect(opts)
console.log(con)
assert.equal(con.uri, opts.uri)
const results = await table.search([0.1, 0.3]).limit(5).execute()
assert.equal(results.length, 5)
}).timeout(10_000)
} else {
describe.skip('Skip S3 test', function () {})
}
})
async function createTestDB (opts: ConnectionOptions, numDimensions: number = 2, numRows: number = 2): Promise<lancedb.Table> {
const con = await lancedb.connect(opts)
const data = []
for (let i = 0; i < numRows; i++) {
const vector = []
for (let j = 0; j < numDimensions; j++) {
vector.push(i + (j * 0.1))
}
data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
}
return await con.createTable('vectors_2', data)
}

1279
node/src/test/test.ts Normal file

File diff suppressed because it is too large Load Diff

45
node/src/test/util.ts Normal file
View File

@@ -0,0 +1,45 @@
// Copyright 2023 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { toSQL } from '../util'
import * as chai from 'chai'
const expect = chai.expect
describe('toSQL', function () {
it('should turn string to SQL expression', function () {
expect(toSQL('foo')).to.equal("'foo'")
})
it('should turn number to SQL expression', function () {
expect(toSQL(123)).to.equal('123')
})
it('should turn boolean to SQL expression', function () {
expect(toSQL(true)).to.equal('TRUE')
})
it('should turn null to SQL expression', function () {
expect(toSQL(null)).to.equal('NULL')
})
it('should turn Date to SQL expression', function () {
const date = new Date('05 October 2011 14:48 UTC')
expect(toSQL(date)).to.equal("'2011-10-05T14:48:00.000Z'")
})
it('should turn array to SQL expression', function () {
expect(toSQL(['foo', 'bar', true, 1])).to.equal("['foo', 'bar', TRUE, 1]")
})
})

77
node/src/util.ts Normal file
View File

@@ -0,0 +1,77 @@
// Copyright 2023 LanceDB Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
export type Literal = string | number | boolean | null | Date | Literal[]
export function toSQL (value: Literal): string {
if (typeof value === 'string') {
return `'${value}'`
}
if (typeof value === 'number') {
return value.toString()
}
if (typeof value === 'boolean') {
return value ? 'TRUE' : 'FALSE'
}
if (value === null) {
return 'NULL'
}
if (value instanceof Date) {
return `'${value.toISOString()}'`
}
if (Array.isArray(value)) {
return `[${value.map(toSQL).join(', ')}]`
}
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
throw new Error(`Unsupported value type: ${typeof value} value: (${value})`)
}
export class TTLCache {
private readonly cache: Map<string, { value: any, expires: number }>
/**
* @param ttl Time to live in milliseconds
*/
constructor (private readonly ttl: number) {
this.cache = new Map()
}
get (key: string): any | undefined {
const entry = this.cache.get(key)
if (entry === undefined) {
return undefined
}
if (entry.expires < Date.now()) {
this.cache.delete(key)
return undefined
}
return entry.value
}
set (key: string, value: any): void {
this.cache.set(key, { value, expires: Date.now() + this.ttl })
}
delete (key: string): void {
this.cache.delete(key)
}
}

14
node/tsconfig.json Normal file
View File

@@ -0,0 +1,14 @@
{
"include": [
"src/**/*.ts",
"src/*.ts"
],
"compilerOptions": {
"target": "ES2020",
"module": "commonjs",
"declaration": true,
"outDir": "./dist",
"strict": true,
"sourceMap": true,
}
}

View File

@@ -1,13 +0,0 @@
These are the typescript bindings of LanceDB.
The core Rust library is in the `../rust/lancedb` directory, the rust binding
code is in the `src/` directory and the typescript bindings are in
the `lancedb/` directory.
Whenever you change the Rust code, you will need to recompile: `npm run build`.
Common commands:
* Build: `npm run build`
* Lint: `npm run lint`
* Fix lints: `npm run lint-fix`
* Test: `npm test`
* Run single test file: `npm test __test__/arrow.test.ts`

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.21.2"
version = "0.20.0-beta.2"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -1,16 +1,7 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
import {
Bool,
Field,
Int32,
List,
Schema,
Struct,
Uint8,
Utf8,
} from "apache-arrow";
import { Schema } from "apache-arrow";
import * as arrow15 from "apache-arrow-15";
import * as arrow16 from "apache-arrow-16";
@@ -20,12 +11,10 @@ import * as arrow18 from "apache-arrow-18";
import {
convertToTable,
fromBufferToRecordBatch,
fromDataToBuffer,
fromRecordBatchToBuffer,
fromTableToBuffer,
makeArrowTable,
makeEmptyTable,
tableFromIPC,
} from "../lancedb/arrow";
import {
EmbeddingFunction,
@@ -264,98 +253,6 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(actualSchema).toEqual(schema);
});
it("will detect vector columns when name contains 'vector' or 'embedding'", async function () {
// Test various naming patterns that should be detected as vector columns
const floatVectorTable = makeArrowTable([
{
// Float vectors (use decimal values to ensure they're treated as floats)
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
user_vector: [1.1, 2.2],
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
text_embedding: [3.3, 4.4],
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
doc_embeddings: [5.5, 6.6],
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
my_vector_field: [7.7, 8.8],
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
embedding_model: [9.9, 10.1],
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
VECTOR_COL: [11.1, 12.2], // uppercase
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
Vector_Mixed: [13.3, 14.4], // mixed case
},
]);
// Check that columns with 'vector' or 'embedding' in name are converted to FixedSizeList
const floatVectorColumns = [
"user_vector",
"text_embedding",
"doc_embeddings",
"my_vector_field",
"embedding_model",
"VECTOR_COL",
"Vector_Mixed",
];
for (const columnName of floatVectorColumns) {
expect(
DataType.isFixedSizeList(
floatVectorTable.getChild(columnName)?.type,
),
).toBe(true);
// Check that float vectors use Float32 by default
expect(
floatVectorTable
.getChild(columnName)
?.type.children[0].type.toString(),
).toEqual(new Float32().toString());
}
// Test that regular integer arrays still get treated as float vectors
// (since JavaScript doesn't distinguish integers from floats at runtime)
const integerArrayTable = makeArrowTable([
{
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
vector_int: [1, 2], // Regular array with integers - should be Float32
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
embedding_int: [3, 4], // Regular array with integers - should be Float32
},
]);
const integerArrayColumns = ["vector_int", "embedding_int"];
for (const columnName of integerArrayColumns) {
expect(
DataType.isFixedSizeList(
integerArrayTable.getChild(columnName)?.type,
),
).toBe(true);
// Regular integer arrays should use Float32 (avoiding false positives)
expect(
integerArrayTable
.getChild(columnName)
?.type.children[0].type.toString(),
).toEqual(new Float32().toString());
}
// Test normal list should NOT be converted to FixedSizeList
const normalListTable = makeArrowTable([
{
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
normal_list: [15.5, 16.6], // should NOT be detected as vector
},
]);
expect(
DataType.isFixedSizeList(
normalListTable.getChild("normal_list")?.type,
),
).toBe(false);
expect(
DataType.isList(normalListTable.getChild("normal_list")?.type),
).toBe(true);
});
it("will allow different vector column types", async function () {
const table = makeArrowTable([{ fp16: [1], fp32: [1], fp64: [1] }], {
vectorColumns: {
@@ -478,221 +375,8 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(table2.schema).toEqual(schema);
});
it("will handle missing columns in schema alignment when using embeddings", async function () {
const schema = new Schema(
[
new Field("domain", new Utf8(), true),
new Field("name", new Utf8(), true),
new Field("description", new Utf8(), true),
],
new Map([["embedding_functions", JSON.stringify([])]]),
);
const data = [
{ domain: "google.com", name: "Google" },
{ domain: "facebook.com", name: "Facebook" },
];
const table = await convertToTable(data, undefined, { schema });
expect(table.numCols).toBe(3);
expect(table.numRows).toBe(2);
const descriptionColumn = table.getChild("description");
expect(descriptionColumn).toBeDefined();
expect(descriptionColumn?.nullCount).toBe(2);
expect(descriptionColumn?.toArray()).toEqual([null, null]);
expect(table.getChild("domain")?.toArray()).toEqual([
"google.com",
"facebook.com",
]);
expect(table.getChild("name")?.toArray()).toEqual([
"Google",
"Facebook",
]);
});
it("will handle completely missing nested struct columns", async function () {
const schema = new Schema(
[
new Field("id", new Utf8(), true),
new Field("name", new Utf8(), true),
new Field(
"metadata",
new Struct([
new Field("version", new Int32(), true),
new Field("author", new Utf8(), true),
new Field(
"tags",
new List(new Field("item", new Utf8(), true)),
true,
),
]),
true,
),
],
new Map([["embedding_functions", JSON.stringify([])]]),
);
const data = [
{ id: "doc1", name: "Document 1" },
{ id: "doc2", name: "Document 2" },
];
const table = await convertToTable(data, undefined, { schema });
expect(table.numCols).toBe(3);
expect(table.numRows).toBe(2);
const buf = await fromTableToBuffer(table);
const retrievedTable = tableFromIPC(buf);
const rows = [];
for (let i = 0; i < retrievedTable.numRows; i++) {
rows.push(retrievedTable.get(i));
}
expect(rows[0].metadata.version).toBe(null);
expect(rows[0].metadata.author).toBe(null);
expect(rows[0].metadata.tags).toBe(null);
expect(rows[0].id).toBe("doc1");
expect(rows[0].name).toBe("Document 1");
});
it("will handle partially missing nested struct fields", async function () {
const schema = new Schema(
[
new Field("id", new Utf8(), true),
new Field(
"metadata",
new Struct([
new Field("version", new Int32(), true),
new Field("author", new Utf8(), true),
new Field("created_at", new Utf8(), true),
]),
true,
),
],
new Map([["embedding_functions", JSON.stringify([])]]),
);
const data = [
{ id: "doc1", metadata: { version: 1, author: "Alice" } },
{ id: "doc2", metadata: { version: 2 } },
];
const table = await convertToTable(data, undefined, { schema });
expect(table.numCols).toBe(2);
expect(table.numRows).toBe(2);
const metadataColumn = table.getChild("metadata");
expect(metadataColumn).toBeDefined();
expect(metadataColumn?.type.toString()).toBe(
"Struct<{version:Int32, author:Utf8, created_at:Utf8}>",
);
});
it("will handle multiple levels of nested structures", async function () {
const schema = new Schema(
[
new Field("id", new Utf8(), true),
new Field(
"config",
new Struct([
new Field("database", new Utf8(), true),
new Field(
"connection",
new Struct([
new Field("host", new Utf8(), true),
new Field("port", new Int32(), true),
new Field(
"ssl",
new Struct([
new Field("enabled", new Bool(), true),
new Field("cert_path", new Utf8(), true),
]),
true,
),
]),
true,
),
]),
true,
),
],
new Map([["embedding_functions", JSON.stringify([])]]),
);
const data = [
{
id: "config1",
config: {
database: "postgres",
connection: { host: "localhost" },
},
},
{
id: "config2",
config: { database: "mysql" },
},
{
id: "config3",
},
];
const table = await convertToTable(data, undefined, { schema });
expect(table.numCols).toBe(2);
expect(table.numRows).toBe(3);
const configColumn = table.getChild("config");
expect(configColumn).toBeDefined();
expect(configColumn?.type.toString()).toBe(
"Struct<{database:Utf8, connection:Struct<{host:Utf8, port:Int32, ssl:Struct<{enabled:Bool, cert_path:Utf8}>}>}>",
);
});
it("will handle missing columns in Arrow table input when using embeddings", async function () {
const incompleteTable = makeArrowTable([
{ domain: "google.com", name: "Google" },
{ domain: "facebook.com", name: "Facebook" },
]);
const schema = new Schema(
[
new Field("domain", new Utf8(), true),
new Field("name", new Utf8(), true),
new Field("description", new Utf8(), true),
],
new Map([["embedding_functions", JSON.stringify([])]]),
);
const buf = await fromDataToBuffer(incompleteTable, undefined, schema);
expect(buf.byteLength).toBeGreaterThan(0);
const retrievedTable = tableFromIPC(buf);
expect(retrievedTable.numCols).toBe(3);
expect(retrievedTable.numRows).toBe(2);
const descriptionColumn = retrievedTable.getChild("description");
expect(descriptionColumn).toBeDefined();
expect(descriptionColumn?.nullCount).toBe(2);
expect(descriptionColumn?.toArray()).toEqual([null, null]);
expect(retrievedTable.getChild("domain")?.toArray()).toEqual([
"google.com",
"facebook.com",
]);
expect(retrievedTable.getChild("name")?.toArray()).toEqual([
"Google",
"Facebook",
]);
});
it("should correctly retain values in nested struct fields", async function () {
// Define test data with nested struct
const testData = [
{
id: "doc1",
@@ -716,8 +400,10 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
},
];
// Create Arrow table from the data
const table = makeArrowTable(testData);
// Verify schema has the nested struct fields
const metadataField = table.schema.fields.find(
(f) => f.name === "metadata",
);
@@ -731,17 +417,23 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
"text",
]);
// Convert to buffer and back (simulating storage and retrieval)
const buf = await fromTableToBuffer(table);
const retrievedTable = tableFromIPC(buf);
// Verify the retrieved table has the same structure
const rows = [];
for (let i = 0; i < retrievedTable.numRows; i++) {
rows.push(retrievedTable.get(i));
}
// Check values in the first row
const firstRow = rows[0];
expect(firstRow.id).toBe("doc1");
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
// Verify metadata values are preserved (this is where the bug is)
expect(firstRow.metadata).toBeDefined();
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
expect(firstRow.metadata.startLine).toBe(10);
expect(firstRow.metadata.endLine).toBe(20);
@@ -900,14 +592,14 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
).rejects.toThrow("column vector was missing");
});
it("will skip embedding application if already applied", async function () {
it("will provide a nice error if run twice", async function () {
const records = sampleRecords();
const table = await convertToTable(records, dummyEmbeddingConfig);
// fromTableToBuffer will try and apply the embeddings again
// but should skip since the column already has non-null values
const result = await fromTableToBuffer(table, dummyEmbeddingConfig);
expect(result.byteLength).toBeGreaterThan(0);
await expect(
fromTableToBuffer(table, dummyEmbeddingConfig),
).rejects.toThrow("already existed");
});
});

View File

@@ -42,28 +42,6 @@ describe("remote connection", () => {
});
});
it("should accept overall timeout configuration", async () => {
await connect("db://test", {
apiKey: "fake",
clientConfig: {
timeoutConfig: { timeout: 30 },
},
});
// Test with all timeout parameters
await connect("db://test", {
apiKey: "fake",
clientConfig: {
timeoutConfig: {
timeout: 60,
connectTimeout: 10,
readTimeout: 20,
poolIdleTimeout: 300,
},
},
});
});
it("should pass down apiKey and userAgent", async () => {
await withMockDatabase(
(req, res) => {

View File

@@ -1,46 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
import * as tmp from "tmp";
import { Session, connect } from "../lancedb";
describe("Session", () => {
let tmpDir: tmp.DirResult;
beforeEach(() => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
});
afterEach(() => tmpDir.removeCallback());
it("should configure cache sizes and work with database operations", async () => {
// Create session with small cache limits for testing
const indexCacheSize = BigInt(1024 * 1024); // 1MB
const metadataCacheSize = BigInt(512 * 1024); // 512KB
const session = new Session(indexCacheSize, metadataCacheSize);
// Record initial cache state
const initialCacheSize = session.sizeBytes();
const initialCacheItems = session.approxNumItems();
// Test session works with database connection
const db = await connect({ uri: tmpDir.name, session: session });
// Create and use a table to exercise the session
const data = Array.from({ length: 100 }, (_, i) => ({
id: i,
text: `item ${i}`,
}));
const table = await db.createTable("test", data);
const results = await table.query().limit(5).toArray();
expect(results).toHaveLength(5);
// Verify cache usage increased after operations
const finalCacheSize = session.sizeBytes();
const finalCacheItems = session.approxNumItems();
expect(finalCacheSize).toBeGreaterThan(initialCacheSize); // Cache should have grown
expect(finalCacheItems).toBeGreaterThanOrEqual(initialCacheItems); // Items should not decrease
expect(initialCacheSize).toBeLessThan(indexCacheSize + metadataCacheSize); // Within limits
});
});

View File

@@ -33,12 +33,7 @@ import {
register,
} from "../lancedb/embedding";
import { Index } from "../lancedb/indices";
import {
BooleanQuery,
Occur,
Operator,
instanceOfFullTextQuery,
} from "../lancedb/query";
import { instanceOfFullTextQuery } from "../lancedb/query";
import exp = require("constants");
describe.each([arrow15, arrow16, arrow17, arrow18])(
@@ -368,9 +363,9 @@ describe("merge insert", () => {
{ a: 4, b: "z" },
];
const result = (await table.toArrow()).toArray().sort((a, b) => a.a - b.a);
expect(result.map((row) => ({ ...row }))).toEqual(expected);
expect(
JSON.parse(JSON.stringify((await table.toArrow()).toArray())),
).toEqual(expected);
});
test("conditional update", async () => {
const newData = [
@@ -559,32 +554,6 @@ describe("When creating an index", () => {
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
expect(rst.numRows).toBe(1);
// test nprobes
rst = await tbl.query().nearestTo(queryVec).limit(2).nprobes(50).toArrow();
expect(rst.numRows).toBe(2);
rst = await tbl
.query()
.nearestTo(queryVec)
.limit(2)
.minimumNprobes(15)
.toArrow();
expect(rst.numRows).toBe(2);
rst = await tbl
.query()
.nearestTo(queryVec)
.limit(2)
.minimumNprobes(10)
.maximumNprobes(20)
.toArrow();
expect(rst.numRows).toBe(2);
expect(() => tbl.query().nearestTo(queryVec).minimumNprobes(0)).toThrow(
"Invalid input, minimum_nprobes must be greater than 0",
);
expect(() => tbl.query().nearestTo(queryVec).maximumNprobes(5)).toThrow(
"Invalid input, maximum_nprobes must be greater than or equal to minimum_nprobes",
);
await tbl.dropIndex("vec_idx");
const indices2 = await tbl.listIndices();
expect(indices2.length).toBe(0);
@@ -1562,18 +1531,6 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
const results = await table.search("hello").toArray();
expect(results[0].text).toBe(data[0].text);
const results2 = await table
.search(new MatchQuery("hello world", "text"))
.toArray();
expect(results2.length).toBe(2);
const results3 = await table
.search(
new MatchQuery("hello world", "text", { operator: Operator.And }),
)
.toArray();
expect(results3.length).toBe(1);
});
test("full text search without lowercase", async () => {
@@ -1650,114 +1607,6 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(resultSet.has("fob")).toBe(true);
expect(resultSet.has("fo")).toBe(true);
expect(resultSet.has("food")).toBe(true);
const prefixResults = await table
.search(
new MatchQuery("foo", "text", { fuzziness: 3, prefixLength: 3 }),
)
.toArray();
expect(prefixResults.length).toBe(2);
const resultSet2 = new Set(prefixResults.map((r) => r.text));
expect(resultSet2.has("foo")).toBe(true);
expect(resultSet2.has("food")).toBe(true);
});
test("full text search boolean query", async () => {
const db = await connect(tmpDir.name);
const data = [
{ text: "The cat and dog are playing" },
{ text: "The cat is sleeping" },
{ text: "The dog is barking" },
{ text: "The dog chases the cat" },
];
const table = await db.createTable("test", data);
await table.createIndex("text", {
config: Index.fts({ withPosition: false }),
});
const shouldResults = await table
.search(
new BooleanQuery([
[Occur.Should, new MatchQuery("cat", "text")],
[Occur.Should, new MatchQuery("dog", "text")],
]),
)
.toArray();
expect(shouldResults.length).toBe(4);
const mustResults = await table
.search(
new BooleanQuery([
[Occur.Must, new MatchQuery("cat", "text")],
[Occur.Must, new MatchQuery("dog", "text")],
]),
)
.toArray();
expect(mustResults.length).toBe(2);
const mustNotResults = await table
.search(
new BooleanQuery([
[Occur.Must, new MatchQuery("cat", "text")],
[Occur.MustNot, new MatchQuery("dog", "text")],
]),
)
.toArray();
expect(mustNotResults.length).toBe(1);
});
test("full text search ngram", async () => {
const db = await connect(tmpDir.name);
const data = [
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
{ text: "lance database", vector: [0.4, 0.5, 0.6] },
{ text: "lance is cool", vector: [0.7, 0.8, 0.9] },
];
const table = await db.createTable("test", data);
await table.createIndex("text", {
config: Index.fts({ baseTokenizer: "ngram" }),
});
const results = await table.search("lan").toArray();
expect(results.length).toBe(2);
const resultSet = new Set(results.map((r) => r.text));
expect(resultSet.has("lance database")).toBe(true);
expect(resultSet.has("lance is cool")).toBe(true);
const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
expect(results2.length).toBe(2);
const resultSet2 = new Set(results2.map((r) => r.text));
expect(resultSet2.has("lance database")).toBe(true);
expect(resultSet2.has("lance is cool")).toBe(true);
// the default min_ngram_length is 3, so "la" should not match
const results3 = await table.search("la").toArray();
expect(results3.length).toBe(0);
// test setting min_ngram_length and prefix_only
await table.createIndex("text", {
config: Index.fts({
baseTokenizer: "ngram",
ngramMinLength: 2,
prefixOnly: true,
}),
replace: true,
});
const results4 = await table.search("lan").toArray();
expect(results4.length).toBe(2);
const resultSet4 = new Set(results4.map((r) => r.text));
expect(resultSet4.has("lance database")).toBe(true);
expect(resultSet4.has("lance is cool")).toBe(true);
const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
expect(results5.length).toBe(0);
const results6 = await table.search("la").toArray();
expect(results6.length).toBe(2);
const resultSet6 = new Set(results6.map((r) => r.text));
expect(resultSet6.has("lance database")).toBe(true);
expect(resultSet6.has("lance is cool")).toBe(true);
});
test.each([
@@ -1863,43 +1712,4 @@ describe("column name options", () => {
expect(results[0].query_index).toBe(0);
expect(results[1].query_index).toBe(1);
});
test("index and search multivectors", async () => {
const db = await connect(tmpDir.name);
const data = [];
// generate 512 random multivectors
for (let i = 0; i < 256; i++) {
data.push({
multivector: Array.from({ length: 10 }, () =>
Array(2).fill(Math.random()),
),
});
}
const table = await db.createTable("multivectors", data, {
schema: new Schema([
new Field(
"multivector",
new List(
new Field(
"item",
new FixedSizeList(2, new Field("item", new Float32())),
),
),
),
]),
});
const results = await table.search(data[0].multivector).limit(10).toArray();
expect(results.length).toBe(10);
await table.createIndex("multivector", {
config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }),
});
const results2 = await table
.search(data[0].multivector)
.limit(10)
.toArray();
expect(results2.length).toBe(10);
});
});

View File

@@ -30,7 +30,7 @@
"x64",
"arm64"
],
"license": "Apache-2.0",
"license": "Apache 2.0",
"os": [
"darwin",
"linux",

View File

@@ -34,7 +34,6 @@ import {
Struct,
Timestamp,
Type,
Uint8,
Utf8,
Vector,
makeVector as arrowMakeVector,
@@ -52,15 +51,6 @@ import {
sanitizeTable,
sanitizeType,
} from "./sanitize";
/**
* Check if a field name indicates a vector column.
*/
function nameSuggestsVectorColumn(fieldName: string): boolean {
const nameLower = fieldName.toLowerCase();
return nameLower.includes("vector") || nameLower.includes("embedding");
}
export * from "apache-arrow";
export type SchemaLike =
| Schema
@@ -117,20 +107,6 @@ export type IntoVector =
| number[]
| Promise<Float32Array | Float64Array | number[]>;
export type MultiVector = IntoVector[];
export function isMultiVector(value: unknown): value is MultiVector {
return Array.isArray(value) && isIntoVector(value[0]);
}
export function isIntoVector(value: unknown): value is IntoVector {
return (
value instanceof Float32Array ||
value instanceof Float64Array ||
(Array.isArray(value) && !Array.isArray(value[0]))
);
}
export function isArrowTable(value: object): value is TableLike {
if (value instanceof ArrowTable) return true;
return "schema" in value && "batches" in value;
@@ -441,9 +417,7 @@ function inferSchema(
} else {
const inferredType = inferType(value, path, opts);
if (inferredType === undefined) {
throw new Error(`Failed to infer data type for field ${path.join(
".",
)} at row ${rowI}. \
throw new Error(`Failed to infer data type for field ${path.join(".")} at row ${rowI}. \
Consider providing an explicit schema.`);
}
pathTree.set(path, inferredType);
@@ -601,17 +575,10 @@ function inferType(
return undefined;
}
// Try to automatically detect embedding columns.
if (nameSuggestsVectorColumn(path[path.length - 1])) {
// Check if value is a Uint8Array for integer vector type determination
if (value instanceof Uint8Array) {
// For integer vectors, we default to Uint8 (matching Python implementation)
const child = new Field("item", new Uint8(), true);
return new FixedSizeList(value.length, child);
} else {
// For float vectors, we default to Float32
const child = new Field("item", new Float32(), true);
return new FixedSizeList(value.length, child);
}
if (valueType instanceof Float && path[path.length - 1] === "vector") {
// We default to Float32 for vectors.
const child = new Field("item", new Float32(), true);
return new FixedSizeList(value.length, child);
} else {
const child = new Field("item", valueType, true);
return new List(child);
@@ -832,17 +799,11 @@ async function applyEmbeddingsFromMetadata(
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
);
}
// Check if destination column exists and handle accordingly
if (columns[destColumn] !== undefined) {
const existingColumn = columns[destColumn];
// If the column exists but is all null, we can fill it with embeddings
if (existingColumn.nullCount !== existingColumn.length) {
// Column has non-null values, skip embedding application
continue;
}
throw new Error(
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
);
}
if (table.batches.length > 1) {
throw new Error(
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
@@ -870,15 +831,6 @@ async function applyEmbeddingsFromMetadata(
const vector = makeVector(vectors, destType);
columns[destColumn] = vector;
}
// Add any missing columns from the schema as null vectors
for (const field of schema.fields) {
if (!(field.name in columns)) {
const nullValues = new Array(table.numRows).fill(null);
columns[field.name] = makeVector(nullValues, field.type);
}
}
const newTable = new ArrowTable(columns);
return alignTable(newTable, schema);
}
@@ -951,23 +903,11 @@ async function applyEmbeddings<T>(
);
}
} else {
// Check if destination column exists and handle accordingly
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
const existingColumn = newColumns[destColumn];
// If the column exists but is all null, we can fill it with embeddings
if (existingColumn.nullCount !== existingColumn.length) {
// Column has non-null values, skip embedding application and return table as-is
let newTable = new ArrowTable(newColumns);
if (schema != null) {
newTable = alignTable(newTable, schema as Schema);
}
return new ArrowTable(
new Schema(newTable.schema.fields, schemaMetadata),
newTable.batches,
);
}
throw new Error(
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
);
}
if (table.batches.length > 1) {
throw new Error(
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
@@ -1027,21 +967,7 @@ export async function convertToTable(
embeddings?: EmbeddingFunctionConfig,
makeTableOptions?: Partial<MakeArrowTableOptions>,
): Promise<ArrowTable> {
let processedData = data;
// If we have a schema with embedding metadata, we need to preprocess the data
// to ensure all nested fields are present
if (
makeTableOptions?.schema &&
makeTableOptions.schema.metadata?.has("embedding_functions")
) {
processedData = ensureNestedFieldsExist(
data,
makeTableOptions.schema as Schema,
);
}
const table = makeArrowTable(processedData, makeTableOptions);
const table = makeArrowTable(data, makeTableOptions);
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
}
@@ -1134,16 +1060,7 @@ export async function fromDataToBuffer(
schema = sanitizeSchema(schema);
}
if (isArrowTable(data)) {
const table = sanitizeTable(data);
// If we have a schema with embedding functions, we need to ensure all columns exist
// before applying embeddings, since applyEmbeddingsFromMetadata expects all columns
// to be present in the table
if (schema && schema.metadata?.has("embedding_functions")) {
const alignedTable = alignTableToSchema(table, schema);
return fromTableToBuffer(alignedTable, embeddings, schema);
} else {
return fromTableToBuffer(table, embeddings, schema);
}
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
} else {
const table = await convertToTable(data, embeddings, { schema });
return fromTableToBuffer(table);
@@ -1212,7 +1129,7 @@ function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
type: new Struct(schema.fields),
length: batch.numRows,
nullCount: batch.nullCount,
children: alignedChildren as unknown as ArrowData<DataType>[],
children: alignedChildren,
});
return new RecordBatch(schema, newData);
}
@@ -1284,79 +1201,6 @@ function validateSchemaEmbeddings(
return new Schema(fields, schema.metadata);
}
/**
* Ensures that all nested fields defined in the schema exist in the data,
* filling missing fields with null values.
*/
export function ensureNestedFieldsExist(
data: Array<Record<string, unknown>>,
schema: Schema,
): Array<Record<string, unknown>> {
return data.map((row) => {
const completeRow: Record<string, unknown> = {};
for (const field of schema.fields) {
if (field.name in row) {
if (
field.type.constructor.name === "Struct" &&
row[field.name] !== null &&
row[field.name] !== undefined
) {
// Handle nested struct
const nestedValue = row[field.name] as Record<string, unknown>;
completeRow[field.name] = ensureStructFieldsExist(
nestedValue,
field.type,
);
} else {
// Non-struct field or null struct value
completeRow[field.name] = row[field.name];
}
} else {
// Field is missing from the data - set to null
completeRow[field.name] = null;
}
}
return completeRow;
});
}
/**
* Recursively ensures that all fields in a struct type exist in the data,
* filling missing fields with null values.
*/
function ensureStructFieldsExist(
data: Record<string, unknown>,
structType: Struct,
): Record<string, unknown> {
const completeStruct: Record<string, unknown> = {};
for (const childField of structType.children) {
if (childField.name in data) {
if (
childField.type.constructor.name === "Struct" &&
data[childField.name] !== null &&
data[childField.name] !== undefined
) {
// Recursively handle nested struct
completeStruct[childField.name] = ensureStructFieldsExist(
data[childField.name] as Record<string, unknown>,
childField.type,
);
} else {
// Non-struct field or null struct value
completeStruct[childField.name] = data[childField.name];
}
} else {
// Field is missing - set to null
completeStruct[childField.name] = null;
}
}
return completeStruct;
}
interface JsonDataType {
type: string;
fields?: JsonField[];
@@ -1490,64 +1334,3 @@ function fieldToJson(field: Field): JsonField {
metadata: field.metadata,
};
}
function alignTableToSchema(
table: ArrowTable,
targetSchema: Schema,
): ArrowTable {
const existingColumns = new Map<string, Vector>();
// Map existing columns
for (const field of table.schema.fields) {
existingColumns.set(field.name, table.getChild(field.name)!);
}
// Create vectors for all fields in target schema
const alignedColumns: Record<string, Vector> = {};
for (const field of targetSchema.fields) {
if (existingColumns.has(field.name)) {
// Column exists, use it
alignedColumns[field.name] = existingColumns.get(field.name)!;
} else {
// Column missing, create null vector
alignedColumns[field.name] = createNullVector(field, table.numRows);
}
}
// Create new table with aligned schema and columns
return new ArrowTable(targetSchema, alignedColumns);
}
function createNullVector(field: Field, numRows: number): Vector {
if (field.type.constructor.name === "Struct") {
// For struct types, create a struct with null fields
const structType = field.type as Struct;
const childVectors = structType.children.map((childField) =>
createNullVector(childField, numRows),
);
// Create struct data
const structData = makeData({
type: structType,
length: numRows,
nullCount: 0,
children: childVectors.map((v) => v.data[0]),
});
return arrowMakeVector(structData);
} else {
// For other types, create a vector of nulls
const nullBitmap = new Uint8Array(Math.ceil(numRows / 8));
// All bits are 0, meaning all values are null
const data = makeData({
type: field.type,
length: numRows,
nullCount: numRows,
nullBitmap,
});
return arrowMakeVector(data);
}
}

View File

@@ -85,9 +85,6 @@ export interface OpenTableOptions {
/**
* Set the size of the index cache, specified as a number of entries
*
* @deprecated Use session-level cache configuration instead.
* Create a Session with custom cache sizes and pass it to the connect() function.
*
* The exact meaning of an "entry" will depend on the type of index:
* - IVF: there is one entry for each IVF partition
* - BTREE: there is one entry for the entire index

View File

@@ -10,7 +10,6 @@ import {
import {
ConnectionOptions,
Connection as LanceDbConnection,
Session,
} from "./native.js";
export {
@@ -52,8 +51,6 @@ export {
OpenTableOptions,
} from "./connection";
export { Session } from "./native.js";
export {
ExecutableQuery,
Query,
@@ -67,10 +64,7 @@ export {
PhraseQuery,
BoostQuery,
MultiMatchQuery,
BooleanQuery,
FullTextQueryType,
Operator,
Occur,
} from "./query";
export {
@@ -103,7 +97,6 @@ export {
RecordBatchLike,
DataLike,
IntoVector,
MultiVector,
} from "./arrow";
export { IntoSql, packBits } from "./util";
@@ -134,7 +127,6 @@ export { IntoSql, packBits } from "./util";
export async function connect(
uri: string,
options?: Partial<ConnectionOptions>,
session?: Session,
): Promise<Connection>;
/**
* Connect to a LanceDB instance at the given URI.
@@ -153,43 +145,31 @@ export async function connect(
* storageOptions: {timeout: "60s"}
* });
* ```
*
* @example
* ```ts
* const session = Session.default();
* const conn = await connect({
* uri: "/path/to/database",
* session: session
* });
* ```
*/
export async function connect(
options: Partial<ConnectionOptions> & { uri: string },
): Promise<Connection>;
export async function connect(
uriOrOptions: string | (Partial<ConnectionOptions> & { uri: string }),
options?: Partial<ConnectionOptions>,
options: Partial<ConnectionOptions> = {},
): Promise<Connection> {
let uri: string | undefined;
let finalOptions: Partial<ConnectionOptions> = {};
if (typeof uriOrOptions !== "string") {
const { uri: uri_, ...opts } = uriOrOptions;
uri = uri_;
finalOptions = opts;
options = opts;
} else {
uri = uriOrOptions;
finalOptions = options || {};
}
if (!uri) {
throw new Error("uri is required");
}
finalOptions = (finalOptions as ConnectionOptions) ?? {};
(<ConnectionOptions>finalOptions).storageOptions = cleanseStorageOptions(
(<ConnectionOptions>finalOptions).storageOptions,
options = (options as ConnectionOptions) ?? {};
(<ConnectionOptions>options).storageOptions = cleanseStorageOptions(
(<ConnectionOptions>options).storageOptions,
);
const nativeConn = await LanceDbConnection.new(uri, finalOptions);
const nativeConn = await LanceDbConnection.new(uri, options);
return new LocalConnection(nativeConn);
}

View File

@@ -439,7 +439,7 @@ export interface FtsOptions {
*
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
*/
baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";
baseTokenizer?: "simple" | "whitespace" | "raw";
/**
* language for stemming and stop words
@@ -472,21 +472,6 @@ export interface FtsOptions {
* whether to remove punctuation
*/
asciiFolding?: boolean;
/**
* ngram min length
*/
ngramMinLength?: number;
/**
* ngram max length
*/
ngramMaxLength?: number;
/**
* whether to only index the prefix of the token for ngram tokenizer
*/
prefixOnly?: boolean;
}
export class Index {
@@ -623,9 +608,6 @@ export class Index {
options?.stem,
options?.removeStopWords,
options?.asciiFolding,
options?.ngramMinLength,
options?.ngramMaxLength,
options?.prefixOnly,
),
);
}

View File

@@ -448,10 +448,6 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
* For best results we recommend tuning this parameter with a benchmark against
* your actual data to find the smallest possible value that will still give
* you the desired recall.
*
* For more fine grained control over behavior when you have a very narrow filter
* you can use `minimumNprobes` and `maximumNprobes`. This method sets both
* the minimum and maximum to the same value.
*/
nprobes(nprobes: number): VectorQuery {
super.doCall((inner) => inner.nprobes(nprobes));
@@ -459,33 +455,6 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
return this;
}
/**
* Set the minimum number of probes used.
*
* This controls the minimum number of partitions that will be searched. This
* parameter will impact every query against a vector index, regardless of the
* filter. See `nprobes` for more details. Higher values will increase recall
* but will also increase latency.
*/
minimumNprobes(minimumNprobes: number): VectorQuery {
super.doCall((inner) => inner.minimumNprobes(minimumNprobes));
return this;
}
/**
* Set the maximum number of probes used.
*
* This controls the maximum number of partitions that will be searched. If this
* number is greater than minimumNprobes then the excess partitions will _only_ be
* searched if we have not found enough results. This can be useful when there is
* a narrow filter to allow these queries to spend more time searching and avoid
* potential false negatives.
*/
maximumNprobes(maximumNprobes: number): VectorQuery {
super.doCall((inner) => inner.maximumNprobes(maximumNprobes));
return this;
}
/*
* Set the distance range to use
*
@@ -793,31 +762,6 @@ export enum FullTextQueryType {
MatchPhrase = "match_phrase",
Boost = "boost",
MultiMatch = "multi_match",
Boolean = "boolean",
}
/**
* Enum representing the logical operators used in full-text queries.
*
* - `And`: All terms must match.
* - `Or`: At least one term must match.
*/
export enum Operator {
And = "AND",
Or = "OR",
}
/**
* Enum representing the occurrence of terms in full-text queries.
*
* - `Must`: The term must be present in the document.
* - `Should`: The term should contribute to the document score, but is not required.
* - `MustNot`: The term must not be present in the document.
*/
export enum Occur {
Should = "SHOULD",
Must = "MUST",
MustNot = "MUST_NOT",
}
/**
@@ -847,7 +791,6 @@ export function instanceOfFullTextQuery(obj: any): obj is FullTextQuery {
export class MatchQuery implements FullTextQuery {
/** @ignore */
public readonly inner: JsFullTextQuery;
/**
* Creates an instance of MatchQuery.
*
@@ -857,8 +800,6 @@ export class MatchQuery implements FullTextQuery {
* - `boost`: The boost factor for the query (default is 1.0).
* - `fuzziness`: The fuzziness level for the query (default is 0).
* - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
* - `operator`: The logical operator to use for combining terms in the query (default is "OR").
* - `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
*/
constructor(
query: string,
@@ -867,8 +808,6 @@ export class MatchQuery implements FullTextQuery {
boost?: number;
fuzziness?: number;
maxExpansions?: number;
operator?: Operator;
prefixLength?: number;
},
) {
let fuzziness = options?.fuzziness;
@@ -881,8 +820,6 @@ export class MatchQuery implements FullTextQuery {
options?.boost ?? 1.0,
fuzziness,
options?.maxExpansions ?? 50,
options?.operator ?? Operator.Or,
options?.prefixLength ?? 0,
);
}
@@ -899,11 +836,9 @@ export class PhraseQuery implements FullTextQuery {
*
* @param query - The phrase to search for in the specified column.
* @param column - The name of the column to search within.
* @param options - Optional parameters for the phrase query.
* - `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
*/
constructor(query: string, column: string, options?: { slop?: number }) {
this.inner = JsFullTextQuery.phraseQuery(query, column, options?.slop ?? 0);
constructor(query: string, column: string) {
this.inner = JsFullTextQuery.phraseQuery(query, column);
}
queryType(): FullTextQueryType {
@@ -954,21 +889,18 @@ export class MultiMatchQuery implements FullTextQuery {
* @param columns - An array of column names to search within.
* @param options - Optional parameters for the multi-match query.
* - `boosts`: An array of boost factors for each column (default is 1.0 for all).
* - `operator`: The logical operator to use for combining terms in the query (default is "OR").
*/
constructor(
query: string,
columns: string[],
options?: {
boosts?: number[];
operator?: Operator;
},
) {
this.inner = JsFullTextQuery.multiMatchQuery(
query,
columns,
options?.boosts,
options?.operator ?? Operator.Or,
);
}
@@ -976,23 +908,3 @@ export class MultiMatchQuery implements FullTextQuery {
return FullTextQueryType.MultiMatch;
}
}
export class BooleanQuery implements FullTextQuery {
/** @ignore */
public readonly inner: JsFullTextQuery;
/**
* Creates an instance of BooleanQuery.
*
* @param queries - An array of (Occur, FullTextQuery objects) to combine.
* Occur specifies whether the query must match, or should match.
*/
constructor(queries: [Occur, FullTextQuery][]) {
this.inner = JsFullTextQuery.booleanQuery(
queries.map(([occur, query]) => [occur, query.inner]),
);
}
queryType(): FullTextQueryType {
return FullTextQueryType.Boolean;
}
}

View File

@@ -6,11 +6,9 @@ import {
Data,
DataType,
IntoVector,
MultiVector,
Schema,
dataTypeToJson,
fromDataToBuffer,
isMultiVector,
tableFromIPC,
} from "./arrow";
@@ -77,10 +75,10 @@ export interface OptimizeOptions {
* // Delete all versions older than 1 day
* const olderThan = new Date();
* olderThan.setDate(olderThan.getDate() - 1));
* tbl.optimize({cleanupOlderThan: olderThan});
* tbl.cleanupOlderVersions(olderThan);
*
* // Delete all versions except the current version
* tbl.optimize({cleanupOlderThan: new Date()});
* tbl.cleanupOlderVersions(new Date());
*/
cleanupOlderThan: Date;
deleteUnverified: boolean;
@@ -348,7 +346,7 @@ export abstract class Table {
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
*/
abstract search(
query: string | IntoVector | MultiVector | FullTextQuery,
query: string | IntoVector | FullTextQuery,
queryType?: string,
ftsColumns?: string | string[],
): VectorQuery | Query;
@@ -359,7 +357,7 @@ export abstract class Table {
* is the same thing as calling `nearestTo` on the builder returned
* by `query`. @see {@link Query#nearestTo} for more details.
*/
abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
abstract vectorSearch(vector: IntoVector): VectorQuery;
/**
* Add new columns with defined values.
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
@@ -670,7 +668,7 @@ export class LocalTable extends Table {
}
search(
query: string | IntoVector | MultiVector | FullTextQuery,
query: string | IntoVector | FullTextQuery,
queryType: string = "auto",
ftsColumns?: string | string[],
): VectorQuery | Query {
@@ -717,15 +715,7 @@ export class LocalTable extends Table {
return this.query().nearestTo(queryPromise);
}
vectorSearch(vector: IntoVector | MultiVector): VectorQuery {
if (isMultiVector(vector)) {
const query = this.query().nearestTo(vector[0]);
for (const v of vector.slice(1)) {
query.addQueryVector(v);
}
return query;
}
vectorSearch(vector: IntoVector): VectorQuery {
return this.query().nearestTo(vector);
}

View File

@@ -1,11 +1,11 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.21.2",
"version": "0.20.0-beta.2",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",
"files": ["lancedb.darwin-arm64.node"],
"license": "Apache-2.0",
"license": "Apache 2.0",
"engines": {
"node": ">= 18"
}

Some files were not shown because too many files have changed in this diff Show More