mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
25 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bb809abd4b | ||
|
|
c87530f7a3 | ||
|
|
1eb1beecd6 | ||
|
|
ce550e6c45 | ||
|
|
d3bae1f3a3 | ||
|
|
dcf53c4506 | ||
|
|
941eada703 | ||
|
|
ed640a76d9 | ||
|
|
296205ef96 | ||
|
|
16beaaa656 | ||
|
|
4ff87b1f4a | ||
|
|
0532ef2358 | ||
|
|
dcf7334c1f | ||
|
|
8ffe992a6f | ||
|
|
9d683e4f0b | ||
|
|
0a1ea1858d | ||
|
|
7d0127b376 | ||
|
|
02595dc475 | ||
|
|
f23327af79 | ||
|
|
c7afa724dd | ||
|
|
c359cec504 | ||
|
|
fe76496a59 | ||
|
|
67ec1fe75c | ||
|
|
70d9b04ba5 | ||
|
|
b0d4a79c35 |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.21.2-beta.1"
|
current_version = "0.21.2"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
@@ -50,11 +50,6 @@ pre_commit_hooks = [
|
|||||||
optional_value = "final"
|
optional_value = "final"
|
||||||
values = ["beta", "final"]
|
values = ["beta", "final"]
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
filename = "node/package.json"
|
|
||||||
replace = "\"version\": \"{new_version}\","
|
|
||||||
search = "\"version\": \"{current_version}\","
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
[[tool.bumpversion.files]]
|
||||||
filename = "nodejs/package.json"
|
filename = "nodejs/package.json"
|
||||||
replace = "\"version\": \"{new_version}\","
|
replace = "\"version\": \"{new_version}\","
|
||||||
@@ -66,39 +61,8 @@ glob = "nodejs/npm/*/package.json"
|
|||||||
replace = "\"version\": \"{new_version}\","
|
replace = "\"version\": \"{new_version}\","
|
||||||
search = "\"version\": \"{current_version}\","
|
search = "\"version\": \"{current_version}\","
|
||||||
|
|
||||||
# vectodb node binary packages
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "node/package.json"
|
|
||||||
replace = "\"@lancedb/vectordb-darwin-arm64\": \"{new_version}\""
|
|
||||||
search = "\"@lancedb/vectordb-darwin-arm64\": \"{current_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "node/package.json"
|
|
||||||
replace = "\"@lancedb/vectordb-darwin-x64\": \"{new_version}\""
|
|
||||||
search = "\"@lancedb/vectordb-darwin-x64\": \"{current_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "node/package.json"
|
|
||||||
replace = "\"@lancedb/vectordb-linux-arm64-gnu\": \"{new_version}\""
|
|
||||||
search = "\"@lancedb/vectordb-linux-arm64-gnu\": \"{current_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "node/package.json"
|
|
||||||
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
|
|
||||||
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "node/package.json"
|
|
||||||
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
|
|
||||||
search = "\"@lancedb/vectordb-win32-x64-msvc\": \"{current_version}\""
|
|
||||||
|
|
||||||
# Cargo files
|
# Cargo files
|
||||||
# ------------
|
# ------------
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
filename = "rust/ffi/node/Cargo.toml"
|
|
||||||
replace = "\nversion = \"{new_version}\""
|
|
||||||
search = "\nversion = \"{current_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
[[tool.bumpversion.files]]
|
||||||
filename = "rust/lancedb/Cargo.toml"
|
filename = "rust/lancedb/Cargo.toml"
|
||||||
replace = "\nversion = \"{new_version}\""
|
replace = "\nversion = \"{new_version}\""
|
||||||
|
|||||||
11
.github/workflows/docs.yml
vendored
11
.github/workflows/docs.yml
vendored
@@ -56,22 +56,11 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
node-version: 20
|
node-version: 20
|
||||||
cache: 'npm'
|
cache: 'npm'
|
||||||
cache-dependency-path: node/package-lock.json
|
|
||||||
- name: Install node dependencies
|
- name: Install node dependencies
|
||||||
working-directory: node
|
working-directory: node
|
||||||
run: |
|
run: |
|
||||||
sudo apt update
|
sudo apt update
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
- name: Build node
|
|
||||||
working-directory: node
|
|
||||||
run: |
|
|
||||||
npm ci
|
|
||||||
npm run build
|
|
||||||
npm run tsc
|
|
||||||
- name: Create markdown files
|
|
||||||
working-directory: node
|
|
||||||
run: |
|
|
||||||
npx typedoc --plugin typedoc-plugin-markdown --out ../docs/src/javascript src/index.ts
|
|
||||||
- name: Build docs
|
- name: Build docs
|
||||||
working-directory: docs
|
working-directory: docs
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
48
.github/workflows/docs_test.yml
vendored
48
.github/workflows/docs_test.yml
vendored
@@ -58,51 +58,3 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
cd docs/test/python
|
cd docs/test/python
|
||||||
for d in *; do cd "$d"; echo "$d".py; python "$d".py; cd ..; done
|
for d in *; do cd "$d"; echo "$d".py; python "$d".py; cd ..; done
|
||||||
test-node:
|
|
||||||
name: Test doc nodejs code
|
|
||||||
runs-on: ubuntu-24.04
|
|
||||||
timeout-minutes: 60
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
lfs: true
|
|
||||||
- name: Print CPU capabilities
|
|
||||||
run: cat /proc/cpuinfo
|
|
||||||
- name: Set up Node
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
- name: Install protobuf
|
|
||||||
run: |
|
|
||||||
sudo apt update
|
|
||||||
sudo apt install -y protobuf-compiler
|
|
||||||
- name: Install dependecies needed for ubuntu
|
|
||||||
run: |
|
|
||||||
sudo apt install -y libssl-dev
|
|
||||||
rustup update && rustup default
|
|
||||||
- name: Rust cache
|
|
||||||
uses: swatinem/rust-cache@v2
|
|
||||||
- name: Install node dependencies
|
|
||||||
run: |
|
|
||||||
sudo swapoff -a
|
|
||||||
sudo fallocate -l 8G /swapfile
|
|
||||||
sudo chmod 600 /swapfile
|
|
||||||
sudo mkswap /swapfile
|
|
||||||
sudo swapon /swapfile
|
|
||||||
sudo swapon --show
|
|
||||||
cd node
|
|
||||||
npm ci
|
|
||||||
npm run build-release
|
|
||||||
cd ../docs
|
|
||||||
npm install
|
|
||||||
- name: Test
|
|
||||||
env:
|
|
||||||
LANCEDB_URI: ${{ secrets.LANCEDB_URI }}
|
|
||||||
LANCEDB_DEV_API_KEY: ${{ secrets.LANCEDB_DEV_API_KEY }}
|
|
||||||
run: |
|
|
||||||
cd docs
|
|
||||||
npm t
|
|
||||||
|
|||||||
147
.github/workflows/node.yml
vendored
147
.github/workflows/node.yml
vendored
@@ -1,147 +0,0 @@
|
|||||||
name: Node
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- node/**
|
|
||||||
- rust/ffi/node/**
|
|
||||||
- .github/workflows/node.yml
|
|
||||||
- docker-compose.yml
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
env:
|
|
||||||
# Disable full debug symbol generation to speed up CI build and keep memory down
|
|
||||||
# "1" means line tables only, which is useful for panic tracebacks.
|
|
||||||
#
|
|
||||||
# Use native CPU to accelerate tests if possible, especially for f16
|
|
||||||
# target-cpu=haswell fixes failing ci build
|
|
||||||
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=haswell -C target-feature=+f16c,+avx2,+fma"
|
|
||||||
RUST_BACKTRACE: "1"
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
linux:
|
|
||||||
name: Linux (Node ${{ matrix.node-version }})
|
|
||||||
timeout-minutes: 30
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
node-version: [ "18", "20" ]
|
|
||||||
runs-on: "ubuntu-22.04"
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
working-directory: node
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
lfs: true
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: ${{ matrix.node-version }}
|
|
||||||
cache: 'npm'
|
|
||||||
cache-dependency-path: node/package-lock.json
|
|
||||||
- uses: Swatinem/rust-cache@v2
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt update
|
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
npm ci
|
|
||||||
npm run build
|
|
||||||
npm run pack-build
|
|
||||||
npm install --no-save ./dist/lancedb-vectordb-*.tgz
|
|
||||||
# Remove index.node to test with dependency installed
|
|
||||||
rm index.node
|
|
||||||
- name: Test
|
|
||||||
run: npm run test
|
|
||||||
macos:
|
|
||||||
timeout-minutes: 30
|
|
||||||
runs-on: "macos-13"
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
working-directory: node
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
lfs: true
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
cache: 'npm'
|
|
||||||
cache-dependency-path: node/package-lock.json
|
|
||||||
- uses: Swatinem/rust-cache@v2
|
|
||||||
- name: Install dependencies
|
|
||||||
run: brew install protobuf
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
npm ci
|
|
||||||
npm run build
|
|
||||||
npm run pack-build
|
|
||||||
npm install --no-save ./dist/lancedb-vectordb-*.tgz
|
|
||||||
# Remove index.node to test with dependency installed
|
|
||||||
rm index.node
|
|
||||||
- name: Test
|
|
||||||
run: |
|
|
||||||
npm run test
|
|
||||||
aws-integtest:
|
|
||||||
timeout-minutes: 45
|
|
||||||
runs-on: "ubuntu-22.04"
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
working-directory: node
|
|
||||||
env:
|
|
||||||
AWS_ACCESS_KEY_ID: ACCESSKEY
|
|
||||||
AWS_SECRET_ACCESS_KEY: SECRETKEY
|
|
||||||
AWS_DEFAULT_REGION: us-west-2
|
|
||||||
# this one is for s3
|
|
||||||
AWS_ENDPOINT: http://localhost:4566
|
|
||||||
# this one is for dynamodb
|
|
||||||
DYNAMODB_ENDPOINT: http://localhost:4566
|
|
||||||
ALLOW_HTTP: true
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
lfs: true
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
cache: 'npm'
|
|
||||||
cache-dependency-path: node/package-lock.json
|
|
||||||
- name: start local stack
|
|
||||||
run: docker compose -f ../docker-compose.yml up -d --wait
|
|
||||||
- name: create s3
|
|
||||||
run: aws s3 mb s3://lancedb-integtest --endpoint $AWS_ENDPOINT
|
|
||||||
- name: create ddb
|
|
||||||
run: |
|
|
||||||
aws dynamodb create-table \
|
|
||||||
--table-name lancedb-integtest \
|
|
||||||
--attribute-definitions '[{"AttributeName": "base_uri", "AttributeType": "S"}, {"AttributeName": "version", "AttributeType": "N"}]' \
|
|
||||||
--key-schema '[{"AttributeName": "base_uri", "KeyType": "HASH"}, {"AttributeName": "version", "KeyType": "RANGE"}]' \
|
|
||||||
--provisioned-throughput '{"ReadCapacityUnits": 10, "WriteCapacityUnits": 10}' \
|
|
||||||
--endpoint-url $DYNAMODB_ENDPOINT
|
|
||||||
- uses: Swatinem/rust-cache@v2
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt update
|
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
npm ci
|
|
||||||
npm run build
|
|
||||||
npm run pack-build
|
|
||||||
npm install --no-save ./dist/lancedb-vectordb-*.tgz
|
|
||||||
# Remove index.node to test with dependency installed
|
|
||||||
rm index.node
|
|
||||||
- name: Test
|
|
||||||
run: npm run integration-test
|
|
||||||
4
.github/workflows/nodejs.yml
vendored
4
.github/workflows/nodejs.yml
vendored
@@ -79,7 +79,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
node-version: ${{ matrix.node-version }}
|
node-version: ${{ matrix.node-version }}
|
||||||
cache: 'npm'
|
cache: 'npm'
|
||||||
cache-dependency-path: node/package-lock.json
|
cache-dependency-path: nodejs/package-lock.json
|
||||||
- uses: Swatinem/rust-cache@v2
|
- uses: Swatinem/rust-cache@v2
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
@@ -137,7 +137,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
node-version: 20
|
node-version: 20
|
||||||
cache: 'npm'
|
cache: 'npm'
|
||||||
cache-dependency-path: node/package-lock.json
|
cache-dependency-path: nodejs/package-lock.json
|
||||||
- uses: Swatinem/rust-cache@v2
|
- uses: Swatinem/rust-cache@v2
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
197
.github/workflows/npm-publish.yml
vendored
197
.github/workflows/npm-publish.yml
vendored
@@ -365,200 +365,3 @@ jobs:
|
|||||||
ARGS="$ARGS --tag preview"
|
ARGS="$ARGS --tag preview"
|
||||||
fi
|
fi
|
||||||
npm publish $ARGS
|
npm publish $ARGS
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
# vectordb release (legacy)
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
# TODO: delete this when we drop vectordb
|
|
||||||
node:
|
|
||||||
name: vectordb Typescript
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
working-directory: node
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
cache: "npm"
|
|
||||||
cache-dependency-path: node/package-lock.json
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt update
|
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
npm ci
|
|
||||||
npm run tsc
|
|
||||||
npm pack
|
|
||||||
- name: Upload Linux Artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: node-package
|
|
||||||
path: |
|
|
||||||
node/vectordb-*.tgz
|
|
||||||
|
|
||||||
node-macos:
|
|
||||||
name: vectordb ${{ matrix.config.arch }}
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
config:
|
|
||||||
- arch: x86_64-apple-darwin
|
|
||||||
runner: macos-13
|
|
||||||
- arch: aarch64-apple-darwin
|
|
||||||
# xlarge is implicitly arm64.
|
|
||||||
runner: macos-14
|
|
||||||
runs-on: ${{ matrix.config.runner }}
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install system dependencies
|
|
||||||
run: brew install protobuf
|
|
||||||
- name: Install npm dependencies
|
|
||||||
run: |
|
|
||||||
cd node
|
|
||||||
npm ci
|
|
||||||
- name: Build MacOS native node modules
|
|
||||||
run: bash ci/build_macos_artifacts.sh ${{ matrix.config.arch }}
|
|
||||||
- name: Upload Darwin Artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: node-native-darwin-${{ matrix.config.arch }}
|
|
||||||
path: |
|
|
||||||
node/dist/lancedb-vectordb-darwin*.tgz
|
|
||||||
|
|
||||||
node-linux-gnu:
|
|
||||||
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
|
|
||||||
runs-on: ${{ matrix.config.runner }}
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
config:
|
|
||||||
- arch: x86_64
|
|
||||||
runner: ubuntu-latest
|
|
||||||
- arch: aarch64
|
|
||||||
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
|
|
||||||
runner: warp-ubuntu-latest-arm64-4x
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
# To avoid OOM errors on ARM, we create a swap file.
|
|
||||||
- name: Configure aarch64 build
|
|
||||||
if: ${{ matrix.config.arch == 'aarch64' }}
|
|
||||||
run: |
|
|
||||||
free -h
|
|
||||||
sudo fallocate -l 16G /swapfile
|
|
||||||
sudo chmod 600 /swapfile
|
|
||||||
sudo mkswap /swapfile
|
|
||||||
sudo swapon /swapfile
|
|
||||||
echo "/swapfile swap swap defaults 0 0" >> sudo /etc/fstab
|
|
||||||
# print info
|
|
||||||
swapon --show
|
|
||||||
free -h
|
|
||||||
- name: Build Linux Artifacts
|
|
||||||
run: |
|
|
||||||
bash ci/build_linux_artifacts.sh ${{ matrix.config.arch }} ${{ matrix.config.arch }}-unknown-linux-gnu
|
|
||||||
- name: Upload Linux Artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: node-native-linux-${{ matrix.config.arch }}-gnu
|
|
||||||
path: |
|
|
||||||
node/dist/lancedb-vectordb-linux*.tgz
|
|
||||||
|
|
||||||
node-windows:
|
|
||||||
name: vectordb ${{ matrix.target }}
|
|
||||||
runs-on: windows-2022
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
target: [x86_64-pc-windows-msvc]
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Protoc v21.12
|
|
||||||
working-directory: C:\
|
|
||||||
run: |
|
|
||||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
|
||||||
Set-Location C:\protoc
|
|
||||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
|
||||||
7z x protoc.zip
|
|
||||||
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
|
||||||
shell: powershell
|
|
||||||
- name: Install npm dependencies
|
|
||||||
run: |
|
|
||||||
cd node
|
|
||||||
npm ci
|
|
||||||
- name: Build Windows native node modules
|
|
||||||
run: .\ci\build_windows_artifacts.ps1 ${{ matrix.target }}
|
|
||||||
- name: Upload Windows Artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: node-native-windows
|
|
||||||
path: |
|
|
||||||
node/dist/lancedb-vectordb-win32*.tgz
|
|
||||||
|
|
||||||
release:
|
|
||||||
name: vectordb NPM Publish
|
|
||||||
needs: [node, node-macos, node-linux-gnu, node-windows]
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
contents: write
|
|
||||||
# Only runs on tags that matches the make-release action
|
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
|
||||||
steps:
|
|
||||||
- uses: actions/download-artifact@v4
|
|
||||||
with:
|
|
||||||
pattern: node-*
|
|
||||||
- name: Display structure of downloaded files
|
|
||||||
run: ls -R
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
registry-url: "https://registry.npmjs.org"
|
|
||||||
- name: Publish to NPM
|
|
||||||
env:
|
|
||||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
|
||||||
run: |
|
|
||||||
# Tag beta as "preview" instead of default "latest". See lancedb
|
|
||||||
# npm publish step for more info.
|
|
||||||
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
|
|
||||||
PUBLISH_ARGS="--tag preview"
|
|
||||||
fi
|
|
||||||
|
|
||||||
mv */*.tgz .
|
|
||||||
for filename in *.tgz; do
|
|
||||||
npm publish $PUBLISH_ARGS $filename
|
|
||||||
done
|
|
||||||
- name: Deprecate
|
|
||||||
env:
|
|
||||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
|
||||||
# We need to deprecate the old package to avoid confusion.
|
|
||||||
# Each time we publish a new version, it gets undeprecated.
|
|
||||||
run: npm deprecate vectordb "Use @lancedb/lancedb instead."
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
ref: main
|
|
||||||
- name: Update package-lock.json
|
|
||||||
run: |
|
|
||||||
git config user.name 'Lance Release'
|
|
||||||
git config user.email 'lance-dev@lancedb.com'
|
|
||||||
bash ci/update_lockfiles.sh
|
|
||||||
- name: Push new commit
|
|
||||||
uses: ad-m/github-push-action@master
|
|
||||||
with:
|
|
||||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
|
||||||
branch: main
|
|
||||||
- name: Notify Slack Action
|
|
||||||
uses: ravsamhq/notify-slack-action@2.3.0
|
|
||||||
if: ${{ always() }}
|
|
||||||
with:
|
|
||||||
status: ${{ job.status }}
|
|
||||||
notify_when: "failure"
|
|
||||||
notification_title: "{workflow} is failing"
|
|
||||||
env:
|
|
||||||
SLACK_WEBHOOK_URL: ${{ secrets.ACTION_MONITORING_SLACK }}
|
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -31,9 +31,6 @@ python/dist
|
|||||||
*.node
|
*.node
|
||||||
**/node_modules
|
**/node_modules
|
||||||
**/.DS_Store
|
**/.DS_Store
|
||||||
node/dist
|
|
||||||
node/examples/**/package-lock.json
|
|
||||||
node/examples/**/dist
|
|
||||||
nodejs/lancedb/native*
|
nodejs/lancedb/native*
|
||||||
dist
|
dist
|
||||||
|
|
||||||
|
|||||||
@@ -11,8 +11,6 @@ Project layout:
|
|||||||
* `nodejs`: The Typescript bindings, using napi-rs
|
* `nodejs`: The Typescript bindings, using napi-rs
|
||||||
* `java`: The Java bindings
|
* `java`: The Java bindings
|
||||||
|
|
||||||
(`rust/ffi` and `node/` are for a deprecated package. You can ignore them.)
|
|
||||||
|
|
||||||
Common commands:
|
Common commands:
|
||||||
|
|
||||||
* Check for compiler errors: `cargo check --features remote --tests --examples`
|
* Check for compiler errors: `cargo check --features remote --tests --examples`
|
||||||
|
|||||||
219
Cargo.lock
generated
219
Cargo.lock
generated
@@ -1480,7 +1480,7 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"glob",
|
"glob",
|
||||||
"libc",
|
"libc",
|
||||||
"libloading 0.8.8",
|
"libloading",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1573,15 +1573,6 @@ version = "0.3.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
|
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "conv"
|
|
||||||
version = "0.3.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "78ff10625fd0ac447827aa30ea8b861fead473bb60aeb73af6c1c58caf0d1299"
|
|
||||||
dependencies = [
|
|
||||||
"custom_derive",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "convert_case"
|
name = "convert_case"
|
||||||
version = "0.6.0"
|
version = "0.6.0"
|
||||||
@@ -1730,9 +1721,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crunchy"
|
name = "crunchy"
|
||||||
version = "0.2.2"
|
version = "0.2.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
|
checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crypto-bigint"
|
name = "crypto-bigint"
|
||||||
@@ -1797,12 +1788,6 @@ dependencies = [
|
|||||||
"syn 2.0.103",
|
"syn 2.0.103",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "custom_derive"
|
|
||||||
version = "0.1.7"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "darling"
|
name = "darling"
|
||||||
version = "0.20.11"
|
version = "0.20.11"
|
||||||
@@ -2852,12 +2837,12 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fsst"
|
name = "fsst"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "99b0ce83d91fe637d97c127ac8df19f57e6012a5472c339154e5100cb107df4c"
|
checksum = "548190a42654ce848835b410ae33f43b4d55cb24548fd0a885a289a1d5a95019"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3967,9 +3952,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance"
|
name = "lance"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7484555bbe6f7898d6a283f89ecd3e2ba85a0f28d9a9e6f15f3018d8adaebdd9"
|
checksum = "94bafd9d9a9301c1eac48892ec8016d4d28204d4fc55f2ebebee9a7af465e152"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -4016,7 +4001,7 @@ dependencies = [
|
|||||||
"pin-project",
|
"pin-project",
|
||||||
"prost",
|
"prost",
|
||||||
"prost-types",
|
"prost-types",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
"roaring",
|
"roaring",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@@ -4031,9 +4016,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-arrow"
|
name = "lance-arrow"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8674ce4b27d131ac98692dbc0b28f43690defa6ca63303b3cab21e6beaf43868"
|
checksum = "b97ebcd8edc2b534e8ded20c97c8928e275160794af91ed803a3d48d8d2a88d8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -4045,14 +4030,14 @@ dependencies = [
|
|||||||
"getrandom 0.2.16",
|
"getrandom 0.2.16",
|
||||||
"half",
|
"half",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-core"
|
name = "lance-core"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a1dd99bf06d5e322e81ff84cc2ce12b463836b4fba2bc1e0223085e1c8d7b71a"
|
checksum = "ce5c1849d07985d6a5011aca9de43c7a42ec4c996d66ef3f2d9896c227cc934c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -4074,7 +4059,7 @@ dependencies = [
|
|||||||
"object_store",
|
"object_store",
|
||||||
"pin-project",
|
"pin-project",
|
||||||
"prost",
|
"prost",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
"roaring",
|
"roaring",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"snafu",
|
"snafu",
|
||||||
@@ -4087,9 +4072,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-datafusion"
|
name = "lance-datafusion"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "29e78724715c1cb255ea3ac749b617406d91db6565ea77d531c1aba46716efc4"
|
checksum = "d355c087bc66d85e36cfb428465f585b13971e1e13585dd2b6886a54d8a7d9a4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4117,9 +4102,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-datagen"
|
name = "lance-datagen"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0cc5fa5f59bf65d02118fcc05615b511c03222f5240c4a18218f1297f97bcdf7"
|
checksum = "110d4dedfe02e9cff8f11cfb64a261755da7ee9131845197efeec8b659cc5513"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4128,16 +4113,16 @@ dependencies = [
|
|||||||
"chrono",
|
"chrono",
|
||||||
"futures",
|
"futures",
|
||||||
"hex",
|
"hex",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
"rand_xoshiro",
|
"rand_xoshiro",
|
||||||
"random_word 0.5.0",
|
"random_word 0.5.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-encoding"
|
name = "lance-encoding"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a550fe9d4d931c48177691b9c085baf158bfde4ed7b6055eb27fed54174e5767"
|
checksum = "66750006299a2fb003091bc290eb1fe2a5933e35236d921934131f3e4629cd33"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrayref",
|
"arrayref",
|
||||||
"arrow",
|
"arrow",
|
||||||
@@ -4165,7 +4150,7 @@ dependencies = [
|
|||||||
"prost",
|
"prost",
|
||||||
"prost-build",
|
"prost-build",
|
||||||
"prost-types",
|
"prost-types",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
"seq-macro",
|
"seq-macro",
|
||||||
"snafu",
|
"snafu",
|
||||||
"tokio",
|
"tokio",
|
||||||
@@ -4176,9 +4161,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-file"
|
name = "lance-file"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e2d338a50e09bc5af5773cdc5d269680288847d1d34a4622063cce8ad4b5375b"
|
checksum = "7c639062100610a075e01fd455173348b2fccea10cb0e89f70e38a3183c56022"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4212,9 +4197,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-index"
|
name = "lance-index"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "14cbcb44403ee477ab4e53194e4c322295959785a7056b33043a2f9f01fa0f8a"
|
checksum = "7ae67a048a51fb525d1bfde86d1b39118462277e7e7a7cd0e7ba866312873532"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4252,7 +4237,7 @@ dependencies = [
|
|||||||
"object_store",
|
"object_store",
|
||||||
"prost",
|
"prost",
|
||||||
"prost-build",
|
"prost-build",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
"rayon",
|
"rayon",
|
||||||
"roaring",
|
"roaring",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -4267,9 +4252,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-io"
|
name = "lance-io"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "933c8dad56aa3048c421f336b20f23f507cc47271fcc18bea8b4052c247a170e"
|
checksum = "cc86c7307e2d3d895cfefa503f986edcbdd208eb0aa89ba2c75724ba04bce843"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-arith",
|
"arrow-arith",
|
||||||
@@ -4298,7 +4283,7 @@ dependencies = [
|
|||||||
"path_abs",
|
"path_abs",
|
||||||
"pin-project",
|
"pin-project",
|
||||||
"prost",
|
"prost",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
"serde",
|
"serde",
|
||||||
"shellexpand",
|
"shellexpand",
|
||||||
"snafu",
|
"snafu",
|
||||||
@@ -4309,9 +4294,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-linalg"
|
name = "lance-linalg"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2540ae40b7c35901be13541437c947aadb5a6afb2110f7275e90884aeee4cc07"
|
checksum = "769f910b6f2ad5eb4d1b3071c533b619351e61e0dfca74f13c98680a8e6476e9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
@@ -4326,7 +4311,7 @@ dependencies = [
|
|||||||
"lance-core",
|
"lance-core",
|
||||||
"log",
|
"log",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
"rayon",
|
"rayon",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
@@ -4334,9 +4319,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-table"
|
name = "lance-table"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "31e1cfa3e031b5795330eec7808baa1c2e105a067adf0790e5bb9a51aa7256ff"
|
checksum = "ffbeafa8a3e97b5b3a06f06d69b0cefe56e65c64a33f674c40c113b797328bd2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4360,7 +4345,7 @@ dependencies = [
|
|||||||
"prost",
|
"prost",
|
||||||
"prost-build",
|
"prost-build",
|
||||||
"prost-types",
|
"prost-types",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
"rangemap",
|
"rangemap",
|
||||||
"roaring",
|
"roaring",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -4374,20 +4359,20 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance-testing"
|
name = "lance-testing"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2847faaa98fdb2facc75ae515e553ea67e68d0b05de41ac577b8038e1bbafac8"
|
checksum = "535a3bba37625cd515a7172a8d0d138f86822acef9fa9425ad1e050ef88bf92f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
"lance-arrow",
|
"lance-arrow",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
"rand 0.8.5",
|
"rand 0.9.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.21.2-beta.1"
|
version = "0.21.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
@@ -4443,7 +4428,7 @@ dependencies = [
|
|||||||
"regex",
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"rstest",
|
"rstest",
|
||||||
"semver 1.0.26",
|
"semver",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_with",
|
"serde_with",
|
||||||
@@ -4472,34 +4457,9 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "lancedb-node"
|
|
||||||
version = "0.21.2-beta.1"
|
|
||||||
dependencies = [
|
|
||||||
"arrow-array",
|
|
||||||
"arrow-ipc",
|
|
||||||
"arrow-schema",
|
|
||||||
"async-trait",
|
|
||||||
"chrono",
|
|
||||||
"conv",
|
|
||||||
"env_logger",
|
|
||||||
"futures",
|
|
||||||
"half",
|
|
||||||
"lance",
|
|
||||||
"lance-index",
|
|
||||||
"lance-linalg",
|
|
||||||
"lancedb",
|
|
||||||
"lzma-sys",
|
|
||||||
"neon",
|
|
||||||
"object_store",
|
|
||||||
"once_cell",
|
|
||||||
"snafu",
|
|
||||||
"tokio",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
version = "0.21.2-beta.1"
|
version = "0.21.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-ipc",
|
"arrow-ipc",
|
||||||
@@ -4519,7 +4479,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.24.2-beta.1"
|
version = "0.24.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
@@ -4620,16 +4580,6 @@ version = "0.2.174"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
|
checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "libloading"
|
|
||||||
version = "0.6.7"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "351a32417a12d5f7e82c368a66781e307834dae04c6ce0cd4456d52989229883"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"winapi",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libloading"
|
name = "libloading"
|
||||||
version = "0.8.8"
|
version = "0.8.8"
|
||||||
@@ -5008,7 +4958,7 @@ dependencies = [
|
|||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"regex",
|
"regex",
|
||||||
"semver 1.0.26",
|
"semver",
|
||||||
"syn 2.0.103",
|
"syn 2.0.103",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -5018,48 +4968,7 @@ version = "2.4.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "427802e8ec3a734331fec1035594a210ce1ff4dc5bc1950530920ab717964ea3"
|
checksum = "427802e8ec3a734331fec1035594a210ce1ff4dc5bc1950530920ab717964ea3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libloading 0.8.8",
|
"libloading",
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "neon"
|
|
||||||
version = "0.10.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "28e15415261d880aed48122e917a45e87bb82cf0260bb6db48bbab44b7464373"
|
|
||||||
dependencies = [
|
|
||||||
"neon-build",
|
|
||||||
"neon-macros",
|
|
||||||
"neon-runtime",
|
|
||||||
"semver 0.9.0",
|
|
||||||
"smallvec",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "neon-build"
|
|
||||||
version = "0.10.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8bac98a702e71804af3dacfde41edde4a16076a7bbe889ae61e56e18c5b1c811"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "neon-macros"
|
|
||||||
version = "0.10.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "b7288eac8b54af7913c60e0eb0e2a7683020dffa342ab3fd15e28f035ba897cf"
|
|
||||||
dependencies = [
|
|
||||||
"quote",
|
|
||||||
"syn 1.0.109",
|
|
||||||
"syn-mid",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "neon-runtime"
|
|
||||||
version = "0.10.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4676720fa8bb32c64c3d9f49c47a47289239ec46b4bdb66d0913cc512cb0daca"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"libloading 0.6.7",
|
|
||||||
"smallvec",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -6340,11 +6249,11 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rand_xoshiro"
|
name = "rand_xoshiro"
|
||||||
version = "0.6.0"
|
version = "0.7.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
|
checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"rand_core 0.6.4",
|
"rand_core 0.9.3",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -6741,7 +6650,7 @@ version = "0.4.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
|
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"semver 1.0.26",
|
"semver",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -7006,27 +6915,12 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "semver"
|
|
||||||
version = "0.9.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
|
|
||||||
dependencies = [
|
|
||||||
"semver-parser",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "semver"
|
name = "semver"
|
||||||
version = "1.0.26"
|
version = "1.0.26"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
|
checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "semver-parser"
|
|
||||||
version = "0.7.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "seq-macro"
|
name = "seq-macro"
|
||||||
version = "0.3.6"
|
version = "0.3.6"
|
||||||
@@ -7426,17 +7320,6 @@ dependencies = [
|
|||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "syn-mid"
|
|
||||||
version = "0.5.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "fea305d57546cc8cd04feb14b62ec84bf17f50e3f7b12560d7bfa9265f39d9ed"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn 1.0.109",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sync_wrapper"
|
name = "sync_wrapper"
|
||||||
version = "1.0.2"
|
version = "1.0.2"
|
||||||
@@ -8072,7 +7955,7 @@ checksum = "90b70b37e9074642bc5f60bb23247fd072a84314ca9e71cdf8527593406a0dd3"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"gemm 0.18.2",
|
"gemm 0.18.2",
|
||||||
"half",
|
"half",
|
||||||
"libloading 0.8.8",
|
"libloading",
|
||||||
"memmap2 0.9.5",
|
"memmap2 0.9.5",
|
||||||
"num",
|
"num",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
|
|||||||
27
Cargo.toml
27
Cargo.toml
@@ -1,11 +1,5 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
members = [
|
members = ["rust/lancedb", "nodejs", "python", "java/core/lancedb-jni"]
|
||||||
"rust/ffi/node",
|
|
||||||
"rust/lancedb",
|
|
||||||
"nodejs",
|
|
||||||
"python",
|
|
||||||
"java/core/lancedb-jni",
|
|
||||||
]
|
|
||||||
# Python package needs to be built by maturin.
|
# Python package needs to be built by maturin.
|
||||||
exclude = ["python"]
|
exclude = ["python"]
|
||||||
resolver = "2"
|
resolver = "2"
|
||||||
@@ -21,14 +15,14 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.32.0", "features" = ["dynamodb"] }
|
lance = { "version" = "=0.33.0", "features" = ["dynamodb"] }
|
||||||
lance-io = "=0.32.0"
|
lance-io = "=0.33.0"
|
||||||
lance-index = "=0.32.0"
|
lance-index = "=0.33.0"
|
||||||
lance-linalg = "=0.32.0"
|
lance-linalg = "=0.33.0"
|
||||||
lance-table = "=0.32.0"
|
lance-table = "=0.33.0"
|
||||||
lance-testing = "=0.32.0"
|
lance-testing = "=0.33.0"
|
||||||
lance-datafusion = "=0.32.0"
|
lance-datafusion = "=0.33.0"
|
||||||
lance-encoding = "=0.32.0"
|
lance-encoding = "=0.33.0"
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "55.1", optional = false }
|
arrow = { version = "55.1", optional = false }
|
||||||
arrow-array = "55.1"
|
arrow-array = "55.1"
|
||||||
@@ -61,12 +55,11 @@ rand = "0.9"
|
|||||||
regex = "1.10"
|
regex = "1.10"
|
||||||
lazy_static = "1"
|
lazy_static = "1"
|
||||||
semver = "1.0.25"
|
semver = "1.0.25"
|
||||||
|
crunchy = "0.2.4"
|
||||||
# Temporary pins to work around downstream issues
|
# Temporary pins to work around downstream issues
|
||||||
# https://github.com/apache/arrow-rs/commit/2fddf85afcd20110ce783ed5b4cdeb82293da30b
|
# https://github.com/apache/arrow-rs/commit/2fddf85afcd20110ce783ed5b4cdeb82293da30b
|
||||||
chrono = "=0.4.41"
|
chrono = "=0.4.41"
|
||||||
# https://github.com/RustCrypto/formats/issues/1684
|
# https://github.com/RustCrypto/formats/issues/1684
|
||||||
base64ct = "=1.6.0"
|
base64ct = "=1.6.0"
|
||||||
# Workaround for: https://github.com/eira-fransham/crunchy/issues/13
|
|
||||||
crunchy = "=0.2.2"
|
|
||||||
# Workaround for: https://github.com/Lokathor/bytemuck/issues/306
|
# Workaround for: https://github.com/Lokathor/bytemuck/issues/306
|
||||||
bytemuck_derive = ">=1.8.1, <1.9.0"
|
bytemuck_derive = ">=1.8.1, <1.9.0"
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
ARCH=${1:-x86_64}
|
|
||||||
TARGET_TRIPLE=${2:-x86_64-unknown-linux-gnu}
|
|
||||||
|
|
||||||
# We pass down the current user so that when we later mount the local files
|
|
||||||
# into the container, the files are accessible by the current user.
|
|
||||||
pushd ci/manylinux_node
|
|
||||||
docker build \
|
|
||||||
-t lancedb-node-manylinux \
|
|
||||||
--build-arg="ARCH=$ARCH" \
|
|
||||||
--build-arg="DOCKER_USER=$(id -u)" \
|
|
||||||
--progress=plain \
|
|
||||||
.
|
|
||||||
popd
|
|
||||||
|
|
||||||
# We turn on memory swap to avoid OOM killer
|
|
||||||
docker run \
|
|
||||||
-v $(pwd):/io -w /io \
|
|
||||||
--memory-swap=-1 \
|
|
||||||
lancedb-node-manylinux \
|
|
||||||
bash ci/manylinux_node/build_vectordb.sh $ARCH $TARGET_TRIPLE
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
# Builds the macOS artifacts (node binaries).
|
|
||||||
# Usage: ./ci/build_macos_artifacts.sh [target]
|
|
||||||
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
|
|
||||||
set -e
|
|
||||||
|
|
||||||
prebuild_rust() {
|
|
||||||
# Building here for the sake of easier debugging.
|
|
||||||
pushd rust/ffi/node
|
|
||||||
echo "Building rust library for $1"
|
|
||||||
export RUST_BACKTRACE=1
|
|
||||||
cargo build --release --target $1
|
|
||||||
popd
|
|
||||||
}
|
|
||||||
|
|
||||||
build_node_binaries() {
|
|
||||||
pushd node
|
|
||||||
echo "Building node library for $1"
|
|
||||||
npm run build-release -- --target $1
|
|
||||||
npm run pack-build -- --target $1
|
|
||||||
popd
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ -n "$1" ]; then
|
|
||||||
targets=$1
|
|
||||||
else
|
|
||||||
targets="x86_64-apple-darwin aarch64-apple-darwin"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Building artifacts for targets: $targets"
|
|
||||||
for target in $targets
|
|
||||||
do
|
|
||||||
prebuild_rust $target
|
|
||||||
build_node_binaries $target
|
|
||||||
done
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
# Builds the Windows artifacts (node binaries).
|
|
||||||
# Usage: .\ci\build_windows_artifacts.ps1 [target]
|
|
||||||
# Targets supported:
|
|
||||||
# - x86_64-pc-windows-msvc
|
|
||||||
# - i686-pc-windows-msvc
|
|
||||||
# - aarch64-pc-windows-msvc
|
|
||||||
|
|
||||||
function Prebuild-Rust {
|
|
||||||
param (
|
|
||||||
[string]$target
|
|
||||||
)
|
|
||||||
|
|
||||||
# Building here for the sake of easier debugging.
|
|
||||||
Push-Location -Path "rust/ffi/node"
|
|
||||||
Write-Host "Building rust library for $target"
|
|
||||||
$env:RUST_BACKTRACE=1
|
|
||||||
cargo build --release --target $target
|
|
||||||
Pop-Location
|
|
||||||
}
|
|
||||||
|
|
||||||
function Build-NodeBinaries {
|
|
||||||
param (
|
|
||||||
[string]$target
|
|
||||||
)
|
|
||||||
|
|
||||||
Push-Location -Path "node"
|
|
||||||
Write-Host "Building node library for $target"
|
|
||||||
npm run build-release -- --target $target
|
|
||||||
npm run pack-build -- --target $target
|
|
||||||
Pop-Location
|
|
||||||
}
|
|
||||||
|
|
||||||
$targets = $args[0]
|
|
||||||
if (-not $targets) {
|
|
||||||
$targets = "x86_64-pc-windows-msvc", "aarch64-pc-windows-msvc"
|
|
||||||
}
|
|
||||||
|
|
||||||
Write-Host "Building artifacts for targets: $targets"
|
|
||||||
foreach ($target in $targets) {
|
|
||||||
Prebuild-Rust $target
|
|
||||||
Build-NodeBinaries $target
|
|
||||||
}
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
# Builds the Windows artifacts (nodejs binaries).
|
|
||||||
# Usage: .\ci\build_windows_artifacts_nodejs.ps1 [target]
|
|
||||||
# Targets supported:
|
|
||||||
# - x86_64-pc-windows-msvc
|
|
||||||
# - i686-pc-windows-msvc
|
|
||||||
# - aarch64-pc-windows-msvc
|
|
||||||
|
|
||||||
function Prebuild-Rust {
|
|
||||||
param (
|
|
||||||
[string]$target
|
|
||||||
)
|
|
||||||
|
|
||||||
# Building here for the sake of easier debugging.
|
|
||||||
Push-Location -Path "rust/lancedb"
|
|
||||||
Write-Host "Building rust library for $target"
|
|
||||||
$env:RUST_BACKTRACE=1
|
|
||||||
cargo build --release --target $target
|
|
||||||
Pop-Location
|
|
||||||
}
|
|
||||||
|
|
||||||
function Build-NodeBinaries {
|
|
||||||
param (
|
|
||||||
[string]$target
|
|
||||||
)
|
|
||||||
|
|
||||||
Push-Location -Path "nodejs"
|
|
||||||
Write-Host "Building nodejs library for $target"
|
|
||||||
$env:RUST_TARGET=$target
|
|
||||||
npm run build-release
|
|
||||||
Pop-Location
|
|
||||||
}
|
|
||||||
|
|
||||||
$targets = $args[0]
|
|
||||||
if (-not $targets) {
|
|
||||||
$targets = "x86_64-pc-windows-msvc", "aarch64-pc-windows-msvc"
|
|
||||||
}
|
|
||||||
|
|
||||||
Write-Host "Building artifacts for targets: $targets"
|
|
||||||
foreach ($target in $targets) {
|
|
||||||
Prebuild-Rust $target
|
|
||||||
Build-NodeBinaries $target
|
|
||||||
}
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
# Many linux dockerfile with Rust, Node, and Lance dependencies installed.
|
|
||||||
# This container allows building the node modules native libraries in an
|
|
||||||
# environment with a very old glibc, so that we are compatible with a wide
|
|
||||||
# range of linux distributions.
|
|
||||||
ARG ARCH=x86_64
|
|
||||||
|
|
||||||
FROM quay.io/pypa/manylinux_2_28_${ARCH}
|
|
||||||
|
|
||||||
ARG ARCH=x86_64
|
|
||||||
ARG DOCKER_USER=default_user
|
|
||||||
|
|
||||||
# Protobuf is also installed as root.
|
|
||||||
COPY install_protobuf.sh install_protobuf.sh
|
|
||||||
RUN ./install_protobuf.sh ${ARCH}
|
|
||||||
|
|
||||||
ENV DOCKER_USER=${DOCKER_USER}
|
|
||||||
# Create a group and user, but only if it doesn't exist
|
|
||||||
RUN echo ${ARCH} && id -u ${DOCKER_USER} >/dev/null 2>&1 || adduser --user-group --create-home --uid ${DOCKER_USER} build_user
|
|
||||||
|
|
||||||
# We switch to the user to install Rust and Node, since those like to be
|
|
||||||
# installed at the user level.
|
|
||||||
USER ${DOCKER_USER}
|
|
||||||
|
|
||||||
COPY prepare_manylinux_node.sh prepare_manylinux_node.sh
|
|
||||||
RUN cp /prepare_manylinux_node.sh $HOME/ && \
|
|
||||||
cd $HOME && \
|
|
||||||
./prepare_manylinux_node.sh ${ARCH}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# Builds the node module for manylinux. Invoked by ci/build_linux_artifacts.sh.
|
|
||||||
set -e
|
|
||||||
ARCH=${1:-x86_64}
|
|
||||||
TARGET_TRIPLE=${2:-x86_64-unknown-linux-gnu}
|
|
||||||
|
|
||||||
#Alpine doesn't have .bashrc
|
|
||||||
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
|
||||||
|
|
||||||
cd node
|
|
||||||
npm ci
|
|
||||||
npm run build-release
|
|
||||||
npm run pack-build -- -t $TARGET_TRIPLE
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# Installs protobuf compiler. Should be run as root.
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $1 == x86_64* ]]; then
|
|
||||||
ARCH=x86_64
|
|
||||||
else
|
|
||||||
# gnu target
|
|
||||||
ARCH=aarch_64
|
|
||||||
fi
|
|
||||||
|
|
||||||
PB_REL=https://github.com/protocolbuffers/protobuf/releases
|
|
||||||
PB_VERSION=23.1
|
|
||||||
curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-$ARCH.zip
|
|
||||||
unzip protoc-$PB_VERSION-linux-$ARCH.zip -d /usr/local
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
install_node() {
|
|
||||||
echo "Installing node..."
|
|
||||||
|
|
||||||
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
|
|
||||||
|
|
||||||
source "$HOME"/.bashrc
|
|
||||||
|
|
||||||
nvm install --no-progress 18
|
|
||||||
}
|
|
||||||
|
|
||||||
install_rust() {
|
|
||||||
echo "Installing rust..."
|
|
||||||
curl https://sh.rustup.rs -sSf | bash -s -- -y
|
|
||||||
export PATH="$PATH:/root/.cargo/bin"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_node
|
|
||||||
install_rust
|
|
||||||
@@ -15,16 +15,13 @@ cargo metadata --quiet > /dev/null
|
|||||||
pushd nodejs || exit 1
|
pushd nodejs || exit 1
|
||||||
npm install --package-lock-only --silent
|
npm install --package-lock-only --silent
|
||||||
popd
|
popd
|
||||||
pushd node || exit 1
|
|
||||||
npm install --package-lock-only --silent
|
|
||||||
popd
|
|
||||||
|
|
||||||
if git diff --quiet --exit-code; then
|
if git diff --quiet --exit-code; then
|
||||||
echo "No lockfile changes to commit; skipping amend."
|
echo "No lockfile changes to commit; skipping amend."
|
||||||
elif $AMEND; then
|
elif $AMEND; then
|
||||||
git add Cargo.lock nodejs/package-lock.json node/package-lock.json
|
git add Cargo.lock nodejs/package-lock.json
|
||||||
git commit --amend --no-edit
|
git commit --amend --no-edit
|
||||||
else
|
else
|
||||||
git add Cargo.lock nodejs/package-lock.json node/package-lock.json
|
git add Cargo.lock nodejs/package-lock.json
|
||||||
git commit -m "Update lockfiles"
|
git commit -m "Update lockfiles"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ The following concepts are important to keep in mind:
|
|||||||
- Data is versioned, with each insert operation creating a new version of the dataset and an update to the manifest that tracks versions via metadata
|
- Data is versioned, with each insert operation creating a new version of the dataset and an update to the manifest that tracks versions via metadata
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
1. First, each version contains metadata and just the new/updated data in your transaction. So if you have 100 versions, they aren't 100 duplicates of the same data. However, they do have 100x the metadata overhead of a single version, which can result in slower queries.
|
1. First, each version contains metadata and just the new/updated data in your transaction. So if you have 100 versions, they aren't 100 duplicates of the same data. However, they do have 100x the metadata overhead of a single version, which can result in slower queries.
|
||||||
2. Second, these versions exist to keep LanceDB scalable and consistent. We do not immediately blow away old versions when creating new ones because other clients might be in the middle of querying the old version. It's important to retain older versions for as long as they might be queried.
|
2. Second, these versions exist to keep LanceDB scalable and consistent. We do not immediately blow away old versions when creating new ones because other clients might be in the middle of querying the old version. It's important to retain older versions for as long as they might be queried.
|
||||||
|
|
||||||
## What are fragments?
|
## What are fragments?
|
||||||
@@ -37,6 +37,10 @@ Depending on the use case and dataset, optimal compaction will have different re
|
|||||||
- It’s always better to use *batch* inserts rather than adding 1 row at a time (to avoid too small fragments). If single-row inserts are unavoidable, run compaction on a regular basis to merge them into larger fragments.
|
- It’s always better to use *batch* inserts rather than adding 1 row at a time (to avoid too small fragments). If single-row inserts are unavoidable, run compaction on a regular basis to merge them into larger fragments.
|
||||||
- Keep the number of fragments under 100, which is suitable for most use cases (for *really* large datasets of >500M rows, more fragments might be needed)
|
- Keep the number of fragments under 100, which is suitable for most use cases (for *really* large datasets of >500M rows, more fragments might be needed)
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
|
||||||
|
LanceDB Cloud/Enterprise supports [auto-compaction](https://docs.lancedb.com/enterprise/architecture/architecture#write-path) which automatically optimizes fragments in the background as data changes.
|
||||||
|
|
||||||
## Deletion
|
## Deletion
|
||||||
|
|
||||||
Although Lance allows you to delete rows from a dataset, it does not actually delete the data immediately. It simply marks the row as deleted in the `DataFile` that represents a fragment. For a given version of the dataset, each fragment can have up to one deletion file (if no rows were ever deleted from that fragment, it will not have a deletion file). This is important to keep in mind because it means that the data is still there, and can be recovered if needed, as long as that version still exists based on your backup policy.
|
Although Lance allows you to delete rows from a dataset, it does not actually delete the data immediately. It simply marks the row as deleted in the `DataFile` that represents a fragment. For a given version of the dataset, each fragment can have up to one deletion file (if no rows were ever deleted from that fragment, it will not have a deletion file). This is important to keep in mind because it means that the data is still there, and can be recovered if needed, as long as that version still exists based on your backup policy.
|
||||||
@@ -50,13 +54,9 @@ Reindexing is the process of updating the index to account for new data, keeping
|
|||||||
|
|
||||||
Both LanceDB OSS and Cloud support reindexing, but the process (at least for now) is different for each, depending on the type of index.
|
Both LanceDB OSS and Cloud support reindexing, but the process (at least for now) is different for each, depending on the type of index.
|
||||||
|
|
||||||
When a reindex job is triggered in the background, the entire data is reindexed, but in the interim as new queries come in, LanceDB will combine results from the existing index with exhaustive kNN search on the new data. This is done to ensure that you're still searching on all your data, but it does come at a performance cost. The more data that you add without reindexing, the impact on latency (due to exhaustive search) can be noticeable.
|
In LanceDB OSS, re-indexing happens synchronously when you call either `create_index` or `optimize` on a table. In LanceDB Cloud, re-indexing happens asynchronously as you add and update data in your table.
|
||||||
|
|
||||||
### Vector reindex
|
By default, queries will search new data even if it has yet to be indexed. This is done using brute-force methods, such as kNN for vector search, and combined with the fast index search results. This is done to ensure that you're always searching over all your data, but it does come at a performance cost. Without reindexing, adding more data to a table will make queries slower and more expensive. This behavior can be disabled by setting the [fast_search](https://lancedb.github.io/lancedb/python/python/#lancedb.query.AsyncQuery.fast_search) parameter which will instruct the query to ignore un-indexed data.
|
||||||
|
|
||||||
* LanceDB Cloud supports incremental reindexing, where a background process will trigger a new index build for you automatically when new data is added to a dataset
|
* LanceDB Cloud/Enterprise supports [automatic incremental reindexing](https://docs.lancedb.com/core#vector-index) for vector, scalar, and FTS indices, where a background process will trigger a new index build for you automatically when new data is added or modified in a dataset
|
||||||
* LanceDB OSS requires you to manually trigger a reindex operation -- we are working on adding incremental reindexing to LanceDB OSS as well
|
* LanceDB OSS requires you to manually trigger a reindex operation -- we are working on adding incremental reindexing to LanceDB OSS as well
|
||||||
|
|
||||||
### FTS reindex
|
|
||||||
|
|
||||||
FTS reindexing is supported in both LanceDB OSS and Cloud, but requires that it's manually rebuilt once you have a significant enough amount of new data added that needs to be reindexed. We [updated](https://github.com/lancedb/lancedb/pull/762) Tantivy's default heap size from 128MB to 1GB in LanceDB to make it much faster to reindex, by up to 10x from the default settings.
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ A builder for LanceDB queries.
|
|||||||
|
|
||||||
## Extends
|
## Extends
|
||||||
|
|
||||||
- [`QueryBase`](QueryBase.md)<`NativeQuery`>
|
- `StandardQueryBase`<`NativeQuery`>
|
||||||
|
|
||||||
## Properties
|
## Properties
|
||||||
|
|
||||||
@@ -26,7 +26,7 @@ protected inner: Query | Promise<Query>;
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`inner`](QueryBase.md#inner)
|
`StandardQueryBase.inner`
|
||||||
|
|
||||||
## Methods
|
## Methods
|
||||||
|
|
||||||
@@ -73,7 +73,7 @@ AnalyzeExec verbose=true, metrics=[]
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`analyzePlan`](QueryBase.md#analyzeplan)
|
`StandardQueryBase.analyzePlan`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -107,7 +107,7 @@ single query)
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`execute`](QueryBase.md#execute)
|
`StandardQueryBase.execute`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -143,7 +143,7 @@ const plan = await table.query().nearestTo([0.5, 0.2]).explainPlan();
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`explainPlan`](QueryBase.md#explainplan)
|
`StandardQueryBase.explainPlan`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -164,7 +164,7 @@ Use [Table#optimize](Table.md#optimize) to index all un-indexed data.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`fastSearch`](QueryBase.md#fastsearch)
|
`StandardQueryBase.fastSearch`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -194,7 +194,7 @@ Use `where` instead
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`filter`](QueryBase.md#filter)
|
`StandardQueryBase.filter`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -216,7 +216,7 @@ fullTextSearch(query, options?): this
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`fullTextSearch`](QueryBase.md#fulltextsearch)
|
`StandardQueryBase.fullTextSearch`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -241,7 +241,7 @@ called then every valid row from the table will be returned.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`limit`](QueryBase.md#limit)
|
`StandardQueryBase.limit`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -325,6 +325,10 @@ nearestToText(query, columns?): Query
|
|||||||
offset(offset): this
|
offset(offset): this
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Set the number of rows to skip before returning results.
|
||||||
|
|
||||||
|
This is useful for pagination.
|
||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **offset**: `number`
|
* **offset**: `number`
|
||||||
@@ -335,7 +339,7 @@ offset(offset): this
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`offset`](QueryBase.md#offset)
|
`StandardQueryBase.offset`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -388,7 +392,7 @@ object insertion order is easy to get wrong and `Map` is more foolproof.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`select`](QueryBase.md#select)
|
`StandardQueryBase.select`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -410,7 +414,7 @@ Collect the results as an array of objects.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`toArray`](QueryBase.md#toarray)
|
`StandardQueryBase.toArray`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -436,7 +440,7 @@ ArrowTable.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`toArrow`](QueryBase.md#toarrow)
|
`StandardQueryBase.toArrow`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -471,7 +475,7 @@ on the filter column(s).
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`where`](QueryBase.md#where)
|
`StandardQueryBase.where`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -493,4 +497,4 @@ order to perform hybrid search.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`withRowId`](QueryBase.md#withrowid)
|
`StandardQueryBase.withRowId`
|
||||||
|
|||||||
@@ -15,12 +15,11 @@ Common methods supported by all query types
|
|||||||
|
|
||||||
## Extended by
|
## Extended by
|
||||||
|
|
||||||
- [`Query`](Query.md)
|
- [`TakeQuery`](TakeQuery.md)
|
||||||
- [`VectorQuery`](VectorQuery.md)
|
|
||||||
|
|
||||||
## Type Parameters
|
## Type Parameters
|
||||||
|
|
||||||
• **NativeQueryType** *extends* `NativeQuery` \| `NativeVectorQuery`
|
• **NativeQueryType** *extends* `NativeQuery` \| `NativeVectorQuery` \| `NativeTakeQuery`
|
||||||
|
|
||||||
## Implements
|
## Implements
|
||||||
|
|
||||||
@@ -141,104 +140,6 @@ const plan = await table.query().nearestTo([0.5, 0.2]).explainPlan();
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
### fastSearch()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
fastSearch(): this
|
|
||||||
```
|
|
||||||
|
|
||||||
Skip searching un-indexed data. This can make search faster, but will miss
|
|
||||||
any data that is not yet indexed.
|
|
||||||
|
|
||||||
Use [Table#optimize](Table.md#optimize) to index all un-indexed data.
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
`this`
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### ~~filter()~~
|
|
||||||
|
|
||||||
```ts
|
|
||||||
filter(predicate): this
|
|
||||||
```
|
|
||||||
|
|
||||||
A filter statement to be applied to this query.
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
* **predicate**: `string`
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
`this`
|
|
||||||
|
|
||||||
#### See
|
|
||||||
|
|
||||||
where
|
|
||||||
|
|
||||||
#### Deprecated
|
|
||||||
|
|
||||||
Use `where` instead
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### fullTextSearch()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
fullTextSearch(query, options?): this
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
* **query**: `string` \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
|
||||||
|
|
||||||
* **options?**: `Partial`<[`FullTextSearchOptions`](../interfaces/FullTextSearchOptions.md)>
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
`this`
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### limit()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
limit(limit): this
|
|
||||||
```
|
|
||||||
|
|
||||||
Set the maximum number of results to return.
|
|
||||||
|
|
||||||
By default, a plain search has no limit. If this method is not
|
|
||||||
called then every valid row from the table will be returned.
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
* **limit**: `number`
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
`this`
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### offset()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
offset(offset): this
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
* **offset**: `number`
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
`this`
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### select()
|
### select()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
@@ -328,37 +229,6 @@ ArrowTable.
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
### where()
|
|
||||||
|
|
||||||
```ts
|
|
||||||
where(predicate): this
|
|
||||||
```
|
|
||||||
|
|
||||||
A filter statement to be applied to this query.
|
|
||||||
|
|
||||||
The filter should be supplied as an SQL query string. For example:
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
* **predicate**: `string`
|
|
||||||
|
|
||||||
#### Returns
|
|
||||||
|
|
||||||
`this`
|
|
||||||
|
|
||||||
#### Example
|
|
||||||
|
|
||||||
```ts
|
|
||||||
x > 10
|
|
||||||
y > 0 AND y < 100
|
|
||||||
x > 5 OR y = 'test'
|
|
||||||
|
|
||||||
Filtering performance can often be improved by creating a scalar index
|
|
||||||
on the filter column(s).
|
|
||||||
```
|
|
||||||
|
|
||||||
***
|
|
||||||
|
|
||||||
### withRowId()
|
### withRowId()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -9,7 +9,8 @@
|
|||||||
A session for managing caches and object stores across LanceDB operations.
|
A session for managing caches and object stores across LanceDB operations.
|
||||||
|
|
||||||
Sessions allow you to configure cache sizes for index and metadata caches,
|
Sessions allow you to configure cache sizes for index and metadata caches,
|
||||||
which can significantly impact performance for large datasets.
|
which can significantly impact memory use and performance. They can
|
||||||
|
also be re-used across multiple connections to share the same cache state.
|
||||||
|
|
||||||
## Constructors
|
## Constructors
|
||||||
|
|
||||||
@@ -24,8 +25,11 @@ Create a new session with custom cache sizes.
|
|||||||
# Parameters
|
# Parameters
|
||||||
|
|
||||||
- `index_cache_size_bytes`: The size of the index cache in bytes.
|
- `index_cache_size_bytes`: The size of the index cache in bytes.
|
||||||
|
Index data is stored in memory in this cache to speed up queries.
|
||||||
Defaults to 6GB if not specified.
|
Defaults to 6GB if not specified.
|
||||||
- `metadata_cache_size_bytes`: The size of the metadata cache in bytes.
|
- `metadata_cache_size_bytes`: The size of the metadata cache in bytes.
|
||||||
|
The metadata cache stores file metadata and schema information in memory.
|
||||||
|
This cache improves scan and write performance.
|
||||||
Defaults to 1GB if not specified.
|
Defaults to 1GB if not specified.
|
||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|||||||
@@ -674,6 +674,48 @@ console.log(tags); // { "v1": { version: 1, manifestSize: ... } }
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### takeOffsets()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
abstract takeOffsets(offsets): TakeQuery
|
||||||
|
```
|
||||||
|
|
||||||
|
Create a query that returns a subset of the rows in the table.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **offsets**: `number`[]
|
||||||
|
The offsets of the rows to return.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`TakeQuery`](TakeQuery.md)
|
||||||
|
|
||||||
|
A builder that can be used to parameterize the query.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### takeRowIds()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
abstract takeRowIds(rowIds): TakeQuery
|
||||||
|
```
|
||||||
|
|
||||||
|
Create a query that returns a subset of the rows in the table.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **rowIds**: `number`[]
|
||||||
|
The row ids of the rows to return.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`TakeQuery`](TakeQuery.md)
|
||||||
|
|
||||||
|
A builder that can be used to parameterize the query.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### toArrow()
|
### toArrow()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
265
docs/src/js/classes/TakeQuery.md
Normal file
265
docs/src/js/classes/TakeQuery.md
Normal file
@@ -0,0 +1,265 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / TakeQuery
|
||||||
|
|
||||||
|
# Class: TakeQuery
|
||||||
|
|
||||||
|
A query that returns a subset of the rows in the table.
|
||||||
|
|
||||||
|
## Extends
|
||||||
|
|
||||||
|
- [`QueryBase`](QueryBase.md)<`NativeTakeQuery`>
|
||||||
|
|
||||||
|
## Properties
|
||||||
|
|
||||||
|
### inner
|
||||||
|
|
||||||
|
```ts
|
||||||
|
protected inner: TakeQuery | Promise<TakeQuery>;
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Inherited from
|
||||||
|
|
||||||
|
[`QueryBase`](QueryBase.md).[`inner`](QueryBase.md#inner)
|
||||||
|
|
||||||
|
## Methods
|
||||||
|
|
||||||
|
### analyzePlan()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
analyzePlan(): Promise<string>
|
||||||
|
```
|
||||||
|
|
||||||
|
Executes the query and returns the physical query plan annotated with runtime metrics.
|
||||||
|
|
||||||
|
This is useful for debugging and performance analysis, as it shows how the query was executed
|
||||||
|
and includes metrics such as elapsed time, rows processed, and I/O statistics.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`Promise`<`string`>
|
||||||
|
|
||||||
|
A query execution plan with runtime metrics for each step.
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
import * as lancedb from "@lancedb/lancedb"
|
||||||
|
|
||||||
|
const db = await lancedb.connect("./.lancedb");
|
||||||
|
const table = await db.createTable("my_table", [
|
||||||
|
{ vector: [1.1, 0.9], id: "1" },
|
||||||
|
]);
|
||||||
|
|
||||||
|
const plan = await table.query().nearestTo([0.5, 0.2]).analyzePlan();
|
||||||
|
|
||||||
|
Example output (with runtime metrics inlined):
|
||||||
|
AnalyzeExec verbose=true, metrics=[]
|
||||||
|
ProjectionExec: expr=[id@3 as id, vector@0 as vector, _distance@2 as _distance], metrics=[output_rows=1, elapsed_compute=3.292µs]
|
||||||
|
Take: columns="vector, _rowid, _distance, (id)", metrics=[output_rows=1, elapsed_compute=66.001µs, batches_processed=1, bytes_read=8, iops=1, requests=1]
|
||||||
|
CoalesceBatchesExec: target_batch_size=1024, metrics=[output_rows=1, elapsed_compute=3.333µs]
|
||||||
|
GlobalLimitExec: skip=0, fetch=10, metrics=[output_rows=1, elapsed_compute=167ns]
|
||||||
|
FilterExec: _distance@2 IS NOT NULL, metrics=[output_rows=1, elapsed_compute=8.542µs]
|
||||||
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], metrics=[output_rows=1, elapsed_compute=63.25µs, row_replacements=1]
|
||||||
|
KNNVectorDistance: metric=l2, metrics=[output_rows=1, elapsed_compute=114.333µs, output_batches=1]
|
||||||
|
LanceScan: uri=/path/to/data, projection=[vector], row_id=true, row_addr=false, ordered=false, metrics=[output_rows=1, elapsed_compute=103.626µs, bytes_read=549, iops=2, requests=2]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Inherited from
|
||||||
|
|
||||||
|
[`QueryBase`](QueryBase.md).[`analyzePlan`](QueryBase.md#analyzeplan)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### execute()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
protected execute(options?): RecordBatchIterator
|
||||||
|
```
|
||||||
|
|
||||||
|
Execute the query and return the results as an
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`RecordBatchIterator`](RecordBatchIterator.md)
|
||||||
|
|
||||||
|
#### See
|
||||||
|
|
||||||
|
- AsyncIterator
|
||||||
|
of
|
||||||
|
- RecordBatch.
|
||||||
|
|
||||||
|
By default, LanceDb will use many threads to calculate results and, when
|
||||||
|
the result set is large, multiple batches will be processed at one time.
|
||||||
|
This readahead is limited however and backpressure will be applied if this
|
||||||
|
stream is consumed slowly (this constrains the maximum memory used by a
|
||||||
|
single query)
|
||||||
|
|
||||||
|
#### Inherited from
|
||||||
|
|
||||||
|
[`QueryBase`](QueryBase.md).[`execute`](QueryBase.md#execute)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### explainPlan()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
explainPlan(verbose): Promise<string>
|
||||||
|
```
|
||||||
|
|
||||||
|
Generates an explanation of the query execution plan.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **verbose**: `boolean` = `false`
|
||||||
|
If true, provides a more detailed explanation. Defaults to false.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`Promise`<`string`>
|
||||||
|
|
||||||
|
A Promise that resolves to a string containing the query execution plan explanation.
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
import * as lancedb from "@lancedb/lancedb"
|
||||||
|
const db = await lancedb.connect("./.lancedb");
|
||||||
|
const table = await db.createTable("my_table", [
|
||||||
|
{ vector: [1.1, 0.9], id: "1" },
|
||||||
|
]);
|
||||||
|
const plan = await table.query().nearestTo([0.5, 0.2]).explainPlan();
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Inherited from
|
||||||
|
|
||||||
|
[`QueryBase`](QueryBase.md).[`explainPlan`](QueryBase.md#explainplan)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### select()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
select(columns): this
|
||||||
|
```
|
||||||
|
|
||||||
|
Return only the specified columns.
|
||||||
|
|
||||||
|
By default a query will return all columns from the table. However, this can have
|
||||||
|
a very significant impact on latency. LanceDb stores data in a columnar fashion. This
|
||||||
|
means we can finely tune our I/O to select exactly the columns we need.
|
||||||
|
|
||||||
|
As a best practice you should always limit queries to the columns that you need. If you
|
||||||
|
pass in an array of column names then only those columns will be returned.
|
||||||
|
|
||||||
|
You can also use this method to create new "dynamic" columns based on your existing columns.
|
||||||
|
For example, you may not care about "a" or "b" but instead simply want "a + b". This is often
|
||||||
|
seen in the SELECT clause of an SQL query (e.g. `SELECT a+b FROM my_table`).
|
||||||
|
|
||||||
|
To create dynamic columns you can pass in a Map<string, string>. A column will be returned
|
||||||
|
for each entry in the map. The key provides the name of the column. The value is
|
||||||
|
an SQL string used to specify how the column is calculated.
|
||||||
|
|
||||||
|
For example, an SQL query might state `SELECT a + b AS combined, c`. The equivalent
|
||||||
|
input to this method would be:
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **columns**: `string` \| `string`[] \| `Record`<`string`, `string`> \| `Map`<`string`, `string`>
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`this`
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
new Map([["combined", "a + b"], ["c", "c"]])
|
||||||
|
|
||||||
|
Columns will always be returned in the order given, even if that order is different than
|
||||||
|
the order used when adding the data.
|
||||||
|
|
||||||
|
Note that you can pass in a `Record<string, string>` (e.g. an object literal). This method
|
||||||
|
uses `Object.entries` which should preserve the insertion order of the object. However,
|
||||||
|
object insertion order is easy to get wrong and `Map` is more foolproof.
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Inherited from
|
||||||
|
|
||||||
|
[`QueryBase`](QueryBase.md).[`select`](QueryBase.md#select)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### toArray()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
toArray(options?): Promise<any[]>
|
||||||
|
```
|
||||||
|
|
||||||
|
Collect the results as an array of objects.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`Promise`<`any`[]>
|
||||||
|
|
||||||
|
#### Inherited from
|
||||||
|
|
||||||
|
[`QueryBase`](QueryBase.md).[`toArray`](QueryBase.md#toarray)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### toArrow()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
toArrow(options?): Promise<Table<any>>
|
||||||
|
```
|
||||||
|
|
||||||
|
Collect the results as an Arrow
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **options?**: `Partial`<[`QueryExecutionOptions`](../interfaces/QueryExecutionOptions.md)>
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`Promise`<`Table`<`any`>>
|
||||||
|
|
||||||
|
#### See
|
||||||
|
|
||||||
|
ArrowTable.
|
||||||
|
|
||||||
|
#### Inherited from
|
||||||
|
|
||||||
|
[`QueryBase`](QueryBase.md).[`toArrow`](QueryBase.md#toarrow)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### withRowId()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
withRowId(): this
|
||||||
|
```
|
||||||
|
|
||||||
|
Whether to return the row id in the results.
|
||||||
|
|
||||||
|
This column can be used to match results between different queries. For
|
||||||
|
example, to match results from a full text search and a vector search in
|
||||||
|
order to perform hybrid search.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`this`
|
||||||
|
|
||||||
|
#### Inherited from
|
||||||
|
|
||||||
|
[`QueryBase`](QueryBase.md).[`withRowId`](QueryBase.md#withrowid)
|
||||||
@@ -16,7 +16,7 @@ This builder can be reused to execute the query many times.
|
|||||||
|
|
||||||
## Extends
|
## Extends
|
||||||
|
|
||||||
- [`QueryBase`](QueryBase.md)<`NativeVectorQuery`>
|
- `StandardQueryBase`<`NativeVectorQuery`>
|
||||||
|
|
||||||
## Properties
|
## Properties
|
||||||
|
|
||||||
@@ -28,7 +28,7 @@ protected inner: VectorQuery | Promise<VectorQuery>;
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`inner`](QueryBase.md#inner)
|
`StandardQueryBase.inner`
|
||||||
|
|
||||||
## Methods
|
## Methods
|
||||||
|
|
||||||
@@ -91,7 +91,7 @@ AnalyzeExec verbose=true, metrics=[]
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`analyzePlan`](QueryBase.md#analyzeplan)
|
`StandardQueryBase.analyzePlan`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -248,7 +248,7 @@ single query)
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`execute`](QueryBase.md#execute)
|
`StandardQueryBase.execute`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -284,7 +284,7 @@ const plan = await table.query().nearestTo([0.5, 0.2]).explainPlan();
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`explainPlan`](QueryBase.md#explainplan)
|
`StandardQueryBase.explainPlan`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -305,7 +305,7 @@ Use [Table#optimize](Table.md#optimize) to index all un-indexed data.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`fastSearch`](QueryBase.md#fastsearch)
|
`StandardQueryBase.fastSearch`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -335,7 +335,7 @@ Use `where` instead
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`filter`](QueryBase.md#filter)
|
`StandardQueryBase.filter`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -357,7 +357,7 @@ fullTextSearch(query, options?): this
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`fullTextSearch`](QueryBase.md#fulltextsearch)
|
`StandardQueryBase.fullTextSearch`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -382,7 +382,7 @@ called then every valid row from the table will be returned.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`limit`](QueryBase.md#limit)
|
`StandardQueryBase.limit`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -480,6 +480,10 @@ the minimum and maximum to the same value.
|
|||||||
offset(offset): this
|
offset(offset): this
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Set the number of rows to skip before returning results.
|
||||||
|
|
||||||
|
This is useful for pagination.
|
||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **offset**: `number`
|
* **offset**: `number`
|
||||||
@@ -490,7 +494,7 @@ offset(offset): this
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`offset`](QueryBase.md#offset)
|
`StandardQueryBase.offset`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -637,7 +641,7 @@ object insertion order is easy to get wrong and `Map` is more foolproof.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`select`](QueryBase.md#select)
|
`StandardQueryBase.select`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -659,7 +663,7 @@ Collect the results as an array of objects.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`toArray`](QueryBase.md#toarray)
|
`StandardQueryBase.toArray`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -685,7 +689,7 @@ ArrowTable.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`toArrow`](QueryBase.md#toarrow)
|
`StandardQueryBase.toArrow`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -720,7 +724,7 @@ on the filter column(s).
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`where`](QueryBase.md#where)
|
`StandardQueryBase.where`
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
@@ -742,4 +746,4 @@ order to perform hybrid search.
|
|||||||
|
|
||||||
#### Inherited from
|
#### Inherited from
|
||||||
|
|
||||||
[`QueryBase`](QueryBase.md).[`withRowId`](QueryBase.md#withrowid)
|
`StandardQueryBase.withRowId`
|
||||||
|
|||||||
@@ -33,6 +33,7 @@
|
|||||||
- [Table](classes/Table.md)
|
- [Table](classes/Table.md)
|
||||||
- [TagContents](classes/TagContents.md)
|
- [TagContents](classes/TagContents.md)
|
||||||
- [Tags](classes/Tags.md)
|
- [Tags](classes/Tags.md)
|
||||||
|
- [TakeQuery](classes/TakeQuery.md)
|
||||||
- [VectorColumnOptions](classes/VectorColumnOptions.md)
|
- [VectorColumnOptions](classes/VectorColumnOptions.md)
|
||||||
- [VectorQuery](classes/VectorQuery.md)
|
- [VectorQuery](classes/VectorQuery.md)
|
||||||
|
|
||||||
|
|||||||
@@ -44,3 +44,17 @@ optional readTimeout: number;
|
|||||||
The timeout for reading data from the server in seconds. Default is 300
|
The timeout for reading data from the server in seconds. Default is 300
|
||||||
seconds (5 minutes). This can also be set via the environment variable
|
seconds (5 minutes). This can also be set via the environment variable
|
||||||
`LANCE_CLIENT_READ_TIMEOUT`, as an integer number of seconds.
|
`LANCE_CLIENT_READ_TIMEOUT`, as an integer number of seconds.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### timeout?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional timeout: number;
|
||||||
|
```
|
||||||
|
|
||||||
|
The overall timeout for the entire request in seconds. This includes
|
||||||
|
connection, send, and read time. If the entire request doesn't complete
|
||||||
|
within this time, it will fail. Default is None (no overall timeout).
|
||||||
|
This can also be set via the environment variable `LANCE_CLIENT_TIMEOUT`,
|
||||||
|
as an integer number of seconds.
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.21.2-beta.1</version>
|
<version>0.21.2-final.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.21.2-beta.1</version>
|
<version>0.21.2-final.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.21.2-beta.1</version>
|
<version>0.21.2-final.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<name>${project.artifactId}</name>
|
<name>${project.artifactId}</name>
|
||||||
<description>LanceDB Java SDK Parent POM</description>
|
<description>LanceDB Java SDK Parent POM</description>
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
module.exports = {
|
|
||||||
env: {
|
|
||||||
browser: true,
|
|
||||||
es2021: true
|
|
||||||
},
|
|
||||||
extends: 'standard-with-typescript',
|
|
||||||
overrides: [
|
|
||||||
],
|
|
||||||
parserOptions: {
|
|
||||||
project: './tsconfig.json',
|
|
||||||
ecmaVersion: 'latest',
|
|
||||||
sourceType: 'module'
|
|
||||||
},
|
|
||||||
rules: {
|
|
||||||
"@typescript-eslint/method-signature-style": "off",
|
|
||||||
"@typescript-eslint/quotes": "off",
|
|
||||||
"@typescript-eslint/semi": "off",
|
|
||||||
"@typescript-eslint/explicit-function-return-type": "off",
|
|
||||||
"@typescript-eslint/space-before-function-paren": "off",
|
|
||||||
"@typescript-eslint/indent": "off",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
gen_test_data.py
|
|
||||||
index.node
|
|
||||||
dist/lancedb*.tgz
|
|
||||||
vectordb*.tgz
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
# Changelog
|
|
||||||
|
|
||||||
All notable changes to this project will be documented in this file.
|
|
||||||
|
|
||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
||||||
|
|
||||||
## [0.1.5] - 2023-06-00
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- Support for macOS X86
|
|
||||||
|
|
||||||
## [0.1.4] - 2023-06-03
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- Select / Project query API
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
|
|
||||||
- Deprecated created_index in favor of createIndex
|
|
||||||
|
|
||||||
## [0.1.3] - 2023-06-01
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- Support S3 and Google Cloud Storage
|
|
||||||
- Embedding functions support
|
|
||||||
- OpenAI embedding function
|
|
||||||
|
|
||||||
## [0.1.2] - 2023-05-27
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- Append records API
|
|
||||||
- Extra query params to to nodejs client
|
|
||||||
- Create_index API
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
|
|
||||||
- bugfix: string columns should be converted to Utf8Array (#94)
|
|
||||||
|
|
||||||
## [0.1.1] - 2023-05-16
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- create_table API
|
|
||||||
- limit parameter for queries
|
|
||||||
- Typescript / JavaScript examples
|
|
||||||
- Linux support
|
|
||||||
|
|
||||||
## [0.1.0] - 2023-05-16
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- Initial JavaScript / Node.js library for LanceDB
|
|
||||||
- Read-only api to query LanceDB datasets
|
|
||||||
- Supports macOS arm only
|
|
||||||
|
|
||||||
## [pre-0.1.0]
|
|
||||||
|
|
||||||
- Various prototypes / test builds
|
|
||||||
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
# LanceDB
|
|
||||||
|
|
||||||
A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb).
|
|
||||||
|
|
||||||
**DEPRECATED: This library is deprecated. Please use the new client,
|
|
||||||
[@lancedb/lancedb](https://www.npmjs.com/package/@lancedb/lancedb).**
|
|
||||||
|
|
||||||
## Installation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm install vectordb
|
|
||||||
```
|
|
||||||
|
|
||||||
This will download the appropriate native library for your platform. We currently
|
|
||||||
support:
|
|
||||||
|
|
||||||
* Linux (x86_64 and aarch64)
|
|
||||||
* MacOS (Intel and ARM/M1/M2)
|
|
||||||
* Windows (x86_64 only)
|
|
||||||
|
|
||||||
We do not yet support musl-based Linux (such as Alpine Linux) or aarch64 Windows.
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
### Basic Example
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const lancedb = require('vectordb');
|
|
||||||
const db = await lancedb.connect('data/sample-lancedb');
|
|
||||||
const table = await db.createTable("my_table",
|
|
||||||
[{ id: 1, vector: [0.1, 1.0], item: "foo", price: 10.0 },
|
|
||||||
{ id: 2, vector: [3.9, 0.5], item: "bar", price: 20.0 }])
|
|
||||||
const results = await table.search([0.1, 0.3]).limit(20).execute();
|
|
||||||
console.log(results);
|
|
||||||
```
|
|
||||||
|
|
||||||
The [examples](./examples) folder contains complete examples.
|
|
||||||
|
|
||||||
## Development
|
|
||||||
|
|
||||||
To build everything fresh:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm install
|
|
||||||
npm run build
|
|
||||||
```
|
|
||||||
|
|
||||||
Then you should be able to run the tests with:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm test
|
|
||||||
```
|
|
||||||
|
|
||||||
### Fix lints
|
|
||||||
|
|
||||||
To run the linter and have it automatically fix all errors
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm run lint -- --fix
|
|
||||||
```
|
|
||||||
|
|
||||||
To build documentation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npx typedoc --plugin typedoc-plugin-markdown --out ../docs/src/javascript src/index.ts
|
|
||||||
```
|
|
||||||
@@ -1,41 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
'use strict'
|
|
||||||
|
|
||||||
async function example () {
|
|
||||||
const lancedb = require('vectordb')
|
|
||||||
// You need to provide an OpenAI API key, here we read it from the OPENAI_API_KEY environment variable
|
|
||||||
const apiKey = process.env.OPENAI_API_KEY
|
|
||||||
// The embedding function will create embeddings for the 'text' column(text in this case)
|
|
||||||
const embedding = new lancedb.OpenAIEmbeddingFunction('text', apiKey)
|
|
||||||
|
|
||||||
const db = await lancedb.connect('data/sample-lancedb')
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ id: 1, text: 'Black T-Shirt', price: 10 },
|
|
||||||
{ id: 2, text: 'Leather Jacket', price: 50 }
|
|
||||||
]
|
|
||||||
|
|
||||||
const table = await db.createTable('vectors', data, embedding)
|
|
||||||
console.log(await db.tableNames())
|
|
||||||
|
|
||||||
const results = await table
|
|
||||||
.search('keeps me warm')
|
|
||||||
.limit(1)
|
|
||||||
.execute()
|
|
||||||
console.log(results[0].text)
|
|
||||||
}
|
|
||||||
|
|
||||||
example().then(_ => { console.log('All done!') })
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb-example-js-openai",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"vectordb": "file:../..",
|
|
||||||
"openai": "^3.2.1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
'use strict'
|
|
||||||
|
|
||||||
|
|
||||||
async function example() {
|
|
||||||
|
|
||||||
const lancedb = require('vectordb')
|
|
||||||
|
|
||||||
// Import transformers and the all-MiniLM-L6-v2 model (https://huggingface.co/Xenova/all-MiniLM-L6-v2)
|
|
||||||
const { pipeline } = await import('@xenova/transformers')
|
|
||||||
const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
|
|
||||||
|
|
||||||
|
|
||||||
// Create embedding function from pipeline which returns a list of vectors from batch
|
|
||||||
// sourceColumn is the name of the column in the data to be embedded
|
|
||||||
//
|
|
||||||
// Output of pipe is a Tensor { data: Float32Array(384) }, so filter for the vector
|
|
||||||
const embed_fun = {}
|
|
||||||
embed_fun.sourceColumn = 'text'
|
|
||||||
embed_fun.embed = async function (batch) {
|
|
||||||
let result = []
|
|
||||||
for (let text of batch) {
|
|
||||||
const res = await pipe(text, { pooling: 'mean', normalize: true })
|
|
||||||
result.push(Array.from(res['data']))
|
|
||||||
}
|
|
||||||
return (result)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Link a folder and create a table with data
|
|
||||||
const db = await lancedb.connect('data/sample-lancedb')
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ id: 1, text: 'Cherry', type: 'fruit' },
|
|
||||||
{ id: 2, text: 'Carrot', type: 'vegetable' },
|
|
||||||
{ id: 3, text: 'Potato', type: 'vegetable' },
|
|
||||||
{ id: 4, text: 'Apple', type: 'fruit' },
|
|
||||||
{ id: 5, text: 'Banana', type: 'fruit' }
|
|
||||||
]
|
|
||||||
|
|
||||||
const table = await db.createTable('food_table', data, embed_fun)
|
|
||||||
|
|
||||||
|
|
||||||
// Query the table
|
|
||||||
const results = await table
|
|
||||||
.search("a sweet fruit to eat")
|
|
||||||
.metricType("cosine")
|
|
||||||
.limit(2)
|
|
||||||
.execute()
|
|
||||||
console.log(results.map(r => r.text))
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
example().then(_ => { console.log("Done!") })
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb-example-js-transformers",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "Example for using transformers.js with lancedb",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"@xenova/transformers": "^2.4.1",
|
|
||||||
"vectordb": "file:../.."
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@@ -1,122 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
'use strict'
|
|
||||||
|
|
||||||
const lancedb = require('vectordb')
|
|
||||||
const fs = require('fs/promises')
|
|
||||||
const readline = require('readline/promises')
|
|
||||||
const { stdin: input, stdout: output } = require('process')
|
|
||||||
const { Configuration, OpenAIApi } = require('openai')
|
|
||||||
|
|
||||||
// Download file from XYZ
|
|
||||||
const INPUT_FILE_NAME = 'data/youtube-transcriptions_sample.jsonl';
|
|
||||||
|
|
||||||
(async () => {
|
|
||||||
// You need to provide an OpenAI API key, here we read it from the OPENAI_API_KEY environment variable
|
|
||||||
const apiKey = process.env.OPENAI_API_KEY
|
|
||||||
// The embedding function will create embeddings for the 'context' column
|
|
||||||
const embedFunction = new lancedb.OpenAIEmbeddingFunction('context', apiKey)
|
|
||||||
|
|
||||||
// Connects to LanceDB
|
|
||||||
const db = await lancedb.connect('data/youtube-lancedb')
|
|
||||||
|
|
||||||
// Open the vectors table or create one if it does not exist
|
|
||||||
let tbl
|
|
||||||
if ((await db.tableNames()).includes('vectors')) {
|
|
||||||
tbl = await db.openTable('vectors', embedFunction)
|
|
||||||
} else {
|
|
||||||
tbl = await createEmbeddingsTable(db, embedFunction)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use OpenAI Completion API to generate and answer based on the context that LanceDB provides
|
|
||||||
const configuration = new Configuration({ apiKey })
|
|
||||||
const openai = new OpenAIApi(configuration)
|
|
||||||
const rl = readline.createInterface({ input, output })
|
|
||||||
try {
|
|
||||||
while (true) {
|
|
||||||
const query = await rl.question('Prompt: ')
|
|
||||||
const results = await tbl
|
|
||||||
.search(query)
|
|
||||||
.select(['title', 'text', 'context'])
|
|
||||||
.limit(3)
|
|
||||||
.execute()
|
|
||||||
|
|
||||||
// console.table(results)
|
|
||||||
|
|
||||||
const response = await openai.createCompletion({
|
|
||||||
model: 'text-davinci-003',
|
|
||||||
prompt: createPrompt(query, results),
|
|
||||||
max_tokens: 400,
|
|
||||||
temperature: 0,
|
|
||||||
top_p: 1,
|
|
||||||
frequency_penalty: 0,
|
|
||||||
presence_penalty: 0
|
|
||||||
})
|
|
||||||
console.log(response.data.choices[0].text)
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
console.log('Error: ', err)
|
|
||||||
} finally {
|
|
||||||
rl.close()
|
|
||||||
}
|
|
||||||
process.exit(1)
|
|
||||||
})()
|
|
||||||
|
|
||||||
async function createEmbeddingsTable (db, embedFunction) {
|
|
||||||
console.log(`Creating embeddings from ${INPUT_FILE_NAME}`)
|
|
||||||
// read the input file into a JSON array, skipping empty lines
|
|
||||||
const lines = (await fs.readFile(INPUT_FILE_NAME, 'utf-8'))
|
|
||||||
.toString()
|
|
||||||
.split('\n')
|
|
||||||
.filter(line => line.length > 0)
|
|
||||||
.map(line => JSON.parse(line))
|
|
||||||
|
|
||||||
const data = contextualize(lines, 20, 'video_id')
|
|
||||||
return await db.createTable('vectors', data, embedFunction)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Each transcript has a small text column, we include previous transcripts in order to
|
|
||||||
// have more context information when creating embeddings
|
|
||||||
function contextualize (rows, contextSize, groupColumn) {
|
|
||||||
const grouped = []
|
|
||||||
rows.forEach(row => {
|
|
||||||
if (!grouped[row[groupColumn]]) {
|
|
||||||
grouped[row[groupColumn]] = []
|
|
||||||
}
|
|
||||||
grouped[row[groupColumn]].push(row)
|
|
||||||
})
|
|
||||||
|
|
||||||
const data = []
|
|
||||||
Object.keys(grouped).forEach(key => {
|
|
||||||
for (let i = 0; i < grouped[key].length; i++) {
|
|
||||||
const start = i - contextSize > 0 ? i - contextSize : 0
|
|
||||||
grouped[key][i].context = grouped[key].slice(start, i + 1).map(r => r.text).join(' ')
|
|
||||||
}
|
|
||||||
data.push(...grouped[key])
|
|
||||||
})
|
|
||||||
return data
|
|
||||||
}
|
|
||||||
|
|
||||||
// Creates a prompt by aggregating all relevant contexts
|
|
||||||
function createPrompt (query, context) {
|
|
||||||
let prompt =
|
|
||||||
'Answer the question based on the context below.\n\n' +
|
|
||||||
'Context:\n'
|
|
||||||
|
|
||||||
// need to make sure our prompt is not larger than max size
|
|
||||||
prompt = prompt + context.map(c => c.context).join('\n\n---\n\n').substring(0, 3750)
|
|
||||||
prompt = prompt + `\n\nQuestion: ${query}\nAnswer:`
|
|
||||||
return prompt
|
|
||||||
}
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb-example-js-openai",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"vectordb": "file:../..",
|
|
||||||
"openai": "^3.2.1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
'use strict'
|
|
||||||
|
|
||||||
async function example () {
|
|
||||||
const lancedb = require('vectordb')
|
|
||||||
const db = await lancedb.connect('data/sample-lancedb')
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ id: 1, vector: [0.1, 0.2], price: 10 },
|
|
||||||
{ id: 2, vector: [1.1, 1.2], price: 50 }
|
|
||||||
]
|
|
||||||
|
|
||||||
const table = await db.createTable('vectors', data)
|
|
||||||
console.log(await db.tableNames())
|
|
||||||
|
|
||||||
const results = await table
|
|
||||||
.search([0.1, 0.3])
|
|
||||||
.limit(20)
|
|
||||||
.execute()
|
|
||||||
console.log(results)
|
|
||||||
}
|
|
||||||
|
|
||||||
example()
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb-example-js",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"vectordb": "file:../.."
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb-example-ts",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "dist/index.js",
|
|
||||||
"types": "dist/index.d.ts",
|
|
||||||
"scripts": {
|
|
||||||
"tsc": "tsc -b",
|
|
||||||
"build": "tsc"
|
|
||||||
},
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"devDependencies": {
|
|
||||||
"@types/node": "^18.16.2",
|
|
||||||
"ts-node": "^10.9.1",
|
|
||||||
"ts-node-dev": "^2.0.0",
|
|
||||||
"typescript": "*"
|
|
||||||
},
|
|
||||||
"dependencies": {
|
|
||||||
"vectordb": "file:../.."
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import * as vectordb from 'vectordb';
|
|
||||||
|
|
||||||
async function example () {
|
|
||||||
const db = await vectordb.connect('data/sample-lancedb')
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ id: 1, vector: [0.1, 0.2], price: 10 },
|
|
||||||
{ id: 2, vector: [1.1, 1.2], price: 50 }
|
|
||||||
]
|
|
||||||
|
|
||||||
const table = await db.createTable('vectors', data)
|
|
||||||
console.log(await db.tableNames())
|
|
||||||
|
|
||||||
const results = await table
|
|
||||||
.search([0.1, 0.3])
|
|
||||||
.limit(20)
|
|
||||||
.execute()
|
|
||||||
console.log(results)
|
|
||||||
}
|
|
||||||
|
|
||||||
example().then(_ => { console.log ("All done!") })
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
{
|
|
||||||
"include": ["src/**/*.ts"],
|
|
||||||
"compilerOptions": {
|
|
||||||
"target": "es2016",
|
|
||||||
"module": "commonjs",
|
|
||||||
"declaration": true,
|
|
||||||
"outDir": "./dist",
|
|
||||||
"strict": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
const { currentTarget } = require('@neon-rs/load')
|
|
||||||
|
|
||||||
let nativeLib
|
|
||||||
|
|
||||||
try {
|
|
||||||
// When developing locally, give preference to the local built library
|
|
||||||
nativeLib = require('./index.node')
|
|
||||||
} catch {
|
|
||||||
try {
|
|
||||||
nativeLib = require(`@lancedb/vectordb-${currentTarget()}`)
|
|
||||||
} catch (e) {
|
|
||||||
throw new Error(`vectordb: failed to load native library.
|
|
||||||
You may need to run \`npm install @lancedb/vectordb-${currentTarget()}\`.
|
|
||||||
|
|
||||||
If that does not work, please file a bug report at https://github.com/lancedb/lancedb/issues
|
|
||||||
|
|
||||||
Source error: ${e}`)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dynamic require for runtime.
|
|
||||||
module.exports = nativeLib
|
|
||||||
5234
node/package-lock.json
generated
5234
node/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -1,98 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb",
|
|
||||||
"version": "0.21.2-beta.1",
|
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
|
||||||
"private": false,
|
|
||||||
"main": "dist/index.js",
|
|
||||||
"types": "dist/index.d.ts",
|
|
||||||
"scripts": {
|
|
||||||
"tsc": "tsc -b",
|
|
||||||
"build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb_node index.node -- cargo build -p lancedb-node --message-format=json",
|
|
||||||
"build-release": "npm run build -- --release",
|
|
||||||
"test": "npm run tsc && mocha -recursive dist/test",
|
|
||||||
"integration-test": "npm run tsc && mocha -recursive dist/integration_test",
|
|
||||||
"lint": "eslint native.js src --ext .js,.ts",
|
|
||||||
"clean": "rm -rf node_modules *.node dist/",
|
|
||||||
"pack-build": "neon pack-build",
|
|
||||||
"check-npm": "printenv && which node && which npm && npm --version"
|
|
||||||
},
|
|
||||||
"repository": {
|
|
||||||
"type": "git",
|
|
||||||
"url": "https://github.com/lancedb/lancedb.git"
|
|
||||||
},
|
|
||||||
"homepage": "https://lancedb.github.io/lancedb/",
|
|
||||||
"bugs": {
|
|
||||||
"url": "https://github.com/lancedb/lancedb/issues"
|
|
||||||
},
|
|
||||||
"keywords": [
|
|
||||||
"data-format",
|
|
||||||
"data-science",
|
|
||||||
"machine-learning",
|
|
||||||
"data-analytics"
|
|
||||||
],
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"devDependencies": {
|
|
||||||
"@neon-rs/cli": "^0.0.160",
|
|
||||||
"@types/chai": "^4.3.4",
|
|
||||||
"@types/chai-as-promised": "^7.1.5",
|
|
||||||
"@types/mocha": "^10.0.1",
|
|
||||||
"@types/node": "^18.16.2",
|
|
||||||
"@types/sinon": "^10.0.15",
|
|
||||||
"@types/temp": "^0.9.1",
|
|
||||||
"@types/uuid": "^9.0.3",
|
|
||||||
"@typescript-eslint/eslint-plugin": "^5.59.1",
|
|
||||||
"apache-arrow-old": "npm:apache-arrow@13.0.0",
|
|
||||||
"cargo-cp-artifact": "^0.1",
|
|
||||||
"chai": "^4.3.7",
|
|
||||||
"chai-as-promised": "^7.1.1",
|
|
||||||
"eslint": "^8.39.0",
|
|
||||||
"eslint-config-standard-with-typescript": "^34.0.1",
|
|
||||||
"eslint-plugin-import": "^2.26.0",
|
|
||||||
"eslint-plugin-n": "^15.7.0",
|
|
||||||
"eslint-plugin-promise": "^6.1.1",
|
|
||||||
"mocha": "^10.2.0",
|
|
||||||
"openai": "^4.24.1",
|
|
||||||
"sinon": "^15.1.0",
|
|
||||||
"temp": "^0.9.4",
|
|
||||||
"ts-node": "^10.9.1",
|
|
||||||
"ts-node-dev": "^2.0.0",
|
|
||||||
"typedoc": "^0.24.7",
|
|
||||||
"typedoc-plugin-markdown": "^3.15.3",
|
|
||||||
"typescript": "^5.1.0",
|
|
||||||
"uuid": "^9.0.0"
|
|
||||||
},
|
|
||||||
"dependencies": {
|
|
||||||
"@neon-rs/load": "^0.0.74",
|
|
||||||
"axios": "^1.4.0"
|
|
||||||
},
|
|
||||||
"peerDependencies": {
|
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
|
||||||
"apache-arrow": "^14.0.2"
|
|
||||||
},
|
|
||||||
"os": [
|
|
||||||
"darwin",
|
|
||||||
"linux",
|
|
||||||
"win32"
|
|
||||||
],
|
|
||||||
"cpu": [
|
|
||||||
"x64",
|
|
||||||
"arm64"
|
|
||||||
],
|
|
||||||
"neon": {
|
|
||||||
"targets": {
|
|
||||||
"x86_64-apple-darwin": "@lancedb/vectordb-darwin-x64",
|
|
||||||
"aarch64-apple-darwin": "@lancedb/vectordb-darwin-arm64",
|
|
||||||
"x86_64-unknown-linux-gnu": "@lancedb/vectordb-linux-x64-gnu",
|
|
||||||
"aarch64-unknown-linux-gnu": "@lancedb/vectordb-linux-arm64-gnu",
|
|
||||||
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"optionalDependencies": {
|
|
||||||
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.1",
|
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.1",
|
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.1",
|
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.1",
|
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,635 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import {
|
|
||||||
Field,
|
|
||||||
makeBuilder,
|
|
||||||
RecordBatchFileWriter,
|
|
||||||
Utf8,
|
|
||||||
type Vector,
|
|
||||||
FixedSizeList,
|
|
||||||
vectorFromArray,
|
|
||||||
Schema,
|
|
||||||
Table as ArrowTable,
|
|
||||||
RecordBatchStreamWriter,
|
|
||||||
List,
|
|
||||||
RecordBatch,
|
|
||||||
makeData,
|
|
||||||
Struct,
|
|
||||||
type Float,
|
|
||||||
DataType,
|
|
||||||
Binary,
|
|
||||||
Float32
|
|
||||||
} from "apache-arrow";
|
|
||||||
import { type EmbeddingFunction } from "./index";
|
|
||||||
import { sanitizeSchema } from "./sanitize";
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Options to control how a column should be converted to a vector array
|
|
||||||
*/
|
|
||||||
export class VectorColumnOptions {
|
|
||||||
/** Vector column type. */
|
|
||||||
type: Float = new Float32();
|
|
||||||
|
|
||||||
constructor(values?: Partial<VectorColumnOptions>) {
|
|
||||||
Object.assign(this, values);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Options to control the makeArrowTable call. */
|
|
||||||
export class MakeArrowTableOptions {
|
|
||||||
/*
|
|
||||||
* Schema of the data.
|
|
||||||
*
|
|
||||||
* If this is not provided then the data type will be inferred from the
|
|
||||||
* JS type. Integer numbers will become int64, floating point numbers
|
|
||||||
* will become float64 and arrays will become variable sized lists with
|
|
||||||
* the data type inferred from the first element in the array.
|
|
||||||
*
|
|
||||||
* The schema must be specified if there are no records (e.g. to make
|
|
||||||
* an empty table)
|
|
||||||
*/
|
|
||||||
schema?: Schema;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Mapping from vector column name to expected type
|
|
||||||
*
|
|
||||||
* Lance expects vector columns to be fixed size list arrays (i.e. tensors)
|
|
||||||
* However, `makeArrowTable` will not infer this by default (it creates
|
|
||||||
* variable size list arrays). This field can be used to indicate that a column
|
|
||||||
* should be treated as a vector column and converted to a fixed size list.
|
|
||||||
*
|
|
||||||
* The keys should be the names of the vector columns. The value specifies the
|
|
||||||
* expected data type of the vector columns.
|
|
||||||
*
|
|
||||||
* If `schema` is provided then this field is ignored.
|
|
||||||
*
|
|
||||||
* By default, the column named "vector" will be assumed to be a float32
|
|
||||||
* vector column.
|
|
||||||
*/
|
|
||||||
vectorColumns: Record<string, VectorColumnOptions> = {
|
|
||||||
vector: new VectorColumnOptions()
|
|
||||||
};
|
|
||||||
|
|
||||||
embeddings?: EmbeddingFunction<any>;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If true then string columns will be encoded with dictionary encoding
|
|
||||||
*
|
|
||||||
* Set this to true if your string columns tend to repeat the same values
|
|
||||||
* often. For more precise control use the `schema` property to specify the
|
|
||||||
* data type for individual columns.
|
|
||||||
*
|
|
||||||
* If `schema` is provided then this property is ignored.
|
|
||||||
*/
|
|
||||||
dictionaryEncodeStrings: boolean = false;
|
|
||||||
|
|
||||||
constructor(values?: Partial<MakeArrowTableOptions>) {
|
|
||||||
Object.assign(this, values);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An enhanced version of the {@link makeTable} function from Apache Arrow
|
|
||||||
* that supports nested fields and embeddings columns.
|
|
||||||
*
|
|
||||||
* This function converts an array of Record<String, any> (row-major JS objects)
|
|
||||||
* to an Arrow Table (a columnar structure)
|
|
||||||
*
|
|
||||||
* Note that it currently does not support nulls.
|
|
||||||
*
|
|
||||||
* If a schema is provided then it will be used to determine the resulting array
|
|
||||||
* types. Fields will also be reordered to fit the order defined by the schema.
|
|
||||||
*
|
|
||||||
* If a schema is not provided then the types will be inferred and the field order
|
|
||||||
* will be controlled by the order of properties in the first record.
|
|
||||||
*
|
|
||||||
* If the input is empty then a schema must be provided to create an empty table.
|
|
||||||
*
|
|
||||||
* When a schema is not specified then data types will be inferred. The inference
|
|
||||||
* rules are as follows:
|
|
||||||
*
|
|
||||||
* - boolean => Bool
|
|
||||||
* - number => Float64
|
|
||||||
* - String => Utf8
|
|
||||||
* - Buffer => Binary
|
|
||||||
* - Record<String, any> => Struct
|
|
||||||
* - Array<any> => List
|
|
||||||
*
|
|
||||||
* @param data input data
|
|
||||||
* @param options options to control the makeArrowTable call.
|
|
||||||
*
|
|
||||||
* @example
|
|
||||||
*
|
|
||||||
* ```ts
|
|
||||||
*
|
|
||||||
* import { fromTableToBuffer, makeArrowTable } from "../arrow";
|
|
||||||
* import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
|
|
||||||
*
|
|
||||||
* const schema = new Schema([
|
|
||||||
* new Field("a", new Int32()),
|
|
||||||
* new Field("b", new Float32()),
|
|
||||||
* new Field("c", new FixedSizeList(3, new Field("item", new Float16()))),
|
|
||||||
* ]);
|
|
||||||
* const table = makeArrowTable([
|
|
||||||
* { a: 1, b: 2, c: [1, 2, 3] },
|
|
||||||
* { a: 4, b: 5, c: [4, 5, 6] },
|
|
||||||
* { a: 7, b: 8, c: [7, 8, 9] },
|
|
||||||
* ], { schema });
|
|
||||||
* ```
|
|
||||||
*
|
|
||||||
* By default it assumes that the column named `vector` is a vector column
|
|
||||||
* and it will be converted into a fixed size list array of type float32.
|
|
||||||
* The `vectorColumns` option can be used to support other vector column
|
|
||||||
* names and data types.
|
|
||||||
*
|
|
||||||
* ```ts
|
|
||||||
*
|
|
||||||
* const schema = new Schema([
|
|
||||||
new Field("a", new Float64()),
|
|
||||||
new Field("b", new Float64()),
|
|
||||||
new Field(
|
|
||||||
"vector",
|
|
||||||
new FixedSizeList(3, new Field("item", new Float32()))
|
|
||||||
),
|
|
||||||
]);
|
|
||||||
const table = makeArrowTable([
|
|
||||||
{ a: 1, b: 2, vector: [1, 2, 3] },
|
|
||||||
{ a: 4, b: 5, vector: [4, 5, 6] },
|
|
||||||
{ a: 7, b: 8, vector: [7, 8, 9] },
|
|
||||||
]);
|
|
||||||
assert.deepEqual(table.schema, schema);
|
|
||||||
* ```
|
|
||||||
*
|
|
||||||
* You can specify the vector column types and names using the options as well
|
|
||||||
*
|
|
||||||
* ```typescript
|
|
||||||
*
|
|
||||||
* const schema = new Schema([
|
|
||||||
new Field('a', new Float64()),
|
|
||||||
new Field('b', new Float64()),
|
|
||||||
new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
|
|
||||||
new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
|
|
||||||
]);
|
|
||||||
* const table = makeArrowTable([
|
|
||||||
{ a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
|
|
||||||
{ a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
|
|
||||||
{ a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
|
|
||||||
], {
|
|
||||||
vectorColumns: {
|
|
||||||
vec1: { type: new Float16() },
|
|
||||||
vec2: { type: new Float16() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
* assert.deepEqual(table.schema, schema)
|
|
||||||
* ```
|
|
||||||
*/
|
|
||||||
export function makeArrowTable(
|
|
||||||
data: Array<Record<string, any>>,
|
|
||||||
options?: Partial<MakeArrowTableOptions>
|
|
||||||
): ArrowTable {
|
|
||||||
if (
|
|
||||||
data.length === 0 &&
|
|
||||||
(options?.schema === undefined || options?.schema === null)
|
|
||||||
) {
|
|
||||||
throw new Error("At least one record or a schema needs to be provided");
|
|
||||||
}
|
|
||||||
|
|
||||||
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
||||||
if (opt.schema !== undefined && opt.schema !== null) {
|
|
||||||
opt.schema = sanitizeSchema(opt.schema);
|
|
||||||
opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
|
|
||||||
}
|
|
||||||
|
|
||||||
const columns: Record<string, Vector> = {};
|
|
||||||
// TODO: sample dataset to find missing columns
|
|
||||||
// Prefer the field ordering of the schema, if present
|
|
||||||
const columnNames =
|
|
||||||
opt.schema != null ? (opt.schema.names as string[]) : Object.keys(data[0]);
|
|
||||||
for (const colName of columnNames) {
|
|
||||||
if (
|
|
||||||
data.length !== 0 &&
|
|
||||||
!Object.prototype.hasOwnProperty.call(data[0], colName)
|
|
||||||
) {
|
|
||||||
// The field is present in the schema, but not in the data, skip it
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// Extract a single column from the records (transpose from row-major to col-major)
|
|
||||||
let values = data.map((datum) => datum[colName]);
|
|
||||||
|
|
||||||
// By default (type === undefined) arrow will infer the type from the JS type
|
|
||||||
let type;
|
|
||||||
if (opt.schema !== undefined) {
|
|
||||||
// If there is a schema provided, then use that for the type instead
|
|
||||||
type = opt.schema?.fields.filter((f) => f.name === colName)[0]?.type;
|
|
||||||
if (DataType.isInt(type) && type.bitWidth === 64) {
|
|
||||||
// wrap in BigInt to avoid bug: https://github.com/apache/arrow/issues/40051
|
|
||||||
values = values.map((v) => {
|
|
||||||
if (v === null) {
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
return BigInt(v);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Otherwise, check to see if this column is one of the vector columns
|
|
||||||
// defined by opt.vectorColumns and, if so, use the fixed size list type
|
|
||||||
const vectorColumnOptions = opt.vectorColumns[colName];
|
|
||||||
if (vectorColumnOptions !== undefined) {
|
|
||||||
type = newVectorType(values[0].length, vectorColumnOptions.type);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Convert an Array of JS values to an arrow vector
|
|
||||||
columns[colName] = makeVector(values, type, opt.dictionaryEncodeStrings);
|
|
||||||
} catch (error: unknown) {
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
throw Error(`Could not convert column "${colName}" to Arrow: ${error}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (opt.schema != null) {
|
|
||||||
// `new ArrowTable(columns)` infers a schema which may sometimes have
|
|
||||||
// incorrect nullability (it assumes nullable=true if there are 0 rows)
|
|
||||||
//
|
|
||||||
// `new ArrowTable(schema, columns)` will also fail because it will create a
|
|
||||||
// batch with an inferred schema and then complain that the batch schema
|
|
||||||
// does not match the provided schema.
|
|
||||||
//
|
|
||||||
// To work around this we first create a table with the wrong schema and
|
|
||||||
// then patch the schema of the batches so we can use
|
|
||||||
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
||||||
const firstTable = new ArrowTable(columns);
|
|
||||||
const batchesFixed = firstTable.batches.map(
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
||||||
(batch) => new RecordBatch(opt.schema!, batch.data)
|
|
||||||
);
|
|
||||||
return new ArrowTable(opt.schema, batchesFixed);
|
|
||||||
} else {
|
|
||||||
return new ArrowTable(columns);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create an empty Arrow table with the provided schema
|
|
||||||
*/
|
|
||||||
export function makeEmptyTable(schema: Schema): ArrowTable {
|
|
||||||
return makeArrowTable([], { schema });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper function to convert Array<Array<any>> to a variable sized list array
|
|
||||||
function makeListVector(lists: any[][]): Vector<any> {
|
|
||||||
if (lists.length === 0 || lists[0].length === 0) {
|
|
||||||
throw Error("Cannot infer list vector from empty array or empty list");
|
|
||||||
}
|
|
||||||
const sampleList = lists[0];
|
|
||||||
let inferredType;
|
|
||||||
try {
|
|
||||||
const sampleVector = makeVector(sampleList);
|
|
||||||
inferredType = sampleVector.type;
|
|
||||||
} catch (error: unknown) {
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
throw Error(`Cannot infer list vector. Cannot infer inner type: ${error}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const listBuilder = makeBuilder({
|
|
||||||
type: new List(new Field("item", inferredType, true))
|
|
||||||
});
|
|
||||||
for (const list of lists) {
|
|
||||||
listBuilder.append(list);
|
|
||||||
}
|
|
||||||
return listBuilder.finish().toVector();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper function to convert an Array of JS values to an Arrow Vector
|
|
||||||
function makeVector(
|
|
||||||
values: any[],
|
|
||||||
type?: DataType,
|
|
||||||
stringAsDictionary?: boolean
|
|
||||||
): Vector<any> {
|
|
||||||
if (type !== undefined) {
|
|
||||||
// No need for inference, let Arrow create it
|
|
||||||
return vectorFromArray(values, type);
|
|
||||||
}
|
|
||||||
if (values.length === 0) {
|
|
||||||
throw Error(
|
|
||||||
"makeVector requires at least one value or the type must be specfied"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const sampleValue = values.find((val) => val !== null && val !== undefined);
|
|
||||||
if (sampleValue === undefined) {
|
|
||||||
throw Error(
|
|
||||||
"makeVector cannot infer the type if all values are null or undefined"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (Array.isArray(sampleValue)) {
|
|
||||||
// Default Arrow inference doesn't handle list types
|
|
||||||
return makeListVector(values);
|
|
||||||
} else if (Buffer.isBuffer(sampleValue)) {
|
|
||||||
// Default Arrow inference doesn't handle Buffer
|
|
||||||
return vectorFromArray(values, new Binary());
|
|
||||||
} else if (
|
|
||||||
!(stringAsDictionary ?? false) &&
|
|
||||||
(typeof sampleValue === "string" || sampleValue instanceof String)
|
|
||||||
) {
|
|
||||||
// If the type is string then don't use Arrow's default inference unless dictionaries are requested
|
|
||||||
// because it will always use dictionary encoding for strings
|
|
||||||
return vectorFromArray(values, new Utf8());
|
|
||||||
} else {
|
|
||||||
// Convert a JS array of values to an arrow vector
|
|
||||||
return vectorFromArray(values);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function applyEmbeddings<T>(
|
|
||||||
table: ArrowTable,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
schema?: Schema
|
|
||||||
): Promise<ArrowTable> {
|
|
||||||
if (embeddings == null) {
|
|
||||||
return table;
|
|
||||||
}
|
|
||||||
if (schema !== undefined && schema !== null) {
|
|
||||||
schema = sanitizeSchema(schema);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert from ArrowTable to Record<String, Vector>
|
|
||||||
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
|
|
||||||
const name = table.schema.fields[idx].name;
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
||||||
const vec = table.getChildAt(idx)!;
|
|
||||||
return [name, vec];
|
|
||||||
});
|
|
||||||
const newColumns = Object.fromEntries(colEntries);
|
|
||||||
|
|
||||||
const sourceColumn = newColumns[embeddings.sourceColumn];
|
|
||||||
const destColumn = embeddings.destColumn ?? "vector";
|
|
||||||
const innerDestType = embeddings.embeddingDataType ?? new Float32();
|
|
||||||
if (sourceColumn === undefined) {
|
|
||||||
throw new Error(
|
|
||||||
`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (table.numRows === 0) {
|
|
||||||
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
|
||||||
// We have an empty table and it already has the embedding column so no work needs to be done
|
|
||||||
// Note: we don't return an error like we did below because this is a common occurrence. For example,
|
|
||||||
// if we call convertToTable with 0 records and a schema that includes the embedding
|
|
||||||
return table;
|
|
||||||
}
|
|
||||||
if (embeddings.embeddingDimension !== undefined) {
|
|
||||||
const destType = newVectorType(
|
|
||||||
embeddings.embeddingDimension,
|
|
||||||
innerDestType
|
|
||||||
);
|
|
||||||
newColumns[destColumn] = makeVector([], destType);
|
|
||||||
} else if (schema != null) {
|
|
||||||
const destField = schema.fields.find((f) => f.name === destColumn);
|
|
||||||
if (destField != null) {
|
|
||||||
newColumns[destColumn] = makeVector([], destField.type);
|
|
||||||
} else {
|
|
||||||
throw new Error(
|
|
||||||
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
throw new Error(
|
|
||||||
"Attempt to apply embeddings to an empty table when the embeddings function does not specify `embeddingDimension`"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
|
||||||
throw new Error(
|
|
||||||
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (table.batches.length > 1) {
|
|
||||||
throw new Error(
|
|
||||||
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const values = sourceColumn.toArray();
|
|
||||||
const vectors = await embeddings.embed(values as T[]);
|
|
||||||
if (vectors.length !== values.length) {
|
|
||||||
throw new Error(
|
|
||||||
"Embedding function did not return an embedding for each input element"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const destType = newVectorType(vectors[0].length, innerDestType);
|
|
||||||
newColumns[destColumn] = makeVector(vectors, destType);
|
|
||||||
}
|
|
||||||
|
|
||||||
const newTable = new ArrowTable(newColumns);
|
|
||||||
if (schema != null) {
|
|
||||||
if (schema.fields.find((f) => f.name === destColumn) === undefined) {
|
|
||||||
throw new Error(
|
|
||||||
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return alignTable(newTable, schema);
|
|
||||||
}
|
|
||||||
return newTable;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Convert an Array of records into an Arrow Table, optionally applying an
|
|
||||||
* embeddings function to it.
|
|
||||||
*
|
|
||||||
* This function calls `makeArrowTable` first to create the Arrow Table.
|
|
||||||
* Any provided `makeTableOptions` (e.g. a schema) will be passed on to
|
|
||||||
* that call.
|
|
||||||
*
|
|
||||||
* The embedding function will be passed a column of values (based on the
|
|
||||||
* `sourceColumn` of the embedding function) and expects to receive back
|
|
||||||
* number[][] which will be converted into a fixed size list column. By
|
|
||||||
* default this will be a fixed size list of Float32 but that can be
|
|
||||||
* customized by the `embeddingDataType` property of the embedding function.
|
|
||||||
*
|
|
||||||
* If a schema is provided in `makeTableOptions` then it should include the
|
|
||||||
* embedding columns. If no schema is provded then embedding columns will
|
|
||||||
* be placed at the end of the table, after all of the input columns.
|
|
||||||
*/
|
|
||||||
export async function convertToTable<T>(
|
|
||||||
data: Array<Record<string, unknown>>,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
makeTableOptions?: Partial<MakeArrowTableOptions>
|
|
||||||
): Promise<ArrowTable> {
|
|
||||||
const table = makeArrowTable(data, makeTableOptions);
|
|
||||||
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Creates the Arrow Type for a Vector column with dimension `dim`
|
|
||||||
function newVectorType<T extends Float>(
|
|
||||||
dim: number,
|
|
||||||
innerType: T
|
|
||||||
): FixedSizeList<T> {
|
|
||||||
// Somewhere we always default to have the elements nullable, so we need to set it to true
|
|
||||||
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
|
|
||||||
const children = new Field<T>("item", innerType, true);
|
|
||||||
return new FixedSizeList(dim, children);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
|
|
||||||
*
|
|
||||||
* This function will call `convertToTable` and pass on `embeddings` and `schema`
|
|
||||||
*
|
|
||||||
* `schema` is required if data is empty
|
|
||||||
*/
|
|
||||||
export async function fromRecordsToBuffer<T>(
|
|
||||||
data: Array<Record<string, unknown>>,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
schema?: Schema
|
|
||||||
): Promise<Buffer> {
|
|
||||||
if (schema !== undefined && schema !== null) {
|
|
||||||
schema = sanitizeSchema(schema);
|
|
||||||
}
|
|
||||||
const table = await convertToTable(data, embeddings, { schema, embeddings });
|
|
||||||
const writer = RecordBatchFileWriter.writeAll(table);
|
|
||||||
return Buffer.from(await writer.toUint8Array());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Serialize an Array of records into a buffer using the Arrow IPC Stream serialization
|
|
||||||
*
|
|
||||||
* This function will call `convertToTable` and pass on `embeddings` and `schema`
|
|
||||||
*
|
|
||||||
* `schema` is required if data is empty
|
|
||||||
*/
|
|
||||||
export async function fromRecordsToStreamBuffer<T>(
|
|
||||||
data: Array<Record<string, unknown>>,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
schema?: Schema
|
|
||||||
): Promise<Buffer> {
|
|
||||||
if (schema !== null && schema !== undefined) {
|
|
||||||
schema = sanitizeSchema(schema);
|
|
||||||
}
|
|
||||||
const table = await convertToTable(data, embeddings, { schema });
|
|
||||||
const writer = RecordBatchStreamWriter.writeAll(table);
|
|
||||||
return Buffer.from(await writer.toUint8Array());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
|
|
||||||
*
|
|
||||||
* This function will apply `embeddings` to the table in a manner similar to
|
|
||||||
* `convertToTable`.
|
|
||||||
*
|
|
||||||
* `schema` is required if the table is empty
|
|
||||||
*/
|
|
||||||
export async function fromTableToBuffer<T>(
|
|
||||||
table: ArrowTable,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
schema?: Schema
|
|
||||||
): Promise<Buffer> {
|
|
||||||
if (schema !== null && schema !== undefined) {
|
|
||||||
schema = sanitizeSchema(schema);
|
|
||||||
}
|
|
||||||
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
||||||
const writer = RecordBatchFileWriter.writeAll(tableWithEmbeddings);
|
|
||||||
return Buffer.from(await writer.toUint8Array());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
|
|
||||||
*
|
|
||||||
* This function will apply `embeddings` to the table in a manner similar to
|
|
||||||
* `convertToTable`.
|
|
||||||
*
|
|
||||||
* `schema` is required if the table is empty
|
|
||||||
*/
|
|
||||||
export async function fromTableToStreamBuffer<T>(
|
|
||||||
table: ArrowTable,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
schema?: Schema
|
|
||||||
): Promise<Buffer> {
|
|
||||||
if (schema !== null && schema !== undefined) {
|
|
||||||
schema = sanitizeSchema(schema);
|
|
||||||
}
|
|
||||||
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
||||||
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
|
|
||||||
return Buffer.from(await writer.toUint8Array());
|
|
||||||
}
|
|
||||||
|
|
||||||
function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
|
|
||||||
const alignedChildren = [];
|
|
||||||
for (const field of schema.fields) {
|
|
||||||
const indexInBatch = batch.schema.fields?.findIndex(
|
|
||||||
(f) => f.name === field.name
|
|
||||||
);
|
|
||||||
if (indexInBatch < 0) {
|
|
||||||
throw new Error(
|
|
||||||
`The column ${field.name} was not found in the Arrow Table`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
alignedChildren.push(batch.data.children[indexInBatch]);
|
|
||||||
}
|
|
||||||
const newData = makeData({
|
|
||||||
type: new Struct(schema.fields),
|
|
||||||
length: batch.numRows,
|
|
||||||
nullCount: batch.nullCount,
|
|
||||||
children: alignedChildren
|
|
||||||
});
|
|
||||||
return new RecordBatch(schema, newData);
|
|
||||||
}
|
|
||||||
|
|
||||||
function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
|
|
||||||
const alignedBatches = table.batches.map((batch) =>
|
|
||||||
alignBatch(batch, schema)
|
|
||||||
);
|
|
||||||
return new ArrowTable(schema, alignedBatches);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Creates an empty Arrow Table
|
|
||||||
export function createEmptyTable(schema: Schema): ArrowTable {
|
|
||||||
return new ArrowTable(sanitizeSchema(schema));
|
|
||||||
}
|
|
||||||
|
|
||||||
function validateSchemaEmbeddings(
|
|
||||||
schema: Schema<any>,
|
|
||||||
data: Array<Record<string, unknown>>,
|
|
||||||
embeddings: EmbeddingFunction<any> | undefined
|
|
||||||
) {
|
|
||||||
const fields = [];
|
|
||||||
const missingEmbeddingFields = [];
|
|
||||||
|
|
||||||
// First we check if the field is a `FixedSizeList`
|
|
||||||
// Then we check if the data contains the field
|
|
||||||
// if it does not, we add it to the list of missing embedding fields
|
|
||||||
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
||||||
// if they are not, we throw an error
|
|
||||||
for (const field of schema.fields) {
|
|
||||||
if (field.type instanceof FixedSizeList) {
|
|
||||||
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
||||||
missingEmbeddingFields.push(field);
|
|
||||||
} else {
|
|
||||||
fields.push(field);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fields.push(field);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
|
||||||
throw new Error(
|
|
||||||
`Table has embeddings: "${missingEmbeddingFields
|
|
||||||
.map((f) => f.name)
|
|
||||||
.join(",")}", but no embedding function was provided`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Schema(fields, schema.metadata);
|
|
||||||
}
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { type Float } from 'apache-arrow'
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An embedding function that automatically creates vector representation for a given column.
|
|
||||||
*/
|
|
||||||
export interface EmbeddingFunction<T> {
|
|
||||||
/**
|
|
||||||
* The name of the column that will be used as input for the Embedding Function.
|
|
||||||
*/
|
|
||||||
sourceColumn: string
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The data type of the embedding
|
|
||||||
*
|
|
||||||
* The embedding function should return `number`. This will be converted into
|
|
||||||
* an Arrow float array. By default this will be Float32 but this property can
|
|
||||||
* be used to control the conversion.
|
|
||||||
*/
|
|
||||||
embeddingDataType?: Float
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The dimension of the embedding
|
|
||||||
*
|
|
||||||
* This is optional, normally this can be determined by looking at the results of
|
|
||||||
* `embed`. If this is not specified, and there is an attempt to apply the embedding
|
|
||||||
* to an empty table, then that process will fail.
|
|
||||||
*/
|
|
||||||
embeddingDimension?: number
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The name of the column that will contain the embedding
|
|
||||||
*
|
|
||||||
* By default this is "vector"
|
|
||||||
*/
|
|
||||||
destColumn?: string
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Should the source column be excluded from the resulting table
|
|
||||||
*
|
|
||||||
* By default the source column is included. Set this to true and
|
|
||||||
* only the embedding will be stored.
|
|
||||||
*/
|
|
||||||
excludeSource?: boolean
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a vector representation for the given values.
|
|
||||||
*/
|
|
||||||
embed: (data: T[]) => Promise<number[][]>
|
|
||||||
}
|
|
||||||
|
|
||||||
export function isEmbeddingFunction<T> (value: any): value is EmbeddingFunction<T> {
|
|
||||||
return typeof value.sourceColumn === 'string' &&
|
|
||||||
typeof value.embed === 'function'
|
|
||||||
}
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { type EmbeddingFunction } from '../index'
|
|
||||||
import type OpenAI from 'openai'
|
|
||||||
|
|
||||||
export class OpenAIEmbeddingFunction implements EmbeddingFunction<string> {
|
|
||||||
private readonly _openai: OpenAI
|
|
||||||
private readonly _modelName: string
|
|
||||||
|
|
||||||
constructor (sourceColumn: string, openAIKey: string, modelName: string = 'text-embedding-ada-002') {
|
|
||||||
/**
|
|
||||||
* @type {import("openai").default}
|
|
||||||
*/
|
|
||||||
let Openai
|
|
||||||
try {
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
||||||
Openai = require('openai')
|
|
||||||
} catch {
|
|
||||||
throw new Error('please install openai@^4.24.1 using npm install openai')
|
|
||||||
}
|
|
||||||
|
|
||||||
this.sourceColumn = sourceColumn
|
|
||||||
const configuration = {
|
|
||||||
apiKey: openAIKey
|
|
||||||
}
|
|
||||||
|
|
||||||
this._openai = new Openai(configuration)
|
|
||||||
this._modelName = modelName
|
|
||||||
}
|
|
||||||
|
|
||||||
async embed (data: string[]): Promise<number[][]> {
|
|
||||||
const response = await this._openai.embeddings.create({
|
|
||||||
model: this._modelName,
|
|
||||||
input: data
|
|
||||||
})
|
|
||||||
|
|
||||||
const embeddings: number[][] = []
|
|
||||||
for (let i = 0; i < response.data.length; i++) {
|
|
||||||
embeddings.push(response.data[i].embedding)
|
|
||||||
}
|
|
||||||
return embeddings
|
|
||||||
}
|
|
||||||
|
|
||||||
sourceColumn: string
|
|
||||||
}
|
|
||||||
1399
node/src/index.ts
1399
node/src/index.ts
File diff suppressed because it is too large
Load Diff
@@ -1,155 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { describe } from 'mocha'
|
|
||||||
import * as chai from 'chai'
|
|
||||||
import { assert } from 'chai'
|
|
||||||
import * as chaiAsPromised from 'chai-as-promised'
|
|
||||||
import { v4 as uuidv4 } from 'uuid'
|
|
||||||
|
|
||||||
import * as lancedb from '../index'
|
|
||||||
import { tmpdir } from 'os'
|
|
||||||
import * as fs from 'fs'
|
|
||||||
import * as path from 'path'
|
|
||||||
|
|
||||||
chai.use(chaiAsPromised)
|
|
||||||
|
|
||||||
describe('LanceDB AWS Integration test', function () {
|
|
||||||
it('s3+ddb schema is processed correctly', async function () {
|
|
||||||
this.timeout(15000)
|
|
||||||
|
|
||||||
// WARNING: specifying engine is NOT a publicly supported feature in lancedb yet
|
|
||||||
// THE API WILL CHANGE
|
|
||||||
const conn = await lancedb.connect('s3://lancedb-integtest?engine=ddb&ddbTableName=lancedb-integtest')
|
|
||||||
const data = [{ vector: Array(128).fill(1.0) }]
|
|
||||||
|
|
||||||
const tableName = uuidv4()
|
|
||||||
let table = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
|
|
||||||
|
|
||||||
const futs = [table.add(data), table.add(data), table.add(data), table.add(data), table.add(data)]
|
|
||||||
await Promise.allSettled(futs)
|
|
||||||
|
|
||||||
table = await conn.openTable(tableName)
|
|
||||||
assert.equal(await table.countRows(), 6)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
describe('LanceDB Mirrored Store Integration test', function () {
|
|
||||||
it('s3://...?mirroredStore=... param is processed correctly', async function () {
|
|
||||||
this.timeout(600000)
|
|
||||||
|
|
||||||
const dir = await fs.promises.mkdtemp(path.join(tmpdir(), 'lancedb-mirror-'))
|
|
||||||
console.log(dir)
|
|
||||||
const conn = await lancedb.connect({ uri: `s3://lancedb-integtest?mirroredStore=${dir}`, storageOptions: { allowHttp: 'true' } })
|
|
||||||
const data = Array(200).fill({ vector: Array(128).fill(1.0), id: 0 })
|
|
||||||
data.push(...Array(200).fill({ vector: Array(128).fill(1.0), id: 1 }))
|
|
||||||
data.push(...Array(200).fill({ vector: Array(128).fill(1.0), id: 2 }))
|
|
||||||
data.push(...Array(200).fill({ vector: Array(128).fill(1.0), id: 3 }))
|
|
||||||
|
|
||||||
const tableName = uuidv4()
|
|
||||||
|
|
||||||
// try create table and check if it's mirrored
|
|
||||||
const t = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
|
|
||||||
|
|
||||||
const mirroredPath = path.join(dir, `${tableName}.lance`)
|
|
||||||
|
|
||||||
const files = await fs.promises.readdir(mirroredPath, { withFileTypes: true })
|
|
||||||
// there should be three dirs
|
|
||||||
assert.equal(files.length, 3, 'files after table creation')
|
|
||||||
assert.isTrue(files[0].isDirectory())
|
|
||||||
assert.isTrue(files[1].isDirectory())
|
|
||||||
|
|
||||||
const transactionFiles = await fs.promises.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true })
|
|
||||||
assert.equal(transactionFiles.length, 1, 'transactionFiles after table creation')
|
|
||||||
assert.isTrue(transactionFiles[0].name.endsWith('.txn'))
|
|
||||||
|
|
||||||
const versionFiles = await fs.promises.readdir(path.join(mirroredPath, '_versions'), { withFileTypes: true })
|
|
||||||
assert.equal(versionFiles.length, 1, 'versionFiles after table creation')
|
|
||||||
assert.isTrue(versionFiles[0].name.endsWith('.manifest'))
|
|
||||||
|
|
||||||
const dataFiles = await fs.promises.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true })
|
|
||||||
assert.equal(dataFiles.length, 1, 'dataFiles after table creation')
|
|
||||||
assert.isTrue(dataFiles[0].name.endsWith('.lance'))
|
|
||||||
|
|
||||||
// try create index and check if it's mirrored
|
|
||||||
await t.createIndex({ column: 'vector', type: 'ivf_pq' })
|
|
||||||
|
|
||||||
const filesAfterIndex = await fs.promises.readdir(mirroredPath, { withFileTypes: true })
|
|
||||||
// there should be four dirs
|
|
||||||
assert.equal(filesAfterIndex.length, 4, 'filesAfterIndex')
|
|
||||||
assert.isTrue(filesAfterIndex[0].isDirectory())
|
|
||||||
assert.isTrue(filesAfterIndex[1].isDirectory())
|
|
||||||
assert.isTrue(filesAfterIndex[2].isDirectory())
|
|
||||||
|
|
||||||
// Two TXs now
|
|
||||||
const transactionFilesAfterIndex = await fs.promises.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true })
|
|
||||||
assert.equal(transactionFilesAfterIndex.length, 2, 'transactionFilesAfterIndex')
|
|
||||||
assert.isTrue(transactionFilesAfterIndex[0].name.endsWith('.txn'))
|
|
||||||
assert.isTrue(transactionFilesAfterIndex[1].name.endsWith('.txn'))
|
|
||||||
|
|
||||||
const dataFilesAfterIndex = await fs.promises.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true })
|
|
||||||
assert.equal(dataFilesAfterIndex.length, 1, 'dataFilesAfterIndex')
|
|
||||||
assert.isTrue(dataFilesAfterIndex[0].name.endsWith('.lance'))
|
|
||||||
|
|
||||||
const indicesFiles = await fs.promises.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true })
|
|
||||||
assert.equal(indicesFiles.length, 1, 'indicesFiles')
|
|
||||||
assert.isTrue(indicesFiles[0].isDirectory())
|
|
||||||
|
|
||||||
const indexFiles = await fs.promises.readdir(path.join(mirroredPath, '_indices', indicesFiles[0].name), { withFileTypes: true })
|
|
||||||
console.log(`DEBUG indexFiles in ${indicesFiles[0].name}:`, indexFiles.map(f => `${f.name} (${f.isFile() ? 'file' : 'dir'})`))
|
|
||||||
assert.equal(indexFiles.length, 2, 'indexFiles')
|
|
||||||
const fileNames = indexFiles.map(f => f.name).sort()
|
|
||||||
assert.isTrue(fileNames.includes('auxiliary.idx'), 'auxiliary.idx should be present')
|
|
||||||
assert.isTrue(fileNames.includes('index.idx'), 'index.idx should be present')
|
|
||||||
assert.isTrue(indexFiles.every(f => f.isFile()), 'all index files should be files')
|
|
||||||
|
|
||||||
// try delete and check if it's mirrored
|
|
||||||
await t.delete('id = 0')
|
|
||||||
|
|
||||||
const filesAfterDelete = await fs.promises.readdir(mirroredPath, { withFileTypes: true })
|
|
||||||
// there should be five dirs
|
|
||||||
assert.equal(filesAfterDelete.length, 5, 'filesAfterDelete')
|
|
||||||
assert.isTrue(filesAfterDelete[0].isDirectory())
|
|
||||||
assert.isTrue(filesAfterDelete[1].isDirectory())
|
|
||||||
assert.isTrue(filesAfterDelete[2].isDirectory())
|
|
||||||
assert.isTrue(filesAfterDelete[3].isDirectory())
|
|
||||||
assert.isTrue(filesAfterDelete[4].isDirectory())
|
|
||||||
|
|
||||||
// Three TXs now
|
|
||||||
const transactionFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true })
|
|
||||||
assert.equal(transactionFilesAfterDelete.length, 3, 'transactionFilesAfterDelete')
|
|
||||||
assert.isTrue(transactionFilesAfterDelete[0].name.endsWith('.txn'))
|
|
||||||
assert.isTrue(transactionFilesAfterDelete[1].name.endsWith('.txn'))
|
|
||||||
|
|
||||||
const dataFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true })
|
|
||||||
assert.equal(dataFilesAfterDelete.length, 1, 'dataFilesAfterDelete')
|
|
||||||
assert.isTrue(dataFilesAfterDelete[0].name.endsWith('.lance'))
|
|
||||||
|
|
||||||
const indicesFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true })
|
|
||||||
assert.equal(indicesFilesAfterDelete.length, 1, 'indicesFilesAfterDelete')
|
|
||||||
assert.isTrue(indicesFilesAfterDelete[0].isDirectory())
|
|
||||||
|
|
||||||
const indexFilesAfterDelete = await fs.promises.readdir(path.join(mirroredPath, '_indices', indicesFilesAfterDelete[0].name), { withFileTypes: true })
|
|
||||||
console.log(`DEBUG indexFilesAfterDelete in ${indicesFilesAfterDelete[0].name}:`, indexFilesAfterDelete.map(f => `${f.name} (${f.isFile() ? 'file' : 'dir'})`))
|
|
||||||
assert.equal(indexFilesAfterDelete.length, 2, 'indexFilesAfterDelete')
|
|
||||||
const fileNamesAfterDelete = indexFilesAfterDelete.map(f => f.name).sort()
|
|
||||||
assert.isTrue(fileNamesAfterDelete.includes('auxiliary.idx'), 'auxiliary.idx should be present after delete')
|
|
||||||
assert.isTrue(fileNamesAfterDelete.includes('index.idx'), 'index.idx should be present after delete')
|
|
||||||
assert.isTrue(indexFilesAfterDelete.every(f => f.isFile()), 'all index files should be files after delete')
|
|
||||||
|
|
||||||
const deletionFiles = await fs.promises.readdir(path.join(mirroredPath, '_deletions'), { withFileTypes: true })
|
|
||||||
assert.equal(deletionFiles.length, 1, 'deletionFiles')
|
|
||||||
assert.isTrue(deletionFiles[0].name.endsWith('.arrow'))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
// Copyright 2024 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Middleware for Remote LanceDB Connection or Table
|
|
||||||
*/
|
|
||||||
export interface HttpMiddleware {
|
|
||||||
/**
|
|
||||||
* A callback that can be used to instrument the behavior of http requests to remote
|
|
||||||
* tables. It can be used to add headers, modify the request, or even short-circuit
|
|
||||||
* the request and return a response without making the request to the remote endpoint.
|
|
||||||
* It can also be used to modify the response from the remote endpoint.
|
|
||||||
*
|
|
||||||
* @param {RemoteResponse} res - Request to the remote endpoint
|
|
||||||
* @param {onRemoteRequestNext} next - Callback to advance the middleware chain
|
|
||||||
*/
|
|
||||||
onRemoteRequest(
|
|
||||||
req: RemoteRequest,
|
|
||||||
next: (req: RemoteRequest) => Promise<RemoteResponse>,
|
|
||||||
): Promise<RemoteResponse>
|
|
||||||
};
|
|
||||||
|
|
||||||
export enum Method {
|
|
||||||
GET,
|
|
||||||
POST
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A LanceDB Remote HTTP Request
|
|
||||||
*/
|
|
||||||
export interface RemoteRequest {
|
|
||||||
uri: string
|
|
||||||
method: Method
|
|
||||||
headers: Map<string, string>
|
|
||||||
params?: Map<string, string>
|
|
||||||
body?: any
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A LanceDB Remote HTTP Response
|
|
||||||
*/
|
|
||||||
export interface RemoteResponse {
|
|
||||||
status: number
|
|
||||||
statusText: string
|
|
||||||
headers: Map<string, string>
|
|
||||||
body: () => Promise<any>
|
|
||||||
}
|
|
||||||
@@ -1,163 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { Vector, tableFromIPC } from 'apache-arrow'
|
|
||||||
import { type EmbeddingFunction } from './embedding/embedding_function'
|
|
||||||
import { type MetricType } from '.'
|
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
||||||
const { tableSearch } = require('../native.js')
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A builder for nearest neighbor queries for LanceDB.
|
|
||||||
*/
|
|
||||||
export class Query<T = number[]> {
|
|
||||||
private readonly _query?: T
|
|
||||||
private readonly _tbl?: any
|
|
||||||
private _queryVector?: number[]
|
|
||||||
private _limit?: number
|
|
||||||
private _refineFactor?: number
|
|
||||||
private _nprobes: number
|
|
||||||
private _select?: string[]
|
|
||||||
private _filter?: string
|
|
||||||
private _metricType?: MetricType
|
|
||||||
private _prefilter: boolean
|
|
||||||
private _fastSearch: boolean
|
|
||||||
protected readonly _embeddings?: EmbeddingFunction<T>
|
|
||||||
|
|
||||||
constructor (query?: T, tbl?: any, embeddings?: EmbeddingFunction<T>) {
|
|
||||||
this._tbl = tbl
|
|
||||||
this._query = query
|
|
||||||
this._limit = 10
|
|
||||||
this._nprobes = 20
|
|
||||||
this._refineFactor = undefined
|
|
||||||
this._select = undefined
|
|
||||||
this._filter = undefined
|
|
||||||
this._metricType = undefined
|
|
||||||
this._embeddings = embeddings
|
|
||||||
this._prefilter = false
|
|
||||||
this._fastSearch = false
|
|
||||||
}
|
|
||||||
|
|
||||||
/***
|
|
||||||
* Sets the number of results that will be returned
|
|
||||||
* default value is 10
|
|
||||||
* @param value number of results
|
|
||||||
*/
|
|
||||||
limit (value: number): Query<T> {
|
|
||||||
this._limit = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Refine the results by reading extra elements and re-ranking them in memory.
|
|
||||||
* @param value refine factor to use in this query.
|
|
||||||
*/
|
|
||||||
refineFactor (value: number): Query<T> {
|
|
||||||
this._refineFactor = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The number of probes used. A higher number makes search more accurate but also slower.
|
|
||||||
* @param value The number of probes used.
|
|
||||||
*/
|
|
||||||
nprobes (value: number): Query<T> {
|
|
||||||
this._nprobes = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A filter statement to be applied to this query.
|
|
||||||
* @param value A filter in the same format used by a sql WHERE clause.
|
|
||||||
*/
|
|
||||||
filter (value: string): Query<T> {
|
|
||||||
this._filter = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
where = this.filter
|
|
||||||
|
|
||||||
/** Return only the specified columns.
|
|
||||||
*
|
|
||||||
* @param value Only select the specified columns. If not specified, all columns will be returned.
|
|
||||||
*/
|
|
||||||
select (value: string[]): Query<T> {
|
|
||||||
this._select = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The MetricType used for this Query.
|
|
||||||
* @param value The metric to the. @see MetricType for the different options
|
|
||||||
*/
|
|
||||||
metricType (value: MetricType): Query<T> {
|
|
||||||
this._metricType = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
prefilter (value: boolean): Query<T> {
|
|
||||||
this._prefilter = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Skip searching un-indexed data. This can make search faster, but will miss
|
|
||||||
* any data that is not yet indexed.
|
|
||||||
*/
|
|
||||||
fastSearch (value: boolean): Query<T> {
|
|
||||||
this._fastSearch = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Execute the query and return the results as an Array of Objects
|
|
||||||
*/
|
|
||||||
async execute<T = Record<string, unknown>> (): Promise<T[]> {
|
|
||||||
if (this._query !== undefined) {
|
|
||||||
if (this._embeddings !== undefined) {
|
|
||||||
this._queryVector = (await this._embeddings.embed([this._query]))[0]
|
|
||||||
} else {
|
|
||||||
this._queryVector = this._query as number[]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const isElectron = this.isElectron()
|
|
||||||
const buffer = await tableSearch.call(this._tbl, this, isElectron)
|
|
||||||
const data = tableFromIPC(buffer)
|
|
||||||
|
|
||||||
return data.toArray().map((entry: Record<string, unknown>) => {
|
|
||||||
const newObject: Record<string, unknown> = {}
|
|
||||||
Object.keys(entry).forEach((key: string) => {
|
|
||||||
if (entry[key] instanceof Vector) {
|
|
||||||
// toJSON() returns f16 array correctly
|
|
||||||
newObject[key] = (entry[key] as any).toJSON()
|
|
||||||
} else {
|
|
||||||
newObject[key] = entry[key] as any
|
|
||||||
}
|
|
||||||
})
|
|
||||||
return newObject as unknown as T
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// See https://github.com/electron/electron/issues/2288
|
|
||||||
private isElectron (): boolean {
|
|
||||||
try {
|
|
||||||
// eslint-disable-next-line no-prototype-builtins
|
|
||||||
return (process?.versions?.hasOwnProperty('electron') || navigator?.userAgent?.toLowerCase()?.includes(' electron'))
|
|
||||||
} catch (e) {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,302 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import axios, { type AxiosError, type AxiosResponse, type ResponseType } from 'axios'
|
|
||||||
|
|
||||||
import { tableFromIPC, type Table as ArrowTable } from 'apache-arrow'
|
|
||||||
|
|
||||||
import { type RemoteResponse, type RemoteRequest, Method } from '../middleware'
|
|
||||||
import type { MetricType } from '..'
|
|
||||||
|
|
||||||
interface HttpLancedbClientMiddleware {
|
|
||||||
onRemoteRequest(
|
|
||||||
req: RemoteRequest,
|
|
||||||
next: (req: RemoteRequest) => Promise<RemoteResponse>,
|
|
||||||
): Promise<RemoteResponse>
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Invoke the middleware chain and at the end call the remote endpoint
|
|
||||||
*/
|
|
||||||
async function callWithMiddlewares (
|
|
||||||
req: RemoteRequest,
|
|
||||||
middlewares: HttpLancedbClientMiddleware[],
|
|
||||||
opts?: MiddlewareInvocationOptions
|
|
||||||
): Promise<RemoteResponse> {
|
|
||||||
async function call (
|
|
||||||
i: number,
|
|
||||||
req: RemoteRequest
|
|
||||||
): Promise<RemoteResponse> {
|
|
||||||
// if we have reached the end of the middleware chain, make the request
|
|
||||||
if (i > middlewares.length) {
|
|
||||||
const headers = Object.fromEntries(req.headers.entries())
|
|
||||||
const params = Object.fromEntries(req.params?.entries() ?? [])
|
|
||||||
const timeout = opts?.timeout
|
|
||||||
let res
|
|
||||||
if (req.method === Method.POST) {
|
|
||||||
res = await axios.post(
|
|
||||||
req.uri,
|
|
||||||
req.body,
|
|
||||||
{
|
|
||||||
headers,
|
|
||||||
params,
|
|
||||||
timeout,
|
|
||||||
responseType: opts?.responseType
|
|
||||||
}
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
res = await axios.get(
|
|
||||||
req.uri,
|
|
||||||
{
|
|
||||||
headers,
|
|
||||||
params,
|
|
||||||
timeout
|
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
return toLanceRes(res)
|
|
||||||
}
|
|
||||||
|
|
||||||
// call next middleware in chain
|
|
||||||
return await middlewares[i - 1].onRemoteRequest(
|
|
||||||
req,
|
|
||||||
async (req) => {
|
|
||||||
return await call(i + 1, req)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
return await call(1, req)
|
|
||||||
}
|
|
||||||
|
|
||||||
interface MiddlewareInvocationOptions {
|
|
||||||
responseType?: ResponseType
|
|
||||||
timeout?: number
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Marshall the library response into a LanceDB response
|
|
||||||
*/
|
|
||||||
function toLanceRes (res: AxiosResponse): RemoteResponse {
|
|
||||||
const headers = new Map()
|
|
||||||
for (const h in res.headers) {
|
|
||||||
headers.set(h, res.headers[h])
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
status: res.status,
|
|
||||||
statusText: res.statusText,
|
|
||||||
headers,
|
|
||||||
body: async () => {
|
|
||||||
return res.data
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function decodeErrorData(
|
|
||||||
res: RemoteResponse,
|
|
||||||
responseType?: ResponseType
|
|
||||||
): Promise<string> {
|
|
||||||
const errorData = await res.body()
|
|
||||||
if (responseType === 'arraybuffer') {
|
|
||||||
return new TextDecoder().decode(errorData)
|
|
||||||
} else {
|
|
||||||
if (typeof errorData === 'object') {
|
|
||||||
return JSON.stringify(errorData)
|
|
||||||
}
|
|
||||||
|
|
||||||
return errorData
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export class HttpLancedbClient {
|
|
||||||
private readonly _url: string
|
|
||||||
private readonly _apiKey: () => string
|
|
||||||
private readonly _middlewares: HttpLancedbClientMiddleware[]
|
|
||||||
private readonly _timeout: number | undefined
|
|
||||||
|
|
||||||
public constructor (
|
|
||||||
url: string,
|
|
||||||
apiKey: string,
|
|
||||||
timeout?: number,
|
|
||||||
private readonly _dbName?: string
|
|
||||||
|
|
||||||
) {
|
|
||||||
this._url = url
|
|
||||||
this._apiKey = () => apiKey
|
|
||||||
this._middlewares = []
|
|
||||||
this._timeout = timeout
|
|
||||||
}
|
|
||||||
|
|
||||||
get uri (): string {
|
|
||||||
return this._url
|
|
||||||
}
|
|
||||||
|
|
||||||
public async search (
|
|
||||||
tableName: string,
|
|
||||||
vector: number[],
|
|
||||||
k: number,
|
|
||||||
nprobes: number,
|
|
||||||
prefilter: boolean,
|
|
||||||
refineFactor?: number,
|
|
||||||
columns?: string[],
|
|
||||||
filter?: string,
|
|
||||||
metricType?: MetricType,
|
|
||||||
fastSearch?: boolean
|
|
||||||
): Promise<ArrowTable<any>> {
|
|
||||||
const result = await this.post(
|
|
||||||
`/v1/table/${tableName}/query/`,
|
|
||||||
{
|
|
||||||
vector,
|
|
||||||
k,
|
|
||||||
nprobes,
|
|
||||||
refine_factor: refineFactor,
|
|
||||||
columns,
|
|
||||||
filter,
|
|
||||||
prefilter,
|
|
||||||
metric: metricType,
|
|
||||||
fast_search: fastSearch
|
|
||||||
},
|
|
||||||
undefined,
|
|
||||||
undefined,
|
|
||||||
'arraybuffer'
|
|
||||||
)
|
|
||||||
const table = tableFromIPC(await result.body())
|
|
||||||
return table
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sent GET request.
|
|
||||||
*/
|
|
||||||
public async get (path: string, params?: Record<string, string>): Promise<RemoteResponse> {
|
|
||||||
const req = {
|
|
||||||
uri: `${this._url}${path}`,
|
|
||||||
method: Method.GET,
|
|
||||||
headers: new Map(Object.entries({
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'x-api-key': this._apiKey(),
|
|
||||||
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
|
|
||||||
})),
|
|
||||||
params: new Map(Object.entries(params ?? {}))
|
|
||||||
}
|
|
||||||
|
|
||||||
let response
|
|
||||||
try {
|
|
||||||
response = await callWithMiddlewares(req, this._middlewares)
|
|
||||||
return response
|
|
||||||
} catch (err: any) {
|
|
||||||
console.error(serializeErrorAsJson(err))
|
|
||||||
if (err.response === undefined) {
|
|
||||||
throw new Error(`Network Error: ${err.message as string}`)
|
|
||||||
}
|
|
||||||
|
|
||||||
response = toLanceRes(err.response)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (response.status !== 200) {
|
|
||||||
const errorData = await decodeErrorData(response)
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${response.status}, ` +
|
|
||||||
`message: ${response.statusText}: ${errorData}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
return response
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sent POST request.
|
|
||||||
*/
|
|
||||||
public async post (
|
|
||||||
path: string,
|
|
||||||
data?: any,
|
|
||||||
params?: Record<string, string>,
|
|
||||||
content?: string | undefined,
|
|
||||||
responseType?: ResponseType | undefined
|
|
||||||
): Promise<RemoteResponse> {
|
|
||||||
const req = {
|
|
||||||
uri: `${this._url}${path}`,
|
|
||||||
method: Method.POST,
|
|
||||||
headers: new Map(Object.entries({
|
|
||||||
'Content-Type': content ?? 'application/json',
|
|
||||||
'x-api-key': this._apiKey(),
|
|
||||||
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
|
|
||||||
})),
|
|
||||||
params: new Map(Object.entries(params ?? {})),
|
|
||||||
body: data
|
|
||||||
}
|
|
||||||
|
|
||||||
let response
|
|
||||||
try {
|
|
||||||
response = await callWithMiddlewares(req, this._middlewares, {
|
|
||||||
responseType,
|
|
||||||
timeout: this._timeout
|
|
||||||
})
|
|
||||||
|
|
||||||
// return response
|
|
||||||
} catch (err: any) {
|
|
||||||
console.error(serializeErrorAsJson(err))
|
|
||||||
|
|
||||||
if (err.response === undefined) {
|
|
||||||
throw new Error(`Network Error: ${err.message as string}`)
|
|
||||||
}
|
|
||||||
response = toLanceRes(err.response)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (response.status !== 200) {
|
|
||||||
const errorData = await decodeErrorData(response, responseType)
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${response.status}, ` +
|
|
||||||
`message: ${response.statusText}: ${errorData}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
return response
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instrument this client with middleware
|
|
||||||
* @param mw - The middleware that instruments the client
|
|
||||||
* @returns - an instance of this client instrumented with the middleware
|
|
||||||
*/
|
|
||||||
public withMiddleware (mw: HttpLancedbClientMiddleware): HttpLancedbClient {
|
|
||||||
const wrapped = this.clone()
|
|
||||||
wrapped._middlewares.push(mw)
|
|
||||||
return wrapped
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Make a clone of this client
|
|
||||||
*/
|
|
||||||
private clone (): HttpLancedbClient {
|
|
||||||
const clone = new HttpLancedbClient(this._url, this._apiKey(), this._timeout, this._dbName)
|
|
||||||
for (const mw of this._middlewares) {
|
|
||||||
clone._middlewares.push(mw)
|
|
||||||
}
|
|
||||||
return clone
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function serializeErrorAsJson(err: AxiosError) {
|
|
||||||
const error = JSON.parse(JSON.stringify(err, Object.getOwnPropertyNames(err)))
|
|
||||||
error.response = err.response != null
|
|
||||||
? JSON.parse(JSON.stringify(
|
|
||||||
err.response,
|
|
||||||
// config contains the request data, too noisy
|
|
||||||
Object.getOwnPropertyNames(err.response).filter(prop => prop !== 'config')
|
|
||||||
))
|
|
||||||
: null
|
|
||||||
return JSON.stringify({ error })
|
|
||||||
}
|
|
||||||
@@ -1,567 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import {
|
|
||||||
type EmbeddingFunction,
|
|
||||||
type Table,
|
|
||||||
type VectorIndexParams,
|
|
||||||
type Connection,
|
|
||||||
type ConnectionOptions,
|
|
||||||
type CreateTableOptions,
|
|
||||||
type VectorIndex,
|
|
||||||
type WriteOptions,
|
|
||||||
type IndexStats,
|
|
||||||
type UpdateArgs,
|
|
||||||
type UpdateSqlArgs,
|
|
||||||
makeArrowTable,
|
|
||||||
type MergeInsertArgs,
|
|
||||||
type ColumnAlteration
|
|
||||||
} from '../index'
|
|
||||||
import { Query } from '../query'
|
|
||||||
|
|
||||||
import { Vector, Table as ArrowTable } from 'apache-arrow'
|
|
||||||
import { HttpLancedbClient } from './client'
|
|
||||||
import { isEmbeddingFunction } from '../embedding/embedding_function'
|
|
||||||
import {
|
|
||||||
createEmptyTable,
|
|
||||||
fromRecordsToStreamBuffer,
|
|
||||||
fromTableToStreamBuffer
|
|
||||||
} from '../arrow'
|
|
||||||
import { toSQL, TTLCache } from '../util'
|
|
||||||
import { type HttpMiddleware } from '../middleware'
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Remote connection.
|
|
||||||
*/
|
|
||||||
export class RemoteConnection implements Connection {
|
|
||||||
private _client: HttpLancedbClient
|
|
||||||
private readonly _dbName: string
|
|
||||||
private readonly _tableCache = new TTLCache(300_000)
|
|
||||||
|
|
||||||
constructor (opts: ConnectionOptions) {
|
|
||||||
if (!opts.uri.startsWith('db://')) {
|
|
||||||
throw new Error(`Invalid remote DB URI: ${opts.uri}`)
|
|
||||||
}
|
|
||||||
if (opts.apiKey == null || opts.apiKey === '') {
|
|
||||||
opts = Object.assign({}, opts, { apiKey: process.env.LANCEDB_API_KEY })
|
|
||||||
}
|
|
||||||
if (opts.apiKey === undefined || opts.region === undefined) {
|
|
||||||
throw new Error(
|
|
||||||
'API key and region are must be passed for remote connections. ' +
|
|
||||||
'API key can also be set through LANCEDB_API_KEY env variable.')
|
|
||||||
}
|
|
||||||
|
|
||||||
this._dbName = opts.uri.slice('db://'.length)
|
|
||||||
let server: string
|
|
||||||
if (opts.hostOverride === undefined) {
|
|
||||||
server = `https://${this._dbName}.${opts.region}.api.lancedb.com`
|
|
||||||
} else {
|
|
||||||
server = opts.hostOverride
|
|
||||||
}
|
|
||||||
this._client = new HttpLancedbClient(
|
|
||||||
server,
|
|
||||||
opts.apiKey,
|
|
||||||
opts.timeout,
|
|
||||||
opts.hostOverride === undefined ? undefined : this._dbName
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
get uri (): string {
|
|
||||||
// add the lancedb+ prefix back
|
|
||||||
return 'db://' + this._client.uri
|
|
||||||
}
|
|
||||||
|
|
||||||
async tableNames (
|
|
||||||
pageToken: string = '',
|
|
||||||
limit: number = 10
|
|
||||||
): Promise<string[]> {
|
|
||||||
const response = await this._client.get('/v1/table/', {
|
|
||||||
limit: `${limit}`,
|
|
||||||
page_token: pageToken
|
|
||||||
})
|
|
||||||
const body = await response.body()
|
|
||||||
for (const table of body.tables) {
|
|
||||||
this._tableCache.set(table, true)
|
|
||||||
}
|
|
||||||
return body.tables
|
|
||||||
}
|
|
||||||
|
|
||||||
async openTable (name: string): Promise<Table>
|
|
||||||
async openTable<T>(
|
|
||||||
name: string,
|
|
||||||
embeddings: EmbeddingFunction<T>
|
|
||||||
): Promise<Table<T>>
|
|
||||||
async openTable<T>(
|
|
||||||
name: string,
|
|
||||||
embeddings?: EmbeddingFunction<T>
|
|
||||||
): Promise<Table<T>> {
|
|
||||||
// check if the table exists
|
|
||||||
if (this._tableCache.get(name) === undefined) {
|
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(name)}/describe/`)
|
|
||||||
this._tableCache.set(name, true)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (embeddings !== undefined) {
|
|
||||||
return new RemoteTable(this._client, name, embeddings)
|
|
||||||
} else {
|
|
||||||
return new RemoteTable(this._client, name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async createTable<T>(
|
|
||||||
nameOrOpts: string | CreateTableOptions<T>,
|
|
||||||
data?: Array<Record<string, unknown>> | ArrowTable,
|
|
||||||
optsOrEmbedding?: WriteOptions | EmbeddingFunction<T>,
|
|
||||||
opt?: WriteOptions
|
|
||||||
): Promise<Table<T>> {
|
|
||||||
// Logic copied from LocatlConnection, refactor these to a base class + connectionImpl pattern
|
|
||||||
let schema
|
|
||||||
let embeddings: undefined | EmbeddingFunction<T>
|
|
||||||
let tableName: string
|
|
||||||
if (typeof nameOrOpts === 'string') {
|
|
||||||
if (
|
|
||||||
optsOrEmbedding !== undefined &&
|
|
||||||
isEmbeddingFunction(optsOrEmbedding)
|
|
||||||
) {
|
|
||||||
embeddings = optsOrEmbedding
|
|
||||||
}
|
|
||||||
tableName = nameOrOpts
|
|
||||||
} else {
|
|
||||||
schema = nameOrOpts.schema
|
|
||||||
embeddings = nameOrOpts.embeddingFunction
|
|
||||||
tableName = nameOrOpts.name
|
|
||||||
if (data === undefined) {
|
|
||||||
data = nameOrOpts.data
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let buffer: Buffer
|
|
||||||
|
|
||||||
function isEmpty (
|
|
||||||
data: Array<Record<string, unknown>> | ArrowTable<any>
|
|
||||||
): boolean {
|
|
||||||
if (data instanceof ArrowTable) {
|
|
||||||
return data.numRows === 0
|
|
||||||
}
|
|
||||||
return data.length === 0
|
|
||||||
}
|
|
||||||
|
|
||||||
if (data === undefined || isEmpty(data)) {
|
|
||||||
if (schema === undefined) {
|
|
||||||
throw new Error('Either data or schema needs to defined')
|
|
||||||
}
|
|
||||||
buffer = await fromTableToStreamBuffer(createEmptyTable(schema))
|
|
||||||
} else if (data instanceof ArrowTable) {
|
|
||||||
buffer = await fromTableToStreamBuffer(data, embeddings)
|
|
||||||
} else {
|
|
||||||
// data is Array<Record<...>>
|
|
||||||
buffer = await fromRecordsToStreamBuffer(data, embeddings)
|
|
||||||
}
|
|
||||||
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(tableName)}/create/`,
|
|
||||||
buffer,
|
|
||||||
undefined,
|
|
||||||
'application/vnd.apache.arrow.stream'
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
this._tableCache.set(tableName, true)
|
|
||||||
if (embeddings === undefined) {
|
|
||||||
return new RemoteTable(this._client, tableName)
|
|
||||||
} else {
|
|
||||||
return new RemoteTable(this._client, tableName, embeddings)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async dropTable (name: string): Promise<void> {
|
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(name)}/drop/`)
|
|
||||||
this._tableCache.delete(name)
|
|
||||||
}
|
|
||||||
|
|
||||||
withMiddleware (middleware: HttpMiddleware): Connection {
|
|
||||||
const wrapped = this.clone()
|
|
||||||
wrapped._client = wrapped._client.withMiddleware(middleware)
|
|
||||||
return wrapped
|
|
||||||
}
|
|
||||||
|
|
||||||
private clone (): RemoteConnection {
|
|
||||||
const clone: RemoteConnection = Object.create(RemoteConnection.prototype)
|
|
||||||
return Object.assign(clone, this)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export class RemoteQuery<T = number[]> extends Query<T> {
|
|
||||||
constructor (
|
|
||||||
query: T,
|
|
||||||
private readonly _client: HttpLancedbClient,
|
|
||||||
private readonly _name: string,
|
|
||||||
embeddings?: EmbeddingFunction<T>
|
|
||||||
) {
|
|
||||||
super(query, undefined, embeddings)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: refactor this to a base class + queryImpl pattern
|
|
||||||
async execute<T = Record<string, unknown>>(): Promise<T[]> {
|
|
||||||
const embeddings = this._embeddings
|
|
||||||
const query = (this as any)._query
|
|
||||||
let queryVector: number[]
|
|
||||||
|
|
||||||
if (embeddings !== undefined) {
|
|
||||||
queryVector = (await embeddings.embed([query]))[0]
|
|
||||||
} else {
|
|
||||||
queryVector = query as number[]
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = await this._client.search(
|
|
||||||
this._name,
|
|
||||||
queryVector,
|
|
||||||
(this as any)._limit,
|
|
||||||
(this as any)._nprobes,
|
|
||||||
(this as any)._prefilter,
|
|
||||||
(this as any)._refineFactor,
|
|
||||||
(this as any)._select,
|
|
||||||
(this as any)._filter,
|
|
||||||
(this as any)._metricType,
|
|
||||||
(this as any)._fastSearch
|
|
||||||
)
|
|
||||||
|
|
||||||
return data.toArray().map((entry: Record<string, unknown>) => {
|
|
||||||
const newObject: Record<string, unknown> = {}
|
|
||||||
Object.keys(entry).forEach((key: string) => {
|
|
||||||
if (entry[key] instanceof Vector) {
|
|
||||||
newObject[key] = (entry[key] as any).toArray()
|
|
||||||
} else {
|
|
||||||
newObject[key] = entry[key] as any
|
|
||||||
}
|
|
||||||
})
|
|
||||||
return newObject as unknown as T
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// we are using extend until we have next next version release
|
|
||||||
// Table and Connection has both been refactored to interfaces
|
|
||||||
export class RemoteTable<T = number[]> implements Table<T> {
|
|
||||||
private _client: HttpLancedbClient
|
|
||||||
private readonly _embeddings?: EmbeddingFunction<T>
|
|
||||||
private readonly _name: string
|
|
||||||
|
|
||||||
constructor (client: HttpLancedbClient, name: string)
|
|
||||||
constructor (
|
|
||||||
client: HttpLancedbClient,
|
|
||||||
name: string,
|
|
||||||
embeddings: EmbeddingFunction<T>
|
|
||||||
)
|
|
||||||
constructor (
|
|
||||||
client: HttpLancedbClient,
|
|
||||||
name: string,
|
|
||||||
embeddings?: EmbeddingFunction<T>
|
|
||||||
) {
|
|
||||||
this._client = client
|
|
||||||
this._name = name
|
|
||||||
this._embeddings = embeddings
|
|
||||||
}
|
|
||||||
|
|
||||||
get name (): string {
|
|
||||||
return this._name
|
|
||||||
}
|
|
||||||
|
|
||||||
get schema (): Promise<any> {
|
|
||||||
return this._client
|
|
||||||
.post(`/v1/table/${encodeURIComponent(this._name)}/describe/`)
|
|
||||||
.then(async (res) => {
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
return (await res.body())?.schema
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
search (query: T): Query<T> {
|
|
||||||
return new RemoteQuery(query, this._client, encodeURIComponent(this._name)) //, this._embeddings_new)
|
|
||||||
}
|
|
||||||
|
|
||||||
filter (where: string): Query<T> {
|
|
||||||
throw new Error('Not implemented')
|
|
||||||
}
|
|
||||||
|
|
||||||
async mergeInsert (on: string, data: Array<Record<string, unknown>> | ArrowTable, args: MergeInsertArgs): Promise<void> {
|
|
||||||
let tbl: ArrowTable
|
|
||||||
if (data instanceof ArrowTable) {
|
|
||||||
tbl = data
|
|
||||||
} else {
|
|
||||||
tbl = makeArrowTable(data, await this.schema)
|
|
||||||
}
|
|
||||||
|
|
||||||
const queryParams: any = {
|
|
||||||
on
|
|
||||||
}
|
|
||||||
if (args.whenMatchedUpdateAll !== false && args.whenMatchedUpdateAll !== null && args.whenMatchedUpdateAll !== undefined) {
|
|
||||||
queryParams.when_matched_update_all = 'true'
|
|
||||||
if (typeof args.whenMatchedUpdateAll === 'string') {
|
|
||||||
queryParams.when_matched_update_all_filt = args.whenMatchedUpdateAll
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
queryParams.when_matched_update_all = 'false'
|
|
||||||
}
|
|
||||||
if (args.whenNotMatchedInsertAll ?? false) {
|
|
||||||
queryParams.when_not_matched_insert_all = 'true'
|
|
||||||
} else {
|
|
||||||
queryParams.when_not_matched_insert_all = 'false'
|
|
||||||
}
|
|
||||||
if (args.whenNotMatchedBySourceDelete !== false && args.whenNotMatchedBySourceDelete !== null && args.whenNotMatchedBySourceDelete !== undefined) {
|
|
||||||
queryParams.when_not_matched_by_source_delete = 'true'
|
|
||||||
if (typeof args.whenNotMatchedBySourceDelete === 'string') {
|
|
||||||
queryParams.when_not_matched_by_source_delete_filt = args.whenNotMatchedBySourceDelete
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
queryParams.when_not_matched_by_source_delete = 'false'
|
|
||||||
}
|
|
||||||
|
|
||||||
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/merge_insert/`,
|
|
||||||
buffer,
|
|
||||||
queryParams,
|
|
||||||
'application/vnd.apache.arrow.stream'
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async add (data: Array<Record<string, unknown>> | ArrowTable): Promise<number> {
|
|
||||||
let tbl: ArrowTable
|
|
||||||
if (data instanceof ArrowTable) {
|
|
||||||
tbl = data
|
|
||||||
} else {
|
|
||||||
tbl = makeArrowTable(data, await this.schema)
|
|
||||||
}
|
|
||||||
|
|
||||||
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/insert/`,
|
|
||||||
buffer,
|
|
||||||
{
|
|
||||||
mode: 'append'
|
|
||||||
},
|
|
||||||
'application/vnd.apache.arrow.stream'
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
return tbl.numRows
|
|
||||||
}
|
|
||||||
|
|
||||||
async overwrite (data: Array<Record<string, unknown>> | ArrowTable): Promise<number> {
|
|
||||||
let tbl: ArrowTable
|
|
||||||
if (data instanceof ArrowTable) {
|
|
||||||
tbl = data
|
|
||||||
} else {
|
|
||||||
tbl = makeArrowTable(data)
|
|
||||||
}
|
|
||||||
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/insert/`,
|
|
||||||
buffer,
|
|
||||||
{
|
|
||||||
mode: 'overwrite'
|
|
||||||
},
|
|
||||||
'application/vnd.apache.arrow.stream'
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
return tbl.numRows
|
|
||||||
}
|
|
||||||
|
|
||||||
async createIndex (indexParams: VectorIndexParams): Promise<void> {
|
|
||||||
const unsupportedParams = [
|
|
||||||
'index_name',
|
|
||||||
'num_partitions',
|
|
||||||
'max_iters',
|
|
||||||
'use_opq',
|
|
||||||
'num_sub_vectors',
|
|
||||||
'num_bits',
|
|
||||||
'max_opq_iters',
|
|
||||||
'replace'
|
|
||||||
]
|
|
||||||
for (const param of unsupportedParams) {
|
|
||||||
// eslint-disable-next-line @typescript-eslint/strict-boolean-expressions
|
|
||||||
if (indexParams[param as keyof VectorIndexParams]) {
|
|
||||||
throw new Error(`${param} is not supported for remote connections`)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const column = indexParams.column ?? 'vector'
|
|
||||||
const indexType = 'vector'
|
|
||||||
const metricType = indexParams.metric_type ?? 'L2'
|
|
||||||
const indexCacheSize = indexParams.index_cache_size ?? null
|
|
||||||
|
|
||||||
const data = {
|
|
||||||
column,
|
|
||||||
index_type: indexType,
|
|
||||||
metric_type: metricType,
|
|
||||||
index_cache_size: indexCacheSize
|
|
||||||
}
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/create_index/`,
|
|
||||||
data
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async createScalarIndex (column: string): Promise<void> {
|
|
||||||
const indexType = 'scalar'
|
|
||||||
|
|
||||||
const data = {
|
|
||||||
column,
|
|
||||||
index_type: indexType,
|
|
||||||
replace: true
|
|
||||||
}
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/create_scalar_index/`,
|
|
||||||
data
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
async dropIndex (index_name: string): Promise<void> {
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/index/${encodeURIComponent(index_name)}/drop/`
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async countRows (filter?: string): Promise<number> {
|
|
||||||
const result = await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/count_rows/`, {
|
|
||||||
predicate: filter
|
|
||||||
})
|
|
||||||
return (await result.body())
|
|
||||||
}
|
|
||||||
|
|
||||||
async delete (filter: string): Promise<void> {
|
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/delete/`, {
|
|
||||||
predicate: filter
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async update (args: UpdateArgs | UpdateSqlArgs): Promise<void> {
|
|
||||||
let filter: string | null
|
|
||||||
let updates: Record<string, string>
|
|
||||||
|
|
||||||
if ('valuesSql' in args) {
|
|
||||||
filter = args.where ?? null
|
|
||||||
updates = args.valuesSql
|
|
||||||
} else {
|
|
||||||
filter = args.where ?? null
|
|
||||||
updates = {}
|
|
||||||
for (const [key, value] of Object.entries(args.values)) {
|
|
||||||
updates[key] = toSQL(value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/update/`, {
|
|
||||||
predicate: filter,
|
|
||||||
updates: Object.entries(updates).map(([key, value]) => [key, value])
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async listIndices (): Promise<VectorIndex[]> {
|
|
||||||
const results = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/index/list/`
|
|
||||||
)
|
|
||||||
return (await results.body()).indexes?.map((index: any) => ({
|
|
||||||
columns: index.columns,
|
|
||||||
name: index.index_name,
|
|
||||||
uuid: index.index_uuid,
|
|
||||||
status: index.status
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
async indexStats (indexName: string): Promise<IndexStats> {
|
|
||||||
const results = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/index/${indexName}/stats/`
|
|
||||||
)
|
|
||||||
const body = await results.body()
|
|
||||||
return {
|
|
||||||
numIndexedRows: body?.num_indexed_rows,
|
|
||||||
numUnindexedRows: body?.num_unindexed_rows,
|
|
||||||
indexType: body?.index_type,
|
|
||||||
distanceType: body?.distance_type
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async addColumns (newColumnTransforms: Array<{ name: string, valueSql: string }>): Promise<void> {
|
|
||||||
throw new Error('Add columns is not yet supported in LanceDB Cloud.')
|
|
||||||
}
|
|
||||||
|
|
||||||
async alterColumns (columnAlterations: ColumnAlteration[]): Promise<void> {
|
|
||||||
throw new Error('Alter columns is not yet supported in LanceDB Cloud.')
|
|
||||||
}
|
|
||||||
|
|
||||||
async dropColumns (columnNames: string[]): Promise<void> {
|
|
||||||
throw new Error('Drop columns is not yet supported in LanceDB Cloud.')
|
|
||||||
}
|
|
||||||
|
|
||||||
withMiddleware(middleware: HttpMiddleware): Table<T> {
|
|
||||||
const wrapped = this.clone()
|
|
||||||
wrapped._client = wrapped._client.withMiddleware(middleware)
|
|
||||||
return wrapped
|
|
||||||
}
|
|
||||||
|
|
||||||
private clone (): RemoteTable<T> {
|
|
||||||
const clone: RemoteTable<T> = Object.create(RemoteTable.prototype)
|
|
||||||
return Object.assign(clone, this)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,508 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
// The utilities in this file help sanitize data from the user's arrow
|
|
||||||
// library into the types expected by vectordb's arrow library. Node
|
|
||||||
// generally allows for mulitple versions of the same library (and sometimes
|
|
||||||
// even multiple copies of the same version) to be installed at the same
|
|
||||||
// time. However, arrow-js uses instanceof which expected that the input
|
|
||||||
// comes from the exact same library instance. This is not always the case
|
|
||||||
// and so we must sanitize the input to ensure that it is compatible.
|
|
||||||
|
|
||||||
import {
|
|
||||||
Field,
|
|
||||||
Utf8,
|
|
||||||
FixedSizeBinary,
|
|
||||||
FixedSizeList,
|
|
||||||
Schema,
|
|
||||||
List,
|
|
||||||
Struct,
|
|
||||||
Float,
|
|
||||||
Bool,
|
|
||||||
Date_,
|
|
||||||
Decimal,
|
|
||||||
type DataType,
|
|
||||||
Dictionary,
|
|
||||||
Binary,
|
|
||||||
Float32,
|
|
||||||
Interval,
|
|
||||||
Map_,
|
|
||||||
Duration,
|
|
||||||
Union,
|
|
||||||
Time,
|
|
||||||
Timestamp,
|
|
||||||
Type,
|
|
||||||
Null,
|
|
||||||
Int,
|
|
||||||
type Precision,
|
|
||||||
type DateUnit,
|
|
||||||
Int8,
|
|
||||||
Int16,
|
|
||||||
Int32,
|
|
||||||
Int64,
|
|
||||||
Uint8,
|
|
||||||
Uint16,
|
|
||||||
Uint32,
|
|
||||||
Uint64,
|
|
||||||
Float16,
|
|
||||||
Float64,
|
|
||||||
DateDay,
|
|
||||||
DateMillisecond,
|
|
||||||
DenseUnion,
|
|
||||||
SparseUnion,
|
|
||||||
TimeNanosecond,
|
|
||||||
TimeMicrosecond,
|
|
||||||
TimeMillisecond,
|
|
||||||
TimeSecond,
|
|
||||||
TimestampNanosecond,
|
|
||||||
TimestampMicrosecond,
|
|
||||||
TimestampMillisecond,
|
|
||||||
TimestampSecond,
|
|
||||||
IntervalDayTime,
|
|
||||||
IntervalYearMonth,
|
|
||||||
DurationNanosecond,
|
|
||||||
DurationMicrosecond,
|
|
||||||
DurationMillisecond,
|
|
||||||
DurationSecond
|
|
||||||
} from "apache-arrow";
|
|
||||||
import type { IntBitWidth, TimeBitWidth } from "apache-arrow/type";
|
|
||||||
|
|
||||||
function sanitizeMetadata(
|
|
||||||
metadataLike?: unknown
|
|
||||||
): Map<string, string> | undefined {
|
|
||||||
if (metadataLike === undefined || metadataLike === null) {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
if (!(metadataLike instanceof Map)) {
|
|
||||||
throw Error("Expected metadata, if present, to be a Map<string, string>");
|
|
||||||
}
|
|
||||||
for (const item of metadataLike) {
|
|
||||||
if (!(typeof item[0] === "string" || !(typeof item[1] === "string"))) {
|
|
||||||
throw Error(
|
|
||||||
"Expected metadata, if present, to be a Map<string, string> but it had non-string keys or values"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return metadataLike as Map<string, string>;
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeInt(typeLike: object) {
|
|
||||||
if (
|
|
||||||
!("bitWidth" in typeLike) ||
|
|
||||||
typeof typeLike.bitWidth !== "number" ||
|
|
||||||
!("isSigned" in typeLike) ||
|
|
||||||
typeof typeLike.isSigned !== "boolean"
|
|
||||||
) {
|
|
||||||
throw Error(
|
|
||||||
"Expected an Int Type to have a `bitWidth` and `isSigned` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return new Int(typeLike.isSigned, typeLike.bitWidth as IntBitWidth);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeFloat(typeLike: object) {
|
|
||||||
if (!("precision" in typeLike) || typeof typeLike.precision !== "number") {
|
|
||||||
throw Error("Expected a Float Type to have a `precision` property");
|
|
||||||
}
|
|
||||||
return new Float(typeLike.precision as Precision);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeDecimal(typeLike: object) {
|
|
||||||
if (
|
|
||||||
!("scale" in typeLike) ||
|
|
||||||
typeof typeLike.scale !== "number" ||
|
|
||||||
!("precision" in typeLike) ||
|
|
||||||
typeof typeLike.precision !== "number" ||
|
|
||||||
!("bitWidth" in typeLike) ||
|
|
||||||
typeof typeLike.bitWidth !== "number"
|
|
||||||
) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Decimal Type to have `scale`, `precision`, and `bitWidth` properties"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return new Decimal(typeLike.scale, typeLike.precision, typeLike.bitWidth);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeDate(typeLike: object) {
|
|
||||||
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
|
|
||||||
throw Error("Expected a Date type to have a `unit` property");
|
|
||||||
}
|
|
||||||
return new Date_(typeLike.unit as DateUnit);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeTime(typeLike: object) {
|
|
||||||
if (
|
|
||||||
!("unit" in typeLike) ||
|
|
||||||
typeof typeLike.unit !== "number" ||
|
|
||||||
!("bitWidth" in typeLike) ||
|
|
||||||
typeof typeLike.bitWidth !== "number"
|
|
||||||
) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Time type to have `unit` and `bitWidth` properties"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return new Time(typeLike.unit, typeLike.bitWidth as TimeBitWidth);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeTimestamp(typeLike: object) {
|
|
||||||
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
|
|
||||||
throw Error("Expected a Timestamp type to have a `unit` property");
|
|
||||||
}
|
|
||||||
let timezone = null;
|
|
||||||
if ("timezone" in typeLike && typeof typeLike.timezone === "string") {
|
|
||||||
timezone = typeLike.timezone;
|
|
||||||
}
|
|
||||||
return new Timestamp(typeLike.unit, timezone);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeTypedTimestamp(
|
|
||||||
typeLike: object,
|
|
||||||
Datatype:
|
|
||||||
| typeof TimestampNanosecond
|
|
||||||
| typeof TimestampMicrosecond
|
|
||||||
| typeof TimestampMillisecond
|
|
||||||
| typeof TimestampSecond
|
|
||||||
) {
|
|
||||||
let timezone = null;
|
|
||||||
if ("timezone" in typeLike && typeof typeLike.timezone === "string") {
|
|
||||||
timezone = typeLike.timezone;
|
|
||||||
}
|
|
||||||
return new Datatype(timezone);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeInterval(typeLike: object) {
|
|
||||||
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
|
|
||||||
throw Error("Expected an Interval type to have a `unit` property");
|
|
||||||
}
|
|
||||||
return new Interval(typeLike.unit);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeList(typeLike: object) {
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a List type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (typeLike.children.length !== 1) {
|
|
||||||
throw Error("Expected a List type to have exactly one child");
|
|
||||||
}
|
|
||||||
return new List(sanitizeField(typeLike.children[0]));
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeStruct(typeLike: object) {
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Struct type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return new Struct(typeLike.children.map((child) => sanitizeField(child)));
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeUnion(typeLike: object) {
|
|
||||||
if (
|
|
||||||
!("typeIds" in typeLike) ||
|
|
||||||
!("mode" in typeLike) ||
|
|
||||||
typeof typeLike.mode !== "number"
|
|
||||||
) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Union type to have `typeIds` and `mode` properties"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Union type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Union(
|
|
||||||
typeLike.mode,
|
|
||||||
typeLike.typeIds as any,
|
|
||||||
typeLike.children.map((child) => sanitizeField(child))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeTypedUnion(
|
|
||||||
typeLike: object,
|
|
||||||
UnionType: typeof DenseUnion | typeof SparseUnion
|
|
||||||
) {
|
|
||||||
if (!("typeIds" in typeLike)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a DenseUnion/SparseUnion type to have a `typeIds` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a DenseUnion/SparseUnion type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new UnionType(
|
|
||||||
typeLike.typeIds as any,
|
|
||||||
typeLike.children.map((child) => sanitizeField(child))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeFixedSizeBinary(typeLike: object) {
|
|
||||||
if (!("byteWidth" in typeLike) || typeof typeLike.byteWidth !== "number") {
|
|
||||||
throw Error(
|
|
||||||
"Expected a FixedSizeBinary type to have a `byteWidth` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return new FixedSizeBinary(typeLike.byteWidth);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeFixedSizeList(typeLike: object) {
|
|
||||||
if (!("listSize" in typeLike) || typeof typeLike.listSize !== "number") {
|
|
||||||
throw Error("Expected a FixedSizeList type to have a `listSize` property");
|
|
||||||
}
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a FixedSizeList type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (typeLike.children.length !== 1) {
|
|
||||||
throw Error("Expected a FixedSizeList type to have exactly one child");
|
|
||||||
}
|
|
||||||
return new FixedSizeList(
|
|
||||||
typeLike.listSize,
|
|
||||||
sanitizeField(typeLike.children[0])
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeMap(typeLike: object) {
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Map type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (!("keysSorted" in typeLike) || typeof typeLike.keysSorted !== "boolean") {
|
|
||||||
throw Error("Expected a Map type to have a `keysSorted` property");
|
|
||||||
}
|
|
||||||
return new Map_(
|
|
||||||
typeLike.children.map((field) => sanitizeField(field)) as any,
|
|
||||||
typeLike.keysSorted
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeDuration(typeLike: object) {
|
|
||||||
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
|
|
||||||
throw Error("Expected a Duration type to have a `unit` property");
|
|
||||||
}
|
|
||||||
return new Duration(typeLike.unit);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeDictionary(typeLike: object) {
|
|
||||||
if (!("id" in typeLike) || typeof typeLike.id !== "number") {
|
|
||||||
throw Error("Expected a Dictionary type to have an `id` property");
|
|
||||||
}
|
|
||||||
if (!("indices" in typeLike) || typeof typeLike.indices !== "object") {
|
|
||||||
throw Error("Expected a Dictionary type to have an `indices` property");
|
|
||||||
}
|
|
||||||
if (!("dictionary" in typeLike) || typeof typeLike.dictionary !== "object") {
|
|
||||||
throw Error("Expected a Dictionary type to have an `dictionary` property");
|
|
||||||
}
|
|
||||||
if (!("isOrdered" in typeLike) || typeof typeLike.isOrdered !== "boolean") {
|
|
||||||
throw Error("Expected a Dictionary type to have an `isOrdered` property");
|
|
||||||
}
|
|
||||||
return new Dictionary(
|
|
||||||
sanitizeType(typeLike.dictionary),
|
|
||||||
sanitizeType(typeLike.indices) as any,
|
|
||||||
typeLike.id,
|
|
||||||
typeLike.isOrdered
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeType(typeLike: unknown): DataType<any> {
|
|
||||||
if (typeof typeLike !== "object" || typeLike === null) {
|
|
||||||
throw Error("Expected a Type but object was null/undefined");
|
|
||||||
}
|
|
||||||
if (!("typeId" in typeLike) || !(typeof typeLike.typeId !== "function")) {
|
|
||||||
throw Error("Expected a Type to have a typeId function");
|
|
||||||
}
|
|
||||||
let typeId: Type;
|
|
||||||
if (typeof typeLike.typeId === "function") {
|
|
||||||
typeId = (typeLike.typeId as () => unknown)() as Type;
|
|
||||||
} else if (typeof typeLike.typeId === "number") {
|
|
||||||
typeId = typeLike.typeId as Type;
|
|
||||||
} else {
|
|
||||||
throw Error("Type's typeId property was not a function or number");
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (typeId) {
|
|
||||||
case Type.NONE:
|
|
||||||
throw Error("Received a Type with a typeId of NONE");
|
|
||||||
case Type.Null:
|
|
||||||
return new Null();
|
|
||||||
case Type.Int:
|
|
||||||
return sanitizeInt(typeLike);
|
|
||||||
case Type.Float:
|
|
||||||
return sanitizeFloat(typeLike);
|
|
||||||
case Type.Binary:
|
|
||||||
return new Binary();
|
|
||||||
case Type.Utf8:
|
|
||||||
return new Utf8();
|
|
||||||
case Type.Bool:
|
|
||||||
return new Bool();
|
|
||||||
case Type.Decimal:
|
|
||||||
return sanitizeDecimal(typeLike);
|
|
||||||
case Type.Date:
|
|
||||||
return sanitizeDate(typeLike);
|
|
||||||
case Type.Time:
|
|
||||||
return sanitizeTime(typeLike);
|
|
||||||
case Type.Timestamp:
|
|
||||||
return sanitizeTimestamp(typeLike);
|
|
||||||
case Type.Interval:
|
|
||||||
return sanitizeInterval(typeLike);
|
|
||||||
case Type.List:
|
|
||||||
return sanitizeList(typeLike);
|
|
||||||
case Type.Struct:
|
|
||||||
return sanitizeStruct(typeLike);
|
|
||||||
case Type.Union:
|
|
||||||
return sanitizeUnion(typeLike);
|
|
||||||
case Type.FixedSizeBinary:
|
|
||||||
return sanitizeFixedSizeBinary(typeLike);
|
|
||||||
case Type.FixedSizeList:
|
|
||||||
return sanitizeFixedSizeList(typeLike);
|
|
||||||
case Type.Map:
|
|
||||||
return sanitizeMap(typeLike);
|
|
||||||
case Type.Duration:
|
|
||||||
return sanitizeDuration(typeLike);
|
|
||||||
case Type.Dictionary:
|
|
||||||
return sanitizeDictionary(typeLike);
|
|
||||||
case Type.Int8:
|
|
||||||
return new Int8();
|
|
||||||
case Type.Int16:
|
|
||||||
return new Int16();
|
|
||||||
case Type.Int32:
|
|
||||||
return new Int32();
|
|
||||||
case Type.Int64:
|
|
||||||
return new Int64();
|
|
||||||
case Type.Uint8:
|
|
||||||
return new Uint8();
|
|
||||||
case Type.Uint16:
|
|
||||||
return new Uint16();
|
|
||||||
case Type.Uint32:
|
|
||||||
return new Uint32();
|
|
||||||
case Type.Uint64:
|
|
||||||
return new Uint64();
|
|
||||||
case Type.Float16:
|
|
||||||
return new Float16();
|
|
||||||
case Type.Float32:
|
|
||||||
return new Float32();
|
|
||||||
case Type.Float64:
|
|
||||||
return new Float64();
|
|
||||||
case Type.DateMillisecond:
|
|
||||||
return new DateMillisecond();
|
|
||||||
case Type.DateDay:
|
|
||||||
return new DateDay();
|
|
||||||
case Type.TimeNanosecond:
|
|
||||||
return new TimeNanosecond();
|
|
||||||
case Type.TimeMicrosecond:
|
|
||||||
return new TimeMicrosecond();
|
|
||||||
case Type.TimeMillisecond:
|
|
||||||
return new TimeMillisecond();
|
|
||||||
case Type.TimeSecond:
|
|
||||||
return new TimeSecond();
|
|
||||||
case Type.TimestampNanosecond:
|
|
||||||
return sanitizeTypedTimestamp(typeLike, TimestampNanosecond);
|
|
||||||
case Type.TimestampMicrosecond:
|
|
||||||
return sanitizeTypedTimestamp(typeLike, TimestampMicrosecond);
|
|
||||||
case Type.TimestampMillisecond:
|
|
||||||
return sanitizeTypedTimestamp(typeLike, TimestampMillisecond);
|
|
||||||
case Type.TimestampSecond:
|
|
||||||
return sanitizeTypedTimestamp(typeLike, TimestampSecond);
|
|
||||||
case Type.DenseUnion:
|
|
||||||
return sanitizeTypedUnion(typeLike, DenseUnion);
|
|
||||||
case Type.SparseUnion:
|
|
||||||
return sanitizeTypedUnion(typeLike, SparseUnion);
|
|
||||||
case Type.IntervalDayTime:
|
|
||||||
return new IntervalDayTime();
|
|
||||||
case Type.IntervalYearMonth:
|
|
||||||
return new IntervalYearMonth();
|
|
||||||
case Type.DurationNanosecond:
|
|
||||||
return new DurationNanosecond();
|
|
||||||
case Type.DurationMicrosecond:
|
|
||||||
return new DurationMicrosecond();
|
|
||||||
case Type.DurationMillisecond:
|
|
||||||
return new DurationMillisecond();
|
|
||||||
case Type.DurationSecond:
|
|
||||||
return new DurationSecond();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeField(fieldLike: unknown): Field {
|
|
||||||
if (fieldLike instanceof Field) {
|
|
||||||
return fieldLike;
|
|
||||||
}
|
|
||||||
if (typeof fieldLike !== "object" || fieldLike === null) {
|
|
||||||
throw Error("Expected a Field but object was null/undefined");
|
|
||||||
}
|
|
||||||
if (
|
|
||||||
!("type" in fieldLike) ||
|
|
||||||
!("name" in fieldLike) ||
|
|
||||||
!("nullable" in fieldLike)
|
|
||||||
) {
|
|
||||||
throw Error(
|
|
||||||
"The field passed in is missing a `type`/`name`/`nullable` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const type = sanitizeType(fieldLike.type);
|
|
||||||
const name = fieldLike.name;
|
|
||||||
if (!(typeof name === "string")) {
|
|
||||||
throw Error("The field passed in had a non-string `name` property");
|
|
||||||
}
|
|
||||||
const nullable = fieldLike.nullable;
|
|
||||||
if (!(typeof nullable === "boolean")) {
|
|
||||||
throw Error("The field passed in had a non-boolean `nullable` property");
|
|
||||||
}
|
|
||||||
let metadata;
|
|
||||||
if ("metadata" in fieldLike) {
|
|
||||||
metadata = sanitizeMetadata(fieldLike.metadata);
|
|
||||||
}
|
|
||||||
return new Field(name, type, nullable, metadata);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert something schemaLike into a Schema instance
|
|
||||||
*
|
|
||||||
* This method is often needed even when the caller is using a Schema
|
|
||||||
* instance because they might be using a different instance of apache-arrow
|
|
||||||
* than lancedb is using.
|
|
||||||
*/
|
|
||||||
export function sanitizeSchema(schemaLike: unknown): Schema {
|
|
||||||
if (schemaLike instanceof Schema) {
|
|
||||||
return schemaLike;
|
|
||||||
}
|
|
||||||
if (typeof schemaLike !== "object" || schemaLike === null) {
|
|
||||||
throw Error("Expected a Schema but object was null/undefined");
|
|
||||||
}
|
|
||||||
if (!("fields" in schemaLike)) {
|
|
||||||
throw Error(
|
|
||||||
"The schema passed in does not appear to be a schema (no 'fields' property)"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
let metadata;
|
|
||||||
if ("metadata" in schemaLike) {
|
|
||||||
metadata = sanitizeMetadata(schemaLike.metadata);
|
|
||||||
}
|
|
||||||
if (!Array.isArray(schemaLike.fields)) {
|
|
||||||
throw Error(
|
|
||||||
"The schema passed in had a 'fields' property but it was not an array"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const sanitizedFields = schemaLike.fields.map((field) =>
|
|
||||||
sanitizeField(field)
|
|
||||||
);
|
|
||||||
return new Schema(sanitizedFields, metadata);
|
|
||||||
}
|
|
||||||
@@ -1,360 +0,0 @@
|
|||||||
// Copyright 2024 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { describe } from 'mocha'
|
|
||||||
import { assert, expect, use as chaiUse } from 'chai'
|
|
||||||
import * as chaiAsPromised from 'chai-as-promised'
|
|
||||||
|
|
||||||
import { convertToTable, fromTableToBuffer, makeArrowTable, makeEmptyTable } from '../arrow'
|
|
||||||
import {
|
|
||||||
Field,
|
|
||||||
FixedSizeList,
|
|
||||||
Float16,
|
|
||||||
Float32,
|
|
||||||
Int32,
|
|
||||||
tableFromIPC,
|
|
||||||
Schema,
|
|
||||||
Float64,
|
|
||||||
type Table,
|
|
||||||
Binary,
|
|
||||||
Bool,
|
|
||||||
Utf8,
|
|
||||||
Struct,
|
|
||||||
List,
|
|
||||||
DataType,
|
|
||||||
Dictionary,
|
|
||||||
Int64,
|
|
||||||
MetadataVersion
|
|
||||||
} from 'apache-arrow'
|
|
||||||
import {
|
|
||||||
Dictionary as OldDictionary,
|
|
||||||
Field as OldField,
|
|
||||||
FixedSizeList as OldFixedSizeList,
|
|
||||||
Float32 as OldFloat32,
|
|
||||||
Int32 as OldInt32,
|
|
||||||
Struct as OldStruct,
|
|
||||||
Schema as OldSchema,
|
|
||||||
TimestampNanosecond as OldTimestampNanosecond,
|
|
||||||
Utf8 as OldUtf8
|
|
||||||
} from 'apache-arrow-old'
|
|
||||||
import { type EmbeddingFunction } from '../embedding/embedding_function'
|
|
||||||
|
|
||||||
chaiUse(chaiAsPromised)
|
|
||||||
|
|
||||||
function sampleRecords (): Array<Record<string, any>> {
|
|
||||||
return [
|
|
||||||
{
|
|
||||||
binary: Buffer.alloc(5),
|
|
||||||
boolean: false,
|
|
||||||
number: 7,
|
|
||||||
string: 'hello',
|
|
||||||
struct: { x: 0, y: 0 },
|
|
||||||
list: ['anime', 'action', 'comedy']
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper method to verify various ways to create a table
|
|
||||||
async function checkTableCreation (tableCreationMethod: (records: any, recordsReversed: any, schema: Schema) => Promise<Table>): Promise<void> {
|
|
||||||
const records = sampleRecords()
|
|
||||||
const recordsReversed = [{
|
|
||||||
list: ['anime', 'action', 'comedy'],
|
|
||||||
struct: { x: 0, y: 0 },
|
|
||||||
string: 'hello',
|
|
||||||
number: 7,
|
|
||||||
boolean: false,
|
|
||||||
binary: Buffer.alloc(5)
|
|
||||||
}]
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('binary', new Binary(), false),
|
|
||||||
new Field('boolean', new Bool(), false),
|
|
||||||
new Field('number', new Float64(), false),
|
|
||||||
new Field('string', new Utf8(), false),
|
|
||||||
new Field('struct', new Struct([
|
|
||||||
new Field('x', new Float64(), false),
|
|
||||||
new Field('y', new Float64(), false)
|
|
||||||
])),
|
|
||||||
new Field('list', new List(new Field('item', new Utf8(), false)), false)
|
|
||||||
])
|
|
||||||
|
|
||||||
const table = await tableCreationMethod(records, recordsReversed, schema)
|
|
||||||
schema.fields.forEach((field, idx) => {
|
|
||||||
const actualField = table.schema.fields[idx]
|
|
||||||
assert.isFalse(actualField.nullable)
|
|
||||||
assert.equal(table.getChild(field.name)?.type.toString(), field.type.toString())
|
|
||||||
assert.equal(table.getChildAt(idx)?.type.toString(), field.type.toString())
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('The function makeArrowTable', function () {
|
|
||||||
it('will use data types from a provided schema instead of inference', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('a', new Int32()),
|
|
||||||
new Field('b', new Float32()),
|
|
||||||
new Field('c', new FixedSizeList(3, new Field('item', new Float16()))),
|
|
||||||
new Field('d', new Int64())
|
|
||||||
])
|
|
||||||
const table = makeArrowTable(
|
|
||||||
[
|
|
||||||
{ a: 1, b: 2, c: [1, 2, 3], d: 9 },
|
|
||||||
{ a: 4, b: 5, c: [4, 5, 6], d: 10 },
|
|
||||||
{ a: 7, b: 8, c: [7, 8, 9], d: null }
|
|
||||||
],
|
|
||||||
{ schema }
|
|
||||||
)
|
|
||||||
|
|
||||||
const buf = await fromTableToBuffer(table)
|
|
||||||
assert.isAbove(buf.byteLength, 0)
|
|
||||||
|
|
||||||
const actual = tableFromIPC(buf)
|
|
||||||
assert.equal(actual.numRows, 3)
|
|
||||||
const actualSchema = actual.schema
|
|
||||||
assert.deepEqual(actualSchema, schema)
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will assume the column `vector` is FixedSizeList<Float32> by default', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('a', new Float64()),
|
|
||||||
new Field('b', new Float64()),
|
|
||||||
new Field(
|
|
||||||
'vector',
|
|
||||||
new FixedSizeList(3, new Field('item', new Float32(), true))
|
|
||||||
)
|
|
||||||
])
|
|
||||||
const table = makeArrowTable([
|
|
||||||
{ a: 1, b: 2, vector: [1, 2, 3] },
|
|
||||||
{ a: 4, b: 5, vector: [4, 5, 6] },
|
|
||||||
{ a: 7, b: 8, vector: [7, 8, 9] }
|
|
||||||
])
|
|
||||||
|
|
||||||
const buf = await fromTableToBuffer(table)
|
|
||||||
assert.isAbove(buf.byteLength, 0)
|
|
||||||
|
|
||||||
const actual = tableFromIPC(buf)
|
|
||||||
assert.equal(actual.numRows, 3)
|
|
||||||
const actualSchema = actual.schema
|
|
||||||
assert.deepEqual(actualSchema, schema)
|
|
||||||
})
|
|
||||||
|
|
||||||
it('can support multiple vector columns', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('a', new Float64()),
|
|
||||||
new Field('b', new Float64()),
|
|
||||||
new Field('vec1', new FixedSizeList(3, new Field('item', new Float16(), true))),
|
|
||||||
new Field('vec2', new FixedSizeList(3, new Field('item', new Float16(), true)))
|
|
||||||
])
|
|
||||||
const table = makeArrowTable(
|
|
||||||
[
|
|
||||||
{ a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
|
|
||||||
{ a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
|
|
||||||
{ a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
|
|
||||||
],
|
|
||||||
{
|
|
||||||
vectorColumns: {
|
|
||||||
vec1: { type: new Float16() },
|
|
||||||
vec2: { type: new Float16() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
const buf = await fromTableToBuffer(table)
|
|
||||||
assert.isAbove(buf.byteLength, 0)
|
|
||||||
|
|
||||||
const actual = tableFromIPC(buf)
|
|
||||||
assert.equal(actual.numRows, 3)
|
|
||||||
const actualSchema = actual.schema
|
|
||||||
assert.deepEqual(actualSchema, schema)
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will allow different vector column types', async function () {
|
|
||||||
const table = makeArrowTable(
|
|
||||||
[
|
|
||||||
{ fp16: [1], fp32: [1], fp64: [1] }
|
|
||||||
],
|
|
||||||
{
|
|
||||||
vectorColumns: {
|
|
||||||
fp16: { type: new Float16() },
|
|
||||||
fp32: { type: new Float32() },
|
|
||||||
fp64: { type: new Float64() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert.equal(table.getChild('fp16')?.type.children[0].type.toString(), new Float16().toString())
|
|
||||||
assert.equal(table.getChild('fp32')?.type.children[0].type.toString(), new Float32().toString())
|
|
||||||
assert.equal(table.getChild('fp64')?.type.children[0].type.toString(), new Float64().toString())
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will use dictionary encoded strings if asked', async function () {
|
|
||||||
const table = makeArrowTable([{ str: 'hello' }])
|
|
||||||
assert.isTrue(DataType.isUtf8(table.getChild('str')?.type))
|
|
||||||
|
|
||||||
const tableWithDict = makeArrowTable([{ str: 'hello' }], { dictionaryEncodeStrings: true })
|
|
||||||
assert.isTrue(DataType.isDictionary(tableWithDict.getChild('str')?.type))
|
|
||||||
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('str', new Dictionary(new Utf8(), new Int32()))
|
|
||||||
])
|
|
||||||
|
|
||||||
const tableWithDict2 = makeArrowTable([{ str: 'hello' }], { schema })
|
|
||||||
assert.isTrue(DataType.isDictionary(tableWithDict2.getChild('str')?.type))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will infer data types correctly', async function () {
|
|
||||||
await checkTableCreation(async (records) => makeArrowTable(records))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will allow a schema to be provided', async function () {
|
|
||||||
await checkTableCreation(async (records, _, schema) => makeArrowTable(records, { schema }))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will use the field order of any provided schema', async function () {
|
|
||||||
await checkTableCreation(async (_, recordsReversed, schema) => makeArrowTable(recordsReversed, { schema }))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will make an empty table', async function () {
|
|
||||||
await checkTableCreation(async (_, __, schema) => makeArrowTable([], { schema }))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
class DummyEmbedding implements EmbeddingFunction<string> {
|
|
||||||
public readonly sourceColumn = 'string'
|
|
||||||
public readonly embeddingDimension = 2
|
|
||||||
public readonly embeddingDataType = new Float16()
|
|
||||||
|
|
||||||
async embed (data: string[]): Promise<number[][]> {
|
|
||||||
return data.map(
|
|
||||||
() => [0.0, 0.0]
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class DummyEmbeddingWithNoDimension implements EmbeddingFunction<string> {
|
|
||||||
public readonly sourceColumn = 'string'
|
|
||||||
|
|
||||||
async embed (data: string[]): Promise<number[][]> {
|
|
||||||
return data.map(
|
|
||||||
() => [0.0, 0.0]
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('convertToTable', function () {
|
|
||||||
it('will infer data types correctly', async function () {
|
|
||||||
await checkTableCreation(async (records) => await convertToTable(records))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will allow a schema to be provided', async function () {
|
|
||||||
await checkTableCreation(async (records, _, schema) => await convertToTable(records, undefined, { schema }))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will use the field order of any provided schema', async function () {
|
|
||||||
await checkTableCreation(async (_, recordsReversed, schema) => await convertToTable(recordsReversed, undefined, { schema }))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will make an empty table', async function () {
|
|
||||||
await checkTableCreation(async (_, __, schema) => await convertToTable([], undefined, { schema }))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will apply embeddings', async function () {
|
|
||||||
const records = sampleRecords()
|
|
||||||
const table = await convertToTable(records, new DummyEmbedding())
|
|
||||||
assert.isTrue(DataType.isFixedSizeList(table.getChild('vector')?.type))
|
|
||||||
assert.equal(table.getChild('vector')?.type.children[0].type.toString(), new Float16().toString())
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will fail if missing the embedding source column', async function () {
|
|
||||||
return await expect(convertToTable([{ id: 1 }], new DummyEmbedding())).to.be.rejectedWith("'string' was not present")
|
|
||||||
})
|
|
||||||
|
|
||||||
it('use embeddingDimension if embedding missing from table', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('string', new Utf8(), false)
|
|
||||||
])
|
|
||||||
// Simulate getting an empty Arrow table (minus embedding) from some other source
|
|
||||||
// In other words, we aren't starting with records
|
|
||||||
const table = makeEmptyTable(schema)
|
|
||||||
|
|
||||||
// If the embedding specifies the dimension we are fine
|
|
||||||
await fromTableToBuffer(table, new DummyEmbedding())
|
|
||||||
|
|
||||||
// We can also supply a schema and should be ok
|
|
||||||
const schemaWithEmbedding = new Schema([
|
|
||||||
new Field('string', new Utf8(), false),
|
|
||||||
new Field('vector', new FixedSizeList(2, new Field('item', new Float16(), false)), false)
|
|
||||||
])
|
|
||||||
await fromTableToBuffer(table, new DummyEmbeddingWithNoDimension(), schemaWithEmbedding)
|
|
||||||
|
|
||||||
// Otherwise we will get an error
|
|
||||||
return await expect(fromTableToBuffer(table, new DummyEmbeddingWithNoDimension())).to.be.rejectedWith('does not specify `embeddingDimension`')
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will apply embeddings to an empty table', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('string', new Utf8(), false),
|
|
||||||
new Field('vector', new FixedSizeList(2, new Field('item', new Float16(), false)), false)
|
|
||||||
])
|
|
||||||
const table = await convertToTable([], new DummyEmbedding(), { schema })
|
|
||||||
assert.isTrue(DataType.isFixedSizeList(table.getChild('vector')?.type))
|
|
||||||
assert.equal(table.getChild('vector')?.type.children[0].type.toString(), new Float16().toString())
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will complain if embeddings present but schema missing embedding column', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('string', new Utf8(), false)
|
|
||||||
])
|
|
||||||
return await expect(convertToTable([], new DummyEmbedding(), { schema })).to.be.rejectedWith('column vector was missing')
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will provide a nice error if run twice', async function () {
|
|
||||||
const records = sampleRecords()
|
|
||||||
const table = await convertToTable(records, new DummyEmbedding())
|
|
||||||
// fromTableToBuffer will try and apply the embeddings again
|
|
||||||
return await expect(fromTableToBuffer(table, new DummyEmbedding())).to.be.rejectedWith('already existed')
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
describe('makeEmptyTable', function () {
|
|
||||||
it('will make an empty table', async function () {
|
|
||||||
await checkTableCreation(async (_, __, schema) => makeEmptyTable(schema))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
describe('when using two versions of arrow', function () {
|
|
||||||
it('can still import data', async function() {
|
|
||||||
const schema = new OldSchema([
|
|
||||||
new OldField('id', new OldInt32()),
|
|
||||||
new OldField('vector', new OldFixedSizeList(1024, new OldField("item", new OldFloat32(), true))),
|
|
||||||
new OldField('struct', new OldStruct([
|
|
||||||
new OldField('nested', new OldDictionary(new OldUtf8(), new OldInt32(), 1, true)),
|
|
||||||
new OldField('ts_with_tz', new OldTimestampNanosecond("some_tz")),
|
|
||||||
new OldField('ts_no_tz', new OldTimestampNanosecond(null))
|
|
||||||
]))
|
|
||||||
]) as any
|
|
||||||
// We use arrow version 13 to emulate a "foreign arrow" and this version doesn't have metadataVersion
|
|
||||||
// In theory, this wouldn't matter. We don't rely on that property. However, it causes deepEqual to
|
|
||||||
// fail so we patch it back in
|
|
||||||
schema.metadataVersion = MetadataVersion.V5
|
|
||||||
const table = makeArrowTable(
|
|
||||||
[],
|
|
||||||
{ schema }
|
|
||||||
)
|
|
||||||
|
|
||||||
const buf = await fromTableToBuffer(table)
|
|
||||||
assert.isAbove(buf.byteLength, 0)
|
|
||||||
const actual = tableFromIPC(buf)
|
|
||||||
const actualSchema = actual.schema
|
|
||||||
assert.deepEqual(actualSchema, schema)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,55 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { describe } from 'mocha'
|
|
||||||
import { assert } from 'chai'
|
|
||||||
|
|
||||||
import { OpenAIEmbeddingFunction } from '../../embedding/openai'
|
|
||||||
import { isEmbeddingFunction } from '../../embedding/embedding_function'
|
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
||||||
const OpenAIApi = require('openai')
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
||||||
const { stub } = require('sinon')
|
|
||||||
|
|
||||||
describe('OpenAPIEmbeddings', function () {
|
|
||||||
const stubValue = {
|
|
||||||
data: [
|
|
||||||
{
|
|
||||||
embedding: Array(1536).fill(1.0)
|
|
||||||
},
|
|
||||||
{
|
|
||||||
embedding: Array(1536).fill(2.0)
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('#embed', function () {
|
|
||||||
it('should create vector embeddings', async function () {
|
|
||||||
const openAIStub = stub(OpenAIApi.Embeddings.prototype, 'create').returns(stubValue)
|
|
||||||
const f = new OpenAIEmbeddingFunction('text', 'sk-key')
|
|
||||||
const vectors = await f.embed(['abc', 'def'])
|
|
||||||
assert.isTrue(openAIStub.calledOnce)
|
|
||||||
assert.equal(vectors.length, 2)
|
|
||||||
assert.deepEqual(vectors[0], stubValue.data[0].embedding)
|
|
||||||
assert.deepEqual(vectors[1], stubValue.data[1].embedding)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
describe('isEmbeddingFunction', function () {
|
|
||||||
it('should match the isEmbeddingFunction guard', function () {
|
|
||||||
assert.isTrue(isEmbeddingFunction(new OpenAIEmbeddingFunction('text', 'sk-key')))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,76 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
// IO tests
|
|
||||||
|
|
||||||
import { describe } from 'mocha'
|
|
||||||
import { assert } from 'chai'
|
|
||||||
|
|
||||||
import * as lancedb from '../index'
|
|
||||||
import { type ConnectionOptions } from '../index'
|
|
||||||
|
|
||||||
describe('LanceDB S3 client', function () {
|
|
||||||
if (process.env.TEST_S3_BASE_URL != null) {
|
|
||||||
const baseUri = process.env.TEST_S3_BASE_URL
|
|
||||||
it('should have a valid url', async function () {
|
|
||||||
const opts = { uri: `${baseUri}/valid_url` }
|
|
||||||
const table = await createTestDB(opts, 2, 20)
|
|
||||||
const con = await lancedb.connect(opts)
|
|
||||||
assert.equal(con.uri, opts.uri)
|
|
||||||
|
|
||||||
const results = await table.search([0.1, 0.3]).limit(5).execute()
|
|
||||||
assert.equal(results.length, 5)
|
|
||||||
}).timeout(10_000)
|
|
||||||
} else {
|
|
||||||
describe.skip('Skip S3 test', function () {})
|
|
||||||
}
|
|
||||||
|
|
||||||
if (process.env.TEST_S3_BASE_URL != null && process.env.TEST_AWS_ACCESS_KEY_ID != null && process.env.TEST_AWS_SECRET_ACCESS_KEY != null) {
|
|
||||||
const baseUri = process.env.TEST_S3_BASE_URL
|
|
||||||
it('use custom credentials', async function () {
|
|
||||||
const opts: ConnectionOptions = {
|
|
||||||
uri: `${baseUri}/custom_credentials`,
|
|
||||||
awsCredentials: {
|
|
||||||
accessKeyId: process.env.TEST_AWS_ACCESS_KEY_ID as string,
|
|
||||||
secretKey: process.env.TEST_AWS_SECRET_ACCESS_KEY as string
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const table = await createTestDB(opts, 2, 20)
|
|
||||||
console.log(table)
|
|
||||||
const con = await lancedb.connect(opts)
|
|
||||||
console.log(con)
|
|
||||||
assert.equal(con.uri, opts.uri)
|
|
||||||
|
|
||||||
const results = await table.search([0.1, 0.3]).limit(5).execute()
|
|
||||||
assert.equal(results.length, 5)
|
|
||||||
}).timeout(10_000)
|
|
||||||
} else {
|
|
||||||
describe.skip('Skip S3 test', function () {})
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
async function createTestDB (opts: ConnectionOptions, numDimensions: number = 2, numRows: number = 2): Promise<lancedb.Table> {
|
|
||||||
const con = await lancedb.connect(opts)
|
|
||||||
|
|
||||||
const data = []
|
|
||||||
for (let i = 0; i < numRows; i++) {
|
|
||||||
const vector = []
|
|
||||||
for (let j = 0; j < numDimensions; j++) {
|
|
||||||
vector.push(i + (j * 0.1))
|
|
||||||
}
|
|
||||||
data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
|
|
||||||
}
|
|
||||||
|
|
||||||
return await con.createTable('vectors_2', data)
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,45 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { toSQL } from '../util'
|
|
||||||
import * as chai from 'chai'
|
|
||||||
|
|
||||||
const expect = chai.expect
|
|
||||||
|
|
||||||
describe('toSQL', function () {
|
|
||||||
it('should turn string to SQL expression', function () {
|
|
||||||
expect(toSQL('foo')).to.equal("'foo'")
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should turn number to SQL expression', function () {
|
|
||||||
expect(toSQL(123)).to.equal('123')
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should turn boolean to SQL expression', function () {
|
|
||||||
expect(toSQL(true)).to.equal('TRUE')
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should turn null to SQL expression', function () {
|
|
||||||
expect(toSQL(null)).to.equal('NULL')
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should turn Date to SQL expression', function () {
|
|
||||||
const date = new Date('05 October 2011 14:48 UTC')
|
|
||||||
expect(toSQL(date)).to.equal("'2011-10-05T14:48:00.000Z'")
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should turn array to SQL expression', function () {
|
|
||||||
expect(toSQL(['foo', 'bar', true, 1])).to.equal("['foo', 'bar', TRUE, 1]")
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
export type Literal = string | number | boolean | null | Date | Literal[]
|
|
||||||
|
|
||||||
export function toSQL (value: Literal): string {
|
|
||||||
if (typeof value === 'string') {
|
|
||||||
return `'${value}'`
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof value === 'number') {
|
|
||||||
return value.toString()
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof value === 'boolean') {
|
|
||||||
return value ? 'TRUE' : 'FALSE'
|
|
||||||
}
|
|
||||||
|
|
||||||
if (value === null) {
|
|
||||||
return 'NULL'
|
|
||||||
}
|
|
||||||
|
|
||||||
if (value instanceof Date) {
|
|
||||||
return `'${value.toISOString()}'`
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Array.isArray(value)) {
|
|
||||||
return `[${value.map(toSQL).join(', ')}]`
|
|
||||||
}
|
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
throw new Error(`Unsupported value type: ${typeof value} value: (${value})`)
|
|
||||||
}
|
|
||||||
|
|
||||||
export class TTLCache {
|
|
||||||
private readonly cache: Map<string, { value: any, expires: number }>
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param ttl Time to live in milliseconds
|
|
||||||
*/
|
|
||||||
constructor (private readonly ttl: number) {
|
|
||||||
this.cache = new Map()
|
|
||||||
}
|
|
||||||
|
|
||||||
get (key: string): any | undefined {
|
|
||||||
const entry = this.cache.get(key)
|
|
||||||
if (entry === undefined) {
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
|
|
||||||
if (entry.expires < Date.now()) {
|
|
||||||
this.cache.delete(key)
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
|
|
||||||
return entry.value
|
|
||||||
}
|
|
||||||
|
|
||||||
set (key: string, value: any): void {
|
|
||||||
this.cache.set(key, { value, expires: Date.now() + this.ttl })
|
|
||||||
}
|
|
||||||
|
|
||||||
delete (key: string): void {
|
|
||||||
this.cache.delete(key)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
{
|
|
||||||
"include": [
|
|
||||||
"src/**/*.ts",
|
|
||||||
"src/*.ts"
|
|
||||||
],
|
|
||||||
"compilerOptions": {
|
|
||||||
"target": "ES2020",
|
|
||||||
"module": "commonjs",
|
|
||||||
"declaration": true,
|
|
||||||
"outDir": "./dist",
|
|
||||||
"strict": true,
|
|
||||||
"sourceMap": true,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.21.2-beta.1"
|
version = "0.21.2"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -1,7 +1,16 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
import { Bool, Field, Int32, List, Schema, Struct, Utf8 } from "apache-arrow";
|
import {
|
||||||
|
Bool,
|
||||||
|
Field,
|
||||||
|
Int32,
|
||||||
|
List,
|
||||||
|
Schema,
|
||||||
|
Struct,
|
||||||
|
Uint8,
|
||||||
|
Utf8,
|
||||||
|
} from "apache-arrow";
|
||||||
|
|
||||||
import * as arrow15 from "apache-arrow-15";
|
import * as arrow15 from "apache-arrow-15";
|
||||||
import * as arrow16 from "apache-arrow-16";
|
import * as arrow16 from "apache-arrow-16";
|
||||||
@@ -255,6 +264,98 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(actualSchema).toEqual(schema);
|
expect(actualSchema).toEqual(schema);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("will detect vector columns when name contains 'vector' or 'embedding'", async function () {
|
||||||
|
// Test various naming patterns that should be detected as vector columns
|
||||||
|
const floatVectorTable = makeArrowTable([
|
||||||
|
{
|
||||||
|
// Float vectors (use decimal values to ensure they're treated as floats)
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
user_vector: [1.1, 2.2],
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
text_embedding: [3.3, 4.4],
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
doc_embeddings: [5.5, 6.6],
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
my_vector_field: [7.7, 8.8],
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
embedding_model: [9.9, 10.1],
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
VECTOR_COL: [11.1, 12.2], // uppercase
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
Vector_Mixed: [13.3, 14.4], // mixed case
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Check that columns with 'vector' or 'embedding' in name are converted to FixedSizeList
|
||||||
|
const floatVectorColumns = [
|
||||||
|
"user_vector",
|
||||||
|
"text_embedding",
|
||||||
|
"doc_embeddings",
|
||||||
|
"my_vector_field",
|
||||||
|
"embedding_model",
|
||||||
|
"VECTOR_COL",
|
||||||
|
"Vector_Mixed",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const columnName of floatVectorColumns) {
|
||||||
|
expect(
|
||||||
|
DataType.isFixedSizeList(
|
||||||
|
floatVectorTable.getChild(columnName)?.type,
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
|
// Check that float vectors use Float32 by default
|
||||||
|
expect(
|
||||||
|
floatVectorTable
|
||||||
|
.getChild(columnName)
|
||||||
|
?.type.children[0].type.toString(),
|
||||||
|
).toEqual(new Float32().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that regular integer arrays still get treated as float vectors
|
||||||
|
// (since JavaScript doesn't distinguish integers from floats at runtime)
|
||||||
|
const integerArrayTable = makeArrowTable([
|
||||||
|
{
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
vector_int: [1, 2], // Regular array with integers - should be Float32
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
embedding_int: [3, 4], // Regular array with integers - should be Float32
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
const integerArrayColumns = ["vector_int", "embedding_int"];
|
||||||
|
|
||||||
|
for (const columnName of integerArrayColumns) {
|
||||||
|
expect(
|
||||||
|
DataType.isFixedSizeList(
|
||||||
|
integerArrayTable.getChild(columnName)?.type,
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
|
// Regular integer arrays should use Float32 (avoiding false positives)
|
||||||
|
expect(
|
||||||
|
integerArrayTable
|
||||||
|
.getChild(columnName)
|
||||||
|
?.type.children[0].type.toString(),
|
||||||
|
).toEqual(new Float32().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test normal list should NOT be converted to FixedSizeList
|
||||||
|
const normalListTable = makeArrowTable([
|
||||||
|
{
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
normal_list: [15.5, 16.6], // should NOT be detected as vector
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
expect(
|
||||||
|
DataType.isFixedSizeList(
|
||||||
|
normalListTable.getChild("normal_list")?.type,
|
||||||
|
),
|
||||||
|
).toBe(false);
|
||||||
|
expect(
|
||||||
|
DataType.isList(normalListTable.getChild("normal_list")?.type),
|
||||||
|
).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
it("will allow different vector column types", async function () {
|
it("will allow different vector column types", async function () {
|
||||||
const table = makeArrowTable([{ fp16: [1], fp32: [1], fp64: [1] }], {
|
const table = makeArrowTable([{ fp16: [1], fp32: [1], fp64: [1] }], {
|
||||||
vectorColumns: {
|
vectorColumns: {
|
||||||
|
|||||||
@@ -42,6 +42,28 @@ describe("remote connection", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should accept overall timeout configuration", async () => {
|
||||||
|
await connect("db://test", {
|
||||||
|
apiKey: "fake",
|
||||||
|
clientConfig: {
|
||||||
|
timeoutConfig: { timeout: 30 },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test with all timeout parameters
|
||||||
|
await connect("db://test", {
|
||||||
|
apiKey: "fake",
|
||||||
|
clientConfig: {
|
||||||
|
timeoutConfig: {
|
||||||
|
timeout: 60,
|
||||||
|
connectTimeout: 10,
|
||||||
|
readTimeout: 20,
|
||||||
|
poolIdleTimeout: 300,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it("should pass down apiKey and userAgent", async () => {
|
it("should pass down apiKey and userAgent", async () => {
|
||||||
await withMockDatabase(
|
await withMockDatabase(
|
||||||
(req, res) => {
|
(req, res) => {
|
||||||
|
|||||||
@@ -287,6 +287,12 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(res2[1].id).toEqual(data2.id);
|
expect(res2[1].id).toEqual(data2.id);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should support take queries", async () => {
|
||||||
|
await table.add([{ id: 1 }, { id: 2 }, { id: 3 }]);
|
||||||
|
const res = await table.takeOffsets([1, 2]).toArrow();
|
||||||
|
expect(res.getChild("id")?.toJSON()).toEqual([2, 3]);
|
||||||
|
});
|
||||||
|
|
||||||
it("should return the table as an instance of an arrow table", async () => {
|
it("should return the table as an instance of an arrow table", async () => {
|
||||||
const arrowTbl = await table.toArrow();
|
const arrowTbl = await table.toArrow();
|
||||||
expect(arrowTbl).toBeInstanceOf(ArrowTable);
|
expect(arrowTbl).toBeInstanceOf(ArrowTable);
|
||||||
@@ -557,7 +563,7 @@ describe("When creating an index", () => {
|
|||||||
|
|
||||||
// test offset
|
// test offset
|
||||||
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
||||||
expect(rst.numRows).toBe(1);
|
expect(rst.numRows).toBe(2);
|
||||||
|
|
||||||
// test nprobes
|
// test nprobes
|
||||||
rst = await tbl.query().nearestTo(queryVec).limit(2).nprobes(50).toArrow();
|
rst = await tbl.query().nearestTo(queryVec).limit(2).nprobes(50).toArrow();
|
||||||
@@ -582,7 +588,7 @@ describe("When creating an index", () => {
|
|||||||
"Invalid input, minimum_nprobes must be greater than 0",
|
"Invalid input, minimum_nprobes must be greater than 0",
|
||||||
);
|
);
|
||||||
expect(() => tbl.query().nearestTo(queryVec).maximumNprobes(5)).toThrow(
|
expect(() => tbl.query().nearestTo(queryVec).maximumNprobes(5)).toThrow(
|
||||||
"Invalid input, maximum_nprobes must be greater than minimum_nprobes",
|
"Invalid input, maximum_nprobes must be greater than or equal to minimum_nprobes",
|
||||||
);
|
);
|
||||||
|
|
||||||
await tbl.dropIndex("vec_idx");
|
await tbl.dropIndex("vec_idx");
|
||||||
@@ -696,7 +702,7 @@ describe("When creating an index", () => {
|
|||||||
|
|
||||||
// test offset
|
// test offset
|
||||||
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
||||||
expect(rst.numRows).toBe(1);
|
expect(rst.numRows).toBe(2);
|
||||||
|
|
||||||
// test ef
|
// test ef
|
||||||
rst = await tbl.query().limit(2).nearestTo(queryVec).ef(100).toArrow();
|
rst = await tbl.query().limit(2).nearestTo(queryVec).ef(100).toArrow();
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ test("ann index examples", async () => {
|
|||||||
// --8<-- [start:ingest]
|
// --8<-- [start:ingest]
|
||||||
const db = await lancedb.connect(databaseDir);
|
const db = await lancedb.connect(databaseDir);
|
||||||
|
|
||||||
const data = Array.from({ length: 5_000 }, (_, i) => ({
|
const data = Array.from({ length: 1_000 }, (_, i) => ({
|
||||||
vector: Array(128).fill(i),
|
vector: Array(128).fill(i),
|
||||||
id: `${i}`,
|
id: `${i}`,
|
||||||
content: "",
|
content: "",
|
||||||
@@ -24,8 +24,8 @@ test("ann index examples", async () => {
|
|||||||
});
|
});
|
||||||
await table.createIndex("vector", {
|
await table.createIndex("vector", {
|
||||||
config: lancedb.Index.ivfPq({
|
config: lancedb.Index.ivfPq({
|
||||||
numPartitions: 10,
|
numPartitions: 30,
|
||||||
numSubVectors: 16,
|
numSubVectors: 8,
|
||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
// --8<-- [end:ingest]
|
// --8<-- [end:ingest]
|
||||||
|
|||||||
2
nodejs/examples/package-lock.json
generated
2
nodejs/examples/package-lock.json
generated
@@ -30,7 +30,7 @@
|
|||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"os": [
|
"os": [
|
||||||
"darwin",
|
"darwin",
|
||||||
"linux",
|
"linux",
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ import {
|
|||||||
Struct,
|
Struct,
|
||||||
Timestamp,
|
Timestamp,
|
||||||
Type,
|
Type,
|
||||||
|
Uint8,
|
||||||
Utf8,
|
Utf8,
|
||||||
Vector,
|
Vector,
|
||||||
makeVector as arrowMakeVector,
|
makeVector as arrowMakeVector,
|
||||||
@@ -51,6 +52,15 @@ import {
|
|||||||
sanitizeTable,
|
sanitizeTable,
|
||||||
sanitizeType,
|
sanitizeType,
|
||||||
} from "./sanitize";
|
} from "./sanitize";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a field name indicates a vector column.
|
||||||
|
*/
|
||||||
|
function nameSuggestsVectorColumn(fieldName: string): boolean {
|
||||||
|
const nameLower = fieldName.toLowerCase();
|
||||||
|
return nameLower.includes("vector") || nameLower.includes("embedding");
|
||||||
|
}
|
||||||
|
|
||||||
export * from "apache-arrow";
|
export * from "apache-arrow";
|
||||||
export type SchemaLike =
|
export type SchemaLike =
|
||||||
| Schema
|
| Schema
|
||||||
@@ -591,10 +601,17 @@ function inferType(
|
|||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
// Try to automatically detect embedding columns.
|
// Try to automatically detect embedding columns.
|
||||||
if (valueType instanceof Float && path[path.length - 1] === "vector") {
|
if (nameSuggestsVectorColumn(path[path.length - 1])) {
|
||||||
// We default to Float32 for vectors.
|
// Check if value is a Uint8Array for integer vector type determination
|
||||||
const child = new Field("item", new Float32(), true);
|
if (value instanceof Uint8Array) {
|
||||||
return new FixedSizeList(value.length, child);
|
// For integer vectors, we default to Uint8 (matching Python implementation)
|
||||||
|
const child = new Field("item", new Uint8(), true);
|
||||||
|
return new FixedSizeList(value.length, child);
|
||||||
|
} else {
|
||||||
|
// For float vectors, we default to Float32
|
||||||
|
const child = new Field("item", new Float32(), true);
|
||||||
|
return new FixedSizeList(value.length, child);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
const child = new Field("item", valueType, true);
|
const child = new Field("item", valueType, true);
|
||||||
return new List(child);
|
return new List(child);
|
||||||
|
|||||||
@@ -59,6 +59,7 @@ export {
|
|||||||
Query,
|
Query,
|
||||||
QueryBase,
|
QueryBase,
|
||||||
VectorQuery,
|
VectorQuery,
|
||||||
|
TakeQuery,
|
||||||
QueryExecutionOptions,
|
QueryExecutionOptions,
|
||||||
FullTextSearchOptions,
|
FullTextSearchOptions,
|
||||||
RecordBatchIterator,
|
RecordBatchIterator,
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ import {
|
|||||||
RecordBatchIterator as NativeBatchIterator,
|
RecordBatchIterator as NativeBatchIterator,
|
||||||
Query as NativeQuery,
|
Query as NativeQuery,
|
||||||
Table as NativeTable,
|
Table as NativeTable,
|
||||||
|
TakeQuery as NativeTakeQuery,
|
||||||
VectorQuery as NativeVectorQuery,
|
VectorQuery as NativeVectorQuery,
|
||||||
} from "./native";
|
} from "./native";
|
||||||
import { Reranker } from "./rerankers";
|
import { Reranker } from "./rerankers";
|
||||||
@@ -50,7 +51,7 @@ export class RecordBatchIterator implements AsyncIterator<RecordBatch> {
|
|||||||
/* eslint-enable */
|
/* eslint-enable */
|
||||||
|
|
||||||
class RecordBatchIterable<
|
class RecordBatchIterable<
|
||||||
NativeQueryType extends NativeQuery | NativeVectorQuery,
|
NativeQueryType extends NativeQuery | NativeVectorQuery | NativeTakeQuery,
|
||||||
> implements AsyncIterable<RecordBatch>
|
> implements AsyncIterable<RecordBatch>
|
||||||
{
|
{
|
||||||
private inner: NativeQueryType;
|
private inner: NativeQueryType;
|
||||||
@@ -107,8 +108,9 @@ export interface FullTextSearchOptions {
|
|||||||
*
|
*
|
||||||
* @hideconstructor
|
* @hideconstructor
|
||||||
*/
|
*/
|
||||||
export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
export class QueryBase<
|
||||||
implements AsyncIterable<RecordBatch>
|
NativeQueryType extends NativeQuery | NativeVectorQuery | NativeTakeQuery,
|
||||||
|
> implements AsyncIterable<RecordBatch>
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* @hidden
|
* @hidden
|
||||||
@@ -133,56 +135,6 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
|||||||
fn(this.inner);
|
fn(this.inner);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
* A filter statement to be applied to this query.
|
|
||||||
*
|
|
||||||
* The filter should be supplied as an SQL query string. For example:
|
|
||||||
* @example
|
|
||||||
* x > 10
|
|
||||||
* y > 0 AND y < 100
|
|
||||||
* x > 5 OR y = 'test'
|
|
||||||
*
|
|
||||||
* Filtering performance can often be improved by creating a scalar index
|
|
||||||
* on the filter column(s).
|
|
||||||
*/
|
|
||||||
where(predicate: string): this {
|
|
||||||
this.doCall((inner: NativeQueryType) => inner.onlyIf(predicate));
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* A filter statement to be applied to this query.
|
|
||||||
* @see where
|
|
||||||
* @deprecated Use `where` instead
|
|
||||||
*/
|
|
||||||
filter(predicate: string): this {
|
|
||||||
return this.where(predicate);
|
|
||||||
}
|
|
||||||
|
|
||||||
fullTextSearch(
|
|
||||||
query: string | FullTextQuery,
|
|
||||||
options?: Partial<FullTextSearchOptions>,
|
|
||||||
): this {
|
|
||||||
let columns: string[] | null = null;
|
|
||||||
if (options) {
|
|
||||||
if (typeof options.columns === "string") {
|
|
||||||
columns = [options.columns];
|
|
||||||
} else if (Array.isArray(options.columns)) {
|
|
||||||
columns = options.columns;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
this.doCall((inner: NativeQueryType) => {
|
|
||||||
if (typeof query === "string") {
|
|
||||||
inner.fullTextSearch({
|
|
||||||
query: query,
|
|
||||||
columns: columns,
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
inner.fullTextSearch({ query: query.inner });
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return only the specified columns.
|
* Return only the specified columns.
|
||||||
@@ -241,33 +193,6 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Set the maximum number of results to return.
|
|
||||||
*
|
|
||||||
* By default, a plain search has no limit. If this method is not
|
|
||||||
* called then every valid row from the table will be returned.
|
|
||||||
*/
|
|
||||||
limit(limit: number): this {
|
|
||||||
this.doCall((inner: NativeQueryType) => inner.limit(limit));
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
offset(offset: number): this {
|
|
||||||
this.doCall((inner: NativeQueryType) => inner.offset(offset));
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Skip searching un-indexed data. This can make search faster, but will miss
|
|
||||||
* any data that is not yet indexed.
|
|
||||||
*
|
|
||||||
* Use {@link Table#optimize} to index all un-indexed data.
|
|
||||||
*/
|
|
||||||
fastSearch(): this {
|
|
||||||
this.doCall((inner: NativeQueryType) => inner.fastSearch());
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whether to return the row id in the results.
|
* Whether to return the row id in the results.
|
||||||
*
|
*
|
||||||
@@ -403,6 +328,100 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class StandardQueryBase<
|
||||||
|
NativeQueryType extends NativeQuery | NativeVectorQuery,
|
||||||
|
>
|
||||||
|
extends QueryBase<NativeQueryType>
|
||||||
|
implements ExecutableQuery
|
||||||
|
{
|
||||||
|
constructor(inner: NativeQueryType | Promise<NativeQueryType>) {
|
||||||
|
super(inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A filter statement to be applied to this query.
|
||||||
|
*
|
||||||
|
* The filter should be supplied as an SQL query string. For example:
|
||||||
|
* @example
|
||||||
|
* x > 10
|
||||||
|
* y > 0 AND y < 100
|
||||||
|
* x > 5 OR y = 'test'
|
||||||
|
*
|
||||||
|
* Filtering performance can often be improved by creating a scalar index
|
||||||
|
* on the filter column(s).
|
||||||
|
*/
|
||||||
|
where(predicate: string): this {
|
||||||
|
this.doCall((inner: NativeQueryType) => inner.onlyIf(predicate));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* A filter statement to be applied to this query.
|
||||||
|
* @see where
|
||||||
|
* @deprecated Use `where` instead
|
||||||
|
*/
|
||||||
|
filter(predicate: string): this {
|
||||||
|
return this.where(predicate);
|
||||||
|
}
|
||||||
|
|
||||||
|
fullTextSearch(
|
||||||
|
query: string | FullTextQuery,
|
||||||
|
options?: Partial<FullTextSearchOptions>,
|
||||||
|
): this {
|
||||||
|
let columns: string[] | null = null;
|
||||||
|
if (options) {
|
||||||
|
if (typeof options.columns === "string") {
|
||||||
|
columns = [options.columns];
|
||||||
|
} else if (Array.isArray(options.columns)) {
|
||||||
|
columns = options.columns;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.doCall((inner: NativeQueryType) => {
|
||||||
|
if (typeof query === "string") {
|
||||||
|
inner.fullTextSearch({
|
||||||
|
query: query,
|
||||||
|
columns: columns,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
inner.fullTextSearch({ query: query.inner });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the maximum number of results to return.
|
||||||
|
*
|
||||||
|
* By default, a plain search has no limit. If this method is not
|
||||||
|
* called then every valid row from the table will be returned.
|
||||||
|
*/
|
||||||
|
limit(limit: number): this {
|
||||||
|
this.doCall((inner: NativeQueryType) => inner.limit(limit));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the number of rows to skip before returning results.
|
||||||
|
*
|
||||||
|
* This is useful for pagination.
|
||||||
|
*/
|
||||||
|
offset(offset: number): this {
|
||||||
|
this.doCall((inner: NativeQueryType) => inner.offset(offset));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Skip searching un-indexed data. This can make search faster, but will miss
|
||||||
|
* any data that is not yet indexed.
|
||||||
|
*
|
||||||
|
* Use {@link Table#optimize} to index all un-indexed data.
|
||||||
|
*/
|
||||||
|
fastSearch(): this {
|
||||||
|
this.doCall((inner: NativeQueryType) => inner.fastSearch());
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An interface for a query that can be executed
|
* An interface for a query that can be executed
|
||||||
*
|
*
|
||||||
@@ -419,7 +438,7 @@ export interface ExecutableQuery {}
|
|||||||
*
|
*
|
||||||
* @hideconstructor
|
* @hideconstructor
|
||||||
*/
|
*/
|
||||||
export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
export class VectorQuery extends StandardQueryBase<NativeVectorQuery> {
|
||||||
/**
|
/**
|
||||||
* @hidden
|
* @hidden
|
||||||
*/
|
*/
|
||||||
@@ -679,13 +698,24 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A query that returns a subset of the rows in the table.
|
||||||
|
*
|
||||||
|
* @hideconstructor
|
||||||
|
*/
|
||||||
|
export class TakeQuery extends QueryBase<NativeTakeQuery> {
|
||||||
|
constructor(inner: NativeTakeQuery) {
|
||||||
|
super(inner);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** A builder for LanceDB queries.
|
/** A builder for LanceDB queries.
|
||||||
*
|
*
|
||||||
* @see {@link Table#query}, {@link Table#search}
|
* @see {@link Table#query}, {@link Table#search}
|
||||||
*
|
*
|
||||||
* @hideconstructor
|
* @hideconstructor
|
||||||
*/
|
*/
|
||||||
export class Query extends QueryBase<NativeQuery> {
|
export class Query extends StandardQueryBase<NativeQuery> {
|
||||||
/**
|
/**
|
||||||
* @hidden
|
* @hidden
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ import {
|
|||||||
import {
|
import {
|
||||||
FullTextQuery,
|
FullTextQuery,
|
||||||
Query,
|
Query,
|
||||||
|
TakeQuery,
|
||||||
VectorQuery,
|
VectorQuery,
|
||||||
instanceOfFullTextQuery,
|
instanceOfFullTextQuery,
|
||||||
} from "./query";
|
} from "./query";
|
||||||
@@ -336,6 +337,20 @@ export abstract class Table {
|
|||||||
*/
|
*/
|
||||||
abstract query(): Query;
|
abstract query(): Query;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a query that returns a subset of the rows in the table.
|
||||||
|
* @param offsets The offsets of the rows to return.
|
||||||
|
* @returns A builder that can be used to parameterize the query.
|
||||||
|
*/
|
||||||
|
abstract takeOffsets(offsets: number[]): TakeQuery;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a query that returns a subset of the rows in the table.
|
||||||
|
* @param rowIds The row ids of the rows to return.
|
||||||
|
* @returns A builder that can be used to parameterize the query.
|
||||||
|
*/
|
||||||
|
abstract takeRowIds(rowIds: number[]): TakeQuery;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a search query to find the nearest neighbors
|
* Create a search query to find the nearest neighbors
|
||||||
* of the given query
|
* of the given query
|
||||||
@@ -665,6 +680,14 @@ export class LocalTable extends Table {
|
|||||||
await this.inner.waitForIndex(indexNames, timeoutSeconds);
|
await this.inner.waitForIndex(indexNames, timeoutSeconds);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
takeOffsets(offsets: number[]): TakeQuery {
|
||||||
|
return new TakeQuery(this.inner.takeOffsets(offsets));
|
||||||
|
}
|
||||||
|
|
||||||
|
takeRowIds(rowIds: number[]): TakeQuery {
|
||||||
|
return new TakeQuery(this.inner.takeRowIds(rowIds));
|
||||||
|
}
|
||||||
|
|
||||||
query(): Query {
|
query(): Query {
|
||||||
return new Query(this.inner);
|
return new Query(this.inner);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
"files": ["lancedb.darwin-arm64.node"],
|
"files": ["lancedb.darwin-arm64.node"],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 18"
|
"node": ">= 18"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
"files": ["lancedb.darwin-x64.node"],
|
"files": ["lancedb.darwin-x64.node"],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 18"
|
"node": ">= 18"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
"files": ["lancedb.linux-arm64-gnu.node"],
|
"files": ["lancedb.linux-arm64-gnu.node"],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 18"
|
"node": ">= 18"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -1,13 +1,13 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
"files": ["lancedb.linux-arm64-musl.node"],
|
"files": ["lancedb.linux-arm64-musl.node"],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 18"
|
"node": ">= 18"
|
||||||
},
|
},
|
||||||
"libc": ["musl"]
|
"libc": ["musl"]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
"files": ["lancedb.linux-x64-gnu.node"],
|
"files": ["lancedb.linux-x64-gnu.node"],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 18"
|
"node": ">= 18"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -1,13 +1,13 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
"files": ["lancedb.linux-x64-musl.node"],
|
"files": ["lancedb.linux-x64-musl.node"],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 18"
|
"node": ">= 18"
|
||||||
},
|
},
|
||||||
"libc": ["musl"]
|
"libc": ["musl"]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
@@ -11,7 +11,7 @@
|
|||||||
"files": [
|
"files": [
|
||||||
"lancedb.win32-arm64-msvc.node"
|
"lancedb.win32-arm64-msvc.node"
|
||||||
],
|
],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 18"
|
"node": ">= 18"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
"files": ["lancedb.win32-x64-msvc.node"],
|
"files": ["lancedb.win32-x64-msvc.node"],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 18"
|
"node": ">= 18"
|
||||||
}
|
}
|
||||||
|
|||||||
6
nodejs/package-lock.json
generated
6
nodejs/package-lock.json
generated
@@ -1,17 +1,17 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"os": [
|
"os": [
|
||||||
"darwin",
|
"darwin",
|
||||||
"linux",
|
"linux",
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.21.2-beta.1",
|
"version": "0.21.2",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
@@ -36,7 +36,7 @@
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@aws-sdk/client-dynamodb": "^3.33.0",
|
"@aws-sdk/client-dynamodb": "^3.33.0",
|
||||||
"@aws-sdk/client-kms": "^3.33.0",
|
"@aws-sdk/client-kms": "^3.33.0",
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ use lancedb::query::Query as LanceDbQuery;
|
|||||||
use lancedb::query::QueryBase;
|
use lancedb::query::QueryBase;
|
||||||
use lancedb::query::QueryExecutionOptions;
|
use lancedb::query::QueryExecutionOptions;
|
||||||
use lancedb::query::Select;
|
use lancedb::query::Select;
|
||||||
|
use lancedb::query::TakeQuery as LanceDbTakeQuery;
|
||||||
use lancedb::query::VectorQuery as LanceDbVectorQuery;
|
use lancedb::query::VectorQuery as LanceDbVectorQuery;
|
||||||
use napi::bindgen_prelude::*;
|
use napi::bindgen_prelude::*;
|
||||||
use napi_derive::napi;
|
use napi_derive::napi;
|
||||||
@@ -319,6 +320,79 @@ impl VectorQuery {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub struct TakeQuery {
|
||||||
|
inner: LanceDbTakeQuery,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
impl TakeQuery {
|
||||||
|
pub fn new(query: LanceDbTakeQuery) -> Self {
|
||||||
|
Self { inner: query }
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||||
|
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn select_columns(&mut self, columns: Vec<String>) {
|
||||||
|
self.inner = self.inner.clone().select(Select::columns(&columns));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn with_row_id(&mut self) {
|
||||||
|
self.inner = self.inner.clone().with_row_id();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi(catch_unwind)]
|
||||||
|
pub async fn execute(
|
||||||
|
&self,
|
||||||
|
max_batch_length: Option<u32>,
|
||||||
|
timeout_ms: Option<u32>,
|
||||||
|
) -> napi::Result<RecordBatchIterator> {
|
||||||
|
let mut execution_opts = QueryExecutionOptions::default();
|
||||||
|
if let Some(max_batch_length) = max_batch_length {
|
||||||
|
execution_opts.max_batch_length = max_batch_length;
|
||||||
|
}
|
||||||
|
if let Some(timeout_ms) = timeout_ms {
|
||||||
|
execution_opts.timeout = Some(std::time::Duration::from_millis(timeout_ms as u64))
|
||||||
|
}
|
||||||
|
let inner_stream = self
|
||||||
|
.inner
|
||||||
|
.execute_with_options(execution_opts)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
napi::Error::from_reason(format!(
|
||||||
|
"Failed to execute query stream: {}",
|
||||||
|
convert_error(&e)
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
Ok(RecordBatchIterator::new(inner_stream))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub async fn explain_plan(&self, verbose: bool) -> napi::Result<String> {
|
||||||
|
self.inner.explain_plan(verbose).await.map_err(|e| {
|
||||||
|
napi::Error::from_reason(format!(
|
||||||
|
"Failed to retrieve the query plan: {}",
|
||||||
|
convert_error(&e)
|
||||||
|
))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi(catch_unwind)]
|
||||||
|
pub async fn analyze_plan(&self) -> napi::Result<String> {
|
||||||
|
self.inner.analyze_plan().await.map_err(|e| {
|
||||||
|
napi::Error::from_reason(format!(
|
||||||
|
"Failed to execute analyze plan: {}",
|
||||||
|
convert_error(&e)
|
||||||
|
))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct JsFullTextQuery {
|
pub struct JsFullTextQuery {
|
||||||
|
|||||||
@@ -9,6 +9,12 @@ use napi_derive::*;
|
|||||||
#[napi(object)]
|
#[napi(object)]
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct TimeoutConfig {
|
pub struct TimeoutConfig {
|
||||||
|
/// The overall timeout for the entire request in seconds. This includes
|
||||||
|
/// connection, send, and read time. If the entire request doesn't complete
|
||||||
|
/// within this time, it will fail. Default is None (no overall timeout).
|
||||||
|
/// This can also be set via the environment variable `LANCE_CLIENT_TIMEOUT`,
|
||||||
|
/// as an integer number of seconds.
|
||||||
|
pub timeout: Option<f64>,
|
||||||
/// The timeout for establishing a connection in seconds. Default is 120
|
/// The timeout for establishing a connection in seconds. Default is 120
|
||||||
/// seconds (2 minutes). This can also be set via the environment variable
|
/// seconds (2 minutes). This can also be set via the environment variable
|
||||||
/// `LANCE_CLIENT_CONNECT_TIMEOUT`, as an integer number of seconds.
|
/// `LANCE_CLIENT_CONNECT_TIMEOUT`, as an integer number of seconds.
|
||||||
@@ -75,6 +81,7 @@ pub struct ClientConfig {
|
|||||||
impl From<TimeoutConfig> for lancedb::remote::TimeoutConfig {
|
impl From<TimeoutConfig> for lancedb::remote::TimeoutConfig {
|
||||||
fn from(config: TimeoutConfig) -> Self {
|
fn from(config: TimeoutConfig) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
timeout: config.timeout.map(std::time::Duration::from_secs_f64),
|
||||||
connect_timeout: config
|
connect_timeout: config
|
||||||
.connect_timeout
|
.connect_timeout
|
||||||
.map(std::time::Duration::from_secs_f64),
|
.map(std::time::Duration::from_secs_f64),
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ use napi_derive::napi;
|
|||||||
use crate::error::NapiErrorExt;
|
use crate::error::NapiErrorExt;
|
||||||
use crate::index::Index;
|
use crate::index::Index;
|
||||||
use crate::merge::NativeMergeInsertBuilder;
|
use crate::merge::NativeMergeInsertBuilder;
|
||||||
use crate::query::{Query, VectorQuery};
|
use crate::query::{Query, TakeQuery, VectorQuery};
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub struct Table {
|
pub struct Table {
|
||||||
@@ -187,6 +187,44 @@ impl Table {
|
|||||||
Ok(Query::new(self.inner_ref()?.query()))
|
Ok(Query::new(self.inner_ref()?.query()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(catch_unwind)]
|
||||||
|
pub fn take_offsets(&self, offsets: Vec<i64>) -> napi::Result<TakeQuery> {
|
||||||
|
Ok(TakeQuery::new(
|
||||||
|
self.inner_ref()?.take_offsets(
|
||||||
|
offsets
|
||||||
|
.into_iter()
|
||||||
|
.map(|o| {
|
||||||
|
u64::try_from(o).map_err(|e| {
|
||||||
|
napi::Error::from_reason(format!(
|
||||||
|
"Failed to convert offset to u64: {}",
|
||||||
|
e
|
||||||
|
))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect::<Result<Vec<_>>>()?,
|
||||||
|
),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[napi(catch_unwind)]
|
||||||
|
pub fn take_row_ids(&self, row_ids: Vec<i64>) -> napi::Result<TakeQuery> {
|
||||||
|
Ok(TakeQuery::new(
|
||||||
|
self.inner_ref()?.take_row_ids(
|
||||||
|
row_ids
|
||||||
|
.into_iter()
|
||||||
|
.map(|o| {
|
||||||
|
u64::try_from(o).map_err(|e| {
|
||||||
|
napi::Error::from_reason(format!(
|
||||||
|
"Failed to convert row id to u64: {}",
|
||||||
|
e
|
||||||
|
))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect::<Result<Vec<_>>>()?,
|
||||||
|
),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub fn vector_search(&self, vector: Float32Array) -> napi::Result<VectorQuery> {
|
pub fn vector_search(&self, vector: Float32Array) -> napi::Result<VectorQuery> {
|
||||||
self.query()?.nearest_to(vector)
|
self.query()?.nearest_to(vector)
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
"intentionallyNotExported": [
|
"intentionallyNotExported": [
|
||||||
"lancedb/native.d.ts:Query",
|
"lancedb/native.d.ts:Query",
|
||||||
"lancedb/native.d.ts:VectorQuery",
|
"lancedb/native.d.ts:VectorQuery",
|
||||||
|
"lancedb/native.d.ts:TakeQuery",
|
||||||
"lancedb/native.d.ts:RecordBatchIterator",
|
"lancedb/native.d.ts:RecordBatchIterator",
|
||||||
"lancedb/native.d.ts:NativeMergeInsertBuilder"
|
"lancedb/native.d.ts:NativeMergeInsertBuilder"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.24.2"
|
current_version = "0.24.3"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
3
python/.gitignore
vendored
3
python/.gitignore
vendored
@@ -1,2 +1,3 @@
|
|||||||
# Test data created by some example tests
|
# Test data created by some example tests
|
||||||
data/
|
data/
|
||||||
|
_lancedb.pyd
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.24.2"
|
version = "0.24.3"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -68,8 +68,9 @@ dev = [
|
|||||||
"pyright",
|
"pyright",
|
||||||
'typing-extensions>=4.0.0; python_version < "3.11"',
|
'typing-extensions>=4.0.0; python_version < "3.11"',
|
||||||
]
|
]
|
||||||
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
|
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings-python"]
|
||||||
clip = ["torch", "pillow", "open-clip-torch"]
|
clip = ["torch", "pillow", "open-clip-torch"]
|
||||||
|
siglip = ["torch", "pillow", "transformers>=4.41.0","sentencepiece"]
|
||||||
embeddings = [
|
embeddings = [
|
||||||
"requests>=2.31.0",
|
"requests>=2.31.0",
|
||||||
"openai>=1.6.1",
|
"openai>=1.6.1",
|
||||||
@@ -87,6 +88,7 @@ embeddings = [
|
|||||||
"botocore>=1.31.57",
|
"botocore>=1.31.57",
|
||||||
'ibm-watsonx-ai>=1.1.2; python_version >= "3.10"',
|
'ibm-watsonx-ai>=1.1.2; python_version >= "3.10"',
|
||||||
"ollama>=0.3.0",
|
"ollama>=0.3.0",
|
||||||
|
"sentencepiece"
|
||||||
]
|
]
|
||||||
azure = ["adlfs>=2024.2.0"]
|
azure = ["adlfs>=2024.2.0"]
|
||||||
|
|
||||||
|
|||||||
@@ -241,4 +241,4 @@ def __warn_on_fork():
|
|||||||
|
|
||||||
|
|
||||||
if hasattr(os, "register_at_fork"):
|
if hasattr(os, "register_at_fork"):
|
||||||
os.register_at_fork(before=__warn_on_fork)
|
os.register_at_fork(before=__warn_on_fork) # type: ignore[attr-defined]
|
||||||
|
|||||||
@@ -20,3 +20,4 @@ from .jinaai import JinaEmbeddings
|
|||||||
from .watsonx import WatsonxEmbeddings
|
from .watsonx import WatsonxEmbeddings
|
||||||
from .voyageai import VoyageAIEmbeddingFunction
|
from .voyageai import VoyageAIEmbeddingFunction
|
||||||
from .colpali import ColPaliEmbeddings
|
from .colpali import ColPaliEmbeddings
|
||||||
|
from .siglip import SigLipEmbeddings
|
||||||
|
|||||||
148
python/python/lancedb/embeddings/siglip.py
Normal file
148
python/python/lancedb/embeddings/siglip.py
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
import concurrent.futures
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
from typing import TYPE_CHECKING, List, Union
|
||||||
|
import urllib.parse as urlparse
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pyarrow as pa
|
||||||
|
from tqdm import tqdm
|
||||||
|
from pydantic import PrivateAttr
|
||||||
|
|
||||||
|
from ..util import attempt_import_or_raise
|
||||||
|
from .base import EmbeddingFunction
|
||||||
|
from .registry import register
|
||||||
|
from .utils import IMAGES, url_retrieve
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import PIL
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
@register("siglip")
|
||||||
|
class SigLipEmbeddings(EmbeddingFunction):
|
||||||
|
model_name: str = "google/siglip-base-patch16-224"
|
||||||
|
device: str = "cpu"
|
||||||
|
batch_size: int = 64
|
||||||
|
normalize: bool = True
|
||||||
|
|
||||||
|
_model = PrivateAttr()
|
||||||
|
_processor = PrivateAttr()
|
||||||
|
_tokenizer = PrivateAttr()
|
||||||
|
_torch = PrivateAttr()
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
transformers = attempt_import_or_raise("transformers")
|
||||||
|
self._torch = attempt_import_or_raise("torch")
|
||||||
|
|
||||||
|
self._processor = transformers.AutoProcessor.from_pretrained(self.model_name)
|
||||||
|
self._model = transformers.SiglipModel.from_pretrained(self.model_name)
|
||||||
|
self._model.to(self.device)
|
||||||
|
self._model.eval()
|
||||||
|
self._ndims = None
|
||||||
|
|
||||||
|
def ndims(self):
|
||||||
|
if self._ndims is None:
|
||||||
|
self._ndims = self.generate_text_embeddings("foo").shape[0]
|
||||||
|
return self._ndims
|
||||||
|
|
||||||
|
def compute_query_embeddings(
|
||||||
|
self, query: Union[str, "PIL.Image.Image"], *args, **kwargs
|
||||||
|
) -> List[np.ndarray]:
|
||||||
|
if isinstance(query, str):
|
||||||
|
return [self.generate_text_embeddings(query)]
|
||||||
|
else:
|
||||||
|
PIL = attempt_import_or_raise("PIL", "pillow")
|
||||||
|
if isinstance(query, PIL.Image.Image):
|
||||||
|
return [self.generate_image_embedding(query)]
|
||||||
|
else:
|
||||||
|
raise TypeError("SigLIP supports str or PIL Image as query")
|
||||||
|
|
||||||
|
def generate_text_embeddings(self, text: str) -> np.ndarray:
|
||||||
|
torch = self._torch
|
||||||
|
text_inputs = self._processor(
|
||||||
|
text=text,
|
||||||
|
return_tensors="pt",
|
||||||
|
padding="max_length",
|
||||||
|
truncation=True,
|
||||||
|
max_length=64,
|
||||||
|
).to(self.device)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
text_features = self._model.get_text_features(**text_inputs)
|
||||||
|
if self.normalize:
|
||||||
|
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
||||||
|
return text_features.cpu().detach().numpy().squeeze()
|
||||||
|
|
||||||
|
def sanitize_input(self, images: IMAGES) -> Union[List[bytes], np.ndarray]:
|
||||||
|
if isinstance(images, (str, bytes)):
|
||||||
|
images = [images]
|
||||||
|
elif isinstance(images, pa.Array):
|
||||||
|
images = images.to_pylist()
|
||||||
|
elif isinstance(images, pa.ChunkedArray):
|
||||||
|
images = images.combine_chunks().to_pylist()
|
||||||
|
return images
|
||||||
|
|
||||||
|
def compute_source_embeddings(
|
||||||
|
self, images: IMAGES, *args, **kwargs
|
||||||
|
) -> List[np.ndarray]:
|
||||||
|
images = self.sanitize_input(images)
|
||||||
|
embeddings = []
|
||||||
|
|
||||||
|
for i in range(0, len(images), self.batch_size):
|
||||||
|
j = min(i + self.batch_size, len(images))
|
||||||
|
batch = images[i:j]
|
||||||
|
embeddings.extend(self._parallel_get(batch))
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
def _parallel_get(self, images: Union[List[str], List[bytes]]) -> List[np.ndarray]:
|
||||||
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
futures = [
|
||||||
|
executor.submit(self.generate_image_embedding, image)
|
||||||
|
for image in images
|
||||||
|
]
|
||||||
|
return [f.result() for f in tqdm(futures, desc="SigLIP Embedding")]
|
||||||
|
|
||||||
|
def generate_image_embedding(
|
||||||
|
self, image: Union[str, bytes, "PIL.Image.Image"]
|
||||||
|
) -> np.ndarray:
|
||||||
|
image = self._to_pil(image)
|
||||||
|
image = self._processor(images=image, return_tensors="pt")["pixel_values"]
|
||||||
|
return self._encode_and_normalize_image(image)
|
||||||
|
|
||||||
|
def _encode_and_normalize_image(self, image_tensor: "torch.Tensor") -> np.ndarray:
|
||||||
|
torch = self._torch
|
||||||
|
with torch.no_grad():
|
||||||
|
image_features = self._model.get_image_features(
|
||||||
|
image_tensor.to(self.device)
|
||||||
|
)
|
||||||
|
if self.normalize:
|
||||||
|
image_features = image_features / image_features.norm(
|
||||||
|
dim=-1, keepdim=True
|
||||||
|
)
|
||||||
|
return image_features.cpu().detach().numpy().squeeze()
|
||||||
|
|
||||||
|
def _to_pil(self, image: Union[str, bytes, "PIL.Image.Image"]):
|
||||||
|
PIL = attempt_import_or_raise("PIL", "pillow")
|
||||||
|
if isinstance(image, PIL.Image.Image):
|
||||||
|
return image.convert("RGB") if image.mode != "RGB" else image
|
||||||
|
elif isinstance(image, bytes):
|
||||||
|
return PIL.Image.open(io.BytesIO(image)).convert("RGB")
|
||||||
|
elif isinstance(image, str):
|
||||||
|
parsed = urlparse.urlparse(image)
|
||||||
|
if parsed.scheme == "file":
|
||||||
|
return PIL.Image.open(parsed.path).convert("RGB")
|
||||||
|
elif parsed.scheme == "":
|
||||||
|
path = image if os.name == "nt" else parsed.path
|
||||||
|
return PIL.Image.open(path).convert("RGB")
|
||||||
|
elif parsed.scheme.startswith("http"):
|
||||||
|
image_bytes = url_retrieve(image)
|
||||||
|
return PIL.Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Only local and http(s) urls are supported")
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported image type: {type(image)}")
|
||||||
@@ -28,6 +28,7 @@ import pyarrow.fs as pa_fs
|
|||||||
import pydantic
|
import pydantic
|
||||||
|
|
||||||
from lancedb.pydantic import PYDANTIC_VERSION
|
from lancedb.pydantic import PYDANTIC_VERSION
|
||||||
|
from lancedb.background_loop import LOOP
|
||||||
|
|
||||||
from . import __version__
|
from . import __version__
|
||||||
from .arrow import AsyncRecordBatchReader
|
from .arrow import AsyncRecordBatchReader
|
||||||
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
|
|||||||
from ._lancedb import FTSQuery as LanceFTSQuery
|
from ._lancedb import FTSQuery as LanceFTSQuery
|
||||||
from ._lancedb import HybridQuery as LanceHybridQuery
|
from ._lancedb import HybridQuery as LanceHybridQuery
|
||||||
from ._lancedb import VectorQuery as LanceVectorQuery
|
from ._lancedb import VectorQuery as LanceVectorQuery
|
||||||
|
from ._lancedb import TakeQuery as LanceTakeQuery
|
||||||
from ._lancedb import PyQueryRequest
|
from ._lancedb import PyQueryRequest
|
||||||
from .common import VEC
|
from .common import VEC
|
||||||
from .pydantic import LanceModel
|
from .pydantic import LanceModel
|
||||||
@@ -910,7 +912,7 @@ class LanceQueryBuilder(ABC):
|
|||||||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||||||
GlobalLimitExec: skip=0, fetch=10
|
GlobalLimitExec: skip=0, fetch=10
|
||||||
FilterExec: _distance@2 IS NOT NULL
|
FilterExec: _distance@2 IS NOT NULL
|
||||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||||||
KNNVectorDistance: metric=l2
|
KNNVectorDistance: metric=l2
|
||||||
LanceRead: uri=..., projection=[vector], ...
|
LanceRead: uri=..., projection=[vector], ...
|
||||||
|
|
||||||
@@ -2041,11 +2043,11 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
>>> plan = table.search(query).explain_plan(True)
|
>>> plan = table.search(query).explain_plan(True)
|
||||||
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||||||
GlobalLimitExec: skip=0, fetch=10
|
GlobalLimitExec: skip=0, fetch=10
|
||||||
FilterExec: _distance@2 IS NOT NULL
|
FilterExec: _distance@2 IS NOT NULL
|
||||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||||||
KNNVectorDistance: metric=l2
|
KNNVectorDistance: metric=l2
|
||||||
LanceRead: uri=..., projection=[vector], ...
|
LanceRead: uri=..., projection=[vector], ...
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -2139,7 +2141,11 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
|
|
||||||
|
|
||||||
class AsyncQueryBase(object):
|
class AsyncQueryBase(object):
|
||||||
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery]):
|
"""
|
||||||
|
Base class for all async queries (take, scan, vector, fts, hybrid)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery, LanceTakeQuery]):
|
||||||
"""
|
"""
|
||||||
Construct an AsyncQueryBase
|
Construct an AsyncQueryBase
|
||||||
|
|
||||||
@@ -2149,27 +2155,14 @@ class AsyncQueryBase(object):
|
|||||||
self._inner = inner
|
self._inner = inner
|
||||||
|
|
||||||
def to_query_object(self) -> Query:
|
def to_query_object(self) -> Query:
|
||||||
|
"""
|
||||||
|
Convert the query into a query object
|
||||||
|
|
||||||
|
This is currently experimental but can be useful as the query object is pure
|
||||||
|
python and more easily serializable.
|
||||||
|
"""
|
||||||
return Query.from_inner(self._inner.to_query_request())
|
return Query.from_inner(self._inner.to_query_request())
|
||||||
|
|
||||||
def where(self, predicate: str) -> Self:
|
|
||||||
"""
|
|
||||||
Only return rows matching the given predicate
|
|
||||||
|
|
||||||
The predicate should be supplied as an SQL query string.
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
|
|
||||||
>>> predicate = "x > 10"
|
|
||||||
>>> predicate = "y > 0 AND y < 100"
|
|
||||||
>>> predicate = "x > 5 OR y = 'test'"
|
|
||||||
|
|
||||||
Filtering performance can often be improved by creating a scalar index
|
|
||||||
on the filter column(s).
|
|
||||||
"""
|
|
||||||
self._inner.where(predicate)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
|
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
|
||||||
"""
|
"""
|
||||||
Return only the specified columns.
|
Return only the specified columns.
|
||||||
@@ -2208,42 +2201,6 @@ class AsyncQueryBase(object):
|
|||||||
raise TypeError("columns must be a list of column names or a dict")
|
raise TypeError("columns must be a list of column names or a dict")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def limit(self, limit: int) -> Self:
|
|
||||||
"""
|
|
||||||
Set the maximum number of results to return.
|
|
||||||
|
|
||||||
By default, a plain search has no limit. If this method is not
|
|
||||||
called then every valid row from the table will be returned.
|
|
||||||
"""
|
|
||||||
self._inner.limit(limit)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def offset(self, offset: int) -> Self:
|
|
||||||
"""
|
|
||||||
Set the offset for the results.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
offset: int
|
|
||||||
The offset to start fetching results from.
|
|
||||||
"""
|
|
||||||
self._inner.offset(offset)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def fast_search(self) -> Self:
|
|
||||||
"""
|
|
||||||
Skip searching un-indexed data.
|
|
||||||
|
|
||||||
This can make queries faster, but will miss any data that has not been
|
|
||||||
indexed.
|
|
||||||
|
|
||||||
!!! tip
|
|
||||||
You can add new data into an existing index by calling
|
|
||||||
[AsyncTable.optimize][lancedb.table.AsyncTable.optimize].
|
|
||||||
"""
|
|
||||||
self._inner.fast_search()
|
|
||||||
return self
|
|
||||||
|
|
||||||
def with_row_id(self) -> Self:
|
def with_row_id(self) -> Self:
|
||||||
"""
|
"""
|
||||||
Include the _rowid column in the results.
|
Include the _rowid column in the results.
|
||||||
@@ -2251,27 +2208,6 @@ class AsyncQueryBase(object):
|
|||||||
self._inner.with_row_id()
|
self._inner.with_row_id()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def postfilter(self) -> Self:
|
|
||||||
"""
|
|
||||||
If this is called then filtering will happen after the search instead of
|
|
||||||
before.
|
|
||||||
By default filtering will be performed before the search. This is how
|
|
||||||
filtering is typically understood to work. This prefilter step does add some
|
|
||||||
additional latency. Creating a scalar index on the filter column(s) can
|
|
||||||
often improve this latency. However, sometimes a filter is too complex or
|
|
||||||
scalar indices cannot be applied to the column. In these cases postfiltering
|
|
||||||
can be used instead of prefiltering to improve latency.
|
|
||||||
Post filtering applies the filter to the results of the search. This
|
|
||||||
means we only run the filter on a much smaller set of data. However, it can
|
|
||||||
cause the query to return fewer than `limit` results (or even no results) if
|
|
||||||
none of the nearest results match the filter.
|
|
||||||
Post filtering happens during the "refine stage" (described in more detail in
|
|
||||||
@see {@link VectorQuery#refineFactor}). This means that setting a higher refine
|
|
||||||
factor can often help restore some of the results lost by post filtering.
|
|
||||||
"""
|
|
||||||
self._inner.postfilter()
|
|
||||||
return self
|
|
||||||
|
|
||||||
async def to_batches(
|
async def to_batches(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
@@ -2295,7 +2231,9 @@ class AsyncQueryBase(object):
|
|||||||
complete within the specified time, an error will be raised.
|
complete within the specified time, an error will be raised.
|
||||||
"""
|
"""
|
||||||
return AsyncRecordBatchReader(
|
return AsyncRecordBatchReader(
|
||||||
await self._inner.execute(max_batch_length, timeout)
|
await self._inner.execute(
|
||||||
|
max_batch_length=max_batch_length, timeout=timeout
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
async def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
|
async def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||||
@@ -2429,7 +2367,7 @@ class AsyncQueryBase(object):
|
|||||||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||||||
GlobalLimitExec: skip=0, fetch=10
|
GlobalLimitExec: skip=0, fetch=10
|
||||||
FilterExec: _distance@2 IS NOT NULL
|
FilterExec: _distance@2 IS NOT NULL
|
||||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||||||
KNNVectorDistance: metric=l2
|
KNNVectorDistance: metric=l2
|
||||||
LanceRead: uri=..., projection=[vector], ...
|
LanceRead: uri=..., projection=[vector], ...
|
||||||
|
|
||||||
@@ -2454,7 +2392,98 @@ class AsyncQueryBase(object):
|
|||||||
return await self._inner.analyze_plan()
|
return await self._inner.analyze_plan()
|
||||||
|
|
||||||
|
|
||||||
class AsyncQuery(AsyncQueryBase):
|
class AsyncStandardQuery(AsyncQueryBase):
|
||||||
|
"""
|
||||||
|
Base class for "standard" async queries (all but take currently)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery]):
|
||||||
|
"""
|
||||||
|
Construct an AsyncStandardQuery
|
||||||
|
|
||||||
|
This method is not intended to be called directly. Instead, use the
|
||||||
|
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
|
||||||
|
"""
|
||||||
|
super().__init__(inner)
|
||||||
|
|
||||||
|
def where(self, predicate: str) -> Self:
|
||||||
|
"""
|
||||||
|
Only return rows matching the given predicate
|
||||||
|
|
||||||
|
The predicate should be supplied as an SQL query string.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
>>> predicate = "x > 10"
|
||||||
|
>>> predicate = "y > 0 AND y < 100"
|
||||||
|
>>> predicate = "x > 5 OR y = 'test'"
|
||||||
|
|
||||||
|
Filtering performance can often be improved by creating a scalar index
|
||||||
|
on the filter column(s).
|
||||||
|
"""
|
||||||
|
self._inner.where(predicate)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def limit(self, limit: int) -> Self:
|
||||||
|
"""
|
||||||
|
Set the maximum number of results to return.
|
||||||
|
|
||||||
|
By default, a plain search has no limit. If this method is not
|
||||||
|
called then every valid row from the table will be returned.
|
||||||
|
"""
|
||||||
|
self._inner.limit(limit)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def offset(self, offset: int) -> Self:
|
||||||
|
"""
|
||||||
|
Set the offset for the results.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
offset: int
|
||||||
|
The offset to start fetching results from.
|
||||||
|
"""
|
||||||
|
self._inner.offset(offset)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def fast_search(self) -> Self:
|
||||||
|
"""
|
||||||
|
Skip searching un-indexed data.
|
||||||
|
|
||||||
|
This can make queries faster, but will miss any data that has not been
|
||||||
|
indexed.
|
||||||
|
|
||||||
|
!!! tip
|
||||||
|
You can add new data into an existing index by calling
|
||||||
|
[AsyncTable.optimize][lancedb.table.AsyncTable.optimize].
|
||||||
|
"""
|
||||||
|
self._inner.fast_search()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def postfilter(self) -> Self:
|
||||||
|
"""
|
||||||
|
If this is called then filtering will happen after the search instead of
|
||||||
|
before.
|
||||||
|
By default filtering will be performed before the search. This is how
|
||||||
|
filtering is typically understood to work. This prefilter step does add some
|
||||||
|
additional latency. Creating a scalar index on the filter column(s) can
|
||||||
|
often improve this latency. However, sometimes a filter is too complex or
|
||||||
|
scalar indices cannot be applied to the column. In these cases postfiltering
|
||||||
|
can be used instead of prefiltering to improve latency.
|
||||||
|
Post filtering applies the filter to the results of the search. This
|
||||||
|
means we only run the filter on a much smaller set of data. However, it can
|
||||||
|
cause the query to return fewer than `limit` results (or even no results) if
|
||||||
|
none of the nearest results match the filter.
|
||||||
|
Post filtering happens during the "refine stage" (described in more detail in
|
||||||
|
@see {@link VectorQuery#refineFactor}). This means that setting a higher refine
|
||||||
|
factor can often help restore some of the results lost by post filtering.
|
||||||
|
"""
|
||||||
|
self._inner.postfilter()
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncQuery(AsyncStandardQuery):
|
||||||
def __init__(self, inner: LanceQuery):
|
def __init__(self, inner: LanceQuery):
|
||||||
"""
|
"""
|
||||||
Construct an AsyncQuery
|
Construct an AsyncQuery
|
||||||
@@ -2588,7 +2617,7 @@ class AsyncQuery(AsyncQueryBase):
|
|||||||
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query}))
|
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query}))
|
||||||
|
|
||||||
|
|
||||||
class AsyncFTSQuery(AsyncQueryBase):
|
class AsyncFTSQuery(AsyncStandardQuery):
|
||||||
"""A query for full text search for LanceDB."""
|
"""A query for full text search for LanceDB."""
|
||||||
|
|
||||||
def __init__(self, inner: LanceFTSQuery):
|
def __init__(self, inner: LanceFTSQuery):
|
||||||
@@ -2867,7 +2896,7 @@ class AsyncVectorQueryBase:
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
class AsyncVectorQuery(AsyncStandardQuery, AsyncVectorQueryBase):
|
||||||
def __init__(self, inner: LanceVectorQuery):
|
def __init__(self, inner: LanceVectorQuery):
|
||||||
"""
|
"""
|
||||||
Construct an AsyncVectorQuery
|
Construct an AsyncVectorQuery
|
||||||
@@ -2950,7 +2979,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
|||||||
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
|
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
|
||||||
|
|
||||||
|
|
||||||
class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
class AsyncHybridQuery(AsyncStandardQuery, AsyncVectorQueryBase):
|
||||||
"""
|
"""
|
||||||
A query builder that performs hybrid vector and full text search.
|
A query builder that performs hybrid vector and full text search.
|
||||||
Results are combined and reranked based on the specified reranker.
|
Results are combined and reranked based on the specified reranker.
|
||||||
@@ -3054,7 +3083,7 @@ class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
|||||||
CoalesceBatchesExec: target_batch_size=1024
|
CoalesceBatchesExec: target_batch_size=1024
|
||||||
GlobalLimitExec: skip=0, fetch=10
|
GlobalLimitExec: skip=0, fetch=10
|
||||||
FilterExec: _distance@2 IS NOT NULL
|
FilterExec: _distance@2 IS NOT NULL
|
||||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||||||
KNNVectorDistance: metric=l2
|
KNNVectorDistance: metric=l2
|
||||||
LanceRead: uri=..., projection=[vector], ...
|
LanceRead: uri=..., projection=[vector], ...
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
@@ -3102,3 +3131,252 @@ class AsyncHybridQuery(AsyncQueryBase, AsyncVectorQueryBase):
|
|||||||
results.append(await self._inner.to_fts_query().analyze_plan())
|
results.append(await self._inner.to_fts_query().analyze_plan())
|
||||||
|
|
||||||
return "\n".join(results)
|
return "\n".join(results)
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncTakeQuery(AsyncQueryBase):
|
||||||
|
"""
|
||||||
|
Builder for parameterizing and executing take queries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, inner: LanceTakeQuery):
|
||||||
|
super().__init__(inner)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseQueryBuilder(object):
|
||||||
|
"""
|
||||||
|
Wraps AsyncQueryBase and provides a synchronous interface
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, inner: AsyncQueryBase):
|
||||||
|
self._inner = inner
|
||||||
|
|
||||||
|
def to_query_object(self) -> Query:
|
||||||
|
return self._inner.to_query_object()
|
||||||
|
|
||||||
|
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
|
||||||
|
"""
|
||||||
|
Return only the specified columns.
|
||||||
|
|
||||||
|
By default a query will return all columns from the table. However, this can
|
||||||
|
have a very significant impact on latency. LanceDb stores data in a columnar
|
||||||
|
fashion. This
|
||||||
|
means we can finely tune our I/O to select exactly the columns we need.
|
||||||
|
|
||||||
|
As a best practice you should always limit queries to the columns that you need.
|
||||||
|
If you pass in a list of column names then only those columns will be
|
||||||
|
returned.
|
||||||
|
|
||||||
|
You can also use this method to create new "dynamic" columns based on your
|
||||||
|
existing columns. For example, you may not care about "a" or "b" but instead
|
||||||
|
simply want "a + b". This is often seen in the SELECT clause of an SQL query
|
||||||
|
(e.g. `SELECT a+b FROM my_table`).
|
||||||
|
|
||||||
|
To create dynamic columns you can pass in a dict[str, str]. A column will be
|
||||||
|
returned for each entry in the map. The key provides the name of the column.
|
||||||
|
The value is an SQL string used to specify how the column is calculated.
|
||||||
|
|
||||||
|
For example, an SQL query might state `SELECT a + b AS combined, c`. The
|
||||||
|
equivalent input to this method would be `{"combined": "a + b", "c": "c"}`.
|
||||||
|
|
||||||
|
Columns will always be returned in the order given, even if that order is
|
||||||
|
different than the order used when adding the data.
|
||||||
|
"""
|
||||||
|
self._inner.select(columns)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def with_row_id(self) -> Self:
|
||||||
|
"""
|
||||||
|
Include the _rowid column in the results.
|
||||||
|
"""
|
||||||
|
self._inner.with_row_id()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_batches(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
max_batch_length: Optional[int] = None,
|
||||||
|
timeout: Optional[timedelta] = None,
|
||||||
|
) -> pa.RecordBatchReader:
|
||||||
|
"""
|
||||||
|
Execute the query and return the results as an Apache Arrow RecordBatchReader.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
|
||||||
|
max_batch_length: Optional[int]
|
||||||
|
The maximum number of selected records in a single RecordBatch object.
|
||||||
|
If not specified, a default batch length is used.
|
||||||
|
It is possible for batches to be smaller than the provided length if the
|
||||||
|
underlying data is stored in smaller chunks.
|
||||||
|
timeout: Optional[timedelta]
|
||||||
|
The maximum time to wait for the query to complete.
|
||||||
|
If not specified, no timeout is applied. If the query does not
|
||||||
|
complete within the specified time, an error will be raised.
|
||||||
|
"""
|
||||||
|
async_iter = LOOP.run(self._inner.execute(max_batch_length, timeout))
|
||||||
|
|
||||||
|
def iter_sync():
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
yield LOOP.run(async_iter.__anext__())
|
||||||
|
except StopAsyncIteration:
|
||||||
|
return
|
||||||
|
|
||||||
|
return pa.RecordBatchReader.from_batches(async_iter.schema, iter_sync())
|
||||||
|
|
||||||
|
def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
|
||||||
|
"""
|
||||||
|
Execute the query and collect the results into an Apache Arrow Table.
|
||||||
|
|
||||||
|
This method will collect all results into memory before returning. If
|
||||||
|
you expect a large number of results, you may want to use
|
||||||
|
[to_batches][lancedb.query.AsyncQueryBase.to_batches]
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
timeout: Optional[timedelta]
|
||||||
|
The maximum time to wait for the query to complete.
|
||||||
|
If not specified, no timeout is applied. If the query does not
|
||||||
|
complete within the specified time, an error will be raised.
|
||||||
|
"""
|
||||||
|
return LOOP.run(self._inner.to_arrow(timeout))
|
||||||
|
|
||||||
|
def to_list(self, timeout: Optional[timedelta] = None) -> List[dict]:
|
||||||
|
"""
|
||||||
|
Execute the query and return the results as a list of dictionaries.
|
||||||
|
|
||||||
|
Each list entry is a dictionary with the selected column names as keys,
|
||||||
|
or all table columns if `select` is not called. The vector and the "_distance"
|
||||||
|
fields are returned whether or not they're explicitly selected.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
timeout: Optional[timedelta]
|
||||||
|
The maximum time to wait for the query to complete.
|
||||||
|
If not specified, no timeout is applied. If the query does not
|
||||||
|
complete within the specified time, an error will be raised.
|
||||||
|
"""
|
||||||
|
return LOOP.run(self._inner.to_list(timeout))
|
||||||
|
|
||||||
|
def to_pandas(
|
||||||
|
self,
|
||||||
|
flatten: Optional[Union[int, bool]] = None,
|
||||||
|
timeout: Optional[timedelta] = None,
|
||||||
|
) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Execute the query and collect the results into a pandas DataFrame.
|
||||||
|
|
||||||
|
This method will collect all results into memory before returning. If you
|
||||||
|
expect a large number of results, you may want to use
|
||||||
|
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
||||||
|
pandas separately.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
>>> import asyncio
|
||||||
|
>>> from lancedb import connect_async
|
||||||
|
>>> async def doctest_example():
|
||||||
|
... conn = await connect_async("./.lancedb")
|
||||||
|
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
|
||||||
|
... async for batch in await table.query().to_batches():
|
||||||
|
... batch_df = batch.to_pandas()
|
||||||
|
>>> asyncio.run(doctest_example())
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
flatten: Optional[Union[int, bool]]
|
||||||
|
If flatten is True, flatten all nested columns.
|
||||||
|
If flatten is an integer, flatten the nested columns up to the
|
||||||
|
specified depth.
|
||||||
|
If unspecified, do not flatten the nested columns.
|
||||||
|
timeout: Optional[timedelta]
|
||||||
|
The maximum time to wait for the query to complete.
|
||||||
|
If not specified, no timeout is applied. If the query does not
|
||||||
|
complete within the specified time, an error will be raised.
|
||||||
|
"""
|
||||||
|
return LOOP.run(self._inner.to_pandas(flatten, timeout))
|
||||||
|
|
||||||
|
def to_polars(
|
||||||
|
self,
|
||||||
|
timeout: Optional[timedelta] = None,
|
||||||
|
) -> "pl.DataFrame":
|
||||||
|
"""
|
||||||
|
Execute the query and collect the results into a Polars DataFrame.
|
||||||
|
|
||||||
|
This method will collect all results into memory before returning. If you
|
||||||
|
expect a large number of results, you may want to use
|
||||||
|
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
||||||
|
polars separately.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
timeout: Optional[timedelta]
|
||||||
|
The maximum time to wait for the query to complete.
|
||||||
|
If not specified, no timeout is applied. If the query does not
|
||||||
|
complete within the specified time, an error will be raised.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
>>> import asyncio
|
||||||
|
>>> import polars as pl
|
||||||
|
>>> from lancedb import connect_async
|
||||||
|
>>> async def doctest_example():
|
||||||
|
... conn = await connect_async("./.lancedb")
|
||||||
|
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
|
||||||
|
... async for batch in await table.query().to_batches():
|
||||||
|
... batch_df = pl.from_arrow(batch)
|
||||||
|
>>> asyncio.run(doctest_example())
|
||||||
|
"""
|
||||||
|
return LOOP.run(self._inner.to_polars(timeout))
|
||||||
|
|
||||||
|
def explain_plan(self, verbose: Optional[bool] = False):
|
||||||
|
"""Return the execution plan for this query.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> import asyncio
|
||||||
|
>>> from lancedb import connect_async
|
||||||
|
>>> async def doctest_example():
|
||||||
|
... conn = await connect_async("./.lancedb")
|
||||||
|
... table = await conn.create_table("my_table", [{"vector": [99, 99]}])
|
||||||
|
... query = [100, 100]
|
||||||
|
... plan = await table.query().nearest_to([1, 2]).explain_plan(True)
|
||||||
|
... print(plan)
|
||||||
|
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||||
|
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||||||
|
GlobalLimitExec: skip=0, fetch=10
|
||||||
|
FilterExec: _distance@2 IS NOT NULL
|
||||||
|
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||||||
|
KNNVectorDistance: metric=l2
|
||||||
|
LanceRead: uri=..., projection=[vector], ...
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
verbose : bool, default False
|
||||||
|
Use a verbose output format.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
plan : str
|
||||||
|
""" # noqa: E501
|
||||||
|
return LOOP.run(self._inner.explain_plan(verbose))
|
||||||
|
|
||||||
|
def analyze_plan(self):
|
||||||
|
"""Execute the query and display with runtime metrics.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
plan : str
|
||||||
|
"""
|
||||||
|
return LOOP.run(self._inner.analyze_plan())
|
||||||
|
|
||||||
|
|
||||||
|
class LanceTakeQueryBuilder(BaseQueryBuilder):
|
||||||
|
"""
|
||||||
|
Builder for parameterizing and executing take queries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, inner: AsyncTakeQuery):
|
||||||
|
super().__init__(inner)
|
||||||
|
|||||||
@@ -17,6 +17,12 @@ class TimeoutConfig:
|
|||||||
|
|
||||||
Attributes
|
Attributes
|
||||||
----------
|
----------
|
||||||
|
timeout: Optional[timedelta]
|
||||||
|
The overall timeout for the entire request. This includes connection,
|
||||||
|
send, and read time. If the entire request doesn't complete within
|
||||||
|
this time, it will fail. Default is None (no overall timeout).
|
||||||
|
This can also be set via the environment variable
|
||||||
|
`LANCE_CLIENT_TIMEOUT`, as an integer number of seconds.
|
||||||
connect_timeout: Optional[timedelta]
|
connect_timeout: Optional[timedelta]
|
||||||
The timeout for establishing a connection. Default is 120 seconds (2 minutes).
|
The timeout for establishing a connection. Default is 120 seconds (2 minutes).
|
||||||
This can also be set via the environment variable
|
This can also be set via the environment variable
|
||||||
@@ -31,6 +37,7 @@ class TimeoutConfig:
|
|||||||
`LANCE_CLIENT_CONNECTION_TIMEOUT`, as an integer number of seconds.
|
`LANCE_CLIENT_CONNECTION_TIMEOUT`, as an integer number of seconds.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
timeout: Optional[timedelta] = None
|
||||||
connect_timeout: Optional[timedelta] = None
|
connect_timeout: Optional[timedelta] = None
|
||||||
read_timeout: Optional[timedelta] = None
|
read_timeout: Optional[timedelta] = None
|
||||||
pool_idle_timeout: Optional[timedelta] = None
|
pool_idle_timeout: Optional[timedelta] = None
|
||||||
@@ -50,6 +57,7 @@ class TimeoutConfig:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
self.timeout = self.__to_timedelta(self.timeout)
|
||||||
self.connect_timeout = self.__to_timedelta(self.connect_timeout)
|
self.connect_timeout = self.__to_timedelta(self.connect_timeout)
|
||||||
self.read_timeout = self.__to_timedelta(self.read_timeout)
|
self.read_timeout = self.__to_timedelta(self.read_timeout)
|
||||||
self.pool_idle_timeout = self.__to_timedelta(self.pool_idle_timeout)
|
self.pool_idle_timeout = self.__to_timedelta(self.pool_idle_timeout)
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
|
|||||||
from lancedb.merge import LanceMergeInsertBuilder
|
from lancedb.merge import LanceMergeInsertBuilder
|
||||||
from lancedb.embeddings import EmbeddingFunctionRegistry
|
from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||||
|
|
||||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder
|
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
|
||||||
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
||||||
|
|
||||||
|
|
||||||
@@ -617,6 +617,12 @@ class RemoteTable(Table):
|
|||||||
def stats(self):
|
def stats(self):
|
||||||
return LOOP.run(self._table.stats())
|
return LOOP.run(self._table.stats())
|
||||||
|
|
||||||
|
def take_offsets(self, offsets: list[int]) -> LanceTakeQueryBuilder:
|
||||||
|
return LanceTakeQueryBuilder(self._table.take_offsets(offsets))
|
||||||
|
|
||||||
|
def take_row_ids(self, row_ids: list[int]) -> LanceTakeQueryBuilder:
|
||||||
|
return LanceTakeQueryBuilder(self._table.take_row_ids(row_ids))
|
||||||
|
|
||||||
def uses_v2_manifest_paths(self) -> bool:
|
def uses_v2_manifest_paths(self) -> bool:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"uses_v2_manifest_paths() is not supported on the LanceDB Cloud"
|
"uses_v2_manifest_paths() is not supported on the LanceDB Cloud"
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user