mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
99 Commits
python-v0.
...
myriel/doc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3e9f0ac784 | ||
|
|
8ffe992a6f | ||
|
|
9d683e4f0b | ||
|
|
0a1ea1858d | ||
|
|
7d0127b376 | ||
|
|
02595dc475 | ||
|
|
f23327af79 | ||
|
|
c7afa724dd | ||
|
|
c359cec504 | ||
|
|
fe76496a59 | ||
|
|
67ec1fe75c | ||
|
|
70d9b04ba5 | ||
|
|
b0d4a79c35 | ||
|
|
f79295c697 | ||
|
|
381fad9b65 | ||
|
|
055bf91d3e | ||
|
|
050f0086b8 | ||
|
|
10fa23e0d6 | ||
|
|
43d9fc28b0 | ||
|
|
f45f0d0431 | ||
|
|
b9e3c36d82 | ||
|
|
3cd7dd3375 | ||
|
|
12d4ce4cfe | ||
|
|
3d1f102087 | ||
|
|
81afd8a42f | ||
|
|
c2aa03615a | ||
|
|
d2c6759e7f | ||
|
|
94fb9f364a | ||
|
|
fbff244ed8 | ||
|
|
7e7466d224 | ||
|
|
cceaf27d79 | ||
|
|
7a15337e03 | ||
|
|
96c66fd087 | ||
|
|
0579303602 | ||
|
|
75edb8756c | ||
|
|
88283110f4 | ||
|
|
b3a637fdeb | ||
|
|
ce24457531 | ||
|
|
087fe6343d | ||
|
|
ab8cbe62dd | ||
|
|
f076bb41f4 | ||
|
|
902fb83d54 | ||
|
|
779118339f | ||
|
|
03b62599d7 | ||
|
|
4c999fb651 | ||
|
|
6d23d32ab5 | ||
|
|
704cec34e1 | ||
|
|
a300a238db | ||
|
|
a41ff1df0a | ||
|
|
77b005d849 | ||
|
|
167fccc427 | ||
|
|
2bffbcefa5 | ||
|
|
905552f993 | ||
|
|
e4898c9313 | ||
|
|
cab36d94b2 | ||
|
|
b64252d4fd | ||
|
|
6fc006072c | ||
|
|
d4bb59b542 | ||
|
|
6b2dd6de51 | ||
|
|
dbccd9e4f1 | ||
|
|
b12ebfed4c | ||
|
|
1dadb2aefa | ||
|
|
eb9784d7f2 | ||
|
|
ba755626cc | ||
|
|
7760799cb8 | ||
|
|
4beb2d2877 | ||
|
|
a00b8595d1 | ||
|
|
9c8314b4fd | ||
|
|
c625b6f2b2 | ||
|
|
bec8fe6547 | ||
|
|
dc1150c011 | ||
|
|
afaefc6264 | ||
|
|
cb70ff8cee | ||
|
|
cbb5a841b1 | ||
|
|
c72f6770fd | ||
|
|
e5a80a5e86 | ||
|
|
8d0a7fad1f | ||
|
|
b80d4d0134 | ||
|
|
9645fe52c2 | ||
|
|
b77314168d | ||
|
|
e08d45e090 | ||
|
|
2e3ddb8382 | ||
|
|
627ca4c810 | ||
|
|
f8dae4ffe9 | ||
|
|
9eb6119468 | ||
|
|
59b57e30ed | ||
|
|
fec8d58f06 | ||
|
|
84ded9d678 | ||
|
|
65696d9713 | ||
|
|
e2f2ea32e4 | ||
|
|
d5f2eca754 | ||
|
|
7fa455a8a5 | ||
|
|
8f42b5874e | ||
|
|
274f19f560 | ||
|
|
fbcbc75b5b | ||
|
|
008f389bd0 | ||
|
|
91af6518d9 | ||
|
|
af6819762c | ||
|
|
7acece493d |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.20.0-beta.1"
|
current_version = "0.21.2"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
@@ -50,11 +50,6 @@ pre_commit_hooks = [
|
|||||||
optional_value = "final"
|
optional_value = "final"
|
||||||
values = ["beta", "final"]
|
values = ["beta", "final"]
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
filename = "node/package.json"
|
|
||||||
replace = "\"version\": \"{new_version}\","
|
|
||||||
search = "\"version\": \"{current_version}\","
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
[[tool.bumpversion.files]]
|
||||||
filename = "nodejs/package.json"
|
filename = "nodejs/package.json"
|
||||||
replace = "\"version\": \"{new_version}\","
|
replace = "\"version\": \"{new_version}\","
|
||||||
@@ -66,39 +61,8 @@ glob = "nodejs/npm/*/package.json"
|
|||||||
replace = "\"version\": \"{new_version}\","
|
replace = "\"version\": \"{new_version}\","
|
||||||
search = "\"version\": \"{current_version}\","
|
search = "\"version\": \"{current_version}\","
|
||||||
|
|
||||||
# vectodb node binary packages
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "node/package.json"
|
|
||||||
replace = "\"@lancedb/vectordb-darwin-arm64\": \"{new_version}\""
|
|
||||||
search = "\"@lancedb/vectordb-darwin-arm64\": \"{current_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "node/package.json"
|
|
||||||
replace = "\"@lancedb/vectordb-darwin-x64\": \"{new_version}\""
|
|
||||||
search = "\"@lancedb/vectordb-darwin-x64\": \"{current_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "node/package.json"
|
|
||||||
replace = "\"@lancedb/vectordb-linux-arm64-gnu\": \"{new_version}\""
|
|
||||||
search = "\"@lancedb/vectordb-linux-arm64-gnu\": \"{current_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "node/package.json"
|
|
||||||
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
|
|
||||||
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "node/package.json"
|
|
||||||
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
|
|
||||||
search = "\"@lancedb/vectordb-win32-x64-msvc\": \"{current_version}\""
|
|
||||||
|
|
||||||
# Cargo files
|
# Cargo files
|
||||||
# ------------
|
# ------------
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
filename = "rust/ffi/node/Cargo.toml"
|
|
||||||
replace = "\nversion = \"{new_version}\""
|
|
||||||
search = "\nversion = \"{current_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
[[tool.bumpversion.files]]
|
||||||
filename = "rust/lancedb/Cargo.toml"
|
filename = "rust/lancedb/Cargo.toml"
|
||||||
replace = "\nversion = \"{new_version}\""
|
replace = "\nversion = \"{new_version}\""
|
||||||
|
|||||||
10
.github/workflows/cargo-publish.yml
vendored
10
.github/workflows/cargo-publish.yml
vendored
@@ -5,8 +5,8 @@ on:
|
|||||||
tags-ignore:
|
tags-ignore:
|
||||||
# We don't publish pre-releases for Rust. Crates.io is just a source
|
# We don't publish pre-releases for Rust. Crates.io is just a source
|
||||||
# distribution, so we don't need to publish pre-releases.
|
# distribution, so we don't need to publish pre-releases.
|
||||||
- 'v*-beta*'
|
- "v*-beta*"
|
||||||
- '*-v*' # for example, python-vX.Y.Z
|
- "*-v*" # for example, python-vX.Y.Z
|
||||||
|
|
||||||
env:
|
env:
|
||||||
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
||||||
@@ -19,6 +19,8 @@ env:
|
|||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
@@ -31,6 +33,8 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
sudo apt update
|
sudo apt update
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
|
- uses: rust-lang/crates-io-auth-action@v1
|
||||||
|
id: auth
|
||||||
- name: Publish the package
|
- name: Publish the package
|
||||||
run: |
|
run: |
|
||||||
cargo publish -p lancedb --all-features --token ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
cargo publish -p lancedb --all-features --token ${{ steps.auth.outputs.token }}
|
||||||
|
|||||||
10
.github/workflows/make-release-commit.yml
vendored
10
.github/workflows/make-release-commit.yml
vendored
@@ -84,7 +84,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
pip install bump-my-version PyGithub packaging
|
pip install bump-my-version PyGithub packaging
|
||||||
bash ci/bump_version.sh ${{ inputs.type }} ${{ inputs.bump-minor }} v $COMMIT_BEFORE_BUMP
|
bash ci/bump_version.sh ${{ inputs.type }} ${{ inputs.bump-minor }} v $COMMIT_BEFORE_BUMP
|
||||||
bash ci/update_lockfiles.sh
|
bash ci/update_lockfiles.sh --amend
|
||||||
- name: Push new version tag
|
- name: Push new version tag
|
||||||
if: ${{ !inputs.dry_run }}
|
if: ${{ !inputs.dry_run }}
|
||||||
uses: ad-m/github-push-action@master
|
uses: ad-m/github-push-action@master
|
||||||
@@ -93,11 +93,3 @@ jobs:
|
|||||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
branch: ${{ github.ref }}
|
branch: ${{ github.ref }}
|
||||||
tags: true
|
tags: true
|
||||||
- uses: ./.github/workflows/update_package_lock
|
|
||||||
if: ${{ !inputs.dry_run && inputs.other }}
|
|
||||||
with:
|
|
||||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
- uses: ./.github/workflows/update_package_lock_nodejs
|
|
||||||
if: ${{ !inputs.dry_run && inputs.other }}
|
|
||||||
with:
|
|
||||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
|
|||||||
147
.github/workflows/node.yml
vendored
147
.github/workflows/node.yml
vendored
@@ -1,147 +0,0 @@
|
|||||||
name: Node
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- node/**
|
|
||||||
- rust/ffi/node/**
|
|
||||||
- .github/workflows/node.yml
|
|
||||||
- docker-compose.yml
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
env:
|
|
||||||
# Disable full debug symbol generation to speed up CI build and keep memory down
|
|
||||||
# "1" means line tables only, which is useful for panic tracebacks.
|
|
||||||
#
|
|
||||||
# Use native CPU to accelerate tests if possible, especially for f16
|
|
||||||
# target-cpu=haswell fixes failing ci build
|
|
||||||
RUSTFLAGS: "-C debuginfo=1 -C target-cpu=haswell -C target-feature=+f16c,+avx2,+fma"
|
|
||||||
RUST_BACKTRACE: "1"
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
linux:
|
|
||||||
name: Linux (Node ${{ matrix.node-version }})
|
|
||||||
timeout-minutes: 30
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
node-version: [ "18", "20" ]
|
|
||||||
runs-on: "ubuntu-22.04"
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
working-directory: node
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
lfs: true
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: ${{ matrix.node-version }}
|
|
||||||
cache: 'npm'
|
|
||||||
cache-dependency-path: node/package-lock.json
|
|
||||||
- uses: Swatinem/rust-cache@v2
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt update
|
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
npm ci
|
|
||||||
npm run build
|
|
||||||
npm run pack-build
|
|
||||||
npm install --no-save ./dist/lancedb-vectordb-*.tgz
|
|
||||||
# Remove index.node to test with dependency installed
|
|
||||||
rm index.node
|
|
||||||
- name: Test
|
|
||||||
run: npm run test
|
|
||||||
macos:
|
|
||||||
timeout-minutes: 30
|
|
||||||
runs-on: "macos-13"
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
working-directory: node
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
lfs: true
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
cache: 'npm'
|
|
||||||
cache-dependency-path: node/package-lock.json
|
|
||||||
- uses: Swatinem/rust-cache@v2
|
|
||||||
- name: Install dependencies
|
|
||||||
run: brew install protobuf
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
npm ci
|
|
||||||
npm run build
|
|
||||||
npm run pack-build
|
|
||||||
npm install --no-save ./dist/lancedb-vectordb-*.tgz
|
|
||||||
# Remove index.node to test with dependency installed
|
|
||||||
rm index.node
|
|
||||||
- name: Test
|
|
||||||
run: |
|
|
||||||
npm run test
|
|
||||||
aws-integtest:
|
|
||||||
timeout-minutes: 45
|
|
||||||
runs-on: "ubuntu-22.04"
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
working-directory: node
|
|
||||||
env:
|
|
||||||
AWS_ACCESS_KEY_ID: ACCESSKEY
|
|
||||||
AWS_SECRET_ACCESS_KEY: SECRETKEY
|
|
||||||
AWS_DEFAULT_REGION: us-west-2
|
|
||||||
# this one is for s3
|
|
||||||
AWS_ENDPOINT: http://localhost:4566
|
|
||||||
# this one is for dynamodb
|
|
||||||
DYNAMODB_ENDPOINT: http://localhost:4566
|
|
||||||
ALLOW_HTTP: true
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
lfs: true
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
cache: 'npm'
|
|
||||||
cache-dependency-path: node/package-lock.json
|
|
||||||
- name: start local stack
|
|
||||||
run: docker compose -f ../docker-compose.yml up -d --wait
|
|
||||||
- name: create s3
|
|
||||||
run: aws s3 mb s3://lancedb-integtest --endpoint $AWS_ENDPOINT
|
|
||||||
- name: create ddb
|
|
||||||
run: |
|
|
||||||
aws dynamodb create-table \
|
|
||||||
--table-name lancedb-integtest \
|
|
||||||
--attribute-definitions '[{"AttributeName": "base_uri", "AttributeType": "S"}, {"AttributeName": "version", "AttributeType": "N"}]' \
|
|
||||||
--key-schema '[{"AttributeName": "base_uri", "KeyType": "HASH"}, {"AttributeName": "version", "KeyType": "RANGE"}]' \
|
|
||||||
--provisioned-throughput '{"ReadCapacityUnits": 10, "WriteCapacityUnits": 10}' \
|
|
||||||
--endpoint-url $DYNAMODB_ENDPOINT
|
|
||||||
- uses: Swatinem/rust-cache@v2
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt update
|
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
npm ci
|
|
||||||
npm run build
|
|
||||||
npm run pack-build
|
|
||||||
npm install --no-save ./dist/lancedb-vectordb-*.tgz
|
|
||||||
# Remove index.node to test with dependency installed
|
|
||||||
rm index.node
|
|
||||||
- name: Test
|
|
||||||
run: npm run integration-test
|
|
||||||
181
.github/workflows/npm-publish.yml
vendored
181
.github/workflows/npm-publish.yml
vendored
@@ -365,184 +365,3 @@ jobs:
|
|||||||
ARGS="$ARGS --tag preview"
|
ARGS="$ARGS --tag preview"
|
||||||
fi
|
fi
|
||||||
npm publish $ARGS
|
npm publish $ARGS
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
# vectordb release (legacy)
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
# TODO: delete this when we drop vectordb
|
|
||||||
node:
|
|
||||||
name: vectordb Typescript
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
working-directory: node
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
cache: "npm"
|
|
||||||
cache-dependency-path: node/package-lock.json
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt update
|
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
npm ci
|
|
||||||
npm run tsc
|
|
||||||
npm pack
|
|
||||||
- name: Upload Linux Artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: node-package
|
|
||||||
path: |
|
|
||||||
node/vectordb-*.tgz
|
|
||||||
|
|
||||||
node-macos:
|
|
||||||
name: vectordb ${{ matrix.config.arch }}
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
config:
|
|
||||||
- arch: x86_64-apple-darwin
|
|
||||||
runner: macos-13
|
|
||||||
- arch: aarch64-apple-darwin
|
|
||||||
# xlarge is implicitly arm64.
|
|
||||||
runner: macos-14
|
|
||||||
runs-on: ${{ matrix.config.runner }}
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install system dependencies
|
|
||||||
run: brew install protobuf
|
|
||||||
- name: Install npm dependencies
|
|
||||||
run: |
|
|
||||||
cd node
|
|
||||||
npm ci
|
|
||||||
- name: Build MacOS native node modules
|
|
||||||
run: bash ci/build_macos_artifacts.sh ${{ matrix.config.arch }}
|
|
||||||
- name: Upload Darwin Artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: node-native-darwin-${{ matrix.config.arch }}
|
|
||||||
path: |
|
|
||||||
node/dist/lancedb-vectordb-darwin*.tgz
|
|
||||||
|
|
||||||
node-linux-gnu:
|
|
||||||
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
|
|
||||||
runs-on: ${{ matrix.config.runner }}
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
config:
|
|
||||||
- arch: x86_64
|
|
||||||
runner: ubuntu-latest
|
|
||||||
- arch: aarch64
|
|
||||||
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
|
|
||||||
runner: warp-ubuntu-latest-arm64-4x
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
# To avoid OOM errors on ARM, we create a swap file.
|
|
||||||
- name: Configure aarch64 build
|
|
||||||
if: ${{ matrix.config.arch == 'aarch64' }}
|
|
||||||
run: |
|
|
||||||
free -h
|
|
||||||
sudo fallocate -l 16G /swapfile
|
|
||||||
sudo chmod 600 /swapfile
|
|
||||||
sudo mkswap /swapfile
|
|
||||||
sudo swapon /swapfile
|
|
||||||
echo "/swapfile swap swap defaults 0 0" >> sudo /etc/fstab
|
|
||||||
# print info
|
|
||||||
swapon --show
|
|
||||||
free -h
|
|
||||||
- name: Build Linux Artifacts
|
|
||||||
run: |
|
|
||||||
bash ci/build_linux_artifacts.sh ${{ matrix.config.arch }} ${{ matrix.config.arch }}-unknown-linux-gnu
|
|
||||||
- name: Upload Linux Artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: node-native-linux-${{ matrix.config.arch }}-gnu
|
|
||||||
path: |
|
|
||||||
node/dist/lancedb-vectordb-linux*.tgz
|
|
||||||
|
|
||||||
node-windows:
|
|
||||||
name: vectordb ${{ matrix.target }}
|
|
||||||
runs-on: windows-2022
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
target: [x86_64-pc-windows-msvc]
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Protoc v21.12
|
|
||||||
working-directory: C:\
|
|
||||||
run: |
|
|
||||||
New-Item -Path 'C:\protoc' -ItemType Directory
|
|
||||||
Set-Location C:\protoc
|
|
||||||
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
|
|
||||||
7z x protoc.zip
|
|
||||||
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
|
|
||||||
shell: powershell
|
|
||||||
- name: Install npm dependencies
|
|
||||||
run: |
|
|
||||||
cd node
|
|
||||||
npm ci
|
|
||||||
- name: Build Windows native node modules
|
|
||||||
run: .\ci\build_windows_artifacts.ps1 ${{ matrix.target }}
|
|
||||||
- name: Upload Windows Artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: node-native-windows
|
|
||||||
path: |
|
|
||||||
node/dist/lancedb-vectordb-win32*.tgz
|
|
||||||
|
|
||||||
release:
|
|
||||||
name: vectordb NPM Publish
|
|
||||||
needs: [node, node-macos, node-linux-gnu, node-windows]
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
# Only runs on tags that matches the make-release action
|
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
|
||||||
steps:
|
|
||||||
- uses: actions/download-artifact@v4
|
|
||||||
with:
|
|
||||||
pattern: node-*
|
|
||||||
- name: Display structure of downloaded files
|
|
||||||
run: ls -R
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
registry-url: "https://registry.npmjs.org"
|
|
||||||
- name: Publish to NPM
|
|
||||||
env:
|
|
||||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
|
||||||
run: |
|
|
||||||
# Tag beta as "preview" instead of default "latest". See lancedb
|
|
||||||
# npm publish step for more info.
|
|
||||||
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
|
|
||||||
PUBLISH_ARGS="--tag preview"
|
|
||||||
fi
|
|
||||||
|
|
||||||
mv */*.tgz .
|
|
||||||
for filename in *.tgz; do
|
|
||||||
npm publish $PUBLISH_ARGS $filename
|
|
||||||
done
|
|
||||||
- name: Deprecate
|
|
||||||
env:
|
|
||||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
|
||||||
# We need to deprecate the old package to avoid confusion.
|
|
||||||
# Each time we publish a new version, it gets undeprecated.
|
|
||||||
run: npm deprecate vectordb "Use @lancedb/lancedb instead."
|
|
||||||
- name: Notify Slack Action
|
|
||||||
uses: ravsamhq/notify-slack-action@2.3.0
|
|
||||||
if: ${{ always() }}
|
|
||||||
with:
|
|
||||||
status: ${{ job.status }}
|
|
||||||
notify_when: "failure"
|
|
||||||
notification_title: "{workflow} is failing"
|
|
||||||
env:
|
|
||||||
SLACK_WEBHOOK_URL: ${{ secrets.ACTION_MONITORING_SLACK }}
|
|
||||||
|
|||||||
33
.github/workflows/update_package_lock/action.yml
vendored
33
.github/workflows/update_package_lock/action.yml
vendored
@@ -1,33 +0,0 @@
|
|||||||
name: update_package_lock
|
|
||||||
description: "Update node's package.lock"
|
|
||||||
|
|
||||||
inputs:
|
|
||||||
github_token:
|
|
||||||
required: true
|
|
||||||
description: "github token for the repo"
|
|
||||||
|
|
||||||
runs:
|
|
||||||
using: "composite"
|
|
||||||
steps:
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
- name: Set git configs
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
git config user.name 'Lance Release'
|
|
||||||
git config user.email 'lance-dev@lancedb.com'
|
|
||||||
- name: Update package-lock.json file
|
|
||||||
working-directory: ./node
|
|
||||||
run: |
|
|
||||||
npm install
|
|
||||||
git add package-lock.json
|
|
||||||
git commit -m "Updating package-lock.json"
|
|
||||||
shell: bash
|
|
||||||
- name: Push changes
|
|
||||||
if: ${{ inputs.dry_run }} == "false"
|
|
||||||
uses: ad-m/github-push-action@master
|
|
||||||
with:
|
|
||||||
github_token: ${{ inputs.github_token }}
|
|
||||||
branch: main
|
|
||||||
tags: true
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
name: update_package_lock_nodejs
|
|
||||||
description: "Update nodejs's package.lock"
|
|
||||||
|
|
||||||
inputs:
|
|
||||||
github_token:
|
|
||||||
required: true
|
|
||||||
description: "github token for the repo"
|
|
||||||
|
|
||||||
runs:
|
|
||||||
using: "composite"
|
|
||||||
steps:
|
|
||||||
- uses: actions/setup-node@v3
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
- name: Set git configs
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
git config user.name 'Lance Release'
|
|
||||||
git config user.email 'lance-dev@lancedb.com'
|
|
||||||
- name: Update package-lock.json file
|
|
||||||
working-directory: ./nodejs
|
|
||||||
run: |
|
|
||||||
npm install
|
|
||||||
git add package-lock.json
|
|
||||||
git commit -m "Updating package-lock.json"
|
|
||||||
shell: bash
|
|
||||||
- name: Push changes
|
|
||||||
if: ${{ inputs.dry_run }} == "false"
|
|
||||||
uses: ad-m/github-push-action@master
|
|
||||||
with:
|
|
||||||
github_token: ${{ inputs.github_token }}
|
|
||||||
branch: main
|
|
||||||
tags: true
|
|
||||||
24
CLAUDE.md
Normal file
24
CLAUDE.md
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
LanceDB is a database designed for retrieval, including vector, full-text, and hybrid search.
|
||||||
|
It is a wrapper around Lance. There are two backends: local (in-process like SQLite) and
|
||||||
|
remote (against LanceDB Cloud).
|
||||||
|
|
||||||
|
The core of LanceDB is written in Rust. There are bindings in Python, Typescript, and Java.
|
||||||
|
|
||||||
|
Project layout:
|
||||||
|
|
||||||
|
* `rust/lancedb`: The LanceDB core Rust implementation.
|
||||||
|
* `python`: The Python bindings, using PyO3.
|
||||||
|
* `nodejs`: The Typescript bindings, using napi-rs
|
||||||
|
* `java`: The Java bindings
|
||||||
|
|
||||||
|
(`rust/ffi` and `node/` are for a deprecated package. You can ignore them.)
|
||||||
|
|
||||||
|
Common commands:
|
||||||
|
|
||||||
|
* Check for compiler errors: `cargo check --features remote --tests --examples`
|
||||||
|
* Run tests: `cargo test --features remote --tests`
|
||||||
|
* Run specific test: `cargo test --features remote -p <package_name> --test <test_name>`
|
||||||
|
* Lint: `cargo clippy --features remote --tests --examples`
|
||||||
|
* Format: `cargo fmt --all`
|
||||||
|
|
||||||
|
Before committing changes, run formatting.
|
||||||
1222
Cargo.lock
generated
1222
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
35
Cargo.toml
35
Cargo.toml
@@ -1,6 +1,5 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
members = [
|
members = [
|
||||||
"rust/ffi/node",
|
|
||||||
"rust/lancedb",
|
"rust/lancedb",
|
||||||
"nodejs",
|
"nodejs",
|
||||||
"python",
|
"python",
|
||||||
@@ -21,14 +20,16 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.29.0", "features" = ["dynamodb"], tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance = { "version" = "=0.32.1", "features" = [
|
||||||
lance-io = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
"dynamodb",
|
||||||
lance-index = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
], "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-linalg = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-io = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-table = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-index = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-testing = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-linalg = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-datafusion = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-table = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
lance-encoding = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
lance-testing = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-datafusion = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
|
lance-encoding = { "version" = "=0.32.1", "tag" = "v0.32.1-beta.2", "git" = "https://github.com/lancedb/lance.git" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "55.1", optional = false }
|
arrow = { version = "55.1", optional = false }
|
||||||
arrow-array = "55.1"
|
arrow-array = "55.1"
|
||||||
@@ -39,20 +40,20 @@ arrow-schema = "55.1"
|
|||||||
arrow-arith = "55.1"
|
arrow-arith = "55.1"
|
||||||
arrow-cast = "55.1"
|
arrow-cast = "55.1"
|
||||||
async-trait = "0"
|
async-trait = "0"
|
||||||
datafusion = { version = "47.0", default-features = false }
|
datafusion = { version = "48.0", default-features = false }
|
||||||
datafusion-catalog = "47.0"
|
datafusion-catalog = "48.0"
|
||||||
datafusion-common = { version = "47.0", default-features = false }
|
datafusion-common = { version = "48.0", default-features = false }
|
||||||
datafusion-execution = "47.0"
|
datafusion-execution = "48.0"
|
||||||
datafusion-expr = "47.0"
|
datafusion-expr = "48.0"
|
||||||
datafusion-physical-plan = "47.0"
|
datafusion-physical-plan = "48.0"
|
||||||
env_logger = "0.11"
|
env_logger = "0.11"
|
||||||
half = { "version" = "=2.5.0", default-features = false, features = [
|
half = { "version" = "2.6.0", default-features = false, features = [
|
||||||
"num-traits",
|
"num-traits",
|
||||||
] }
|
] }
|
||||||
futures = "0"
|
futures = "0"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
moka = { version = "0.12", features = ["future"] }
|
moka = { version = "0.12", features = ["future"] }
|
||||||
object_store = "0.11.0"
|
object_store = "0.12.0"
|
||||||
pin-project = "1.0.7"
|
pin-project = "1.0.7"
|
||||||
snafu = "0.8"
|
snafu = "0.8"
|
||||||
url = "2"
|
url = "2"
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
ARCH=${1:-x86_64}
|
|
||||||
TARGET_TRIPLE=${2:-x86_64-unknown-linux-gnu}
|
|
||||||
|
|
||||||
# We pass down the current user so that when we later mount the local files
|
|
||||||
# into the container, the files are accessible by the current user.
|
|
||||||
pushd ci/manylinux_node
|
|
||||||
docker build \
|
|
||||||
-t lancedb-node-manylinux \
|
|
||||||
--build-arg="ARCH=$ARCH" \
|
|
||||||
--build-arg="DOCKER_USER=$(id -u)" \
|
|
||||||
--progress=plain \
|
|
||||||
.
|
|
||||||
popd
|
|
||||||
|
|
||||||
# We turn on memory swap to avoid OOM killer
|
|
||||||
docker run \
|
|
||||||
-v $(pwd):/io -w /io \
|
|
||||||
--memory-swap=-1 \
|
|
||||||
lancedb-node-manylinux \
|
|
||||||
bash ci/manylinux_node/build_vectordb.sh $ARCH $TARGET_TRIPLE
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
# Builds the macOS artifacts (node binaries).
|
|
||||||
# Usage: ./ci/build_macos_artifacts.sh [target]
|
|
||||||
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
|
|
||||||
set -e
|
|
||||||
|
|
||||||
prebuild_rust() {
|
|
||||||
# Building here for the sake of easier debugging.
|
|
||||||
pushd rust/ffi/node
|
|
||||||
echo "Building rust library for $1"
|
|
||||||
export RUST_BACKTRACE=1
|
|
||||||
cargo build --release --target $1
|
|
||||||
popd
|
|
||||||
}
|
|
||||||
|
|
||||||
build_node_binaries() {
|
|
||||||
pushd node
|
|
||||||
echo "Building node library for $1"
|
|
||||||
npm run build-release -- --target $1
|
|
||||||
npm run pack-build -- --target $1
|
|
||||||
popd
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ -n "$1" ]; then
|
|
||||||
targets=$1
|
|
||||||
else
|
|
||||||
targets="x86_64-apple-darwin aarch64-apple-darwin"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Building artifacts for targets: $targets"
|
|
||||||
for target in $targets
|
|
||||||
do
|
|
||||||
prebuild_rust $target
|
|
||||||
build_node_binaries $target
|
|
||||||
done
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
# Builds the Windows artifacts (node binaries).
|
|
||||||
# Usage: .\ci\build_windows_artifacts.ps1 [target]
|
|
||||||
# Targets supported:
|
|
||||||
# - x86_64-pc-windows-msvc
|
|
||||||
# - i686-pc-windows-msvc
|
|
||||||
# - aarch64-pc-windows-msvc
|
|
||||||
|
|
||||||
function Prebuild-Rust {
|
|
||||||
param (
|
|
||||||
[string]$target
|
|
||||||
)
|
|
||||||
|
|
||||||
# Building here for the sake of easier debugging.
|
|
||||||
Push-Location -Path "rust/ffi/node"
|
|
||||||
Write-Host "Building rust library for $target"
|
|
||||||
$env:RUST_BACKTRACE=1
|
|
||||||
cargo build --release --target $target
|
|
||||||
Pop-Location
|
|
||||||
}
|
|
||||||
|
|
||||||
function Build-NodeBinaries {
|
|
||||||
param (
|
|
||||||
[string]$target
|
|
||||||
)
|
|
||||||
|
|
||||||
Push-Location -Path "node"
|
|
||||||
Write-Host "Building node library for $target"
|
|
||||||
npm run build-release -- --target $target
|
|
||||||
npm run pack-build -- --target $target
|
|
||||||
Pop-Location
|
|
||||||
}
|
|
||||||
|
|
||||||
$targets = $args[0]
|
|
||||||
if (-not $targets) {
|
|
||||||
$targets = "x86_64-pc-windows-msvc", "aarch64-pc-windows-msvc"
|
|
||||||
}
|
|
||||||
|
|
||||||
Write-Host "Building artifacts for targets: $targets"
|
|
||||||
foreach ($target in $targets) {
|
|
||||||
Prebuild-Rust $target
|
|
||||||
Build-NodeBinaries $target
|
|
||||||
}
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
# Builds the Windows artifacts (nodejs binaries).
|
|
||||||
# Usage: .\ci\build_windows_artifacts_nodejs.ps1 [target]
|
|
||||||
# Targets supported:
|
|
||||||
# - x86_64-pc-windows-msvc
|
|
||||||
# - i686-pc-windows-msvc
|
|
||||||
# - aarch64-pc-windows-msvc
|
|
||||||
|
|
||||||
function Prebuild-Rust {
|
|
||||||
param (
|
|
||||||
[string]$target
|
|
||||||
)
|
|
||||||
|
|
||||||
# Building here for the sake of easier debugging.
|
|
||||||
Push-Location -Path "rust/lancedb"
|
|
||||||
Write-Host "Building rust library for $target"
|
|
||||||
$env:RUST_BACKTRACE=1
|
|
||||||
cargo build --release --target $target
|
|
||||||
Pop-Location
|
|
||||||
}
|
|
||||||
|
|
||||||
function Build-NodeBinaries {
|
|
||||||
param (
|
|
||||||
[string]$target
|
|
||||||
)
|
|
||||||
|
|
||||||
Push-Location -Path "nodejs"
|
|
||||||
Write-Host "Building nodejs library for $target"
|
|
||||||
$env:RUST_TARGET=$target
|
|
||||||
npm run build-release
|
|
||||||
Pop-Location
|
|
||||||
}
|
|
||||||
|
|
||||||
$targets = $args[0]
|
|
||||||
if (-not $targets) {
|
|
||||||
$targets = "x86_64-pc-windows-msvc", "aarch64-pc-windows-msvc"
|
|
||||||
}
|
|
||||||
|
|
||||||
Write-Host "Building artifacts for targets: $targets"
|
|
||||||
foreach ($target in $targets) {
|
|
||||||
Prebuild-Rust $target
|
|
||||||
Build-NodeBinaries $target
|
|
||||||
}
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
# Many linux dockerfile with Rust, Node, and Lance dependencies installed.
|
|
||||||
# This container allows building the node modules native libraries in an
|
|
||||||
# environment with a very old glibc, so that we are compatible with a wide
|
|
||||||
# range of linux distributions.
|
|
||||||
ARG ARCH=x86_64
|
|
||||||
|
|
||||||
FROM quay.io/pypa/manylinux_2_28_${ARCH}
|
|
||||||
|
|
||||||
ARG ARCH=x86_64
|
|
||||||
ARG DOCKER_USER=default_user
|
|
||||||
|
|
||||||
# Protobuf is also installed as root.
|
|
||||||
COPY install_protobuf.sh install_protobuf.sh
|
|
||||||
RUN ./install_protobuf.sh ${ARCH}
|
|
||||||
|
|
||||||
ENV DOCKER_USER=${DOCKER_USER}
|
|
||||||
# Create a group and user, but only if it doesn't exist
|
|
||||||
RUN echo ${ARCH} && id -u ${DOCKER_USER} >/dev/null 2>&1 || adduser --user-group --create-home --uid ${DOCKER_USER} build_user
|
|
||||||
|
|
||||||
# We switch to the user to install Rust and Node, since those like to be
|
|
||||||
# installed at the user level.
|
|
||||||
USER ${DOCKER_USER}
|
|
||||||
|
|
||||||
COPY prepare_manylinux_node.sh prepare_manylinux_node.sh
|
|
||||||
RUN cp /prepare_manylinux_node.sh $HOME/ && \
|
|
||||||
cd $HOME && \
|
|
||||||
./prepare_manylinux_node.sh ${ARCH}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# Builds the node module for manylinux. Invoked by ci/build_linux_artifacts.sh.
|
|
||||||
set -e
|
|
||||||
ARCH=${1:-x86_64}
|
|
||||||
TARGET_TRIPLE=${2:-x86_64-unknown-linux-gnu}
|
|
||||||
|
|
||||||
#Alpine doesn't have .bashrc
|
|
||||||
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
|
||||||
|
|
||||||
cd node
|
|
||||||
npm ci
|
|
||||||
npm run build-release
|
|
||||||
npm run pack-build -- -t $TARGET_TRIPLE
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# Installs protobuf compiler. Should be run as root.
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $1 == x86_64* ]]; then
|
|
||||||
ARCH=x86_64
|
|
||||||
else
|
|
||||||
# gnu target
|
|
||||||
ARCH=aarch_64
|
|
||||||
fi
|
|
||||||
|
|
||||||
PB_REL=https://github.com/protocolbuffers/protobuf/releases
|
|
||||||
PB_VERSION=23.1
|
|
||||||
curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-$ARCH.zip
|
|
||||||
unzip protoc-$PB_VERSION-linux-$ARCH.zip -d /usr/local
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
install_node() {
|
|
||||||
echo "Installing node..."
|
|
||||||
|
|
||||||
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
|
|
||||||
|
|
||||||
source "$HOME"/.bashrc
|
|
||||||
|
|
||||||
nvm install --no-progress 18
|
|
||||||
}
|
|
||||||
|
|
||||||
install_rust() {
|
|
||||||
echo "Installing rust..."
|
|
||||||
curl https://sh.rustup.rs -sSf | bash -s -- -y
|
|
||||||
export PATH="$PATH:/root/.cargo/bin"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_node
|
|
||||||
install_rust
|
|
||||||
188
ci/set_lance_version.py
Normal file
188
ci/set_lance_version.py
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def run_command(command: str) -> str:
|
||||||
|
"""
|
||||||
|
Run a shell command and return stdout as a string.
|
||||||
|
If exit code is not 0, raise an exception with the stderr output.
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
result = subprocess.run(command, shell=True, capture_output=True, text=True)
|
||||||
|
if result.returncode != 0:
|
||||||
|
raise Exception(f"Command failed with error: {result.stderr.strip()}")
|
||||||
|
return result.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_latest_stable_version() -> str:
|
||||||
|
version_line = run_command("cargo info lance | grep '^version:'")
|
||||||
|
version = version_line.split(" ")[1].strip()
|
||||||
|
return version
|
||||||
|
|
||||||
|
|
||||||
|
def get_latest_preview_version() -> str:
|
||||||
|
lance_tags = run_command(
|
||||||
|
"git ls-remote --tags https://github.com/lancedb/lance.git | grep 'refs/tags/v[0-9beta.-]\\+$'"
|
||||||
|
).splitlines()
|
||||||
|
lance_tags = (
|
||||||
|
tag.split("refs/tags/")[1]
|
||||||
|
for tag in lance_tags
|
||||||
|
if "refs/tags/" in tag and "beta" in tag
|
||||||
|
)
|
||||||
|
from packaging.version import Version
|
||||||
|
|
||||||
|
latest = max(
|
||||||
|
(tag[1:] for tag in lance_tags if tag.startswith("v")), key=lambda t: Version(t)
|
||||||
|
)
|
||||||
|
return str(latest)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_features(line: str) -> list:
|
||||||
|
"""
|
||||||
|
Extracts the features from a line in Cargo.toml.
|
||||||
|
Example: 'lance = { "version" = "=0.29.0", "features" = ["dynamodb"] }'
|
||||||
|
Returns: ['dynamodb']
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
match = re.search(r'"features"\s*=\s*\[\s*(.*?)\s*\]', line, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
features_str = match.group(1)
|
||||||
|
return [f.strip('"') for f in features_str.split(",") if len(f) > 0]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def update_cargo_toml(line_updater):
|
||||||
|
"""
|
||||||
|
Updates the Cargo.toml file by applying the line_updater function to each line.
|
||||||
|
The line_updater function should take a line as input and return the updated line.
|
||||||
|
"""
|
||||||
|
with open("Cargo.toml", "r") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
new_lines = []
|
||||||
|
lance_line = ""
|
||||||
|
is_parsing_lance_line = False
|
||||||
|
for line in lines:
|
||||||
|
if line.startswith("lance"):
|
||||||
|
# Update the line using the provided function
|
||||||
|
if line.strip().endswith("}"):
|
||||||
|
new_lines.append(line_updater(line))
|
||||||
|
else:
|
||||||
|
lance_line = line
|
||||||
|
is_parsing_lance_line = True
|
||||||
|
elif is_parsing_lance_line:
|
||||||
|
lance_line += line
|
||||||
|
if line.strip().endswith("}"):
|
||||||
|
new_lines.append(line_updater(lance_line))
|
||||||
|
lance_line = ""
|
||||||
|
is_parsing_lance_line = False
|
||||||
|
else:
|
||||||
|
print("doesn't end with }:", line)
|
||||||
|
else:
|
||||||
|
# Keep the line unchanged
|
||||||
|
new_lines.append(line)
|
||||||
|
|
||||||
|
with open("Cargo.toml", "w") as f:
|
||||||
|
f.writelines(new_lines)
|
||||||
|
|
||||||
|
|
||||||
|
def set_stable_version(version: str):
|
||||||
|
"""
|
||||||
|
Sets lines to
|
||||||
|
lance = { "version" = "=0.29.0", "features" = ["dynamodb"] }
|
||||||
|
lance-io = "=0.29.0"
|
||||||
|
...
|
||||||
|
"""
|
||||||
|
|
||||||
|
def line_updater(line: str) -> str:
|
||||||
|
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||||
|
features = extract_features(line)
|
||||||
|
if features:
|
||||||
|
return f'{package_name} = {{ "version" = "={version}", "features" = {json.dumps(features)} }}\n'
|
||||||
|
else:
|
||||||
|
return f'{package_name} = "={version}"\n'
|
||||||
|
|
||||||
|
update_cargo_toml(line_updater)
|
||||||
|
|
||||||
|
|
||||||
|
def set_preview_version(version: str):
|
||||||
|
"""
|
||||||
|
Sets lines to
|
||||||
|
lance = { "version" = "=0.29.0", "features" = ["dynamodb"], tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
||||||
|
lance-io = { version = "=0.29.0", tag = "v0.29.0-beta.2", git="https://github.com/lancedb/lance.git" }
|
||||||
|
...
|
||||||
|
"""
|
||||||
|
|
||||||
|
def line_updater(line: str) -> str:
|
||||||
|
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||||
|
features = extract_features(line)
|
||||||
|
base_version = version.split("-")[0] # Get the base version without beta suffix
|
||||||
|
if features:
|
||||||
|
return f'{package_name} = {{ "version" = "={base_version}", "features" = {json.dumps(features)}, "tag" = "v{version}", "git" = "https://github.com/lancedb/lance.git" }}\n'
|
||||||
|
else:
|
||||||
|
return f'{package_name} = {{ "version" = "={base_version}", "tag" = "v{version}", "git" = "https://github.com/lancedb/lance.git" }}\n'
|
||||||
|
|
||||||
|
update_cargo_toml(line_updater)
|
||||||
|
|
||||||
|
|
||||||
|
def set_local_version():
|
||||||
|
"""
|
||||||
|
Sets lines to
|
||||||
|
lance = { path = "../lance/rust/lance", features = ["dynamodb"] }
|
||||||
|
lance-io = { path = "../lance/rust/lance-io" }
|
||||||
|
...
|
||||||
|
"""
|
||||||
|
|
||||||
|
def line_updater(line: str) -> str:
|
||||||
|
package_name = line.split("=", maxsplit=1)[0].strip()
|
||||||
|
features = extract_features(line)
|
||||||
|
if features:
|
||||||
|
return f'{package_name} = {{ "path" = "../lance/rust/{package_name}", "features" = {json.dumps(features)} }}\n'
|
||||||
|
else:
|
||||||
|
return f'{package_name} = {{ "path" = "../lance/rust/{package_name}" }}\n'
|
||||||
|
|
||||||
|
update_cargo_toml(line_updater)
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Set the version of the Lance package.")
|
||||||
|
parser.add_argument(
|
||||||
|
"version",
|
||||||
|
type=str,
|
||||||
|
help="The version to set for the Lance package. Use 'stable' for the latest stable version, 'preview' for latest preview version, or a specific version number (e.g., '0.1.0'). You can also specify 'local' to use a local path.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.version == "stable":
|
||||||
|
latest_stable_version = get_latest_stable_version()
|
||||||
|
print(
|
||||||
|
f"Found latest stable version: \033[1mv{latest_stable_version}\033[0m",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
set_stable_version(latest_stable_version)
|
||||||
|
elif args.version == "preview":
|
||||||
|
latest_preview_version = get_latest_preview_version()
|
||||||
|
print(
|
||||||
|
f"Found latest preview version: \033[1mv{latest_preview_version}\033[0m",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
set_preview_version(latest_preview_version)
|
||||||
|
elif args.version == "local":
|
||||||
|
set_local_version()
|
||||||
|
else:
|
||||||
|
# Parse the version number.
|
||||||
|
version = args.version
|
||||||
|
# Ignore initial v if present.
|
||||||
|
if version.startswith("v"):
|
||||||
|
version = version[1:]
|
||||||
|
|
||||||
|
if "beta" in version:
|
||||||
|
set_preview_version(version)
|
||||||
|
else:
|
||||||
|
set_stable_version(version)
|
||||||
|
|
||||||
|
print("Updating lockfiles...", file=sys.stderr, end="")
|
||||||
|
run_command("cargo metadata > /dev/null")
|
||||||
|
print(" done.", file=sys.stderr)
|
||||||
@@ -1,18 +1,30 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
AMEND=false
|
||||||
|
|
||||||
|
for arg in "$@"; do
|
||||||
|
if [[ "$arg" == "--amend" ]]; then
|
||||||
|
AMEND=true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
# This updates the lockfile without building
|
# This updates the lockfile without building
|
||||||
cargo metadata > /dev/null
|
cargo metadata --quiet > /dev/null
|
||||||
|
|
||||||
pushd nodejs || exit 1
|
pushd nodejs || exit 1
|
||||||
npm install --package-lock-only
|
npm install --package-lock-only --silent
|
||||||
popd
|
popd
|
||||||
pushd node || exit 1
|
pushd node || exit 1
|
||||||
npm install --package-lock-only
|
npm install --package-lock-only --silent
|
||||||
popd
|
popd
|
||||||
|
|
||||||
if git diff --quiet --exit-code; then
|
if git diff --quiet --exit-code; then
|
||||||
echo "No lockfile changes to commit; skipping amend."
|
echo "No lockfile changes to commit; skipping amend."
|
||||||
else
|
elif $AMEND; then
|
||||||
|
git add Cargo.lock nodejs/package-lock.json node/package-lock.json
|
||||||
git commit --amend --no-edit
|
git commit --amend --no-edit
|
||||||
|
else
|
||||||
|
git add Cargo.lock nodejs/package-lock.json node/package-lock.json
|
||||||
|
git commit -m "Update lockfiles"
|
||||||
fi
|
fi
|
||||||
|
|||||||
258
docs/mkdocs.yml
258
docs/mkdocs.yml
@@ -103,264 +103,6 @@ markdown_extensions:
|
|||||||
permalink: ""
|
permalink: ""
|
||||||
|
|
||||||
nav:
|
nav:
|
||||||
- Home:
|
|
||||||
- LanceDB: index.md
|
|
||||||
- 🏃🏼♂️ Quick start: basic.md
|
|
||||||
- 📚 Concepts:
|
|
||||||
- Vector search: concepts/vector_search.md
|
|
||||||
- Indexing:
|
|
||||||
- IVFPQ: concepts/index_ivfpq.md
|
|
||||||
- HNSW: concepts/index_hnsw.md
|
|
||||||
- Storage: concepts/storage.md
|
|
||||||
- Data management: concepts/data_management.md
|
|
||||||
- 🔨 Guides:
|
|
||||||
- Working with tables: guides/tables.md
|
|
||||||
- Building a vector index: ann_indexes.md
|
|
||||||
- Vector Search: search.md
|
|
||||||
- Full-text search (native): fts.md
|
|
||||||
- Full-text search (tantivy-based): fts_tantivy.md
|
|
||||||
- Building a scalar index: guides/scalar_index.md
|
|
||||||
- Hybrid search:
|
|
||||||
- Overview: hybrid_search/hybrid_search.md
|
|
||||||
- Comparing Rerankers: hybrid_search/eval.md
|
|
||||||
- Airbnb financial data example: notebooks/hybrid_search.ipynb
|
|
||||||
- Late interaction with MultiVector search:
|
|
||||||
- Overview: guides/multi-vector.md
|
|
||||||
- Example: notebooks/Multivector_on_LanceDB.ipynb
|
|
||||||
- RAG:
|
|
||||||
- Vanilla RAG: rag/vanilla_rag.md
|
|
||||||
- Multi-head RAG: rag/multi_head_rag.md
|
|
||||||
- Corrective RAG: rag/corrective_rag.md
|
|
||||||
- Agentic RAG: rag/agentic_rag.md
|
|
||||||
- Graph RAG: rag/graph_rag.md
|
|
||||||
- Self RAG: rag/self_rag.md
|
|
||||||
- Adaptive RAG: rag/adaptive_rag.md
|
|
||||||
- SFR RAG: rag/sfr_rag.md
|
|
||||||
- Advanced Techniques:
|
|
||||||
- HyDE: rag/advanced_techniques/hyde.md
|
|
||||||
- FLARE: rag/advanced_techniques/flare.md
|
|
||||||
- Reranking:
|
|
||||||
- Quickstart: reranking/index.md
|
|
||||||
- Cohere Reranker: reranking/cohere.md
|
|
||||||
- Linear Combination Reranker: reranking/linear_combination.md
|
|
||||||
- Reciprocal Rank Fusion Reranker: reranking/rrf.md
|
|
||||||
- Cross Encoder Reranker: reranking/cross_encoder.md
|
|
||||||
- ColBERT Reranker: reranking/colbert.md
|
|
||||||
- Jina Reranker: reranking/jina.md
|
|
||||||
- OpenAI Reranker: reranking/openai.md
|
|
||||||
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
|
||||||
- Voyage AI Rerankers: reranking/voyageai.md
|
|
||||||
- Building Custom Rerankers: reranking/custom_reranker.md
|
|
||||||
- Example: notebooks/lancedb_reranking.ipynb
|
|
||||||
- Filtering: sql.md
|
|
||||||
- Versioning & Reproducibility:
|
|
||||||
- sync API: notebooks/reproducibility.ipynb
|
|
||||||
- async API: notebooks/reproducibility_async.ipynb
|
|
||||||
- Configuring Storage: guides/storage.md
|
|
||||||
- Migration Guide: migration.md
|
|
||||||
- Tuning retrieval performance:
|
|
||||||
- Choosing right query type: guides/tuning_retrievers/1_query_types.md
|
|
||||||
- Reranking: guides/tuning_retrievers/2_reranking.md
|
|
||||||
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
|
|
||||||
- 🧬 Managing embeddings:
|
|
||||||
- Understand Embeddings: embeddings/understanding_embeddings.md
|
|
||||||
- Get Started: embeddings/index.md
|
|
||||||
- Embedding functions: embeddings/embedding_functions.md
|
|
||||||
- Available models:
|
|
||||||
- Overview: embeddings/default_embedding_functions.md
|
|
||||||
- Text Embedding Functions:
|
|
||||||
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
|
|
||||||
- Huggingface Embedding Models: embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md
|
|
||||||
- Ollama Embeddings: embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md
|
|
||||||
- OpenAI Embeddings: embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md
|
|
||||||
- Instructor Embeddings: embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md
|
|
||||||
- Gemini Embeddings: embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md
|
|
||||||
- Cohere Embeddings: embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
|
|
||||||
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
|
||||||
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
|
||||||
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
|
||||||
- Voyage AI Embeddings: embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md
|
|
||||||
- Multimodal Embedding Functions:
|
|
||||||
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
|
||||||
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
|
||||||
- Jina Embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md
|
|
||||||
- User-defined embedding functions: embeddings/custom_embedding_function.md
|
|
||||||
- Variables and secrets: embeddings/variables_and_secrets.md
|
|
||||||
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
|
|
||||||
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
|
|
||||||
- 🔌 Integrations:
|
|
||||||
- Tools and data formats: integrations/index.md
|
|
||||||
- Pandas and PyArrow: python/pandas_and_pyarrow.md
|
|
||||||
- Polars: python/polars_arrow.md
|
|
||||||
- DuckDB: python/duckdb.md
|
|
||||||
- Datafusion: python/datafusion.md
|
|
||||||
- LangChain:
|
|
||||||
- LangChain 🔗: integrations/langchain.md
|
|
||||||
- LangChain demo: notebooks/langchain_demo.ipynb
|
|
||||||
- LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb
|
|
||||||
- LlamaIndex 🦙:
|
|
||||||
- LlamaIndex docs: integrations/llamaIndex.md
|
|
||||||
- LlamaIndex demo: notebooks/llamaIndex_demo.ipynb
|
|
||||||
- Pydantic: python/pydantic.md
|
|
||||||
- Voxel51: integrations/voxel51.md
|
|
||||||
- PromptTools: integrations/prompttools.md
|
|
||||||
- dlt: integrations/dlt.md
|
|
||||||
- phidata: integrations/phidata.md
|
|
||||||
- Genkit: integrations/genkit.md
|
|
||||||
- 🎯 Examples:
|
|
||||||
- Overview: examples/index.md
|
|
||||||
- 🐍 Python:
|
|
||||||
- Overview: examples/examples_python.md
|
|
||||||
- Build From Scratch: examples/python_examples/build_from_scratch.md
|
|
||||||
- Multimodal: examples/python_examples/multimodal.md
|
|
||||||
- Rag: examples/python_examples/rag.md
|
|
||||||
- Vector Search: examples/python_examples/vector_search.md
|
|
||||||
- Chatbot: examples/python_examples/chatbot.md
|
|
||||||
- Evaluation: examples/python_examples/evaluations.md
|
|
||||||
- AI Agent: examples/python_examples/aiagent.md
|
|
||||||
- Recommender System: examples/python_examples/recommendersystem.md
|
|
||||||
- Miscellaneous:
|
|
||||||
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
|
||||||
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
|
||||||
- 👾 JavaScript:
|
|
||||||
- Overview: examples/examples_js.md
|
|
||||||
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
|
|
||||||
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
|
|
||||||
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
|
||||||
- 🦀 Rust:
|
|
||||||
- Overview: examples/examples_rust.md
|
|
||||||
- 📓 Studies:
|
|
||||||
- ↗Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/
|
|
||||||
- 💭 FAQs: faq.md
|
|
||||||
- 🔍 Troubleshooting: troubleshooting.md
|
|
||||||
- ⚙️ API reference:
|
|
||||||
- 🐍 Python: python/python.md
|
|
||||||
- 👾 JavaScript (vectordb): javascript/modules.md
|
|
||||||
- 👾 JavaScript (lancedb): js/globals.md
|
|
||||||
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
|
|
||||||
|
|
||||||
- Quick start: basic.md
|
|
||||||
- Concepts:
|
|
||||||
- Vector search: concepts/vector_search.md
|
|
||||||
- Indexing:
|
|
||||||
- IVFPQ: concepts/index_ivfpq.md
|
|
||||||
- HNSW: concepts/index_hnsw.md
|
|
||||||
- Storage: concepts/storage.md
|
|
||||||
- Data management: concepts/data_management.md
|
|
||||||
- Guides:
|
|
||||||
- Working with tables: guides/tables.md
|
|
||||||
- Working with SQL: guides/sql_querying.md
|
|
||||||
- Building an ANN index: ann_indexes.md
|
|
||||||
- Vector Search: search.md
|
|
||||||
- Full-text search (native): fts.md
|
|
||||||
- Full-text search (tantivy-based): fts_tantivy.md
|
|
||||||
- Building a scalar index: guides/scalar_index.md
|
|
||||||
- Hybrid search:
|
|
||||||
- Overview: hybrid_search/hybrid_search.md
|
|
||||||
- Comparing Rerankers: hybrid_search/eval.md
|
|
||||||
- Airbnb financial data example: notebooks/hybrid_search.ipynb
|
|
||||||
- Late interaction with MultiVector search:
|
|
||||||
- Overview: guides/multi-vector.md
|
|
||||||
- Document search Example: notebooks/Multivector_on_LanceDB.ipynb
|
|
||||||
- RAG:
|
|
||||||
- Vanilla RAG: rag/vanilla_rag.md
|
|
||||||
- Multi-head RAG: rag/multi_head_rag.md
|
|
||||||
- Corrective RAG: rag/corrective_rag.md
|
|
||||||
- Agentic RAG: rag/agentic_rag.md
|
|
||||||
- Graph RAG: rag/graph_rag.md
|
|
||||||
- Self RAG: rag/self_rag.md
|
|
||||||
- Adaptive RAG: rag/adaptive_rag.md
|
|
||||||
- SFR RAG: rag/sfr_rag.md
|
|
||||||
- Advanced Techniques:
|
|
||||||
- HyDE: rag/advanced_techniques/hyde.md
|
|
||||||
- FLARE: rag/advanced_techniques/flare.md
|
|
||||||
- Reranking:
|
|
||||||
- Quickstart: reranking/index.md
|
|
||||||
- Cohere Reranker: reranking/cohere.md
|
|
||||||
- Linear Combination Reranker: reranking/linear_combination.md
|
|
||||||
- Reciprocal Rank Fusion Reranker: reranking/rrf.md
|
|
||||||
- Cross Encoder Reranker: reranking/cross_encoder.md
|
|
||||||
- ColBERT Reranker: reranking/colbert.md
|
|
||||||
- Jina Reranker: reranking/jina.md
|
|
||||||
- OpenAI Reranker: reranking/openai.md
|
|
||||||
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
|
||||||
- Building Custom Rerankers: reranking/custom_reranker.md
|
|
||||||
- Example: notebooks/lancedb_reranking.ipynb
|
|
||||||
- Filtering: sql.md
|
|
||||||
- Versioning & Reproducibility:
|
|
||||||
- sync API: notebooks/reproducibility.ipynb
|
|
||||||
- async API: notebooks/reproducibility_async.ipynb
|
|
||||||
- Configuring Storage: guides/storage.md
|
|
||||||
- Migration Guide: migration.md
|
|
||||||
- Tuning retrieval performance:
|
|
||||||
- Choosing right query type: guides/tuning_retrievers/1_query_types.md
|
|
||||||
- Reranking: guides/tuning_retrievers/2_reranking.md
|
|
||||||
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
|
|
||||||
- Managing Embeddings:
|
|
||||||
- Understand Embeddings: embeddings/understanding_embeddings.md
|
|
||||||
- Get Started: embeddings/index.md
|
|
||||||
- Embedding functions: embeddings/embedding_functions.md
|
|
||||||
- Available models:
|
|
||||||
- Overview: embeddings/default_embedding_functions.md
|
|
||||||
- Text Embedding Functions:
|
|
||||||
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
|
|
||||||
- Huggingface Embedding Models: embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md
|
|
||||||
- Ollama Embeddings: embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md
|
|
||||||
- OpenAI Embeddings: embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md
|
|
||||||
- Instructor Embeddings: embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md
|
|
||||||
- Gemini Embeddings: embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md
|
|
||||||
- Cohere Embeddings: embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
|
|
||||||
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
|
||||||
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
|
||||||
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
|
||||||
- Multimodal Embedding Functions:
|
|
||||||
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
|
||||||
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
|
||||||
- Jina Embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md
|
|
||||||
- User-defined embedding functions: embeddings/custom_embedding_function.md
|
|
||||||
- Variables and secrets: embeddings/variables_and_secrets.md
|
|
||||||
- "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb
|
|
||||||
- "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb
|
|
||||||
- Integrations:
|
|
||||||
- Overview: integrations/index.md
|
|
||||||
- Pandas and PyArrow: python/pandas_and_pyarrow.md
|
|
||||||
- Polars: python/polars_arrow.md
|
|
||||||
- DuckDB: python/duckdb.md
|
|
||||||
- Datafusion: python/datafusion.md
|
|
||||||
- LangChain 🦜️🔗↗: integrations/langchain.md
|
|
||||||
- LangChain.js 🦜️🔗↗: https://js.langchain.com/docs/integrations/vectorstores/lancedb
|
|
||||||
- LlamaIndex 🦙↗: integrations/llamaIndex.md
|
|
||||||
- Pydantic: python/pydantic.md
|
|
||||||
- Voxel51: integrations/voxel51.md
|
|
||||||
- PromptTools: integrations/prompttools.md
|
|
||||||
- dlt: integrations/dlt.md
|
|
||||||
- phidata: integrations/phidata.md
|
|
||||||
- Genkit: integrations/genkit.md
|
|
||||||
- Examples:
|
|
||||||
- examples/index.md
|
|
||||||
- 🐍 Python:
|
|
||||||
- Overview: examples/examples_python.md
|
|
||||||
- Build From Scratch: examples/python_examples/build_from_scratch.md
|
|
||||||
- Multimodal: examples/python_examples/multimodal.md
|
|
||||||
- Rag: examples/python_examples/rag.md
|
|
||||||
- Vector Search: examples/python_examples/vector_search.md
|
|
||||||
- Chatbot: examples/python_examples/chatbot.md
|
|
||||||
- Evaluation: examples/python_examples/evaluations.md
|
|
||||||
- AI Agent: examples/python_examples/aiagent.md
|
|
||||||
- Recommender System: examples/python_examples/recommendersystem.md
|
|
||||||
- Miscellaneous:
|
|
||||||
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
|
||||||
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
|
||||||
- 👾 JavaScript:
|
|
||||||
- Overview: examples/examples_js.md
|
|
||||||
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
|
|
||||||
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
|
|
||||||
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
|
||||||
- 🦀 Rust:
|
|
||||||
- Overview: examples/examples_rust.md
|
|
||||||
- Studies:
|
|
||||||
- studies/overview.md
|
|
||||||
- ↗Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/
|
|
||||||
- API reference:
|
- API reference:
|
||||||
- Overview: api_reference.md
|
- Overview: api_reference.md
|
||||||
- Python: python/python.md
|
- Python: python/python.md
|
||||||
|
|||||||
12
docs/package-lock.json
generated
12
docs/package-lock.json
generated
@@ -19,7 +19,7 @@
|
|||||||
},
|
},
|
||||||
"../node": {
|
"../node": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.12.0",
|
"version": "0.21.2-beta.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -65,11 +65,11 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.12.0",
|
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.12.0",
|
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.12.0",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.12.0",
|
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.12.0"
|
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
|
# SQL Querying
|
||||||
|
|
||||||
You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
|
You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
|
||||||
This guide will show how to query Lance tables them using both.
|
This guide will show how to query Lance tables them using both.
|
||||||
|
|
||||||
We will re-use the dataset [created previously](./pandas_and_pyarrow.md):
|
We will re-use the dataset [created previously](./tables.md):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import lancedb
|
import lancedb
|
||||||
@@ -27,21 +29,17 @@ arrow_table = table.to_lance()
|
|||||||
duckdb.query("SELECT * FROM arrow_table")
|
duckdb.query("SELECT * FROM arrow_table")
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
| vector | item | price |
|
||||||
┌─────────────┬─────────┬────────┐
|
| ----------- | ---- | ----- |
|
||||||
│ vector │ item │ price │
|
| [3.1, 4.1] | foo | 10.0 |
|
||||||
│ float[] │ varchar │ double │
|
| [5.9, 26.5] | bar | 20.0 |
|
||||||
├─────────────┼─────────┼────────┤
|
|
||||||
│ [3.1, 4.1] │ foo │ 10.0 │
|
|
||||||
│ [5.9, 26.5] │ bar │ 20.0 │
|
|
||||||
└─────────────┴─────────┴────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Querying a LanceDB Table with Apache Datafusion
|
## Querying a LanceDB Table with Apache Datafusion
|
||||||
|
|
||||||
Have the required imports before doing any querying.
|
Have the required imports before doing any querying.
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
--8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb"
|
--8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb"
|
||||||
--8<-- "python/python/tests/docs/test_guide_tables.py:import-session-context"
|
--8<-- "python/python/tests/docs/test_guide_tables.py:import-session-context"
|
||||||
@@ -51,16 +49,12 @@ Have the required imports before doing any querying.
|
|||||||
Register the table created with the Datafusion session context.
|
Register the table created with the Datafusion session context.
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
|
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
| vector | item | price |
|
||||||
┌─────────────┬─────────┬────────┐
|
| ----------- | ---- | ----- |
|
||||||
│ vector │ item │ price │
|
| [3.1, 4.1] | foo | 10.0 |
|
||||||
│ float[] │ varchar │ double │
|
| [5.9, 26.5] | bar | 20.0 |
|
||||||
├─────────────┼─────────┼────────┤
|
|
||||||
│ [3.1, 4.1] │ foo │ 10.0 │
|
|
||||||
│ [5.9, 26.5] │ bar │ 20.0 │
|
|
||||||
└─────────────┴─────────┴────────┘
|
|
||||||
```
|
|
||||||
|
|||||||
53
docs/src/js/classes/BooleanQuery.md
Normal file
53
docs/src/js/classes/BooleanQuery.md
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / BooleanQuery
|
||||||
|
|
||||||
|
# Class: BooleanQuery
|
||||||
|
|
||||||
|
Represents a full-text query interface.
|
||||||
|
This interface defines the structure and behavior for full-text queries,
|
||||||
|
including methods to retrieve the query type and convert the query to a dictionary format.
|
||||||
|
|
||||||
|
## Implements
|
||||||
|
|
||||||
|
- [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||||
|
|
||||||
|
## Constructors
|
||||||
|
|
||||||
|
### new BooleanQuery()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
new BooleanQuery(queries): BooleanQuery
|
||||||
|
```
|
||||||
|
|
||||||
|
Creates an instance of BooleanQuery.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **queries**: [[`Occur`](../enumerations/Occur.md), [`FullTextQuery`](../interfaces/FullTextQuery.md)][]
|
||||||
|
An array of (Occur, FullTextQuery objects) to combine.
|
||||||
|
Occur specifies whether the query must match, or should match.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`BooleanQuery`](BooleanQuery.md)
|
||||||
|
|
||||||
|
## Methods
|
||||||
|
|
||||||
|
### queryType()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
queryType(): FullTextQueryType
|
||||||
|
```
|
||||||
|
|
||||||
|
The type of the full-text query.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`FullTextQueryType`](../enumerations/FullTextQueryType.md)
|
||||||
|
|
||||||
|
#### Implementation of
|
||||||
|
|
||||||
|
[`FullTextQuery`](../interfaces/FullTextQuery.md).[`queryType`](../interfaces/FullTextQuery.md#querytype)
|
||||||
@@ -40,6 +40,8 @@ Creates an instance of MatchQuery.
|
|||||||
- `boost`: The boost factor for the query (default is 1.0).
|
- `boost`: The boost factor for the query (default is 1.0).
|
||||||
- `fuzziness`: The fuzziness level for the query (default is 0).
|
- `fuzziness`: The fuzziness level for the query (default is 0).
|
||||||
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||||
|
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||||
|
- `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
|
||||||
|
|
||||||
* **options.boost?**: `number`
|
* **options.boost?**: `number`
|
||||||
|
|
||||||
@@ -47,6 +49,10 @@ Creates an instance of MatchQuery.
|
|||||||
|
|
||||||
* **options.maxExpansions?**: `number`
|
* **options.maxExpansions?**: `number`
|
||||||
|
|
||||||
|
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||||
|
|
||||||
|
* **options.prefixLength?**: `number`
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`MatchQuery`](MatchQuery.md)
|
[`MatchQuery`](MatchQuery.md)
|
||||||
|
|||||||
@@ -38,9 +38,12 @@ Creates an instance of MultiMatchQuery.
|
|||||||
* **options?**
|
* **options?**
|
||||||
Optional parameters for the multi-match query.
|
Optional parameters for the multi-match query.
|
||||||
- `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
- `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
||||||
|
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||||
|
|
||||||
* **options.boosts?**: `number`[]
|
* **options.boosts?**: `number`[]
|
||||||
|
|
||||||
|
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`MultiMatchQuery`](MultiMatchQuery.md)
|
[`MultiMatchQuery`](MultiMatchQuery.md)
|
||||||
|
|||||||
@@ -19,7 +19,10 @@ including methods to retrieve the query type and convert the query to a dictiona
|
|||||||
### new PhraseQuery()
|
### new PhraseQuery()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
new PhraseQuery(query, column): PhraseQuery
|
new PhraseQuery(
|
||||||
|
query,
|
||||||
|
column,
|
||||||
|
options?): PhraseQuery
|
||||||
```
|
```
|
||||||
|
|
||||||
Creates an instance of `PhraseQuery`.
|
Creates an instance of `PhraseQuery`.
|
||||||
@@ -32,6 +35,12 @@ Creates an instance of `PhraseQuery`.
|
|||||||
* **column**: `string`
|
* **column**: `string`
|
||||||
The name of the column to search within.
|
The name of the column to search within.
|
||||||
|
|
||||||
|
* **options?**
|
||||||
|
Optional parameters for the phrase query.
|
||||||
|
- `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
|
||||||
|
|
||||||
|
* **options.slop?**: `number`
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
[`PhraseQuery`](PhraseQuery.md)
|
[`PhraseQuery`](PhraseQuery.md)
|
||||||
|
|||||||
84
docs/src/js/classes/Session.md
Normal file
84
docs/src/js/classes/Session.md
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / Session
|
||||||
|
|
||||||
|
# Class: Session
|
||||||
|
|
||||||
|
A session for managing caches and object stores across LanceDB operations.
|
||||||
|
|
||||||
|
Sessions allow you to configure cache sizes for index and metadata caches,
|
||||||
|
which can significantly impact performance for large datasets.
|
||||||
|
|
||||||
|
## Constructors
|
||||||
|
|
||||||
|
### new Session()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
new Session(indexCacheSizeBytes?, metadataCacheSizeBytes?): Session
|
||||||
|
```
|
||||||
|
|
||||||
|
Create a new session with custom cache sizes.
|
||||||
|
|
||||||
|
# Parameters
|
||||||
|
|
||||||
|
- `index_cache_size_bytes`: The size of the index cache in bytes.
|
||||||
|
Defaults to 6GB if not specified.
|
||||||
|
- `metadata_cache_size_bytes`: The size of the metadata cache in bytes.
|
||||||
|
Defaults to 1GB if not specified.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **indexCacheSizeBytes?**: `null` \| `bigint`
|
||||||
|
|
||||||
|
* **metadataCacheSizeBytes?**: `null` \| `bigint`
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`Session`](Session.md)
|
||||||
|
|
||||||
|
## Methods
|
||||||
|
|
||||||
|
### approxNumItems()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
approxNumItems(): number
|
||||||
|
```
|
||||||
|
|
||||||
|
Get the approximate number of items cached in the session.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`number`
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### sizeBytes()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
sizeBytes(): bigint
|
||||||
|
```
|
||||||
|
|
||||||
|
Get the current size of the session caches in bytes.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`bigint`
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### default()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
static default(): Session
|
||||||
|
```
|
||||||
|
|
||||||
|
Create a session with default cache sizes.
|
||||||
|
|
||||||
|
This is equivalent to creating a session with 6GB index cache
|
||||||
|
and 1GB metadata cache.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`Session`](Session.md)
|
||||||
@@ -612,7 +612,7 @@ of the given query
|
|||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||||
the query, a vector or string
|
the query, a vector or string
|
||||||
|
|
||||||
* **queryType?**: `string`
|
* **queryType?**: `string`
|
||||||
@@ -799,7 +799,7 @@ by `query`.
|
|||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
|
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
|
|||||||
@@ -386,6 +386,53 @@ called then every valid row from the table will be returned.
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### maximumNprobes()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
maximumNprobes(maximumNprobes): VectorQuery
|
||||||
|
```
|
||||||
|
|
||||||
|
Set the maximum number of probes used.
|
||||||
|
|
||||||
|
This controls the maximum number of partitions that will be searched. If this
|
||||||
|
number is greater than minimumNprobes then the excess partitions will _only_ be
|
||||||
|
searched if we have not found enough results. This can be useful when there is
|
||||||
|
a narrow filter to allow these queries to spend more time searching and avoid
|
||||||
|
potential false negatives.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **maximumNprobes**: `number`
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`VectorQuery`](VectorQuery.md)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### minimumNprobes()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
minimumNprobes(minimumNprobes): VectorQuery
|
||||||
|
```
|
||||||
|
|
||||||
|
Set the minimum number of probes used.
|
||||||
|
|
||||||
|
This controls the minimum number of partitions that will be searched. This
|
||||||
|
parameter will impact every query against a vector index, regardless of the
|
||||||
|
filter. See `nprobes` for more details. Higher values will increase recall
|
||||||
|
but will also increase latency.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **minimumNprobes**: `number`
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
[`VectorQuery`](VectorQuery.md)
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### nprobes()
|
### nprobes()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
@@ -413,6 +460,10 @@ For best results we recommend tuning this parameter with a benchmark against
|
|||||||
your actual data to find the smallest possible value that will still give
|
your actual data to find the smallest possible value that will still give
|
||||||
you the desired recall.
|
you the desired recall.
|
||||||
|
|
||||||
|
For more fine grained control over behavior when you have a very narrow filter
|
||||||
|
you can use `minimumNprobes` and `maximumNprobes`. This method sets both
|
||||||
|
the minimum and maximum to the same value.
|
||||||
|
|
||||||
#### Parameters
|
#### Parameters
|
||||||
|
|
||||||
* **nprobes**: `number`
|
* **nprobes**: `number`
|
||||||
|
|||||||
@@ -15,6 +15,14 @@ Enum representing the types of full-text queries supported.
|
|||||||
|
|
||||||
## Enumeration Members
|
## Enumeration Members
|
||||||
|
|
||||||
|
### Boolean
|
||||||
|
|
||||||
|
```ts
|
||||||
|
Boolean: "boolean";
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### Boost
|
### Boost
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
37
docs/src/js/enumerations/Occur.md
Normal file
37
docs/src/js/enumerations/Occur.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / Occur
|
||||||
|
|
||||||
|
# Enumeration: Occur
|
||||||
|
|
||||||
|
Enum representing the occurrence of terms in full-text queries.
|
||||||
|
|
||||||
|
- `Must`: The term must be present in the document.
|
||||||
|
- `Should`: The term should contribute to the document score, but is not required.
|
||||||
|
- `MustNot`: The term must not be present in the document.
|
||||||
|
|
||||||
|
## Enumeration Members
|
||||||
|
|
||||||
|
### Must
|
||||||
|
|
||||||
|
```ts
|
||||||
|
Must: "MUST";
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### MustNot
|
||||||
|
|
||||||
|
```ts
|
||||||
|
MustNot: "MUST_NOT";
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### Should
|
||||||
|
|
||||||
|
```ts
|
||||||
|
Should: "SHOULD";
|
||||||
|
```
|
||||||
28
docs/src/js/enumerations/Operator.md
Normal file
28
docs/src/js/enumerations/Operator.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / Operator
|
||||||
|
|
||||||
|
# Enumeration: Operator
|
||||||
|
|
||||||
|
Enum representing the logical operators used in full-text queries.
|
||||||
|
|
||||||
|
- `And`: All terms must match.
|
||||||
|
- `Or`: At least one term must match.
|
||||||
|
|
||||||
|
## Enumeration Members
|
||||||
|
|
||||||
|
### And
|
||||||
|
|
||||||
|
```ts
|
||||||
|
And: "AND";
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### Or
|
||||||
|
|
||||||
|
```ts
|
||||||
|
Or: "OR";
|
||||||
|
```
|
||||||
@@ -6,10 +6,13 @@
|
|||||||
|
|
||||||
# Function: connect()
|
# Function: connect()
|
||||||
|
|
||||||
## connect(uri, options)
|
## connect(uri, options, session)
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
function connect(uri, options?): Promise<Connection>
|
function connect(
|
||||||
|
uri,
|
||||||
|
options?,
|
||||||
|
session?): Promise<Connection>
|
||||||
```
|
```
|
||||||
|
|
||||||
Connect to a LanceDB instance at the given URI.
|
Connect to a LanceDB instance at the given URI.
|
||||||
@@ -29,6 +32,8 @@ Accepted formats:
|
|||||||
* **options?**: `Partial`<[`ConnectionOptions`](../interfaces/ConnectionOptions.md)>
|
* **options?**: `Partial`<[`ConnectionOptions`](../interfaces/ConnectionOptions.md)>
|
||||||
The options to use when connecting to the database
|
The options to use when connecting to the database
|
||||||
|
|
||||||
|
* **session?**: [`Session`](../classes/Session.md)
|
||||||
|
|
||||||
### Returns
|
### Returns
|
||||||
|
|
||||||
`Promise`<[`Connection`](../classes/Connection.md)>
|
`Promise`<[`Connection`](../classes/Connection.md)>
|
||||||
@@ -77,7 +82,7 @@ Accepted formats:
|
|||||||
|
|
||||||
[ConnectionOptions](../interfaces/ConnectionOptions.md) for more details on the URI format.
|
[ConnectionOptions](../interfaces/ConnectionOptions.md) for more details on the URI format.
|
||||||
|
|
||||||
### Example
|
### Examples
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
const conn = await connect({
|
const conn = await connect({
|
||||||
@@ -85,3 +90,11 @@ const conn = await connect({
|
|||||||
storageOptions: {timeout: "60s"}
|
storageOptions: {timeout: "60s"}
|
||||||
});
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```ts
|
||||||
|
const session = Session.default();
|
||||||
|
const conn = await connect({
|
||||||
|
uri: "/path/to/database",
|
||||||
|
session: session
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|||||||
@@ -12,9 +12,12 @@
|
|||||||
## Enumerations
|
## Enumerations
|
||||||
|
|
||||||
- [FullTextQueryType](enumerations/FullTextQueryType.md)
|
- [FullTextQueryType](enumerations/FullTextQueryType.md)
|
||||||
|
- [Occur](enumerations/Occur.md)
|
||||||
|
- [Operator](enumerations/Operator.md)
|
||||||
|
|
||||||
## Classes
|
## Classes
|
||||||
|
|
||||||
|
- [BooleanQuery](classes/BooleanQuery.md)
|
||||||
- [BoostQuery](classes/BoostQuery.md)
|
- [BoostQuery](classes/BoostQuery.md)
|
||||||
- [Connection](classes/Connection.md)
|
- [Connection](classes/Connection.md)
|
||||||
- [Index](classes/Index.md)
|
- [Index](classes/Index.md)
|
||||||
@@ -26,6 +29,7 @@
|
|||||||
- [Query](classes/Query.md)
|
- [Query](classes/Query.md)
|
||||||
- [QueryBase](classes/QueryBase.md)
|
- [QueryBase](classes/QueryBase.md)
|
||||||
- [RecordBatchIterator](classes/RecordBatchIterator.md)
|
- [RecordBatchIterator](classes/RecordBatchIterator.md)
|
||||||
|
- [Session](classes/Session.md)
|
||||||
- [Table](classes/Table.md)
|
- [Table](classes/Table.md)
|
||||||
- [TagContents](classes/TagContents.md)
|
- [TagContents](classes/TagContents.md)
|
||||||
- [Tags](classes/Tags.md)
|
- [Tags](classes/Tags.md)
|
||||||
@@ -81,6 +85,7 @@
|
|||||||
- [FieldLike](type-aliases/FieldLike.md)
|
- [FieldLike](type-aliases/FieldLike.md)
|
||||||
- [IntoSql](type-aliases/IntoSql.md)
|
- [IntoSql](type-aliases/IntoSql.md)
|
||||||
- [IntoVector](type-aliases/IntoVector.md)
|
- [IntoVector](type-aliases/IntoVector.md)
|
||||||
|
- [MultiVector](type-aliases/MultiVector.md)
|
||||||
- [RecordBatchLike](type-aliases/RecordBatchLike.md)
|
- [RecordBatchLike](type-aliases/RecordBatchLike.md)
|
||||||
- [SchemaLike](type-aliases/SchemaLike.md)
|
- [SchemaLike](type-aliases/SchemaLike.md)
|
||||||
- [TableLike](type-aliases/TableLike.md)
|
- [TableLike](type-aliases/TableLike.md)
|
||||||
|
|||||||
@@ -70,6 +70,17 @@ Defaults to 'us-east-1'.
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### session?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional session: Session;
|
||||||
|
```
|
||||||
|
|
||||||
|
(For LanceDB OSS only): the session to use for this connection. Holds
|
||||||
|
shared caches and other session-specific state.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### storageOptions?
|
### storageOptions?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ whether to remove punctuation
|
|||||||
### baseTokenizer?
|
### baseTokenizer?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
optional baseTokenizer: "raw" | "simple" | "whitespace";
|
optional baseTokenizer: "raw" | "simple" | "whitespace" | "ngram";
|
||||||
```
|
```
|
||||||
|
|
||||||
The tokenizer to use when building the index.
|
The tokenizer to use when building the index.
|
||||||
@@ -71,6 +71,36 @@ tokens longer than this length will be ignored
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### ngramMaxLength?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional ngramMaxLength: number;
|
||||||
|
```
|
||||||
|
|
||||||
|
ngram max length
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### ngramMinLength?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional ngramMinLength: number;
|
||||||
|
```
|
||||||
|
|
||||||
|
ngram min length
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### prefixOnly?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional prefixOnly: boolean;
|
||||||
|
```
|
||||||
|
|
||||||
|
whether to only index the prefix of the token for ngram tokenizer
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### removeStopWords?
|
### removeStopWords?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
|
|
||||||
## Properties
|
## Properties
|
||||||
|
|
||||||
### indexCacheSize?
|
### ~~indexCacheSize?~~
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
optional indexCacheSize: number;
|
optional indexCacheSize: number;
|
||||||
@@ -16,6 +16,11 @@ optional indexCacheSize: number;
|
|||||||
|
|
||||||
Set the size of the index cache, specified as a number of entries
|
Set the size of the index cache, specified as a number of entries
|
||||||
|
|
||||||
|
#### Deprecated
|
||||||
|
|
||||||
|
Use session-level cache configuration instead.
|
||||||
|
Create a Session with custom cache sizes and pass it to the connect() function.
|
||||||
|
|
||||||
The exact meaning of an "entry" will depend on the type of index:
|
The exact meaning of an "entry" will depend on the type of index:
|
||||||
- IVF: there is one entry for each IVF partition
|
- IVF: there is one entry for each IVF partition
|
||||||
- BTREE: there is one entry for the entire index
|
- BTREE: there is one entry for the entire index
|
||||||
|
|||||||
@@ -24,10 +24,10 @@ The default is 7 days
|
|||||||
// Delete all versions older than 1 day
|
// Delete all versions older than 1 day
|
||||||
const olderThan = new Date();
|
const olderThan = new Date();
|
||||||
olderThan.setDate(olderThan.getDate() - 1));
|
olderThan.setDate(olderThan.getDate() - 1));
|
||||||
tbl.cleanupOlderVersions(olderThan);
|
tbl.optimize({cleanupOlderThan: olderThan});
|
||||||
|
|
||||||
// Delete all versions except the current version
|
// Delete all versions except the current version
|
||||||
tbl.cleanupOlderVersions(new Date());
|
tbl.optimize({cleanupOlderThan: new Date()});
|
||||||
```
|
```
|
||||||
|
|
||||||
***
|
***
|
||||||
|
|||||||
11
docs/src/js/type-aliases/MultiVector.md
Normal file
11
docs/src/js/type-aliases/MultiVector.md
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / MultiVector
|
||||||
|
|
||||||
|
# Type Alias: MultiVector
|
||||||
|
|
||||||
|
```ts
|
||||||
|
type MultiVector: IntoVector[];
|
||||||
|
```
|
||||||
@@ -428,7 +428,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"**Why?** \n",
|
"**Why?** \n",
|
||||||
"Embedding the UFO dataset and ingesting it into LanceDB takes **~2 hours on a T4 GPU**. To save time: \n",
|
"Embedding the UFO dataset and ingesting it into LanceDB takes **~2 hours on a T4 GPU**. To save time: \n",
|
||||||
"- **Use the pre-prepared table with index created ** (provided below) to proceed directly to step7: search. \n",
|
"- **Use the pre-prepared table with index created** (provided below) to proceed directly to **Step 7**: search. \n",
|
||||||
"- **Step 5a** contains the full ingestion code for reference (run it only if necessary). \n",
|
"- **Step 5a** contains the full ingestion code for reference (run it only if necessary). \n",
|
||||||
"- **Step 6** contains the details on creating the index on the multivector column"
|
"- **Step 6** contains the details on creating the index on the multivector column"
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -30,7 +30,8 @@ excluded_globs = [
|
|||||||
"../src/rag/advanced_techniques/*.md",
|
"../src/rag/advanced_techniques/*.md",
|
||||||
"../src/guides/scalar_index.md",
|
"../src/guides/scalar_index.md",
|
||||||
"../src/guides/storage.md",
|
"../src/guides/storage.md",
|
||||||
"../src/search.md"
|
"../src/search.md",
|
||||||
|
"../src/guides/sql_querying.md",
|
||||||
]
|
]
|
||||||
|
|
||||||
python_prefix = "py"
|
python_prefix = "py"
|
||||||
|
|||||||
@@ -7,3 +7,4 @@ tantivy==0.20.1
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
torch
|
torch
|
||||||
polars>=0.19, <=1.3.0
|
polars>=0.19, <=1.3.0
|
||||||
|
datafusion
|
||||||
|
|||||||
19
java/.mvn/wrapper/maven-wrapper.properties
vendored
Normal file
19
java/.mvn/wrapper/maven-wrapper.properties
vendored
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
wrapperVersion=3.3.2
|
||||||
|
distributionType=only-script
|
||||||
|
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.9/apache-maven-3.9.9-bin.zip
|
||||||
37
java/README.md
Normal file
37
java/README.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# LanceDB Java SDK
|
||||||
|
|
||||||
|
## Configuration and Initialization
|
||||||
|
|
||||||
|
### LanceDB Cloud
|
||||||
|
|
||||||
|
For LanceDB Cloud, use the simplified builder API:
|
||||||
|
|
||||||
|
```java
|
||||||
|
import com.lancedb.lance.namespace.LanceRestNamespace;
|
||||||
|
|
||||||
|
// If your DB url is db://example-db, then your database here is example-db
|
||||||
|
LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
|
||||||
|
.apiKey("your_lancedb_cloud_api_key")
|
||||||
|
.database("your_database_name")
|
||||||
|
.build();
|
||||||
|
```
|
||||||
|
|
||||||
|
### LanceDB Enterprise
|
||||||
|
|
||||||
|
For Enterprise deployments, use your VPC endpoint:
|
||||||
|
|
||||||
|
```java
|
||||||
|
LanceRestNamespace namespace = LanceDBRestNamespaces.builder()
|
||||||
|
.apiKey("your_lancedb_enterprise_api_key")
|
||||||
|
.database("your-top-dir") // Your top level folder under your cloud bucket, e.g. s3://your-bucket/your-top-dir/
|
||||||
|
.hostOverride("http://<vpc_endpoint_dns_name>:80")
|
||||||
|
.build();
|
||||||
|
```
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
Build:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
./mvnw install
|
||||||
|
```
|
||||||
@@ -19,7 +19,7 @@ lancedb = { path = "../../../rust/lancedb" }
|
|||||||
lance = { workspace = true }
|
lance = { workspace = true }
|
||||||
arrow = { workspace = true, features = ["ffi"] }
|
arrow = { workspace = true, features = ["ffi"] }
|
||||||
arrow-schema.workspace = true
|
arrow-schema.workspace = true
|
||||||
tokio = "1.23"
|
tokio = "1.46"
|
||||||
jni = "0.21.1"
|
jni = "0.21.1"
|
||||||
snafu.workspace = true
|
snafu.workspace = true
|
||||||
lazy_static.workspace = true
|
lazy_static.workspace = true
|
||||||
|
|||||||
@@ -8,18 +8,24 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.20.0-beta.1</version>
|
<version>0.21.2-final.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>lancedb-core</artifactId>
|
<artifactId>lancedb-core</artifactId>
|
||||||
<name>LanceDB Core</name>
|
<name>${project.artifactId}</name>
|
||||||
|
<description>LanceDB Core</description>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
<rust.release.build>false</rust.release.build>
|
<rust.release.build>false</rust.release.build>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.lancedb</groupId>
|
||||||
|
<artifactId>lance-namespace-core</artifactId>
|
||||||
|
<version>0.0.1</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.arrow</groupId>
|
<groupId>org.apache.arrow</groupId>
|
||||||
<artifactId>arrow-vector</artifactId>
|
<artifactId>arrow-vector</artifactId>
|
||||||
|
|||||||
26
java/lance-namespace/pom.xml
Normal file
26
java/lance-namespace/pom.xml
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<groupId>com.lancedb</groupId>
|
||||||
|
<artifactId>lancedb-parent</artifactId>
|
||||||
|
<version>0.21.2-final.0</version>
|
||||||
|
<relativePath>../pom.xml</relativePath>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<artifactId>lancedb-lance-namespace</artifactId>
|
||||||
|
<name>${project.artifactId}</name>
|
||||||
|
<description>LanceDB Java Integration with Lance Namespace</description>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.lancedb</groupId>
|
||||||
|
<artifactId>lance-namespace-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
||||||
@@ -0,0 +1,146 @@
|
|||||||
|
/*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package com.lancedb.lancedb;
|
||||||
|
|
||||||
|
import com.lancedb.lance.namespace.LanceRestNamespace;
|
||||||
|
import com.lancedb.lance.namespace.client.apache.ApiClient;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
/** Util class to help construct a {@link LanceRestNamespace} for LanceDB. */
|
||||||
|
public class LanceDbRestNamespaces {
|
||||||
|
private static final String DEFAULT_REGION = "us-east-1";
|
||||||
|
private static final String CLOUD_URL_PATTERN = "https://%s.%s.api.lancedb.com";
|
||||||
|
|
||||||
|
private String apiKey;
|
||||||
|
private String database;
|
||||||
|
private Optional<String> hostOverride = Optional.empty();
|
||||||
|
private Optional<String> region = Optional.empty();
|
||||||
|
private Map<String, String> additionalConfig = new HashMap<>();
|
||||||
|
|
||||||
|
private LanceDbRestNamespaces() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new builder instance.
|
||||||
|
*
|
||||||
|
* @return A new LanceRestNamespaceBuilder
|
||||||
|
*/
|
||||||
|
public static LanceDbRestNamespaces builder() {
|
||||||
|
return new LanceDbRestNamespaces();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the API key (required).
|
||||||
|
*
|
||||||
|
* @param apiKey The LanceDB API key
|
||||||
|
* @return This builder
|
||||||
|
*/
|
||||||
|
public LanceDbRestNamespaces apiKey(String apiKey) {
|
||||||
|
if (apiKey == null || apiKey.trim().isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("API key cannot be null or empty");
|
||||||
|
}
|
||||||
|
this.apiKey = apiKey;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the database name (required).
|
||||||
|
*
|
||||||
|
* @param database The database name
|
||||||
|
* @return This builder
|
||||||
|
*/
|
||||||
|
public LanceDbRestNamespaces database(String database) {
|
||||||
|
if (database == null || database.trim().isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Database cannot be null or empty");
|
||||||
|
}
|
||||||
|
this.database = database;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set a custom host override (optional). When set, this overrides the default LanceDB Cloud URL
|
||||||
|
* construction. Use this for LanceDB Enterprise deployments.
|
||||||
|
*
|
||||||
|
* @param hostOverride The complete base URL (e.g., "http://your-vpc-endpoint:80")
|
||||||
|
* @return This builder
|
||||||
|
*/
|
||||||
|
public LanceDbRestNamespaces hostOverride(String hostOverride) {
|
||||||
|
this.hostOverride = Optional.ofNullable(hostOverride);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the region for LanceDB Cloud (optional). Defaults to "us-east-1" if not specified. This is
|
||||||
|
* ignored when hostOverride is set.
|
||||||
|
*
|
||||||
|
* @param region The AWS region (e.g., "us-east-1", "eu-west-1")
|
||||||
|
* @return This builder
|
||||||
|
*/
|
||||||
|
public LanceDbRestNamespaces region(String region) {
|
||||||
|
this.region = Optional.ofNullable(region);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add additional configuration parameters.
|
||||||
|
*
|
||||||
|
* @param key The configuration key
|
||||||
|
* @param value The configuration value
|
||||||
|
* @return This builder
|
||||||
|
*/
|
||||||
|
public LanceDbRestNamespaces config(String key, String value) {
|
||||||
|
this.additionalConfig.put(key, value);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build the LanceRestNamespace instance.
|
||||||
|
*
|
||||||
|
* @return A configured LanceRestNamespace
|
||||||
|
* @throws IllegalStateException if required parameters are missing
|
||||||
|
*/
|
||||||
|
public LanceRestNamespace build() {
|
||||||
|
// Validate required fields
|
||||||
|
if (apiKey == null) {
|
||||||
|
throw new IllegalStateException("API key is required");
|
||||||
|
}
|
||||||
|
if (database == null) {
|
||||||
|
throw new IllegalStateException("Database is required");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build configuration map
|
||||||
|
Map<String, String> config = new HashMap<>(additionalConfig);
|
||||||
|
config.put("headers.x-lancedb-database", database);
|
||||||
|
config.put("headers.x-api-key", apiKey);
|
||||||
|
|
||||||
|
// Determine base URL
|
||||||
|
String baseUrl;
|
||||||
|
if (hostOverride.isPresent()) {
|
||||||
|
baseUrl = hostOverride.get();
|
||||||
|
config.put("host_override", hostOverride.get());
|
||||||
|
} else {
|
||||||
|
String effectiveRegion = region.orElse(DEFAULT_REGION);
|
||||||
|
baseUrl = String.format(CLOUD_URL_PATTERN, database, effectiveRegion);
|
||||||
|
config.put("region", effectiveRegion);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create and configure ApiClient
|
||||||
|
ApiClient apiClient = new ApiClient();
|
||||||
|
apiClient.setBasePath(baseUrl);
|
||||||
|
|
||||||
|
return new LanceRestNamespace(apiClient, config);
|
||||||
|
}
|
||||||
|
}
|
||||||
259
java/mvnw
vendored
Executable file
259
java/mvnw
vendored
Executable file
@@ -0,0 +1,259 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Apache Maven Wrapper startup batch script, version 3.3.2
|
||||||
|
#
|
||||||
|
# Optional ENV vars
|
||||||
|
# -----------------
|
||||||
|
# JAVA_HOME - location of a JDK home dir, required when download maven via java source
|
||||||
|
# MVNW_REPOURL - repo url base for downloading maven distribution
|
||||||
|
# MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven
|
||||||
|
# MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
set -euf
|
||||||
|
[ "${MVNW_VERBOSE-}" != debug ] || set -x
|
||||||
|
|
||||||
|
# OS specific support.
|
||||||
|
native_path() { printf %s\\n "$1"; }
|
||||||
|
case "$(uname)" in
|
||||||
|
CYGWIN* | MINGW*)
|
||||||
|
[ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")"
|
||||||
|
native_path() { cygpath --path --windows "$1"; }
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# set JAVACMD and JAVACCMD
|
||||||
|
set_java_home() {
|
||||||
|
# For Cygwin and MinGW, ensure paths are in Unix format before anything is touched
|
||||||
|
if [ -n "${JAVA_HOME-}" ]; then
|
||||||
|
if [ -x "$JAVA_HOME/jre/sh/java" ]; then
|
||||||
|
# IBM's JDK on AIX uses strange locations for the executables
|
||||||
|
JAVACMD="$JAVA_HOME/jre/sh/java"
|
||||||
|
JAVACCMD="$JAVA_HOME/jre/sh/javac"
|
||||||
|
else
|
||||||
|
JAVACMD="$JAVA_HOME/bin/java"
|
||||||
|
JAVACCMD="$JAVA_HOME/bin/javac"
|
||||||
|
|
||||||
|
if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then
|
||||||
|
echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2
|
||||||
|
echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
JAVACMD="$(
|
||||||
|
'set' +e
|
||||||
|
'unset' -f command 2>/dev/null
|
||||||
|
'command' -v java
|
||||||
|
)" || :
|
||||||
|
JAVACCMD="$(
|
||||||
|
'set' +e
|
||||||
|
'unset' -f command 2>/dev/null
|
||||||
|
'command' -v javac
|
||||||
|
)" || :
|
||||||
|
|
||||||
|
if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then
|
||||||
|
echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# hash string like Java String::hashCode
|
||||||
|
hash_string() {
|
||||||
|
str="${1:-}" h=0
|
||||||
|
while [ -n "$str" ]; do
|
||||||
|
char="${str%"${str#?}"}"
|
||||||
|
h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296))
|
||||||
|
str="${str#?}"
|
||||||
|
done
|
||||||
|
printf %x\\n $h
|
||||||
|
}
|
||||||
|
|
||||||
|
verbose() { :; }
|
||||||
|
[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; }
|
||||||
|
|
||||||
|
die() {
|
||||||
|
printf %s\\n "$1" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
trim() {
|
||||||
|
# MWRAPPER-139:
|
||||||
|
# Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds.
|
||||||
|
# Needed for removing poorly interpreted newline sequences when running in more
|
||||||
|
# exotic environments such as mingw bash on Windows.
|
||||||
|
printf "%s" "${1}" | tr -d '[:space:]'
|
||||||
|
}
|
||||||
|
|
||||||
|
# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties
|
||||||
|
while IFS="=" read -r key value; do
|
||||||
|
case "${key-}" in
|
||||||
|
distributionUrl) distributionUrl=$(trim "${value-}") ;;
|
||||||
|
distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;;
|
||||||
|
esac
|
||||||
|
done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties"
|
||||||
|
[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties"
|
||||||
|
|
||||||
|
case "${distributionUrl##*/}" in
|
||||||
|
maven-mvnd-*bin.*)
|
||||||
|
MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/
|
||||||
|
case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in
|
||||||
|
*AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;;
|
||||||
|
:Darwin*x86_64) distributionPlatform=darwin-amd64 ;;
|
||||||
|
:Darwin*arm64) distributionPlatform=darwin-aarch64 ;;
|
||||||
|
:Linux*x86_64*) distributionPlatform=linux-amd64 ;;
|
||||||
|
*)
|
||||||
|
echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2
|
||||||
|
distributionPlatform=linux-amd64
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip"
|
||||||
|
;;
|
||||||
|
maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;;
|
||||||
|
*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# apply MVNW_REPOURL and calculate MAVEN_HOME
|
||||||
|
# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash>
|
||||||
|
[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}"
|
||||||
|
distributionUrlName="${distributionUrl##*/}"
|
||||||
|
distributionUrlNameMain="${distributionUrlName%.*}"
|
||||||
|
distributionUrlNameMain="${distributionUrlNameMain%-bin}"
|
||||||
|
MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}"
|
||||||
|
MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")"
|
||||||
|
|
||||||
|
exec_maven() {
|
||||||
|
unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || :
|
||||||
|
exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD"
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ -d "$MAVEN_HOME" ]; then
|
||||||
|
verbose "found existing MAVEN_HOME at $MAVEN_HOME"
|
||||||
|
exec_maven "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "${distributionUrl-}" in
|
||||||
|
*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;;
|
||||||
|
*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# prepare tmp dir
|
||||||
|
if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then
|
||||||
|
clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; }
|
||||||
|
trap clean HUP INT TERM EXIT
|
||||||
|
else
|
||||||
|
die "cannot create temp dir"
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p -- "${MAVEN_HOME%/*}"
|
||||||
|
|
||||||
|
# Download and Install Apache Maven
|
||||||
|
verbose "Couldn't find MAVEN_HOME, downloading and installing it ..."
|
||||||
|
verbose "Downloading from: $distributionUrl"
|
||||||
|
verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName"
|
||||||
|
|
||||||
|
# select .zip or .tar.gz
|
||||||
|
if ! command -v unzip >/dev/null; then
|
||||||
|
distributionUrl="${distributionUrl%.zip}.tar.gz"
|
||||||
|
distributionUrlName="${distributionUrl##*/}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# verbose opt
|
||||||
|
__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR=''
|
||||||
|
[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v
|
||||||
|
|
||||||
|
# normalize http auth
|
||||||
|
case "${MVNW_PASSWORD:+has-password}" in
|
||||||
|
'') MVNW_USERNAME='' MVNW_PASSWORD='' ;;
|
||||||
|
has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then
|
||||||
|
verbose "Found wget ... using wget"
|
||||||
|
wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl"
|
||||||
|
elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then
|
||||||
|
verbose "Found curl ... using curl"
|
||||||
|
curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl"
|
||||||
|
elif set_java_home; then
|
||||||
|
verbose "Falling back to use Java to download"
|
||||||
|
javaSource="$TMP_DOWNLOAD_DIR/Downloader.java"
|
||||||
|
targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName"
|
||||||
|
cat >"$javaSource" <<-END
|
||||||
|
public class Downloader extends java.net.Authenticator
|
||||||
|
{
|
||||||
|
protected java.net.PasswordAuthentication getPasswordAuthentication()
|
||||||
|
{
|
||||||
|
return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() );
|
||||||
|
}
|
||||||
|
public static void main( String[] args ) throws Exception
|
||||||
|
{
|
||||||
|
setDefault( new Downloader() );
|
||||||
|
java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
END
|
||||||
|
# For Cygwin/MinGW, switch paths to Windows format before running javac and java
|
||||||
|
verbose " - Compiling Downloader.java ..."
|
||||||
|
"$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java"
|
||||||
|
verbose " - Running Downloader.java ..."
|
||||||
|
"$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If specified, validate the SHA-256 sum of the Maven distribution zip file
|
||||||
|
if [ -n "${distributionSha256Sum-}" ]; then
|
||||||
|
distributionSha256Result=false
|
||||||
|
if [ "$MVN_CMD" = mvnd.sh ]; then
|
||||||
|
echo "Checksum validation is not supported for maven-mvnd." >&2
|
||||||
|
echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
|
||||||
|
exit 1
|
||||||
|
elif command -v sha256sum >/dev/null; then
|
||||||
|
if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then
|
||||||
|
distributionSha256Result=true
|
||||||
|
fi
|
||||||
|
elif command -v shasum >/dev/null; then
|
||||||
|
if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then
|
||||||
|
distributionSha256Result=true
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2
|
||||||
|
echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ $distributionSha256Result = false ]; then
|
||||||
|
echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2
|
||||||
|
echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# unzip and move
|
||||||
|
if command -v unzip >/dev/null; then
|
||||||
|
unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip"
|
||||||
|
else
|
||||||
|
tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar"
|
||||||
|
fi
|
||||||
|
printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url"
|
||||||
|
mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME"
|
||||||
|
|
||||||
|
clean || :
|
||||||
|
exec_maven "$@"
|
||||||
14
java/pom.xml
14
java/pom.xml
@@ -6,11 +6,10 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.20.0-beta.1</version>
|
<version>0.21.2-final.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
<name>${project.artifactId}</name>
|
||||||
<name>LanceDB Parent</name>
|
<description>LanceDB Java SDK Parent POM</description>
|
||||||
<description>LanceDB vector database Java API</description>
|
|
||||||
<url>http://lancedb.com/</url>
|
<url>http://lancedb.com/</url>
|
||||||
|
|
||||||
<developers>
|
<developers>
|
||||||
@@ -29,6 +28,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<arrow.version>15.0.0</arrow.version>
|
<arrow.version>15.0.0</arrow.version>
|
||||||
|
<lance-namespace.verison>0.0.1</lance-namespace.verison>
|
||||||
<spotless.skip>false</spotless.skip>
|
<spotless.skip>false</spotless.skip>
|
||||||
<spotless.version>2.30.0</spotless.version>
|
<spotless.version>2.30.0</spotless.version>
|
||||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||||
@@ -52,6 +52,7 @@
|
|||||||
|
|
||||||
<modules>
|
<modules>
|
||||||
<module>core</module>
|
<module>core</module>
|
||||||
|
<module>lance-namespace</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<scm>
|
<scm>
|
||||||
@@ -62,6 +63,11 @@
|
|||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.lancedb</groupId>
|
||||||
|
<artifactId>lance-namespace-core</artifactId>
|
||||||
|
<version>${lance-namespace.verison}</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.arrow</groupId>
|
<groupId>org.apache.arrow</groupId>
|
||||||
<artifactId>arrow-vector</artifactId>
|
<artifactId>arrow-vector</artifactId>
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
module.exports = {
|
|
||||||
env: {
|
|
||||||
browser: true,
|
|
||||||
es2021: true
|
|
||||||
},
|
|
||||||
extends: 'standard-with-typescript',
|
|
||||||
overrides: [
|
|
||||||
],
|
|
||||||
parserOptions: {
|
|
||||||
project: './tsconfig.json',
|
|
||||||
ecmaVersion: 'latest',
|
|
||||||
sourceType: 'module'
|
|
||||||
},
|
|
||||||
rules: {
|
|
||||||
"@typescript-eslint/method-signature-style": "off",
|
|
||||||
"@typescript-eslint/quotes": "off",
|
|
||||||
"@typescript-eslint/semi": "off",
|
|
||||||
"@typescript-eslint/explicit-function-return-type": "off",
|
|
||||||
"@typescript-eslint/space-before-function-paren": "off",
|
|
||||||
"@typescript-eslint/indent": "off",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
gen_test_data.py
|
|
||||||
index.node
|
|
||||||
dist/lancedb*.tgz
|
|
||||||
vectordb*.tgz
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
# Changelog
|
|
||||||
|
|
||||||
All notable changes to this project will be documented in this file.
|
|
||||||
|
|
||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
||||||
|
|
||||||
## [0.1.5] - 2023-06-00
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- Support for macOS X86
|
|
||||||
|
|
||||||
## [0.1.4] - 2023-06-03
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- Select / Project query API
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
|
|
||||||
- Deprecated created_index in favor of createIndex
|
|
||||||
|
|
||||||
## [0.1.3] - 2023-06-01
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- Support S3 and Google Cloud Storage
|
|
||||||
- Embedding functions support
|
|
||||||
- OpenAI embedding function
|
|
||||||
|
|
||||||
## [0.1.2] - 2023-05-27
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- Append records API
|
|
||||||
- Extra query params to to nodejs client
|
|
||||||
- Create_index API
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
|
|
||||||
- bugfix: string columns should be converted to Utf8Array (#94)
|
|
||||||
|
|
||||||
## [0.1.1] - 2023-05-16
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- create_table API
|
|
||||||
- limit parameter for queries
|
|
||||||
- Typescript / JavaScript examples
|
|
||||||
- Linux support
|
|
||||||
|
|
||||||
## [0.1.0] - 2023-05-16
|
|
||||||
|
|
||||||
### Added
|
|
||||||
|
|
||||||
- Initial JavaScript / Node.js library for LanceDB
|
|
||||||
- Read-only api to query LanceDB datasets
|
|
||||||
- Supports macOS arm only
|
|
||||||
|
|
||||||
## [pre-0.1.0]
|
|
||||||
|
|
||||||
- Various prototypes / test builds
|
|
||||||
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
# LanceDB
|
|
||||||
|
|
||||||
A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb).
|
|
||||||
|
|
||||||
**DEPRECATED: This library is deprecated. Please use the new client,
|
|
||||||
[@lancedb/lancedb](https://www.npmjs.com/package/@lancedb/lancedb).**
|
|
||||||
|
|
||||||
## Installation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm install vectordb
|
|
||||||
```
|
|
||||||
|
|
||||||
This will download the appropriate native library for your platform. We currently
|
|
||||||
support:
|
|
||||||
|
|
||||||
* Linux (x86_64 and aarch64)
|
|
||||||
* MacOS (Intel and ARM/M1/M2)
|
|
||||||
* Windows (x86_64 only)
|
|
||||||
|
|
||||||
We do not yet support musl-based Linux (such as Alpine Linux) or aarch64 Windows.
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
### Basic Example
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const lancedb = require('vectordb');
|
|
||||||
const db = await lancedb.connect('data/sample-lancedb');
|
|
||||||
const table = await db.createTable("my_table",
|
|
||||||
[{ id: 1, vector: [0.1, 1.0], item: "foo", price: 10.0 },
|
|
||||||
{ id: 2, vector: [3.9, 0.5], item: "bar", price: 20.0 }])
|
|
||||||
const results = await table.search([0.1, 0.3]).limit(20).execute();
|
|
||||||
console.log(results);
|
|
||||||
```
|
|
||||||
|
|
||||||
The [examples](./examples) folder contains complete examples.
|
|
||||||
|
|
||||||
## Development
|
|
||||||
|
|
||||||
To build everything fresh:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm install
|
|
||||||
npm run build
|
|
||||||
```
|
|
||||||
|
|
||||||
Then you should be able to run the tests with:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm test
|
|
||||||
```
|
|
||||||
|
|
||||||
### Fix lints
|
|
||||||
|
|
||||||
To run the linter and have it automatically fix all errors
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm run lint -- --fix
|
|
||||||
```
|
|
||||||
|
|
||||||
To build documentation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npx typedoc --plugin typedoc-plugin-markdown --out ../docs/src/javascript src/index.ts
|
|
||||||
```
|
|
||||||
@@ -1,41 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
'use strict'
|
|
||||||
|
|
||||||
async function example () {
|
|
||||||
const lancedb = require('vectordb')
|
|
||||||
// You need to provide an OpenAI API key, here we read it from the OPENAI_API_KEY environment variable
|
|
||||||
const apiKey = process.env.OPENAI_API_KEY
|
|
||||||
// The embedding function will create embeddings for the 'text' column(text in this case)
|
|
||||||
const embedding = new lancedb.OpenAIEmbeddingFunction('text', apiKey)
|
|
||||||
|
|
||||||
const db = await lancedb.connect('data/sample-lancedb')
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ id: 1, text: 'Black T-Shirt', price: 10 },
|
|
||||||
{ id: 2, text: 'Leather Jacket', price: 50 }
|
|
||||||
]
|
|
||||||
|
|
||||||
const table = await db.createTable('vectors', data, embedding)
|
|
||||||
console.log(await db.tableNames())
|
|
||||||
|
|
||||||
const results = await table
|
|
||||||
.search('keeps me warm')
|
|
||||||
.limit(1)
|
|
||||||
.execute()
|
|
||||||
console.log(results[0].text)
|
|
||||||
}
|
|
||||||
|
|
||||||
example().then(_ => { console.log('All done!') })
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb-example-js-openai",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"vectordb": "file:../..",
|
|
||||||
"openai": "^3.2.1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
'use strict'
|
|
||||||
|
|
||||||
|
|
||||||
async function example() {
|
|
||||||
|
|
||||||
const lancedb = require('vectordb')
|
|
||||||
|
|
||||||
// Import transformers and the all-MiniLM-L6-v2 model (https://huggingface.co/Xenova/all-MiniLM-L6-v2)
|
|
||||||
const { pipeline } = await import('@xenova/transformers')
|
|
||||||
const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
|
|
||||||
|
|
||||||
|
|
||||||
// Create embedding function from pipeline which returns a list of vectors from batch
|
|
||||||
// sourceColumn is the name of the column in the data to be embedded
|
|
||||||
//
|
|
||||||
// Output of pipe is a Tensor { data: Float32Array(384) }, so filter for the vector
|
|
||||||
const embed_fun = {}
|
|
||||||
embed_fun.sourceColumn = 'text'
|
|
||||||
embed_fun.embed = async function (batch) {
|
|
||||||
let result = []
|
|
||||||
for (let text of batch) {
|
|
||||||
const res = await pipe(text, { pooling: 'mean', normalize: true })
|
|
||||||
result.push(Array.from(res['data']))
|
|
||||||
}
|
|
||||||
return (result)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Link a folder and create a table with data
|
|
||||||
const db = await lancedb.connect('data/sample-lancedb')
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ id: 1, text: 'Cherry', type: 'fruit' },
|
|
||||||
{ id: 2, text: 'Carrot', type: 'vegetable' },
|
|
||||||
{ id: 3, text: 'Potato', type: 'vegetable' },
|
|
||||||
{ id: 4, text: 'Apple', type: 'fruit' },
|
|
||||||
{ id: 5, text: 'Banana', type: 'fruit' }
|
|
||||||
]
|
|
||||||
|
|
||||||
const table = await db.createTable('food_table', data, embed_fun)
|
|
||||||
|
|
||||||
|
|
||||||
// Query the table
|
|
||||||
const results = await table
|
|
||||||
.search("a sweet fruit to eat")
|
|
||||||
.metricType("cosine")
|
|
||||||
.limit(2)
|
|
||||||
.execute()
|
|
||||||
console.log(results.map(r => r.text))
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
example().then(_ => { console.log("Done!") })
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb-example-js-transformers",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "Example for using transformers.js with lancedb",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"@xenova/transformers": "^2.4.1",
|
|
||||||
"vectordb": "file:../.."
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@@ -1,122 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
'use strict'
|
|
||||||
|
|
||||||
const lancedb = require('vectordb')
|
|
||||||
const fs = require('fs/promises')
|
|
||||||
const readline = require('readline/promises')
|
|
||||||
const { stdin: input, stdout: output } = require('process')
|
|
||||||
const { Configuration, OpenAIApi } = require('openai')
|
|
||||||
|
|
||||||
// Download file from XYZ
|
|
||||||
const INPUT_FILE_NAME = 'data/youtube-transcriptions_sample.jsonl';
|
|
||||||
|
|
||||||
(async () => {
|
|
||||||
// You need to provide an OpenAI API key, here we read it from the OPENAI_API_KEY environment variable
|
|
||||||
const apiKey = process.env.OPENAI_API_KEY
|
|
||||||
// The embedding function will create embeddings for the 'context' column
|
|
||||||
const embedFunction = new lancedb.OpenAIEmbeddingFunction('context', apiKey)
|
|
||||||
|
|
||||||
// Connects to LanceDB
|
|
||||||
const db = await lancedb.connect('data/youtube-lancedb')
|
|
||||||
|
|
||||||
// Open the vectors table or create one if it does not exist
|
|
||||||
let tbl
|
|
||||||
if ((await db.tableNames()).includes('vectors')) {
|
|
||||||
tbl = await db.openTable('vectors', embedFunction)
|
|
||||||
} else {
|
|
||||||
tbl = await createEmbeddingsTable(db, embedFunction)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use OpenAI Completion API to generate and answer based on the context that LanceDB provides
|
|
||||||
const configuration = new Configuration({ apiKey })
|
|
||||||
const openai = new OpenAIApi(configuration)
|
|
||||||
const rl = readline.createInterface({ input, output })
|
|
||||||
try {
|
|
||||||
while (true) {
|
|
||||||
const query = await rl.question('Prompt: ')
|
|
||||||
const results = await tbl
|
|
||||||
.search(query)
|
|
||||||
.select(['title', 'text', 'context'])
|
|
||||||
.limit(3)
|
|
||||||
.execute()
|
|
||||||
|
|
||||||
// console.table(results)
|
|
||||||
|
|
||||||
const response = await openai.createCompletion({
|
|
||||||
model: 'text-davinci-003',
|
|
||||||
prompt: createPrompt(query, results),
|
|
||||||
max_tokens: 400,
|
|
||||||
temperature: 0,
|
|
||||||
top_p: 1,
|
|
||||||
frequency_penalty: 0,
|
|
||||||
presence_penalty: 0
|
|
||||||
})
|
|
||||||
console.log(response.data.choices[0].text)
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
console.log('Error: ', err)
|
|
||||||
} finally {
|
|
||||||
rl.close()
|
|
||||||
}
|
|
||||||
process.exit(1)
|
|
||||||
})()
|
|
||||||
|
|
||||||
async function createEmbeddingsTable (db, embedFunction) {
|
|
||||||
console.log(`Creating embeddings from ${INPUT_FILE_NAME}`)
|
|
||||||
// read the input file into a JSON array, skipping empty lines
|
|
||||||
const lines = (await fs.readFile(INPUT_FILE_NAME, 'utf-8'))
|
|
||||||
.toString()
|
|
||||||
.split('\n')
|
|
||||||
.filter(line => line.length > 0)
|
|
||||||
.map(line => JSON.parse(line))
|
|
||||||
|
|
||||||
const data = contextualize(lines, 20, 'video_id')
|
|
||||||
return await db.createTable('vectors', data, embedFunction)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Each transcript has a small text column, we include previous transcripts in order to
|
|
||||||
// have more context information when creating embeddings
|
|
||||||
function contextualize (rows, contextSize, groupColumn) {
|
|
||||||
const grouped = []
|
|
||||||
rows.forEach(row => {
|
|
||||||
if (!grouped[row[groupColumn]]) {
|
|
||||||
grouped[row[groupColumn]] = []
|
|
||||||
}
|
|
||||||
grouped[row[groupColumn]].push(row)
|
|
||||||
})
|
|
||||||
|
|
||||||
const data = []
|
|
||||||
Object.keys(grouped).forEach(key => {
|
|
||||||
for (let i = 0; i < grouped[key].length; i++) {
|
|
||||||
const start = i - contextSize > 0 ? i - contextSize : 0
|
|
||||||
grouped[key][i].context = grouped[key].slice(start, i + 1).map(r => r.text).join(' ')
|
|
||||||
}
|
|
||||||
data.push(...grouped[key])
|
|
||||||
})
|
|
||||||
return data
|
|
||||||
}
|
|
||||||
|
|
||||||
// Creates a prompt by aggregating all relevant contexts
|
|
||||||
function createPrompt (query, context) {
|
|
||||||
let prompt =
|
|
||||||
'Answer the question based on the context below.\n\n' +
|
|
||||||
'Context:\n'
|
|
||||||
|
|
||||||
// need to make sure our prompt is not larger than max size
|
|
||||||
prompt = prompt + context.map(c => c.context).join('\n\n---\n\n').substring(0, 3750)
|
|
||||||
prompt = prompt + `\n\nQuestion: ${query}\nAnswer:`
|
|
||||||
return prompt
|
|
||||||
}
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb-example-js-openai",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"vectordb": "file:../..",
|
|
||||||
"openai": "^3.2.1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
'use strict'
|
|
||||||
|
|
||||||
async function example () {
|
|
||||||
const lancedb = require('vectordb')
|
|
||||||
const db = await lancedb.connect('data/sample-lancedb')
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ id: 1, vector: [0.1, 0.2], price: 10 },
|
|
||||||
{ id: 2, vector: [1.1, 1.2], price: 50 }
|
|
||||||
]
|
|
||||||
|
|
||||||
const table = await db.createTable('vectors', data)
|
|
||||||
console.log(await db.tableNames())
|
|
||||||
|
|
||||||
const results = await table
|
|
||||||
.search([0.1, 0.3])
|
|
||||||
.limit(20)
|
|
||||||
.execute()
|
|
||||||
console.log(results)
|
|
||||||
}
|
|
||||||
|
|
||||||
example()
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb-example-js",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"vectordb": "file:../.."
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb-example-ts",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "dist/index.js",
|
|
||||||
"types": "dist/index.d.ts",
|
|
||||||
"scripts": {
|
|
||||||
"tsc": "tsc -b",
|
|
||||||
"build": "tsc"
|
|
||||||
},
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"devDependencies": {
|
|
||||||
"@types/node": "^18.16.2",
|
|
||||||
"ts-node": "^10.9.1",
|
|
||||||
"ts-node-dev": "^2.0.0",
|
|
||||||
"typescript": "*"
|
|
||||||
},
|
|
||||||
"dependencies": {
|
|
||||||
"vectordb": "file:../.."
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import * as vectordb from 'vectordb';
|
|
||||||
|
|
||||||
async function example () {
|
|
||||||
const db = await vectordb.connect('data/sample-lancedb')
|
|
||||||
|
|
||||||
const data = [
|
|
||||||
{ id: 1, vector: [0.1, 0.2], price: 10 },
|
|
||||||
{ id: 2, vector: [1.1, 1.2], price: 50 }
|
|
||||||
]
|
|
||||||
|
|
||||||
const table = await db.createTable('vectors', data)
|
|
||||||
console.log(await db.tableNames())
|
|
||||||
|
|
||||||
const results = await table
|
|
||||||
.search([0.1, 0.3])
|
|
||||||
.limit(20)
|
|
||||||
.execute()
|
|
||||||
console.log(results)
|
|
||||||
}
|
|
||||||
|
|
||||||
example().then(_ => { console.log ("All done!") })
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
{
|
|
||||||
"include": ["src/**/*.ts"],
|
|
||||||
"compilerOptions": {
|
|
||||||
"target": "es2016",
|
|
||||||
"module": "commonjs",
|
|
||||||
"declaration": true,
|
|
||||||
"outDir": "./dist",
|
|
||||||
"strict": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
const { currentTarget } = require('@neon-rs/load')
|
|
||||||
|
|
||||||
let nativeLib
|
|
||||||
|
|
||||||
try {
|
|
||||||
// When developing locally, give preference to the local built library
|
|
||||||
nativeLib = require('./index.node')
|
|
||||||
} catch {
|
|
||||||
try {
|
|
||||||
nativeLib = require(`@lancedb/vectordb-${currentTarget()}`)
|
|
||||||
} catch (e) {
|
|
||||||
throw new Error(`vectordb: failed to load native library.
|
|
||||||
You may need to run \`npm install @lancedb/vectordb-${currentTarget()}\`.
|
|
||||||
|
|
||||||
If that does not work, please file a bug report at https://github.com/lancedb/lancedb/issues
|
|
||||||
|
|
||||||
Source error: ${e}`)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dynamic require for runtime.
|
|
||||||
module.exports = nativeLib
|
|
||||||
5239
node/package-lock.json
generated
5239
node/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -1,98 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "vectordb",
|
|
||||||
"version": "0.20.0-beta.1",
|
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
|
||||||
"private": false,
|
|
||||||
"main": "dist/index.js",
|
|
||||||
"types": "dist/index.d.ts",
|
|
||||||
"scripts": {
|
|
||||||
"tsc": "tsc -b",
|
|
||||||
"build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb_node index.node -- cargo build -p lancedb-node --message-format=json",
|
|
||||||
"build-release": "npm run build -- --release",
|
|
||||||
"test": "npm run tsc && mocha -recursive dist/test",
|
|
||||||
"integration-test": "npm run tsc && mocha -recursive dist/integration_test",
|
|
||||||
"lint": "eslint native.js src --ext .js,.ts",
|
|
||||||
"clean": "rm -rf node_modules *.node dist/",
|
|
||||||
"pack-build": "neon pack-build",
|
|
||||||
"check-npm": "printenv && which node && which npm && npm --version"
|
|
||||||
},
|
|
||||||
"repository": {
|
|
||||||
"type": "git",
|
|
||||||
"url": "https://github.com/lancedb/lancedb.git"
|
|
||||||
},
|
|
||||||
"homepage": "https://lancedb.github.io/lancedb/",
|
|
||||||
"bugs": {
|
|
||||||
"url": "https://github.com/lancedb/lancedb/issues"
|
|
||||||
},
|
|
||||||
"keywords": [
|
|
||||||
"data-format",
|
|
||||||
"data-science",
|
|
||||||
"machine-learning",
|
|
||||||
"data-analytics"
|
|
||||||
],
|
|
||||||
"author": "Lance Devs",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"devDependencies": {
|
|
||||||
"@neon-rs/cli": "^0.0.160",
|
|
||||||
"@types/chai": "^4.3.4",
|
|
||||||
"@types/chai-as-promised": "^7.1.5",
|
|
||||||
"@types/mocha": "^10.0.1",
|
|
||||||
"@types/node": "^18.16.2",
|
|
||||||
"@types/sinon": "^10.0.15",
|
|
||||||
"@types/temp": "^0.9.1",
|
|
||||||
"@types/uuid": "^9.0.3",
|
|
||||||
"@typescript-eslint/eslint-plugin": "^5.59.1",
|
|
||||||
"apache-arrow-old": "npm:apache-arrow@13.0.0",
|
|
||||||
"cargo-cp-artifact": "^0.1",
|
|
||||||
"chai": "^4.3.7",
|
|
||||||
"chai-as-promised": "^7.1.1",
|
|
||||||
"eslint": "^8.39.0",
|
|
||||||
"eslint-config-standard-with-typescript": "^34.0.1",
|
|
||||||
"eslint-plugin-import": "^2.26.0",
|
|
||||||
"eslint-plugin-n": "^15.7.0",
|
|
||||||
"eslint-plugin-promise": "^6.1.1",
|
|
||||||
"mocha": "^10.2.0",
|
|
||||||
"openai": "^4.24.1",
|
|
||||||
"sinon": "^15.1.0",
|
|
||||||
"temp": "^0.9.4",
|
|
||||||
"ts-node": "^10.9.1",
|
|
||||||
"ts-node-dev": "^2.0.0",
|
|
||||||
"typedoc": "^0.24.7",
|
|
||||||
"typedoc-plugin-markdown": "^3.15.3",
|
|
||||||
"typescript": "^5.1.0",
|
|
||||||
"uuid": "^9.0.0"
|
|
||||||
},
|
|
||||||
"dependencies": {
|
|
||||||
"@neon-rs/load": "^0.0.74",
|
|
||||||
"axios": "^1.4.0"
|
|
||||||
},
|
|
||||||
"peerDependencies": {
|
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
|
||||||
"apache-arrow": "^14.0.2"
|
|
||||||
},
|
|
||||||
"os": [
|
|
||||||
"darwin",
|
|
||||||
"linux",
|
|
||||||
"win32"
|
|
||||||
],
|
|
||||||
"cpu": [
|
|
||||||
"x64",
|
|
||||||
"arm64"
|
|
||||||
],
|
|
||||||
"neon": {
|
|
||||||
"targets": {
|
|
||||||
"x86_64-apple-darwin": "@lancedb/vectordb-darwin-x64",
|
|
||||||
"aarch64-apple-darwin": "@lancedb/vectordb-darwin-arm64",
|
|
||||||
"x86_64-unknown-linux-gnu": "@lancedb/vectordb-linux-x64-gnu",
|
|
||||||
"aarch64-unknown-linux-gnu": "@lancedb/vectordb-linux-arm64-gnu",
|
|
||||||
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"optionalDependencies": {
|
|
||||||
"@lancedb/vectordb-darwin-x64": "0.20.0-beta.1",
|
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.20.0-beta.1",
|
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.20.0-beta.1",
|
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.20.0-beta.1",
|
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.20.0-beta.1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,635 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import {
|
|
||||||
Field,
|
|
||||||
makeBuilder,
|
|
||||||
RecordBatchFileWriter,
|
|
||||||
Utf8,
|
|
||||||
type Vector,
|
|
||||||
FixedSizeList,
|
|
||||||
vectorFromArray,
|
|
||||||
Schema,
|
|
||||||
Table as ArrowTable,
|
|
||||||
RecordBatchStreamWriter,
|
|
||||||
List,
|
|
||||||
RecordBatch,
|
|
||||||
makeData,
|
|
||||||
Struct,
|
|
||||||
type Float,
|
|
||||||
DataType,
|
|
||||||
Binary,
|
|
||||||
Float32
|
|
||||||
} from "apache-arrow";
|
|
||||||
import { type EmbeddingFunction } from "./index";
|
|
||||||
import { sanitizeSchema } from "./sanitize";
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Options to control how a column should be converted to a vector array
|
|
||||||
*/
|
|
||||||
export class VectorColumnOptions {
|
|
||||||
/** Vector column type. */
|
|
||||||
type: Float = new Float32();
|
|
||||||
|
|
||||||
constructor(values?: Partial<VectorColumnOptions>) {
|
|
||||||
Object.assign(this, values);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Options to control the makeArrowTable call. */
|
|
||||||
export class MakeArrowTableOptions {
|
|
||||||
/*
|
|
||||||
* Schema of the data.
|
|
||||||
*
|
|
||||||
* If this is not provided then the data type will be inferred from the
|
|
||||||
* JS type. Integer numbers will become int64, floating point numbers
|
|
||||||
* will become float64 and arrays will become variable sized lists with
|
|
||||||
* the data type inferred from the first element in the array.
|
|
||||||
*
|
|
||||||
* The schema must be specified if there are no records (e.g. to make
|
|
||||||
* an empty table)
|
|
||||||
*/
|
|
||||||
schema?: Schema;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Mapping from vector column name to expected type
|
|
||||||
*
|
|
||||||
* Lance expects vector columns to be fixed size list arrays (i.e. tensors)
|
|
||||||
* However, `makeArrowTable` will not infer this by default (it creates
|
|
||||||
* variable size list arrays). This field can be used to indicate that a column
|
|
||||||
* should be treated as a vector column and converted to a fixed size list.
|
|
||||||
*
|
|
||||||
* The keys should be the names of the vector columns. The value specifies the
|
|
||||||
* expected data type of the vector columns.
|
|
||||||
*
|
|
||||||
* If `schema` is provided then this field is ignored.
|
|
||||||
*
|
|
||||||
* By default, the column named "vector" will be assumed to be a float32
|
|
||||||
* vector column.
|
|
||||||
*/
|
|
||||||
vectorColumns: Record<string, VectorColumnOptions> = {
|
|
||||||
vector: new VectorColumnOptions()
|
|
||||||
};
|
|
||||||
|
|
||||||
embeddings?: EmbeddingFunction<any>;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If true then string columns will be encoded with dictionary encoding
|
|
||||||
*
|
|
||||||
* Set this to true if your string columns tend to repeat the same values
|
|
||||||
* often. For more precise control use the `schema` property to specify the
|
|
||||||
* data type for individual columns.
|
|
||||||
*
|
|
||||||
* If `schema` is provided then this property is ignored.
|
|
||||||
*/
|
|
||||||
dictionaryEncodeStrings: boolean = false;
|
|
||||||
|
|
||||||
constructor(values?: Partial<MakeArrowTableOptions>) {
|
|
||||||
Object.assign(this, values);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An enhanced version of the {@link makeTable} function from Apache Arrow
|
|
||||||
* that supports nested fields and embeddings columns.
|
|
||||||
*
|
|
||||||
* This function converts an array of Record<String, any> (row-major JS objects)
|
|
||||||
* to an Arrow Table (a columnar structure)
|
|
||||||
*
|
|
||||||
* Note that it currently does not support nulls.
|
|
||||||
*
|
|
||||||
* If a schema is provided then it will be used to determine the resulting array
|
|
||||||
* types. Fields will also be reordered to fit the order defined by the schema.
|
|
||||||
*
|
|
||||||
* If a schema is not provided then the types will be inferred and the field order
|
|
||||||
* will be controlled by the order of properties in the first record.
|
|
||||||
*
|
|
||||||
* If the input is empty then a schema must be provided to create an empty table.
|
|
||||||
*
|
|
||||||
* When a schema is not specified then data types will be inferred. The inference
|
|
||||||
* rules are as follows:
|
|
||||||
*
|
|
||||||
* - boolean => Bool
|
|
||||||
* - number => Float64
|
|
||||||
* - String => Utf8
|
|
||||||
* - Buffer => Binary
|
|
||||||
* - Record<String, any> => Struct
|
|
||||||
* - Array<any> => List
|
|
||||||
*
|
|
||||||
* @param data input data
|
|
||||||
* @param options options to control the makeArrowTable call.
|
|
||||||
*
|
|
||||||
* @example
|
|
||||||
*
|
|
||||||
* ```ts
|
|
||||||
*
|
|
||||||
* import { fromTableToBuffer, makeArrowTable } from "../arrow";
|
|
||||||
* import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
|
|
||||||
*
|
|
||||||
* const schema = new Schema([
|
|
||||||
* new Field("a", new Int32()),
|
|
||||||
* new Field("b", new Float32()),
|
|
||||||
* new Field("c", new FixedSizeList(3, new Field("item", new Float16()))),
|
|
||||||
* ]);
|
|
||||||
* const table = makeArrowTable([
|
|
||||||
* { a: 1, b: 2, c: [1, 2, 3] },
|
|
||||||
* { a: 4, b: 5, c: [4, 5, 6] },
|
|
||||||
* { a: 7, b: 8, c: [7, 8, 9] },
|
|
||||||
* ], { schema });
|
|
||||||
* ```
|
|
||||||
*
|
|
||||||
* By default it assumes that the column named `vector` is a vector column
|
|
||||||
* and it will be converted into a fixed size list array of type float32.
|
|
||||||
* The `vectorColumns` option can be used to support other vector column
|
|
||||||
* names and data types.
|
|
||||||
*
|
|
||||||
* ```ts
|
|
||||||
*
|
|
||||||
* const schema = new Schema([
|
|
||||||
new Field("a", new Float64()),
|
|
||||||
new Field("b", new Float64()),
|
|
||||||
new Field(
|
|
||||||
"vector",
|
|
||||||
new FixedSizeList(3, new Field("item", new Float32()))
|
|
||||||
),
|
|
||||||
]);
|
|
||||||
const table = makeArrowTable([
|
|
||||||
{ a: 1, b: 2, vector: [1, 2, 3] },
|
|
||||||
{ a: 4, b: 5, vector: [4, 5, 6] },
|
|
||||||
{ a: 7, b: 8, vector: [7, 8, 9] },
|
|
||||||
]);
|
|
||||||
assert.deepEqual(table.schema, schema);
|
|
||||||
* ```
|
|
||||||
*
|
|
||||||
* You can specify the vector column types and names using the options as well
|
|
||||||
*
|
|
||||||
* ```typescript
|
|
||||||
*
|
|
||||||
* const schema = new Schema([
|
|
||||||
new Field('a', new Float64()),
|
|
||||||
new Field('b', new Float64()),
|
|
||||||
new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
|
|
||||||
new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
|
|
||||||
]);
|
|
||||||
* const table = makeArrowTable([
|
|
||||||
{ a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
|
|
||||||
{ a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
|
|
||||||
{ a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
|
|
||||||
], {
|
|
||||||
vectorColumns: {
|
|
||||||
vec1: { type: new Float16() },
|
|
||||||
vec2: { type: new Float16() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
* assert.deepEqual(table.schema, schema)
|
|
||||||
* ```
|
|
||||||
*/
|
|
||||||
export function makeArrowTable(
|
|
||||||
data: Array<Record<string, any>>,
|
|
||||||
options?: Partial<MakeArrowTableOptions>
|
|
||||||
): ArrowTable {
|
|
||||||
if (
|
|
||||||
data.length === 0 &&
|
|
||||||
(options?.schema === undefined || options?.schema === null)
|
|
||||||
) {
|
|
||||||
throw new Error("At least one record or a schema needs to be provided");
|
|
||||||
}
|
|
||||||
|
|
||||||
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
||||||
if (opt.schema !== undefined && opt.schema !== null) {
|
|
||||||
opt.schema = sanitizeSchema(opt.schema);
|
|
||||||
opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
|
|
||||||
}
|
|
||||||
|
|
||||||
const columns: Record<string, Vector> = {};
|
|
||||||
// TODO: sample dataset to find missing columns
|
|
||||||
// Prefer the field ordering of the schema, if present
|
|
||||||
const columnNames =
|
|
||||||
opt.schema != null ? (opt.schema.names as string[]) : Object.keys(data[0]);
|
|
||||||
for (const colName of columnNames) {
|
|
||||||
if (
|
|
||||||
data.length !== 0 &&
|
|
||||||
!Object.prototype.hasOwnProperty.call(data[0], colName)
|
|
||||||
) {
|
|
||||||
// The field is present in the schema, but not in the data, skip it
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// Extract a single column from the records (transpose from row-major to col-major)
|
|
||||||
let values = data.map((datum) => datum[colName]);
|
|
||||||
|
|
||||||
// By default (type === undefined) arrow will infer the type from the JS type
|
|
||||||
let type;
|
|
||||||
if (opt.schema !== undefined) {
|
|
||||||
// If there is a schema provided, then use that for the type instead
|
|
||||||
type = opt.schema?.fields.filter((f) => f.name === colName)[0]?.type;
|
|
||||||
if (DataType.isInt(type) && type.bitWidth === 64) {
|
|
||||||
// wrap in BigInt to avoid bug: https://github.com/apache/arrow/issues/40051
|
|
||||||
values = values.map((v) => {
|
|
||||||
if (v === null) {
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
return BigInt(v);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Otherwise, check to see if this column is one of the vector columns
|
|
||||||
// defined by opt.vectorColumns and, if so, use the fixed size list type
|
|
||||||
const vectorColumnOptions = opt.vectorColumns[colName];
|
|
||||||
if (vectorColumnOptions !== undefined) {
|
|
||||||
type = newVectorType(values[0].length, vectorColumnOptions.type);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Convert an Array of JS values to an arrow vector
|
|
||||||
columns[colName] = makeVector(values, type, opt.dictionaryEncodeStrings);
|
|
||||||
} catch (error: unknown) {
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
throw Error(`Could not convert column "${colName}" to Arrow: ${error}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (opt.schema != null) {
|
|
||||||
// `new ArrowTable(columns)` infers a schema which may sometimes have
|
|
||||||
// incorrect nullability (it assumes nullable=true if there are 0 rows)
|
|
||||||
//
|
|
||||||
// `new ArrowTable(schema, columns)` will also fail because it will create a
|
|
||||||
// batch with an inferred schema and then complain that the batch schema
|
|
||||||
// does not match the provided schema.
|
|
||||||
//
|
|
||||||
// To work around this we first create a table with the wrong schema and
|
|
||||||
// then patch the schema of the batches so we can use
|
|
||||||
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
||||||
const firstTable = new ArrowTable(columns);
|
|
||||||
const batchesFixed = firstTable.batches.map(
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
||||||
(batch) => new RecordBatch(opt.schema!, batch.data)
|
|
||||||
);
|
|
||||||
return new ArrowTable(opt.schema, batchesFixed);
|
|
||||||
} else {
|
|
||||||
return new ArrowTable(columns);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create an empty Arrow table with the provided schema
|
|
||||||
*/
|
|
||||||
export function makeEmptyTable(schema: Schema): ArrowTable {
|
|
||||||
return makeArrowTable([], { schema });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper function to convert Array<Array<any>> to a variable sized list array
|
|
||||||
function makeListVector(lists: any[][]): Vector<any> {
|
|
||||||
if (lists.length === 0 || lists[0].length === 0) {
|
|
||||||
throw Error("Cannot infer list vector from empty array or empty list");
|
|
||||||
}
|
|
||||||
const sampleList = lists[0];
|
|
||||||
let inferredType;
|
|
||||||
try {
|
|
||||||
const sampleVector = makeVector(sampleList);
|
|
||||||
inferredType = sampleVector.type;
|
|
||||||
} catch (error: unknown) {
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
throw Error(`Cannot infer list vector. Cannot infer inner type: ${error}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const listBuilder = makeBuilder({
|
|
||||||
type: new List(new Field("item", inferredType, true))
|
|
||||||
});
|
|
||||||
for (const list of lists) {
|
|
||||||
listBuilder.append(list);
|
|
||||||
}
|
|
||||||
return listBuilder.finish().toVector();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper function to convert an Array of JS values to an Arrow Vector
|
|
||||||
function makeVector(
|
|
||||||
values: any[],
|
|
||||||
type?: DataType,
|
|
||||||
stringAsDictionary?: boolean
|
|
||||||
): Vector<any> {
|
|
||||||
if (type !== undefined) {
|
|
||||||
// No need for inference, let Arrow create it
|
|
||||||
return vectorFromArray(values, type);
|
|
||||||
}
|
|
||||||
if (values.length === 0) {
|
|
||||||
throw Error(
|
|
||||||
"makeVector requires at least one value or the type must be specfied"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const sampleValue = values.find((val) => val !== null && val !== undefined);
|
|
||||||
if (sampleValue === undefined) {
|
|
||||||
throw Error(
|
|
||||||
"makeVector cannot infer the type if all values are null or undefined"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (Array.isArray(sampleValue)) {
|
|
||||||
// Default Arrow inference doesn't handle list types
|
|
||||||
return makeListVector(values);
|
|
||||||
} else if (Buffer.isBuffer(sampleValue)) {
|
|
||||||
// Default Arrow inference doesn't handle Buffer
|
|
||||||
return vectorFromArray(values, new Binary());
|
|
||||||
} else if (
|
|
||||||
!(stringAsDictionary ?? false) &&
|
|
||||||
(typeof sampleValue === "string" || sampleValue instanceof String)
|
|
||||||
) {
|
|
||||||
// If the type is string then don't use Arrow's default inference unless dictionaries are requested
|
|
||||||
// because it will always use dictionary encoding for strings
|
|
||||||
return vectorFromArray(values, new Utf8());
|
|
||||||
} else {
|
|
||||||
// Convert a JS array of values to an arrow vector
|
|
||||||
return vectorFromArray(values);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function applyEmbeddings<T>(
|
|
||||||
table: ArrowTable,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
schema?: Schema
|
|
||||||
): Promise<ArrowTable> {
|
|
||||||
if (embeddings == null) {
|
|
||||||
return table;
|
|
||||||
}
|
|
||||||
if (schema !== undefined && schema !== null) {
|
|
||||||
schema = sanitizeSchema(schema);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert from ArrowTable to Record<String, Vector>
|
|
||||||
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
|
|
||||||
const name = table.schema.fields[idx].name;
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
||||||
const vec = table.getChildAt(idx)!;
|
|
||||||
return [name, vec];
|
|
||||||
});
|
|
||||||
const newColumns = Object.fromEntries(colEntries);
|
|
||||||
|
|
||||||
const sourceColumn = newColumns[embeddings.sourceColumn];
|
|
||||||
const destColumn = embeddings.destColumn ?? "vector";
|
|
||||||
const innerDestType = embeddings.embeddingDataType ?? new Float32();
|
|
||||||
if (sourceColumn === undefined) {
|
|
||||||
throw new Error(
|
|
||||||
`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (table.numRows === 0) {
|
|
||||||
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
|
||||||
// We have an empty table and it already has the embedding column so no work needs to be done
|
|
||||||
// Note: we don't return an error like we did below because this is a common occurrence. For example,
|
|
||||||
// if we call convertToTable with 0 records and a schema that includes the embedding
|
|
||||||
return table;
|
|
||||||
}
|
|
||||||
if (embeddings.embeddingDimension !== undefined) {
|
|
||||||
const destType = newVectorType(
|
|
||||||
embeddings.embeddingDimension,
|
|
||||||
innerDestType
|
|
||||||
);
|
|
||||||
newColumns[destColumn] = makeVector([], destType);
|
|
||||||
} else if (schema != null) {
|
|
||||||
const destField = schema.fields.find((f) => f.name === destColumn);
|
|
||||||
if (destField != null) {
|
|
||||||
newColumns[destColumn] = makeVector([], destField.type);
|
|
||||||
} else {
|
|
||||||
throw new Error(
|
|
||||||
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
throw new Error(
|
|
||||||
"Attempt to apply embeddings to an empty table when the embeddings function does not specify `embeddingDimension`"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
|
||||||
throw new Error(
|
|
||||||
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (table.batches.length > 1) {
|
|
||||||
throw new Error(
|
|
||||||
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const values = sourceColumn.toArray();
|
|
||||||
const vectors = await embeddings.embed(values as T[]);
|
|
||||||
if (vectors.length !== values.length) {
|
|
||||||
throw new Error(
|
|
||||||
"Embedding function did not return an embedding for each input element"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const destType = newVectorType(vectors[0].length, innerDestType);
|
|
||||||
newColumns[destColumn] = makeVector(vectors, destType);
|
|
||||||
}
|
|
||||||
|
|
||||||
const newTable = new ArrowTable(newColumns);
|
|
||||||
if (schema != null) {
|
|
||||||
if (schema.fields.find((f) => f.name === destColumn) === undefined) {
|
|
||||||
throw new Error(
|
|
||||||
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return alignTable(newTable, schema);
|
|
||||||
}
|
|
||||||
return newTable;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Convert an Array of records into an Arrow Table, optionally applying an
|
|
||||||
* embeddings function to it.
|
|
||||||
*
|
|
||||||
* This function calls `makeArrowTable` first to create the Arrow Table.
|
|
||||||
* Any provided `makeTableOptions` (e.g. a schema) will be passed on to
|
|
||||||
* that call.
|
|
||||||
*
|
|
||||||
* The embedding function will be passed a column of values (based on the
|
|
||||||
* `sourceColumn` of the embedding function) and expects to receive back
|
|
||||||
* number[][] which will be converted into a fixed size list column. By
|
|
||||||
* default this will be a fixed size list of Float32 but that can be
|
|
||||||
* customized by the `embeddingDataType` property of the embedding function.
|
|
||||||
*
|
|
||||||
* If a schema is provided in `makeTableOptions` then it should include the
|
|
||||||
* embedding columns. If no schema is provded then embedding columns will
|
|
||||||
* be placed at the end of the table, after all of the input columns.
|
|
||||||
*/
|
|
||||||
export async function convertToTable<T>(
|
|
||||||
data: Array<Record<string, unknown>>,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
makeTableOptions?: Partial<MakeArrowTableOptions>
|
|
||||||
): Promise<ArrowTable> {
|
|
||||||
const table = makeArrowTable(data, makeTableOptions);
|
|
||||||
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Creates the Arrow Type for a Vector column with dimension `dim`
|
|
||||||
function newVectorType<T extends Float>(
|
|
||||||
dim: number,
|
|
||||||
innerType: T
|
|
||||||
): FixedSizeList<T> {
|
|
||||||
// Somewhere we always default to have the elements nullable, so we need to set it to true
|
|
||||||
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
|
|
||||||
const children = new Field<T>("item", innerType, true);
|
|
||||||
return new FixedSizeList(dim, children);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
|
|
||||||
*
|
|
||||||
* This function will call `convertToTable` and pass on `embeddings` and `schema`
|
|
||||||
*
|
|
||||||
* `schema` is required if data is empty
|
|
||||||
*/
|
|
||||||
export async function fromRecordsToBuffer<T>(
|
|
||||||
data: Array<Record<string, unknown>>,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
schema?: Schema
|
|
||||||
): Promise<Buffer> {
|
|
||||||
if (schema !== undefined && schema !== null) {
|
|
||||||
schema = sanitizeSchema(schema);
|
|
||||||
}
|
|
||||||
const table = await convertToTable(data, embeddings, { schema, embeddings });
|
|
||||||
const writer = RecordBatchFileWriter.writeAll(table);
|
|
||||||
return Buffer.from(await writer.toUint8Array());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Serialize an Array of records into a buffer using the Arrow IPC Stream serialization
|
|
||||||
*
|
|
||||||
* This function will call `convertToTable` and pass on `embeddings` and `schema`
|
|
||||||
*
|
|
||||||
* `schema` is required if data is empty
|
|
||||||
*/
|
|
||||||
export async function fromRecordsToStreamBuffer<T>(
|
|
||||||
data: Array<Record<string, unknown>>,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
schema?: Schema
|
|
||||||
): Promise<Buffer> {
|
|
||||||
if (schema !== null && schema !== undefined) {
|
|
||||||
schema = sanitizeSchema(schema);
|
|
||||||
}
|
|
||||||
const table = await convertToTable(data, embeddings, { schema });
|
|
||||||
const writer = RecordBatchStreamWriter.writeAll(table);
|
|
||||||
return Buffer.from(await writer.toUint8Array());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
|
|
||||||
*
|
|
||||||
* This function will apply `embeddings` to the table in a manner similar to
|
|
||||||
* `convertToTable`.
|
|
||||||
*
|
|
||||||
* `schema` is required if the table is empty
|
|
||||||
*/
|
|
||||||
export async function fromTableToBuffer<T>(
|
|
||||||
table: ArrowTable,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
schema?: Schema
|
|
||||||
): Promise<Buffer> {
|
|
||||||
if (schema !== null && schema !== undefined) {
|
|
||||||
schema = sanitizeSchema(schema);
|
|
||||||
}
|
|
||||||
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
||||||
const writer = RecordBatchFileWriter.writeAll(tableWithEmbeddings);
|
|
||||||
return Buffer.from(await writer.toUint8Array());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
|
|
||||||
*
|
|
||||||
* This function will apply `embeddings` to the table in a manner similar to
|
|
||||||
* `convertToTable`.
|
|
||||||
*
|
|
||||||
* `schema` is required if the table is empty
|
|
||||||
*/
|
|
||||||
export async function fromTableToStreamBuffer<T>(
|
|
||||||
table: ArrowTable,
|
|
||||||
embeddings?: EmbeddingFunction<T>,
|
|
||||||
schema?: Schema
|
|
||||||
): Promise<Buffer> {
|
|
||||||
if (schema !== null && schema !== undefined) {
|
|
||||||
schema = sanitizeSchema(schema);
|
|
||||||
}
|
|
||||||
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
||||||
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
|
|
||||||
return Buffer.from(await writer.toUint8Array());
|
|
||||||
}
|
|
||||||
|
|
||||||
function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
|
|
||||||
const alignedChildren = [];
|
|
||||||
for (const field of schema.fields) {
|
|
||||||
const indexInBatch = batch.schema.fields?.findIndex(
|
|
||||||
(f) => f.name === field.name
|
|
||||||
);
|
|
||||||
if (indexInBatch < 0) {
|
|
||||||
throw new Error(
|
|
||||||
`The column ${field.name} was not found in the Arrow Table`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
alignedChildren.push(batch.data.children[indexInBatch]);
|
|
||||||
}
|
|
||||||
const newData = makeData({
|
|
||||||
type: new Struct(schema.fields),
|
|
||||||
length: batch.numRows,
|
|
||||||
nullCount: batch.nullCount,
|
|
||||||
children: alignedChildren
|
|
||||||
});
|
|
||||||
return new RecordBatch(schema, newData);
|
|
||||||
}
|
|
||||||
|
|
||||||
function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
|
|
||||||
const alignedBatches = table.batches.map((batch) =>
|
|
||||||
alignBatch(batch, schema)
|
|
||||||
);
|
|
||||||
return new ArrowTable(schema, alignedBatches);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Creates an empty Arrow Table
|
|
||||||
export function createEmptyTable(schema: Schema): ArrowTable {
|
|
||||||
return new ArrowTable(sanitizeSchema(schema));
|
|
||||||
}
|
|
||||||
|
|
||||||
function validateSchemaEmbeddings(
|
|
||||||
schema: Schema<any>,
|
|
||||||
data: Array<Record<string, unknown>>,
|
|
||||||
embeddings: EmbeddingFunction<any> | undefined
|
|
||||||
) {
|
|
||||||
const fields = [];
|
|
||||||
const missingEmbeddingFields = [];
|
|
||||||
|
|
||||||
// First we check if the field is a `FixedSizeList`
|
|
||||||
// Then we check if the data contains the field
|
|
||||||
// if it does not, we add it to the list of missing embedding fields
|
|
||||||
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
||||||
// if they are not, we throw an error
|
|
||||||
for (const field of schema.fields) {
|
|
||||||
if (field.type instanceof FixedSizeList) {
|
|
||||||
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
||||||
missingEmbeddingFields.push(field);
|
|
||||||
} else {
|
|
||||||
fields.push(field);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fields.push(field);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
|
||||||
throw new Error(
|
|
||||||
`Table has embeddings: "${missingEmbeddingFields
|
|
||||||
.map((f) => f.name)
|
|
||||||
.join(",")}", but no embedding function was provided`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Schema(fields, schema.metadata);
|
|
||||||
}
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { type Float } from 'apache-arrow'
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An embedding function that automatically creates vector representation for a given column.
|
|
||||||
*/
|
|
||||||
export interface EmbeddingFunction<T> {
|
|
||||||
/**
|
|
||||||
* The name of the column that will be used as input for the Embedding Function.
|
|
||||||
*/
|
|
||||||
sourceColumn: string
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The data type of the embedding
|
|
||||||
*
|
|
||||||
* The embedding function should return `number`. This will be converted into
|
|
||||||
* an Arrow float array. By default this will be Float32 but this property can
|
|
||||||
* be used to control the conversion.
|
|
||||||
*/
|
|
||||||
embeddingDataType?: Float
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The dimension of the embedding
|
|
||||||
*
|
|
||||||
* This is optional, normally this can be determined by looking at the results of
|
|
||||||
* `embed`. If this is not specified, and there is an attempt to apply the embedding
|
|
||||||
* to an empty table, then that process will fail.
|
|
||||||
*/
|
|
||||||
embeddingDimension?: number
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The name of the column that will contain the embedding
|
|
||||||
*
|
|
||||||
* By default this is "vector"
|
|
||||||
*/
|
|
||||||
destColumn?: string
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Should the source column be excluded from the resulting table
|
|
||||||
*
|
|
||||||
* By default the source column is included. Set this to true and
|
|
||||||
* only the embedding will be stored.
|
|
||||||
*/
|
|
||||||
excludeSource?: boolean
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a vector representation for the given values.
|
|
||||||
*/
|
|
||||||
embed: (data: T[]) => Promise<number[][]>
|
|
||||||
}
|
|
||||||
|
|
||||||
export function isEmbeddingFunction<T> (value: any): value is EmbeddingFunction<T> {
|
|
||||||
return typeof value.sourceColumn === 'string' &&
|
|
||||||
typeof value.embed === 'function'
|
|
||||||
}
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { type EmbeddingFunction } from '../index'
|
|
||||||
import type OpenAI from 'openai'
|
|
||||||
|
|
||||||
export class OpenAIEmbeddingFunction implements EmbeddingFunction<string> {
|
|
||||||
private readonly _openai: OpenAI
|
|
||||||
private readonly _modelName: string
|
|
||||||
|
|
||||||
constructor (sourceColumn: string, openAIKey: string, modelName: string = 'text-embedding-ada-002') {
|
|
||||||
/**
|
|
||||||
* @type {import("openai").default}
|
|
||||||
*/
|
|
||||||
let Openai
|
|
||||||
try {
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
||||||
Openai = require('openai')
|
|
||||||
} catch {
|
|
||||||
throw new Error('please install openai@^4.24.1 using npm install openai')
|
|
||||||
}
|
|
||||||
|
|
||||||
this.sourceColumn = sourceColumn
|
|
||||||
const configuration = {
|
|
||||||
apiKey: openAIKey
|
|
||||||
}
|
|
||||||
|
|
||||||
this._openai = new Openai(configuration)
|
|
||||||
this._modelName = modelName
|
|
||||||
}
|
|
||||||
|
|
||||||
async embed (data: string[]): Promise<number[][]> {
|
|
||||||
const response = await this._openai.embeddings.create({
|
|
||||||
model: this._modelName,
|
|
||||||
input: data
|
|
||||||
})
|
|
||||||
|
|
||||||
const embeddings: number[][] = []
|
|
||||||
for (let i = 0; i < response.data.length; i++) {
|
|
||||||
embeddings.push(response.data[i].embedding)
|
|
||||||
}
|
|
||||||
return embeddings
|
|
||||||
}
|
|
||||||
|
|
||||||
sourceColumn: string
|
|
||||||
}
|
|
||||||
1399
node/src/index.ts
1399
node/src/index.ts
File diff suppressed because it is too large
Load Diff
@@ -1,180 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { describe } from 'mocha'
|
|
||||||
import * as chai from 'chai'
|
|
||||||
import { assert } from 'chai'
|
|
||||||
import * as chaiAsPromised from 'chai-as-promised'
|
|
||||||
import { v4 as uuidv4 } from 'uuid'
|
|
||||||
|
|
||||||
import * as lancedb from '../index'
|
|
||||||
import { tmpdir } from 'os'
|
|
||||||
import * as fs from 'fs'
|
|
||||||
import * as path from 'path'
|
|
||||||
|
|
||||||
chai.use(chaiAsPromised)
|
|
||||||
|
|
||||||
describe('LanceDB AWS Integration test', function () {
|
|
||||||
it('s3+ddb schema is processed correctly', async function () {
|
|
||||||
this.timeout(15000)
|
|
||||||
|
|
||||||
// WARNING: specifying engine is NOT a publicly supported feature in lancedb yet
|
|
||||||
// THE API WILL CHANGE
|
|
||||||
const conn = await lancedb.connect('s3://lancedb-integtest?engine=ddb&ddbTableName=lancedb-integtest')
|
|
||||||
const data = [{ vector: Array(128).fill(1.0) }]
|
|
||||||
|
|
||||||
const tableName = uuidv4()
|
|
||||||
let table = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
|
|
||||||
|
|
||||||
const futs = [table.add(data), table.add(data), table.add(data), table.add(data), table.add(data)]
|
|
||||||
await Promise.allSettled(futs)
|
|
||||||
|
|
||||||
table = await conn.openTable(tableName)
|
|
||||||
assert.equal(await table.countRows(), 6)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
describe('LanceDB Mirrored Store Integration test', function () {
|
|
||||||
it('s3://...?mirroredStore=... param is processed correctly', async function () {
|
|
||||||
this.timeout(600000)
|
|
||||||
|
|
||||||
const dir = tmpdir()
|
|
||||||
console.log(dir)
|
|
||||||
const conn = await lancedb.connect({ uri: `s3://lancedb-integtest?mirroredStore=${dir}`, storageOptions: { allowHttp: 'true' } })
|
|
||||||
const data = Array(200).fill({ vector: Array(128).fill(1.0), id: 0 })
|
|
||||||
data.push(...Array(200).fill({ vector: Array(128).fill(1.0), id: 1 }))
|
|
||||||
data.push(...Array(200).fill({ vector: Array(128).fill(1.0), id: 2 }))
|
|
||||||
data.push(...Array(200).fill({ vector: Array(128).fill(1.0), id: 3 }))
|
|
||||||
|
|
||||||
const tableName = uuidv4()
|
|
||||||
|
|
||||||
// try create table and check if it's mirrored
|
|
||||||
const t = await conn.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite })
|
|
||||||
|
|
||||||
const mirroredPath = path.join(dir, `${tableName}.lance`)
|
|
||||||
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
// there should be three dirs
|
|
||||||
assert.equal(files.length, 3)
|
|
||||||
assert.isTrue(files[0].isDirectory())
|
|
||||||
assert.isTrue(files[1].isDirectory())
|
|
||||||
|
|
||||||
fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
assert.equal(files.length, 1)
|
|
||||||
assert.isTrue(files[0].name.endsWith('.txn'))
|
|
||||||
})
|
|
||||||
|
|
||||||
fs.readdir(path.join(mirroredPath, '_versions'), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
assert.equal(files.length, 1)
|
|
||||||
assert.isTrue(files[0].name.endsWith('.manifest'))
|
|
||||||
})
|
|
||||||
|
|
||||||
fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
assert.equal(files.length, 1)
|
|
||||||
assert.isTrue(files[0].name.endsWith('.lance'))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
// try create index and check if it's mirrored
|
|
||||||
await t.createIndex({ column: 'vector', type: 'ivf_pq' })
|
|
||||||
|
|
||||||
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
// there should be four dirs
|
|
||||||
assert.equal(files.length, 4)
|
|
||||||
assert.isTrue(files[0].isDirectory())
|
|
||||||
assert.isTrue(files[1].isDirectory())
|
|
||||||
assert.isTrue(files[2].isDirectory())
|
|
||||||
|
|
||||||
// Two TXs now
|
|
||||||
fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
assert.equal(files.length, 2)
|
|
||||||
assert.isTrue(files[0].name.endsWith('.txn'))
|
|
||||||
assert.isTrue(files[1].name.endsWith('.txn'))
|
|
||||||
})
|
|
||||||
|
|
||||||
fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
assert.equal(files.length, 1)
|
|
||||||
assert.isTrue(files[0].name.endsWith('.lance'))
|
|
||||||
})
|
|
||||||
|
|
||||||
fs.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
assert.equal(files.length, 1)
|
|
||||||
assert.isTrue(files[0].isDirectory())
|
|
||||||
|
|
||||||
fs.readdir(path.join(mirroredPath, '_indices', files[0].name), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
|
|
||||||
assert.equal(files.length, 1)
|
|
||||||
assert.isTrue(files[0].isFile())
|
|
||||||
assert.isTrue(files[0].name.endsWith('.idx'))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
// try delete and check if it's mirrored
|
|
||||||
await t.delete('id = 0')
|
|
||||||
|
|
||||||
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
// there should be five dirs
|
|
||||||
assert.equal(files.length, 5)
|
|
||||||
assert.isTrue(files[0].isDirectory())
|
|
||||||
assert.isTrue(files[1].isDirectory())
|
|
||||||
assert.isTrue(files[2].isDirectory())
|
|
||||||
assert.isTrue(files[3].isDirectory())
|
|
||||||
assert.isTrue(files[4].isDirectory())
|
|
||||||
|
|
||||||
// Three TXs now
|
|
||||||
fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
assert.equal(files.length, 3)
|
|
||||||
assert.isTrue(files[0].name.endsWith('.txn'))
|
|
||||||
assert.isTrue(files[1].name.endsWith('.txn'))
|
|
||||||
})
|
|
||||||
|
|
||||||
fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
assert.equal(files.length, 1)
|
|
||||||
assert.isTrue(files[0].name.endsWith('.lance'))
|
|
||||||
})
|
|
||||||
|
|
||||||
fs.readdir(path.join(mirroredPath, '_indices'), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
assert.equal(files.length, 1)
|
|
||||||
assert.isTrue(files[0].isDirectory())
|
|
||||||
|
|
||||||
fs.readdir(path.join(mirroredPath, '_indices', files[0].name), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
|
|
||||||
assert.equal(files.length, 1)
|
|
||||||
assert.isTrue(files[0].isFile())
|
|
||||||
assert.isTrue(files[0].name.endsWith('.idx'))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
fs.readdir(path.join(mirroredPath, '_deletions'), { withFileTypes: true }, (err, files) => {
|
|
||||||
if (err != null) throw err
|
|
||||||
assert.equal(files.length, 1)
|
|
||||||
assert.isTrue(files[0].name.endsWith('.arrow'))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
// Copyright 2024 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Middleware for Remote LanceDB Connection or Table
|
|
||||||
*/
|
|
||||||
export interface HttpMiddleware {
|
|
||||||
/**
|
|
||||||
* A callback that can be used to instrument the behavior of http requests to remote
|
|
||||||
* tables. It can be used to add headers, modify the request, or even short-circuit
|
|
||||||
* the request and return a response without making the request to the remote endpoint.
|
|
||||||
* It can also be used to modify the response from the remote endpoint.
|
|
||||||
*
|
|
||||||
* @param {RemoteResponse} res - Request to the remote endpoint
|
|
||||||
* @param {onRemoteRequestNext} next - Callback to advance the middleware chain
|
|
||||||
*/
|
|
||||||
onRemoteRequest(
|
|
||||||
req: RemoteRequest,
|
|
||||||
next: (req: RemoteRequest) => Promise<RemoteResponse>,
|
|
||||||
): Promise<RemoteResponse>
|
|
||||||
};
|
|
||||||
|
|
||||||
export enum Method {
|
|
||||||
GET,
|
|
||||||
POST
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A LanceDB Remote HTTP Request
|
|
||||||
*/
|
|
||||||
export interface RemoteRequest {
|
|
||||||
uri: string
|
|
||||||
method: Method
|
|
||||||
headers: Map<string, string>
|
|
||||||
params?: Map<string, string>
|
|
||||||
body?: any
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A LanceDB Remote HTTP Response
|
|
||||||
*/
|
|
||||||
export interface RemoteResponse {
|
|
||||||
status: number
|
|
||||||
statusText: string
|
|
||||||
headers: Map<string, string>
|
|
||||||
body: () => Promise<any>
|
|
||||||
}
|
|
||||||
@@ -1,163 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { Vector, tableFromIPC } from 'apache-arrow'
|
|
||||||
import { type EmbeddingFunction } from './embedding/embedding_function'
|
|
||||||
import { type MetricType } from '.'
|
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
||||||
const { tableSearch } = require('../native.js')
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A builder for nearest neighbor queries for LanceDB.
|
|
||||||
*/
|
|
||||||
export class Query<T = number[]> {
|
|
||||||
private readonly _query?: T
|
|
||||||
private readonly _tbl?: any
|
|
||||||
private _queryVector?: number[]
|
|
||||||
private _limit?: number
|
|
||||||
private _refineFactor?: number
|
|
||||||
private _nprobes: number
|
|
||||||
private _select?: string[]
|
|
||||||
private _filter?: string
|
|
||||||
private _metricType?: MetricType
|
|
||||||
private _prefilter: boolean
|
|
||||||
private _fastSearch: boolean
|
|
||||||
protected readonly _embeddings?: EmbeddingFunction<T>
|
|
||||||
|
|
||||||
constructor (query?: T, tbl?: any, embeddings?: EmbeddingFunction<T>) {
|
|
||||||
this._tbl = tbl
|
|
||||||
this._query = query
|
|
||||||
this._limit = 10
|
|
||||||
this._nprobes = 20
|
|
||||||
this._refineFactor = undefined
|
|
||||||
this._select = undefined
|
|
||||||
this._filter = undefined
|
|
||||||
this._metricType = undefined
|
|
||||||
this._embeddings = embeddings
|
|
||||||
this._prefilter = false
|
|
||||||
this._fastSearch = false
|
|
||||||
}
|
|
||||||
|
|
||||||
/***
|
|
||||||
* Sets the number of results that will be returned
|
|
||||||
* default value is 10
|
|
||||||
* @param value number of results
|
|
||||||
*/
|
|
||||||
limit (value: number): Query<T> {
|
|
||||||
this._limit = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Refine the results by reading extra elements and re-ranking them in memory.
|
|
||||||
* @param value refine factor to use in this query.
|
|
||||||
*/
|
|
||||||
refineFactor (value: number): Query<T> {
|
|
||||||
this._refineFactor = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The number of probes used. A higher number makes search more accurate but also slower.
|
|
||||||
* @param value The number of probes used.
|
|
||||||
*/
|
|
||||||
nprobes (value: number): Query<T> {
|
|
||||||
this._nprobes = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A filter statement to be applied to this query.
|
|
||||||
* @param value A filter in the same format used by a sql WHERE clause.
|
|
||||||
*/
|
|
||||||
filter (value: string): Query<T> {
|
|
||||||
this._filter = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
where = this.filter
|
|
||||||
|
|
||||||
/** Return only the specified columns.
|
|
||||||
*
|
|
||||||
* @param value Only select the specified columns. If not specified, all columns will be returned.
|
|
||||||
*/
|
|
||||||
select (value: string[]): Query<T> {
|
|
||||||
this._select = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The MetricType used for this Query.
|
|
||||||
* @param value The metric to the. @see MetricType for the different options
|
|
||||||
*/
|
|
||||||
metricType (value: MetricType): Query<T> {
|
|
||||||
this._metricType = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
prefilter (value: boolean): Query<T> {
|
|
||||||
this._prefilter = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Skip searching un-indexed data. This can make search faster, but will miss
|
|
||||||
* any data that is not yet indexed.
|
|
||||||
*/
|
|
||||||
fastSearch (value: boolean): Query<T> {
|
|
||||||
this._fastSearch = value
|
|
||||||
return this
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Execute the query and return the results as an Array of Objects
|
|
||||||
*/
|
|
||||||
async execute<T = Record<string, unknown>> (): Promise<T[]> {
|
|
||||||
if (this._query !== undefined) {
|
|
||||||
if (this._embeddings !== undefined) {
|
|
||||||
this._queryVector = (await this._embeddings.embed([this._query]))[0]
|
|
||||||
} else {
|
|
||||||
this._queryVector = this._query as number[]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const isElectron = this.isElectron()
|
|
||||||
const buffer = await tableSearch.call(this._tbl, this, isElectron)
|
|
||||||
const data = tableFromIPC(buffer)
|
|
||||||
|
|
||||||
return data.toArray().map((entry: Record<string, unknown>) => {
|
|
||||||
const newObject: Record<string, unknown> = {}
|
|
||||||
Object.keys(entry).forEach((key: string) => {
|
|
||||||
if (entry[key] instanceof Vector) {
|
|
||||||
// toJSON() returns f16 array correctly
|
|
||||||
newObject[key] = (entry[key] as any).toJSON()
|
|
||||||
} else {
|
|
||||||
newObject[key] = entry[key] as any
|
|
||||||
}
|
|
||||||
})
|
|
||||||
return newObject as unknown as T
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// See https://github.com/electron/electron/issues/2288
|
|
||||||
private isElectron (): boolean {
|
|
||||||
try {
|
|
||||||
// eslint-disable-next-line no-prototype-builtins
|
|
||||||
return (process?.versions?.hasOwnProperty('electron') || navigator?.userAgent?.toLowerCase()?.includes(' electron'))
|
|
||||||
} catch (e) {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,302 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import axios, { type AxiosError, type AxiosResponse, type ResponseType } from 'axios'
|
|
||||||
|
|
||||||
import { tableFromIPC, type Table as ArrowTable } from 'apache-arrow'
|
|
||||||
|
|
||||||
import { type RemoteResponse, type RemoteRequest, Method } from '../middleware'
|
|
||||||
import type { MetricType } from '..'
|
|
||||||
|
|
||||||
interface HttpLancedbClientMiddleware {
|
|
||||||
onRemoteRequest(
|
|
||||||
req: RemoteRequest,
|
|
||||||
next: (req: RemoteRequest) => Promise<RemoteResponse>,
|
|
||||||
): Promise<RemoteResponse>
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Invoke the middleware chain and at the end call the remote endpoint
|
|
||||||
*/
|
|
||||||
async function callWithMiddlewares (
|
|
||||||
req: RemoteRequest,
|
|
||||||
middlewares: HttpLancedbClientMiddleware[],
|
|
||||||
opts?: MiddlewareInvocationOptions
|
|
||||||
): Promise<RemoteResponse> {
|
|
||||||
async function call (
|
|
||||||
i: number,
|
|
||||||
req: RemoteRequest
|
|
||||||
): Promise<RemoteResponse> {
|
|
||||||
// if we have reached the end of the middleware chain, make the request
|
|
||||||
if (i > middlewares.length) {
|
|
||||||
const headers = Object.fromEntries(req.headers.entries())
|
|
||||||
const params = Object.fromEntries(req.params?.entries() ?? [])
|
|
||||||
const timeout = opts?.timeout
|
|
||||||
let res
|
|
||||||
if (req.method === Method.POST) {
|
|
||||||
res = await axios.post(
|
|
||||||
req.uri,
|
|
||||||
req.body,
|
|
||||||
{
|
|
||||||
headers,
|
|
||||||
params,
|
|
||||||
timeout,
|
|
||||||
responseType: opts?.responseType
|
|
||||||
}
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
res = await axios.get(
|
|
||||||
req.uri,
|
|
||||||
{
|
|
||||||
headers,
|
|
||||||
params,
|
|
||||||
timeout
|
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
return toLanceRes(res)
|
|
||||||
}
|
|
||||||
|
|
||||||
// call next middleware in chain
|
|
||||||
return await middlewares[i - 1].onRemoteRequest(
|
|
||||||
req,
|
|
||||||
async (req) => {
|
|
||||||
return await call(i + 1, req)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
return await call(1, req)
|
|
||||||
}
|
|
||||||
|
|
||||||
interface MiddlewareInvocationOptions {
|
|
||||||
responseType?: ResponseType
|
|
||||||
timeout?: number
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Marshall the library response into a LanceDB response
|
|
||||||
*/
|
|
||||||
function toLanceRes (res: AxiosResponse): RemoteResponse {
|
|
||||||
const headers = new Map()
|
|
||||||
for (const h in res.headers) {
|
|
||||||
headers.set(h, res.headers[h])
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
status: res.status,
|
|
||||||
statusText: res.statusText,
|
|
||||||
headers,
|
|
||||||
body: async () => {
|
|
||||||
return res.data
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function decodeErrorData(
|
|
||||||
res: RemoteResponse,
|
|
||||||
responseType?: ResponseType
|
|
||||||
): Promise<string> {
|
|
||||||
const errorData = await res.body()
|
|
||||||
if (responseType === 'arraybuffer') {
|
|
||||||
return new TextDecoder().decode(errorData)
|
|
||||||
} else {
|
|
||||||
if (typeof errorData === 'object') {
|
|
||||||
return JSON.stringify(errorData)
|
|
||||||
}
|
|
||||||
|
|
||||||
return errorData
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export class HttpLancedbClient {
|
|
||||||
private readonly _url: string
|
|
||||||
private readonly _apiKey: () => string
|
|
||||||
private readonly _middlewares: HttpLancedbClientMiddleware[]
|
|
||||||
private readonly _timeout: number | undefined
|
|
||||||
|
|
||||||
public constructor (
|
|
||||||
url: string,
|
|
||||||
apiKey: string,
|
|
||||||
timeout?: number,
|
|
||||||
private readonly _dbName?: string
|
|
||||||
|
|
||||||
) {
|
|
||||||
this._url = url
|
|
||||||
this._apiKey = () => apiKey
|
|
||||||
this._middlewares = []
|
|
||||||
this._timeout = timeout
|
|
||||||
}
|
|
||||||
|
|
||||||
get uri (): string {
|
|
||||||
return this._url
|
|
||||||
}
|
|
||||||
|
|
||||||
public async search (
|
|
||||||
tableName: string,
|
|
||||||
vector: number[],
|
|
||||||
k: number,
|
|
||||||
nprobes: number,
|
|
||||||
prefilter: boolean,
|
|
||||||
refineFactor?: number,
|
|
||||||
columns?: string[],
|
|
||||||
filter?: string,
|
|
||||||
metricType?: MetricType,
|
|
||||||
fastSearch?: boolean
|
|
||||||
): Promise<ArrowTable<any>> {
|
|
||||||
const result = await this.post(
|
|
||||||
`/v1/table/${tableName}/query/`,
|
|
||||||
{
|
|
||||||
vector,
|
|
||||||
k,
|
|
||||||
nprobes,
|
|
||||||
refine_factor: refineFactor,
|
|
||||||
columns,
|
|
||||||
filter,
|
|
||||||
prefilter,
|
|
||||||
metric: metricType,
|
|
||||||
fast_search: fastSearch
|
|
||||||
},
|
|
||||||
undefined,
|
|
||||||
undefined,
|
|
||||||
'arraybuffer'
|
|
||||||
)
|
|
||||||
const table = tableFromIPC(await result.body())
|
|
||||||
return table
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sent GET request.
|
|
||||||
*/
|
|
||||||
public async get (path: string, params?: Record<string, string>): Promise<RemoteResponse> {
|
|
||||||
const req = {
|
|
||||||
uri: `${this._url}${path}`,
|
|
||||||
method: Method.GET,
|
|
||||||
headers: new Map(Object.entries({
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'x-api-key': this._apiKey(),
|
|
||||||
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
|
|
||||||
})),
|
|
||||||
params: new Map(Object.entries(params ?? {}))
|
|
||||||
}
|
|
||||||
|
|
||||||
let response
|
|
||||||
try {
|
|
||||||
response = await callWithMiddlewares(req, this._middlewares)
|
|
||||||
return response
|
|
||||||
} catch (err: any) {
|
|
||||||
console.error(serializeErrorAsJson(err))
|
|
||||||
if (err.response === undefined) {
|
|
||||||
throw new Error(`Network Error: ${err.message as string}`)
|
|
||||||
}
|
|
||||||
|
|
||||||
response = toLanceRes(err.response)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (response.status !== 200) {
|
|
||||||
const errorData = await decodeErrorData(response)
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${response.status}, ` +
|
|
||||||
`message: ${response.statusText}: ${errorData}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
return response
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sent POST request.
|
|
||||||
*/
|
|
||||||
public async post (
|
|
||||||
path: string,
|
|
||||||
data?: any,
|
|
||||||
params?: Record<string, string>,
|
|
||||||
content?: string | undefined,
|
|
||||||
responseType?: ResponseType | undefined
|
|
||||||
): Promise<RemoteResponse> {
|
|
||||||
const req = {
|
|
||||||
uri: `${this._url}${path}`,
|
|
||||||
method: Method.POST,
|
|
||||||
headers: new Map(Object.entries({
|
|
||||||
'Content-Type': content ?? 'application/json',
|
|
||||||
'x-api-key': this._apiKey(),
|
|
||||||
...(this._dbName !== undefined ? { 'x-lancedb-database': this._dbName } : {})
|
|
||||||
})),
|
|
||||||
params: new Map(Object.entries(params ?? {})),
|
|
||||||
body: data
|
|
||||||
}
|
|
||||||
|
|
||||||
let response
|
|
||||||
try {
|
|
||||||
response = await callWithMiddlewares(req, this._middlewares, {
|
|
||||||
responseType,
|
|
||||||
timeout: this._timeout
|
|
||||||
})
|
|
||||||
|
|
||||||
// return response
|
|
||||||
} catch (err: any) {
|
|
||||||
console.error(serializeErrorAsJson(err))
|
|
||||||
|
|
||||||
if (err.response === undefined) {
|
|
||||||
throw new Error(`Network Error: ${err.message as string}`)
|
|
||||||
}
|
|
||||||
response = toLanceRes(err.response)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (response.status !== 200) {
|
|
||||||
const errorData = await decodeErrorData(response, responseType)
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${response.status}, ` +
|
|
||||||
`message: ${response.statusText}: ${errorData}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
return response
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instrument this client with middleware
|
|
||||||
* @param mw - The middleware that instruments the client
|
|
||||||
* @returns - an instance of this client instrumented with the middleware
|
|
||||||
*/
|
|
||||||
public withMiddleware (mw: HttpLancedbClientMiddleware): HttpLancedbClient {
|
|
||||||
const wrapped = this.clone()
|
|
||||||
wrapped._middlewares.push(mw)
|
|
||||||
return wrapped
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Make a clone of this client
|
|
||||||
*/
|
|
||||||
private clone (): HttpLancedbClient {
|
|
||||||
const clone = new HttpLancedbClient(this._url, this._apiKey(), this._timeout, this._dbName)
|
|
||||||
for (const mw of this._middlewares) {
|
|
||||||
clone._middlewares.push(mw)
|
|
||||||
}
|
|
||||||
return clone
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function serializeErrorAsJson(err: AxiosError) {
|
|
||||||
const error = JSON.parse(JSON.stringify(err, Object.getOwnPropertyNames(err)))
|
|
||||||
error.response = err.response != null
|
|
||||||
? JSON.parse(JSON.stringify(
|
|
||||||
err.response,
|
|
||||||
// config contains the request data, too noisy
|
|
||||||
Object.getOwnPropertyNames(err.response).filter(prop => prop !== 'config')
|
|
||||||
))
|
|
||||||
: null
|
|
||||||
return JSON.stringify({ error })
|
|
||||||
}
|
|
||||||
@@ -1,567 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import {
|
|
||||||
type EmbeddingFunction,
|
|
||||||
type Table,
|
|
||||||
type VectorIndexParams,
|
|
||||||
type Connection,
|
|
||||||
type ConnectionOptions,
|
|
||||||
type CreateTableOptions,
|
|
||||||
type VectorIndex,
|
|
||||||
type WriteOptions,
|
|
||||||
type IndexStats,
|
|
||||||
type UpdateArgs,
|
|
||||||
type UpdateSqlArgs,
|
|
||||||
makeArrowTable,
|
|
||||||
type MergeInsertArgs,
|
|
||||||
type ColumnAlteration
|
|
||||||
} from '../index'
|
|
||||||
import { Query } from '../query'
|
|
||||||
|
|
||||||
import { Vector, Table as ArrowTable } from 'apache-arrow'
|
|
||||||
import { HttpLancedbClient } from './client'
|
|
||||||
import { isEmbeddingFunction } from '../embedding/embedding_function'
|
|
||||||
import {
|
|
||||||
createEmptyTable,
|
|
||||||
fromRecordsToStreamBuffer,
|
|
||||||
fromTableToStreamBuffer
|
|
||||||
} from '../arrow'
|
|
||||||
import { toSQL, TTLCache } from '../util'
|
|
||||||
import { type HttpMiddleware } from '../middleware'
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Remote connection.
|
|
||||||
*/
|
|
||||||
export class RemoteConnection implements Connection {
|
|
||||||
private _client: HttpLancedbClient
|
|
||||||
private readonly _dbName: string
|
|
||||||
private readonly _tableCache = new TTLCache(300_000)
|
|
||||||
|
|
||||||
constructor (opts: ConnectionOptions) {
|
|
||||||
if (!opts.uri.startsWith('db://')) {
|
|
||||||
throw new Error(`Invalid remote DB URI: ${opts.uri}`)
|
|
||||||
}
|
|
||||||
if (opts.apiKey == null || opts.apiKey === '') {
|
|
||||||
opts = Object.assign({}, opts, { apiKey: process.env.LANCEDB_API_KEY })
|
|
||||||
}
|
|
||||||
if (opts.apiKey === undefined || opts.region === undefined) {
|
|
||||||
throw new Error(
|
|
||||||
'API key and region are must be passed for remote connections. ' +
|
|
||||||
'API key can also be set through LANCEDB_API_KEY env variable.')
|
|
||||||
}
|
|
||||||
|
|
||||||
this._dbName = opts.uri.slice('db://'.length)
|
|
||||||
let server: string
|
|
||||||
if (opts.hostOverride === undefined) {
|
|
||||||
server = `https://${this._dbName}.${opts.region}.api.lancedb.com`
|
|
||||||
} else {
|
|
||||||
server = opts.hostOverride
|
|
||||||
}
|
|
||||||
this._client = new HttpLancedbClient(
|
|
||||||
server,
|
|
||||||
opts.apiKey,
|
|
||||||
opts.timeout,
|
|
||||||
opts.hostOverride === undefined ? undefined : this._dbName
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
get uri (): string {
|
|
||||||
// add the lancedb+ prefix back
|
|
||||||
return 'db://' + this._client.uri
|
|
||||||
}
|
|
||||||
|
|
||||||
async tableNames (
|
|
||||||
pageToken: string = '',
|
|
||||||
limit: number = 10
|
|
||||||
): Promise<string[]> {
|
|
||||||
const response = await this._client.get('/v1/table/', {
|
|
||||||
limit: `${limit}`,
|
|
||||||
page_token: pageToken
|
|
||||||
})
|
|
||||||
const body = await response.body()
|
|
||||||
for (const table of body.tables) {
|
|
||||||
this._tableCache.set(table, true)
|
|
||||||
}
|
|
||||||
return body.tables
|
|
||||||
}
|
|
||||||
|
|
||||||
async openTable (name: string): Promise<Table>
|
|
||||||
async openTable<T>(
|
|
||||||
name: string,
|
|
||||||
embeddings: EmbeddingFunction<T>
|
|
||||||
): Promise<Table<T>>
|
|
||||||
async openTable<T>(
|
|
||||||
name: string,
|
|
||||||
embeddings?: EmbeddingFunction<T>
|
|
||||||
): Promise<Table<T>> {
|
|
||||||
// check if the table exists
|
|
||||||
if (this._tableCache.get(name) === undefined) {
|
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(name)}/describe/`)
|
|
||||||
this._tableCache.set(name, true)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (embeddings !== undefined) {
|
|
||||||
return new RemoteTable(this._client, name, embeddings)
|
|
||||||
} else {
|
|
||||||
return new RemoteTable(this._client, name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async createTable<T>(
|
|
||||||
nameOrOpts: string | CreateTableOptions<T>,
|
|
||||||
data?: Array<Record<string, unknown>> | ArrowTable,
|
|
||||||
optsOrEmbedding?: WriteOptions | EmbeddingFunction<T>,
|
|
||||||
opt?: WriteOptions
|
|
||||||
): Promise<Table<T>> {
|
|
||||||
// Logic copied from LocatlConnection, refactor these to a base class + connectionImpl pattern
|
|
||||||
let schema
|
|
||||||
let embeddings: undefined | EmbeddingFunction<T>
|
|
||||||
let tableName: string
|
|
||||||
if (typeof nameOrOpts === 'string') {
|
|
||||||
if (
|
|
||||||
optsOrEmbedding !== undefined &&
|
|
||||||
isEmbeddingFunction(optsOrEmbedding)
|
|
||||||
) {
|
|
||||||
embeddings = optsOrEmbedding
|
|
||||||
}
|
|
||||||
tableName = nameOrOpts
|
|
||||||
} else {
|
|
||||||
schema = nameOrOpts.schema
|
|
||||||
embeddings = nameOrOpts.embeddingFunction
|
|
||||||
tableName = nameOrOpts.name
|
|
||||||
if (data === undefined) {
|
|
||||||
data = nameOrOpts.data
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let buffer: Buffer
|
|
||||||
|
|
||||||
function isEmpty (
|
|
||||||
data: Array<Record<string, unknown>> | ArrowTable<any>
|
|
||||||
): boolean {
|
|
||||||
if (data instanceof ArrowTable) {
|
|
||||||
return data.numRows === 0
|
|
||||||
}
|
|
||||||
return data.length === 0
|
|
||||||
}
|
|
||||||
|
|
||||||
if (data === undefined || isEmpty(data)) {
|
|
||||||
if (schema === undefined) {
|
|
||||||
throw new Error('Either data or schema needs to defined')
|
|
||||||
}
|
|
||||||
buffer = await fromTableToStreamBuffer(createEmptyTable(schema))
|
|
||||||
} else if (data instanceof ArrowTable) {
|
|
||||||
buffer = await fromTableToStreamBuffer(data, embeddings)
|
|
||||||
} else {
|
|
||||||
// data is Array<Record<...>>
|
|
||||||
buffer = await fromRecordsToStreamBuffer(data, embeddings)
|
|
||||||
}
|
|
||||||
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(tableName)}/create/`,
|
|
||||||
buffer,
|
|
||||||
undefined,
|
|
||||||
'application/vnd.apache.arrow.stream'
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
this._tableCache.set(tableName, true)
|
|
||||||
if (embeddings === undefined) {
|
|
||||||
return new RemoteTable(this._client, tableName)
|
|
||||||
} else {
|
|
||||||
return new RemoteTable(this._client, tableName, embeddings)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async dropTable (name: string): Promise<void> {
|
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(name)}/drop/`)
|
|
||||||
this._tableCache.delete(name)
|
|
||||||
}
|
|
||||||
|
|
||||||
withMiddleware (middleware: HttpMiddleware): Connection {
|
|
||||||
const wrapped = this.clone()
|
|
||||||
wrapped._client = wrapped._client.withMiddleware(middleware)
|
|
||||||
return wrapped
|
|
||||||
}
|
|
||||||
|
|
||||||
private clone (): RemoteConnection {
|
|
||||||
const clone: RemoteConnection = Object.create(RemoteConnection.prototype)
|
|
||||||
return Object.assign(clone, this)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export class RemoteQuery<T = number[]> extends Query<T> {
|
|
||||||
constructor (
|
|
||||||
query: T,
|
|
||||||
private readonly _client: HttpLancedbClient,
|
|
||||||
private readonly _name: string,
|
|
||||||
embeddings?: EmbeddingFunction<T>
|
|
||||||
) {
|
|
||||||
super(query, undefined, embeddings)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: refactor this to a base class + queryImpl pattern
|
|
||||||
async execute<T = Record<string, unknown>>(): Promise<T[]> {
|
|
||||||
const embeddings = this._embeddings
|
|
||||||
const query = (this as any)._query
|
|
||||||
let queryVector: number[]
|
|
||||||
|
|
||||||
if (embeddings !== undefined) {
|
|
||||||
queryVector = (await embeddings.embed([query]))[0]
|
|
||||||
} else {
|
|
||||||
queryVector = query as number[]
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = await this._client.search(
|
|
||||||
this._name,
|
|
||||||
queryVector,
|
|
||||||
(this as any)._limit,
|
|
||||||
(this as any)._nprobes,
|
|
||||||
(this as any)._prefilter,
|
|
||||||
(this as any)._refineFactor,
|
|
||||||
(this as any)._select,
|
|
||||||
(this as any)._filter,
|
|
||||||
(this as any)._metricType,
|
|
||||||
(this as any)._fastSearch
|
|
||||||
)
|
|
||||||
|
|
||||||
return data.toArray().map((entry: Record<string, unknown>) => {
|
|
||||||
const newObject: Record<string, unknown> = {}
|
|
||||||
Object.keys(entry).forEach((key: string) => {
|
|
||||||
if (entry[key] instanceof Vector) {
|
|
||||||
newObject[key] = (entry[key] as any).toArray()
|
|
||||||
} else {
|
|
||||||
newObject[key] = entry[key] as any
|
|
||||||
}
|
|
||||||
})
|
|
||||||
return newObject as unknown as T
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// we are using extend until we have next next version release
|
|
||||||
// Table and Connection has both been refactored to interfaces
|
|
||||||
export class RemoteTable<T = number[]> implements Table<T> {
|
|
||||||
private _client: HttpLancedbClient
|
|
||||||
private readonly _embeddings?: EmbeddingFunction<T>
|
|
||||||
private readonly _name: string
|
|
||||||
|
|
||||||
constructor (client: HttpLancedbClient, name: string)
|
|
||||||
constructor (
|
|
||||||
client: HttpLancedbClient,
|
|
||||||
name: string,
|
|
||||||
embeddings: EmbeddingFunction<T>
|
|
||||||
)
|
|
||||||
constructor (
|
|
||||||
client: HttpLancedbClient,
|
|
||||||
name: string,
|
|
||||||
embeddings?: EmbeddingFunction<T>
|
|
||||||
) {
|
|
||||||
this._client = client
|
|
||||||
this._name = name
|
|
||||||
this._embeddings = embeddings
|
|
||||||
}
|
|
||||||
|
|
||||||
get name (): string {
|
|
||||||
return this._name
|
|
||||||
}
|
|
||||||
|
|
||||||
get schema (): Promise<any> {
|
|
||||||
return this._client
|
|
||||||
.post(`/v1/table/${encodeURIComponent(this._name)}/describe/`)
|
|
||||||
.then(async (res) => {
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
return (await res.body())?.schema
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
search (query: T): Query<T> {
|
|
||||||
return new RemoteQuery(query, this._client, encodeURIComponent(this._name)) //, this._embeddings_new)
|
|
||||||
}
|
|
||||||
|
|
||||||
filter (where: string): Query<T> {
|
|
||||||
throw new Error('Not implemented')
|
|
||||||
}
|
|
||||||
|
|
||||||
async mergeInsert (on: string, data: Array<Record<string, unknown>> | ArrowTable, args: MergeInsertArgs): Promise<void> {
|
|
||||||
let tbl: ArrowTable
|
|
||||||
if (data instanceof ArrowTable) {
|
|
||||||
tbl = data
|
|
||||||
} else {
|
|
||||||
tbl = makeArrowTable(data, await this.schema)
|
|
||||||
}
|
|
||||||
|
|
||||||
const queryParams: any = {
|
|
||||||
on
|
|
||||||
}
|
|
||||||
if (args.whenMatchedUpdateAll !== false && args.whenMatchedUpdateAll !== null && args.whenMatchedUpdateAll !== undefined) {
|
|
||||||
queryParams.when_matched_update_all = 'true'
|
|
||||||
if (typeof args.whenMatchedUpdateAll === 'string') {
|
|
||||||
queryParams.when_matched_update_all_filt = args.whenMatchedUpdateAll
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
queryParams.when_matched_update_all = 'false'
|
|
||||||
}
|
|
||||||
if (args.whenNotMatchedInsertAll ?? false) {
|
|
||||||
queryParams.when_not_matched_insert_all = 'true'
|
|
||||||
} else {
|
|
||||||
queryParams.when_not_matched_insert_all = 'false'
|
|
||||||
}
|
|
||||||
if (args.whenNotMatchedBySourceDelete !== false && args.whenNotMatchedBySourceDelete !== null && args.whenNotMatchedBySourceDelete !== undefined) {
|
|
||||||
queryParams.when_not_matched_by_source_delete = 'true'
|
|
||||||
if (typeof args.whenNotMatchedBySourceDelete === 'string') {
|
|
||||||
queryParams.when_not_matched_by_source_delete_filt = args.whenNotMatchedBySourceDelete
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
queryParams.when_not_matched_by_source_delete = 'false'
|
|
||||||
}
|
|
||||||
|
|
||||||
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/merge_insert/`,
|
|
||||||
buffer,
|
|
||||||
queryParams,
|
|
||||||
'application/vnd.apache.arrow.stream'
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async add (data: Array<Record<string, unknown>> | ArrowTable): Promise<number> {
|
|
||||||
let tbl: ArrowTable
|
|
||||||
if (data instanceof ArrowTable) {
|
|
||||||
tbl = data
|
|
||||||
} else {
|
|
||||||
tbl = makeArrowTable(data, await this.schema)
|
|
||||||
}
|
|
||||||
|
|
||||||
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/insert/`,
|
|
||||||
buffer,
|
|
||||||
{
|
|
||||||
mode: 'append'
|
|
||||||
},
|
|
||||||
'application/vnd.apache.arrow.stream'
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
return tbl.numRows
|
|
||||||
}
|
|
||||||
|
|
||||||
async overwrite (data: Array<Record<string, unknown>> | ArrowTable): Promise<number> {
|
|
||||||
let tbl: ArrowTable
|
|
||||||
if (data instanceof ArrowTable) {
|
|
||||||
tbl = data
|
|
||||||
} else {
|
|
||||||
tbl = makeArrowTable(data)
|
|
||||||
}
|
|
||||||
const buffer = await fromTableToStreamBuffer(tbl, this._embeddings)
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/insert/`,
|
|
||||||
buffer,
|
|
||||||
{
|
|
||||||
mode: 'overwrite'
|
|
||||||
},
|
|
||||||
'application/vnd.apache.arrow.stream'
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
return tbl.numRows
|
|
||||||
}
|
|
||||||
|
|
||||||
async createIndex (indexParams: VectorIndexParams): Promise<void> {
|
|
||||||
const unsupportedParams = [
|
|
||||||
'index_name',
|
|
||||||
'num_partitions',
|
|
||||||
'max_iters',
|
|
||||||
'use_opq',
|
|
||||||
'num_sub_vectors',
|
|
||||||
'num_bits',
|
|
||||||
'max_opq_iters',
|
|
||||||
'replace'
|
|
||||||
]
|
|
||||||
for (const param of unsupportedParams) {
|
|
||||||
// eslint-disable-next-line @typescript-eslint/strict-boolean-expressions
|
|
||||||
if (indexParams[param as keyof VectorIndexParams]) {
|
|
||||||
throw new Error(`${param} is not supported for remote connections`)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const column = indexParams.column ?? 'vector'
|
|
||||||
const indexType = 'vector'
|
|
||||||
const metricType = indexParams.metric_type ?? 'L2'
|
|
||||||
const indexCacheSize = indexParams.index_cache_size ?? null
|
|
||||||
|
|
||||||
const data = {
|
|
||||||
column,
|
|
||||||
index_type: indexType,
|
|
||||||
metric_type: metricType,
|
|
||||||
index_cache_size: indexCacheSize
|
|
||||||
}
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/create_index/`,
|
|
||||||
data
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async createScalarIndex (column: string): Promise<void> {
|
|
||||||
const indexType = 'scalar'
|
|
||||||
|
|
||||||
const data = {
|
|
||||||
column,
|
|
||||||
index_type: indexType,
|
|
||||||
replace: true
|
|
||||||
}
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/create_scalar_index/`,
|
|
||||||
data
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
async dropIndex (index_name: string): Promise<void> {
|
|
||||||
const res = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/index/${encodeURIComponent(index_name)}/drop/`
|
|
||||||
)
|
|
||||||
if (res.status !== 200) {
|
|
||||||
throw new Error(
|
|
||||||
`Server Error, status: ${res.status}, ` +
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
`message: ${res.statusText}: ${await res.body()}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async countRows (filter?: string): Promise<number> {
|
|
||||||
const result = await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/count_rows/`, {
|
|
||||||
predicate: filter
|
|
||||||
})
|
|
||||||
return (await result.body())
|
|
||||||
}
|
|
||||||
|
|
||||||
async delete (filter: string): Promise<void> {
|
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/delete/`, {
|
|
||||||
predicate: filter
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async update (args: UpdateArgs | UpdateSqlArgs): Promise<void> {
|
|
||||||
let filter: string | null
|
|
||||||
let updates: Record<string, string>
|
|
||||||
|
|
||||||
if ('valuesSql' in args) {
|
|
||||||
filter = args.where ?? null
|
|
||||||
updates = args.valuesSql
|
|
||||||
} else {
|
|
||||||
filter = args.where ?? null
|
|
||||||
updates = {}
|
|
||||||
for (const [key, value] of Object.entries(args.values)) {
|
|
||||||
updates[key] = toSQL(value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
await this._client.post(`/v1/table/${encodeURIComponent(this._name)}/update/`, {
|
|
||||||
predicate: filter,
|
|
||||||
updates: Object.entries(updates).map(([key, value]) => [key, value])
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async listIndices (): Promise<VectorIndex[]> {
|
|
||||||
const results = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/index/list/`
|
|
||||||
)
|
|
||||||
return (await results.body()).indexes?.map((index: any) => ({
|
|
||||||
columns: index.columns,
|
|
||||||
name: index.index_name,
|
|
||||||
uuid: index.index_uuid,
|
|
||||||
status: index.status
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
async indexStats (indexName: string): Promise<IndexStats> {
|
|
||||||
const results = await this._client.post(
|
|
||||||
`/v1/table/${encodeURIComponent(this._name)}/index/${indexName}/stats/`
|
|
||||||
)
|
|
||||||
const body = await results.body()
|
|
||||||
return {
|
|
||||||
numIndexedRows: body?.num_indexed_rows,
|
|
||||||
numUnindexedRows: body?.num_unindexed_rows,
|
|
||||||
indexType: body?.index_type,
|
|
||||||
distanceType: body?.distance_type
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async addColumns (newColumnTransforms: Array<{ name: string, valueSql: string }>): Promise<void> {
|
|
||||||
throw new Error('Add columns is not yet supported in LanceDB Cloud.')
|
|
||||||
}
|
|
||||||
|
|
||||||
async alterColumns (columnAlterations: ColumnAlteration[]): Promise<void> {
|
|
||||||
throw new Error('Alter columns is not yet supported in LanceDB Cloud.')
|
|
||||||
}
|
|
||||||
|
|
||||||
async dropColumns (columnNames: string[]): Promise<void> {
|
|
||||||
throw new Error('Drop columns is not yet supported in LanceDB Cloud.')
|
|
||||||
}
|
|
||||||
|
|
||||||
withMiddleware(middleware: HttpMiddleware): Table<T> {
|
|
||||||
const wrapped = this.clone()
|
|
||||||
wrapped._client = wrapped._client.withMiddleware(middleware)
|
|
||||||
return wrapped
|
|
||||||
}
|
|
||||||
|
|
||||||
private clone (): RemoteTable<T> {
|
|
||||||
const clone: RemoteTable<T> = Object.create(RemoteTable.prototype)
|
|
||||||
return Object.assign(clone, this)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,508 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
// The utilities in this file help sanitize data from the user's arrow
|
|
||||||
// library into the types expected by vectordb's arrow library. Node
|
|
||||||
// generally allows for mulitple versions of the same library (and sometimes
|
|
||||||
// even multiple copies of the same version) to be installed at the same
|
|
||||||
// time. However, arrow-js uses instanceof which expected that the input
|
|
||||||
// comes from the exact same library instance. This is not always the case
|
|
||||||
// and so we must sanitize the input to ensure that it is compatible.
|
|
||||||
|
|
||||||
import {
|
|
||||||
Field,
|
|
||||||
Utf8,
|
|
||||||
FixedSizeBinary,
|
|
||||||
FixedSizeList,
|
|
||||||
Schema,
|
|
||||||
List,
|
|
||||||
Struct,
|
|
||||||
Float,
|
|
||||||
Bool,
|
|
||||||
Date_,
|
|
||||||
Decimal,
|
|
||||||
type DataType,
|
|
||||||
Dictionary,
|
|
||||||
Binary,
|
|
||||||
Float32,
|
|
||||||
Interval,
|
|
||||||
Map_,
|
|
||||||
Duration,
|
|
||||||
Union,
|
|
||||||
Time,
|
|
||||||
Timestamp,
|
|
||||||
Type,
|
|
||||||
Null,
|
|
||||||
Int,
|
|
||||||
type Precision,
|
|
||||||
type DateUnit,
|
|
||||||
Int8,
|
|
||||||
Int16,
|
|
||||||
Int32,
|
|
||||||
Int64,
|
|
||||||
Uint8,
|
|
||||||
Uint16,
|
|
||||||
Uint32,
|
|
||||||
Uint64,
|
|
||||||
Float16,
|
|
||||||
Float64,
|
|
||||||
DateDay,
|
|
||||||
DateMillisecond,
|
|
||||||
DenseUnion,
|
|
||||||
SparseUnion,
|
|
||||||
TimeNanosecond,
|
|
||||||
TimeMicrosecond,
|
|
||||||
TimeMillisecond,
|
|
||||||
TimeSecond,
|
|
||||||
TimestampNanosecond,
|
|
||||||
TimestampMicrosecond,
|
|
||||||
TimestampMillisecond,
|
|
||||||
TimestampSecond,
|
|
||||||
IntervalDayTime,
|
|
||||||
IntervalYearMonth,
|
|
||||||
DurationNanosecond,
|
|
||||||
DurationMicrosecond,
|
|
||||||
DurationMillisecond,
|
|
||||||
DurationSecond
|
|
||||||
} from "apache-arrow";
|
|
||||||
import type { IntBitWidth, TimeBitWidth } from "apache-arrow/type";
|
|
||||||
|
|
||||||
function sanitizeMetadata(
|
|
||||||
metadataLike?: unknown
|
|
||||||
): Map<string, string> | undefined {
|
|
||||||
if (metadataLike === undefined || metadataLike === null) {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
if (!(metadataLike instanceof Map)) {
|
|
||||||
throw Error("Expected metadata, if present, to be a Map<string, string>");
|
|
||||||
}
|
|
||||||
for (const item of metadataLike) {
|
|
||||||
if (!(typeof item[0] === "string" || !(typeof item[1] === "string"))) {
|
|
||||||
throw Error(
|
|
||||||
"Expected metadata, if present, to be a Map<string, string> but it had non-string keys or values"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return metadataLike as Map<string, string>;
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeInt(typeLike: object) {
|
|
||||||
if (
|
|
||||||
!("bitWidth" in typeLike) ||
|
|
||||||
typeof typeLike.bitWidth !== "number" ||
|
|
||||||
!("isSigned" in typeLike) ||
|
|
||||||
typeof typeLike.isSigned !== "boolean"
|
|
||||||
) {
|
|
||||||
throw Error(
|
|
||||||
"Expected an Int Type to have a `bitWidth` and `isSigned` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return new Int(typeLike.isSigned, typeLike.bitWidth as IntBitWidth);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeFloat(typeLike: object) {
|
|
||||||
if (!("precision" in typeLike) || typeof typeLike.precision !== "number") {
|
|
||||||
throw Error("Expected a Float Type to have a `precision` property");
|
|
||||||
}
|
|
||||||
return new Float(typeLike.precision as Precision);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeDecimal(typeLike: object) {
|
|
||||||
if (
|
|
||||||
!("scale" in typeLike) ||
|
|
||||||
typeof typeLike.scale !== "number" ||
|
|
||||||
!("precision" in typeLike) ||
|
|
||||||
typeof typeLike.precision !== "number" ||
|
|
||||||
!("bitWidth" in typeLike) ||
|
|
||||||
typeof typeLike.bitWidth !== "number"
|
|
||||||
) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Decimal Type to have `scale`, `precision`, and `bitWidth` properties"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return new Decimal(typeLike.scale, typeLike.precision, typeLike.bitWidth);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeDate(typeLike: object) {
|
|
||||||
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
|
|
||||||
throw Error("Expected a Date type to have a `unit` property");
|
|
||||||
}
|
|
||||||
return new Date_(typeLike.unit as DateUnit);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeTime(typeLike: object) {
|
|
||||||
if (
|
|
||||||
!("unit" in typeLike) ||
|
|
||||||
typeof typeLike.unit !== "number" ||
|
|
||||||
!("bitWidth" in typeLike) ||
|
|
||||||
typeof typeLike.bitWidth !== "number"
|
|
||||||
) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Time type to have `unit` and `bitWidth` properties"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return new Time(typeLike.unit, typeLike.bitWidth as TimeBitWidth);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeTimestamp(typeLike: object) {
|
|
||||||
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
|
|
||||||
throw Error("Expected a Timestamp type to have a `unit` property");
|
|
||||||
}
|
|
||||||
let timezone = null;
|
|
||||||
if ("timezone" in typeLike && typeof typeLike.timezone === "string") {
|
|
||||||
timezone = typeLike.timezone;
|
|
||||||
}
|
|
||||||
return new Timestamp(typeLike.unit, timezone);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeTypedTimestamp(
|
|
||||||
typeLike: object,
|
|
||||||
Datatype:
|
|
||||||
| typeof TimestampNanosecond
|
|
||||||
| typeof TimestampMicrosecond
|
|
||||||
| typeof TimestampMillisecond
|
|
||||||
| typeof TimestampSecond
|
|
||||||
) {
|
|
||||||
let timezone = null;
|
|
||||||
if ("timezone" in typeLike && typeof typeLike.timezone === "string") {
|
|
||||||
timezone = typeLike.timezone;
|
|
||||||
}
|
|
||||||
return new Datatype(timezone);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeInterval(typeLike: object) {
|
|
||||||
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
|
|
||||||
throw Error("Expected an Interval type to have a `unit` property");
|
|
||||||
}
|
|
||||||
return new Interval(typeLike.unit);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeList(typeLike: object) {
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a List type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (typeLike.children.length !== 1) {
|
|
||||||
throw Error("Expected a List type to have exactly one child");
|
|
||||||
}
|
|
||||||
return new List(sanitizeField(typeLike.children[0]));
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeStruct(typeLike: object) {
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Struct type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return new Struct(typeLike.children.map((child) => sanitizeField(child)));
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeUnion(typeLike: object) {
|
|
||||||
if (
|
|
||||||
!("typeIds" in typeLike) ||
|
|
||||||
!("mode" in typeLike) ||
|
|
||||||
typeof typeLike.mode !== "number"
|
|
||||||
) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Union type to have `typeIds` and `mode` properties"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Union type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Union(
|
|
||||||
typeLike.mode,
|
|
||||||
typeLike.typeIds as any,
|
|
||||||
typeLike.children.map((child) => sanitizeField(child))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeTypedUnion(
|
|
||||||
typeLike: object,
|
|
||||||
UnionType: typeof DenseUnion | typeof SparseUnion
|
|
||||||
) {
|
|
||||||
if (!("typeIds" in typeLike)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a DenseUnion/SparseUnion type to have a `typeIds` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a DenseUnion/SparseUnion type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new UnionType(
|
|
||||||
typeLike.typeIds as any,
|
|
||||||
typeLike.children.map((child) => sanitizeField(child))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeFixedSizeBinary(typeLike: object) {
|
|
||||||
if (!("byteWidth" in typeLike) || typeof typeLike.byteWidth !== "number") {
|
|
||||||
throw Error(
|
|
||||||
"Expected a FixedSizeBinary type to have a `byteWidth` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return new FixedSizeBinary(typeLike.byteWidth);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeFixedSizeList(typeLike: object) {
|
|
||||||
if (!("listSize" in typeLike) || typeof typeLike.listSize !== "number") {
|
|
||||||
throw Error("Expected a FixedSizeList type to have a `listSize` property");
|
|
||||||
}
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a FixedSizeList type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (typeLike.children.length !== 1) {
|
|
||||||
throw Error("Expected a FixedSizeList type to have exactly one child");
|
|
||||||
}
|
|
||||||
return new FixedSizeList(
|
|
||||||
typeLike.listSize,
|
|
||||||
sanitizeField(typeLike.children[0])
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeMap(typeLike: object) {
|
|
||||||
if (!("children" in typeLike) || !Array.isArray(typeLike.children)) {
|
|
||||||
throw Error(
|
|
||||||
"Expected a Map type to have an array-like `children` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (!("keysSorted" in typeLike) || typeof typeLike.keysSorted !== "boolean") {
|
|
||||||
throw Error("Expected a Map type to have a `keysSorted` property");
|
|
||||||
}
|
|
||||||
return new Map_(
|
|
||||||
typeLike.children.map((field) => sanitizeField(field)) as any,
|
|
||||||
typeLike.keysSorted
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeDuration(typeLike: object) {
|
|
||||||
if (!("unit" in typeLike) || typeof typeLike.unit !== "number") {
|
|
||||||
throw Error("Expected a Duration type to have a `unit` property");
|
|
||||||
}
|
|
||||||
return new Duration(typeLike.unit);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeDictionary(typeLike: object) {
|
|
||||||
if (!("id" in typeLike) || typeof typeLike.id !== "number") {
|
|
||||||
throw Error("Expected a Dictionary type to have an `id` property");
|
|
||||||
}
|
|
||||||
if (!("indices" in typeLike) || typeof typeLike.indices !== "object") {
|
|
||||||
throw Error("Expected a Dictionary type to have an `indices` property");
|
|
||||||
}
|
|
||||||
if (!("dictionary" in typeLike) || typeof typeLike.dictionary !== "object") {
|
|
||||||
throw Error("Expected a Dictionary type to have an `dictionary` property");
|
|
||||||
}
|
|
||||||
if (!("isOrdered" in typeLike) || typeof typeLike.isOrdered !== "boolean") {
|
|
||||||
throw Error("Expected a Dictionary type to have an `isOrdered` property");
|
|
||||||
}
|
|
||||||
return new Dictionary(
|
|
||||||
sanitizeType(typeLike.dictionary),
|
|
||||||
sanitizeType(typeLike.indices) as any,
|
|
||||||
typeLike.id,
|
|
||||||
typeLike.isOrdered
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeType(typeLike: unknown): DataType<any> {
|
|
||||||
if (typeof typeLike !== "object" || typeLike === null) {
|
|
||||||
throw Error("Expected a Type but object was null/undefined");
|
|
||||||
}
|
|
||||||
if (!("typeId" in typeLike) || !(typeof typeLike.typeId !== "function")) {
|
|
||||||
throw Error("Expected a Type to have a typeId function");
|
|
||||||
}
|
|
||||||
let typeId: Type;
|
|
||||||
if (typeof typeLike.typeId === "function") {
|
|
||||||
typeId = (typeLike.typeId as () => unknown)() as Type;
|
|
||||||
} else if (typeof typeLike.typeId === "number") {
|
|
||||||
typeId = typeLike.typeId as Type;
|
|
||||||
} else {
|
|
||||||
throw Error("Type's typeId property was not a function or number");
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (typeId) {
|
|
||||||
case Type.NONE:
|
|
||||||
throw Error("Received a Type with a typeId of NONE");
|
|
||||||
case Type.Null:
|
|
||||||
return new Null();
|
|
||||||
case Type.Int:
|
|
||||||
return sanitizeInt(typeLike);
|
|
||||||
case Type.Float:
|
|
||||||
return sanitizeFloat(typeLike);
|
|
||||||
case Type.Binary:
|
|
||||||
return new Binary();
|
|
||||||
case Type.Utf8:
|
|
||||||
return new Utf8();
|
|
||||||
case Type.Bool:
|
|
||||||
return new Bool();
|
|
||||||
case Type.Decimal:
|
|
||||||
return sanitizeDecimal(typeLike);
|
|
||||||
case Type.Date:
|
|
||||||
return sanitizeDate(typeLike);
|
|
||||||
case Type.Time:
|
|
||||||
return sanitizeTime(typeLike);
|
|
||||||
case Type.Timestamp:
|
|
||||||
return sanitizeTimestamp(typeLike);
|
|
||||||
case Type.Interval:
|
|
||||||
return sanitizeInterval(typeLike);
|
|
||||||
case Type.List:
|
|
||||||
return sanitizeList(typeLike);
|
|
||||||
case Type.Struct:
|
|
||||||
return sanitizeStruct(typeLike);
|
|
||||||
case Type.Union:
|
|
||||||
return sanitizeUnion(typeLike);
|
|
||||||
case Type.FixedSizeBinary:
|
|
||||||
return sanitizeFixedSizeBinary(typeLike);
|
|
||||||
case Type.FixedSizeList:
|
|
||||||
return sanitizeFixedSizeList(typeLike);
|
|
||||||
case Type.Map:
|
|
||||||
return sanitizeMap(typeLike);
|
|
||||||
case Type.Duration:
|
|
||||||
return sanitizeDuration(typeLike);
|
|
||||||
case Type.Dictionary:
|
|
||||||
return sanitizeDictionary(typeLike);
|
|
||||||
case Type.Int8:
|
|
||||||
return new Int8();
|
|
||||||
case Type.Int16:
|
|
||||||
return new Int16();
|
|
||||||
case Type.Int32:
|
|
||||||
return new Int32();
|
|
||||||
case Type.Int64:
|
|
||||||
return new Int64();
|
|
||||||
case Type.Uint8:
|
|
||||||
return new Uint8();
|
|
||||||
case Type.Uint16:
|
|
||||||
return new Uint16();
|
|
||||||
case Type.Uint32:
|
|
||||||
return new Uint32();
|
|
||||||
case Type.Uint64:
|
|
||||||
return new Uint64();
|
|
||||||
case Type.Float16:
|
|
||||||
return new Float16();
|
|
||||||
case Type.Float32:
|
|
||||||
return new Float32();
|
|
||||||
case Type.Float64:
|
|
||||||
return new Float64();
|
|
||||||
case Type.DateMillisecond:
|
|
||||||
return new DateMillisecond();
|
|
||||||
case Type.DateDay:
|
|
||||||
return new DateDay();
|
|
||||||
case Type.TimeNanosecond:
|
|
||||||
return new TimeNanosecond();
|
|
||||||
case Type.TimeMicrosecond:
|
|
||||||
return new TimeMicrosecond();
|
|
||||||
case Type.TimeMillisecond:
|
|
||||||
return new TimeMillisecond();
|
|
||||||
case Type.TimeSecond:
|
|
||||||
return new TimeSecond();
|
|
||||||
case Type.TimestampNanosecond:
|
|
||||||
return sanitizeTypedTimestamp(typeLike, TimestampNanosecond);
|
|
||||||
case Type.TimestampMicrosecond:
|
|
||||||
return sanitizeTypedTimestamp(typeLike, TimestampMicrosecond);
|
|
||||||
case Type.TimestampMillisecond:
|
|
||||||
return sanitizeTypedTimestamp(typeLike, TimestampMillisecond);
|
|
||||||
case Type.TimestampSecond:
|
|
||||||
return sanitizeTypedTimestamp(typeLike, TimestampSecond);
|
|
||||||
case Type.DenseUnion:
|
|
||||||
return sanitizeTypedUnion(typeLike, DenseUnion);
|
|
||||||
case Type.SparseUnion:
|
|
||||||
return sanitizeTypedUnion(typeLike, SparseUnion);
|
|
||||||
case Type.IntervalDayTime:
|
|
||||||
return new IntervalDayTime();
|
|
||||||
case Type.IntervalYearMonth:
|
|
||||||
return new IntervalYearMonth();
|
|
||||||
case Type.DurationNanosecond:
|
|
||||||
return new DurationNanosecond();
|
|
||||||
case Type.DurationMicrosecond:
|
|
||||||
return new DurationMicrosecond();
|
|
||||||
case Type.DurationMillisecond:
|
|
||||||
return new DurationMillisecond();
|
|
||||||
case Type.DurationSecond:
|
|
||||||
return new DurationSecond();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function sanitizeField(fieldLike: unknown): Field {
|
|
||||||
if (fieldLike instanceof Field) {
|
|
||||||
return fieldLike;
|
|
||||||
}
|
|
||||||
if (typeof fieldLike !== "object" || fieldLike === null) {
|
|
||||||
throw Error("Expected a Field but object was null/undefined");
|
|
||||||
}
|
|
||||||
if (
|
|
||||||
!("type" in fieldLike) ||
|
|
||||||
!("name" in fieldLike) ||
|
|
||||||
!("nullable" in fieldLike)
|
|
||||||
) {
|
|
||||||
throw Error(
|
|
||||||
"The field passed in is missing a `type`/`name`/`nullable` property"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const type = sanitizeType(fieldLike.type);
|
|
||||||
const name = fieldLike.name;
|
|
||||||
if (!(typeof name === "string")) {
|
|
||||||
throw Error("The field passed in had a non-string `name` property");
|
|
||||||
}
|
|
||||||
const nullable = fieldLike.nullable;
|
|
||||||
if (!(typeof nullable === "boolean")) {
|
|
||||||
throw Error("The field passed in had a non-boolean `nullable` property");
|
|
||||||
}
|
|
||||||
let metadata;
|
|
||||||
if ("metadata" in fieldLike) {
|
|
||||||
metadata = sanitizeMetadata(fieldLike.metadata);
|
|
||||||
}
|
|
||||||
return new Field(name, type, nullable, metadata);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert something schemaLike into a Schema instance
|
|
||||||
*
|
|
||||||
* This method is often needed even when the caller is using a Schema
|
|
||||||
* instance because they might be using a different instance of apache-arrow
|
|
||||||
* than lancedb is using.
|
|
||||||
*/
|
|
||||||
export function sanitizeSchema(schemaLike: unknown): Schema {
|
|
||||||
if (schemaLike instanceof Schema) {
|
|
||||||
return schemaLike;
|
|
||||||
}
|
|
||||||
if (typeof schemaLike !== "object" || schemaLike === null) {
|
|
||||||
throw Error("Expected a Schema but object was null/undefined");
|
|
||||||
}
|
|
||||||
if (!("fields" in schemaLike)) {
|
|
||||||
throw Error(
|
|
||||||
"The schema passed in does not appear to be a schema (no 'fields' property)"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
let metadata;
|
|
||||||
if ("metadata" in schemaLike) {
|
|
||||||
metadata = sanitizeMetadata(schemaLike.metadata);
|
|
||||||
}
|
|
||||||
if (!Array.isArray(schemaLike.fields)) {
|
|
||||||
throw Error(
|
|
||||||
"The schema passed in had a 'fields' property but it was not an array"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const sanitizedFields = schemaLike.fields.map((field) =>
|
|
||||||
sanitizeField(field)
|
|
||||||
);
|
|
||||||
return new Schema(sanitizedFields, metadata);
|
|
||||||
}
|
|
||||||
@@ -1,360 +0,0 @@
|
|||||||
// Copyright 2024 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { describe } from 'mocha'
|
|
||||||
import { assert, expect, use as chaiUse } from 'chai'
|
|
||||||
import * as chaiAsPromised from 'chai-as-promised'
|
|
||||||
|
|
||||||
import { convertToTable, fromTableToBuffer, makeArrowTable, makeEmptyTable } from '../arrow'
|
|
||||||
import {
|
|
||||||
Field,
|
|
||||||
FixedSizeList,
|
|
||||||
Float16,
|
|
||||||
Float32,
|
|
||||||
Int32,
|
|
||||||
tableFromIPC,
|
|
||||||
Schema,
|
|
||||||
Float64,
|
|
||||||
type Table,
|
|
||||||
Binary,
|
|
||||||
Bool,
|
|
||||||
Utf8,
|
|
||||||
Struct,
|
|
||||||
List,
|
|
||||||
DataType,
|
|
||||||
Dictionary,
|
|
||||||
Int64,
|
|
||||||
MetadataVersion
|
|
||||||
} from 'apache-arrow'
|
|
||||||
import {
|
|
||||||
Dictionary as OldDictionary,
|
|
||||||
Field as OldField,
|
|
||||||
FixedSizeList as OldFixedSizeList,
|
|
||||||
Float32 as OldFloat32,
|
|
||||||
Int32 as OldInt32,
|
|
||||||
Struct as OldStruct,
|
|
||||||
Schema as OldSchema,
|
|
||||||
TimestampNanosecond as OldTimestampNanosecond,
|
|
||||||
Utf8 as OldUtf8
|
|
||||||
} from 'apache-arrow-old'
|
|
||||||
import { type EmbeddingFunction } from '../embedding/embedding_function'
|
|
||||||
|
|
||||||
chaiUse(chaiAsPromised)
|
|
||||||
|
|
||||||
function sampleRecords (): Array<Record<string, any>> {
|
|
||||||
return [
|
|
||||||
{
|
|
||||||
binary: Buffer.alloc(5),
|
|
||||||
boolean: false,
|
|
||||||
number: 7,
|
|
||||||
string: 'hello',
|
|
||||||
struct: { x: 0, y: 0 },
|
|
||||||
list: ['anime', 'action', 'comedy']
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper method to verify various ways to create a table
|
|
||||||
async function checkTableCreation (tableCreationMethod: (records: any, recordsReversed: any, schema: Schema) => Promise<Table>): Promise<void> {
|
|
||||||
const records = sampleRecords()
|
|
||||||
const recordsReversed = [{
|
|
||||||
list: ['anime', 'action', 'comedy'],
|
|
||||||
struct: { x: 0, y: 0 },
|
|
||||||
string: 'hello',
|
|
||||||
number: 7,
|
|
||||||
boolean: false,
|
|
||||||
binary: Buffer.alloc(5)
|
|
||||||
}]
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('binary', new Binary(), false),
|
|
||||||
new Field('boolean', new Bool(), false),
|
|
||||||
new Field('number', new Float64(), false),
|
|
||||||
new Field('string', new Utf8(), false),
|
|
||||||
new Field('struct', new Struct([
|
|
||||||
new Field('x', new Float64(), false),
|
|
||||||
new Field('y', new Float64(), false)
|
|
||||||
])),
|
|
||||||
new Field('list', new List(new Field('item', new Utf8(), false)), false)
|
|
||||||
])
|
|
||||||
|
|
||||||
const table = await tableCreationMethod(records, recordsReversed, schema)
|
|
||||||
schema.fields.forEach((field, idx) => {
|
|
||||||
const actualField = table.schema.fields[idx]
|
|
||||||
assert.isFalse(actualField.nullable)
|
|
||||||
assert.equal(table.getChild(field.name)?.type.toString(), field.type.toString())
|
|
||||||
assert.equal(table.getChildAt(idx)?.type.toString(), field.type.toString())
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('The function makeArrowTable', function () {
|
|
||||||
it('will use data types from a provided schema instead of inference', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('a', new Int32()),
|
|
||||||
new Field('b', new Float32()),
|
|
||||||
new Field('c', new FixedSizeList(3, new Field('item', new Float16()))),
|
|
||||||
new Field('d', new Int64())
|
|
||||||
])
|
|
||||||
const table = makeArrowTable(
|
|
||||||
[
|
|
||||||
{ a: 1, b: 2, c: [1, 2, 3], d: 9 },
|
|
||||||
{ a: 4, b: 5, c: [4, 5, 6], d: 10 },
|
|
||||||
{ a: 7, b: 8, c: [7, 8, 9], d: null }
|
|
||||||
],
|
|
||||||
{ schema }
|
|
||||||
)
|
|
||||||
|
|
||||||
const buf = await fromTableToBuffer(table)
|
|
||||||
assert.isAbove(buf.byteLength, 0)
|
|
||||||
|
|
||||||
const actual = tableFromIPC(buf)
|
|
||||||
assert.equal(actual.numRows, 3)
|
|
||||||
const actualSchema = actual.schema
|
|
||||||
assert.deepEqual(actualSchema, schema)
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will assume the column `vector` is FixedSizeList<Float32> by default', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('a', new Float64()),
|
|
||||||
new Field('b', new Float64()),
|
|
||||||
new Field(
|
|
||||||
'vector',
|
|
||||||
new FixedSizeList(3, new Field('item', new Float32(), true))
|
|
||||||
)
|
|
||||||
])
|
|
||||||
const table = makeArrowTable([
|
|
||||||
{ a: 1, b: 2, vector: [1, 2, 3] },
|
|
||||||
{ a: 4, b: 5, vector: [4, 5, 6] },
|
|
||||||
{ a: 7, b: 8, vector: [7, 8, 9] }
|
|
||||||
])
|
|
||||||
|
|
||||||
const buf = await fromTableToBuffer(table)
|
|
||||||
assert.isAbove(buf.byteLength, 0)
|
|
||||||
|
|
||||||
const actual = tableFromIPC(buf)
|
|
||||||
assert.equal(actual.numRows, 3)
|
|
||||||
const actualSchema = actual.schema
|
|
||||||
assert.deepEqual(actualSchema, schema)
|
|
||||||
})
|
|
||||||
|
|
||||||
it('can support multiple vector columns', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('a', new Float64()),
|
|
||||||
new Field('b', new Float64()),
|
|
||||||
new Field('vec1', new FixedSizeList(3, new Field('item', new Float16(), true))),
|
|
||||||
new Field('vec2', new FixedSizeList(3, new Field('item', new Float16(), true)))
|
|
||||||
])
|
|
||||||
const table = makeArrowTable(
|
|
||||||
[
|
|
||||||
{ a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
|
|
||||||
{ a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
|
|
||||||
{ a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
|
|
||||||
],
|
|
||||||
{
|
|
||||||
vectorColumns: {
|
|
||||||
vec1: { type: new Float16() },
|
|
||||||
vec2: { type: new Float16() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
const buf = await fromTableToBuffer(table)
|
|
||||||
assert.isAbove(buf.byteLength, 0)
|
|
||||||
|
|
||||||
const actual = tableFromIPC(buf)
|
|
||||||
assert.equal(actual.numRows, 3)
|
|
||||||
const actualSchema = actual.schema
|
|
||||||
assert.deepEqual(actualSchema, schema)
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will allow different vector column types', async function () {
|
|
||||||
const table = makeArrowTable(
|
|
||||||
[
|
|
||||||
{ fp16: [1], fp32: [1], fp64: [1] }
|
|
||||||
],
|
|
||||||
{
|
|
||||||
vectorColumns: {
|
|
||||||
fp16: { type: new Float16() },
|
|
||||||
fp32: { type: new Float32() },
|
|
||||||
fp64: { type: new Float64() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert.equal(table.getChild('fp16')?.type.children[0].type.toString(), new Float16().toString())
|
|
||||||
assert.equal(table.getChild('fp32')?.type.children[0].type.toString(), new Float32().toString())
|
|
||||||
assert.equal(table.getChild('fp64')?.type.children[0].type.toString(), new Float64().toString())
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will use dictionary encoded strings if asked', async function () {
|
|
||||||
const table = makeArrowTable([{ str: 'hello' }])
|
|
||||||
assert.isTrue(DataType.isUtf8(table.getChild('str')?.type))
|
|
||||||
|
|
||||||
const tableWithDict = makeArrowTable([{ str: 'hello' }], { dictionaryEncodeStrings: true })
|
|
||||||
assert.isTrue(DataType.isDictionary(tableWithDict.getChild('str')?.type))
|
|
||||||
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('str', new Dictionary(new Utf8(), new Int32()))
|
|
||||||
])
|
|
||||||
|
|
||||||
const tableWithDict2 = makeArrowTable([{ str: 'hello' }], { schema })
|
|
||||||
assert.isTrue(DataType.isDictionary(tableWithDict2.getChild('str')?.type))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will infer data types correctly', async function () {
|
|
||||||
await checkTableCreation(async (records) => makeArrowTable(records))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will allow a schema to be provided', async function () {
|
|
||||||
await checkTableCreation(async (records, _, schema) => makeArrowTable(records, { schema }))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will use the field order of any provided schema', async function () {
|
|
||||||
await checkTableCreation(async (_, recordsReversed, schema) => makeArrowTable(recordsReversed, { schema }))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will make an empty table', async function () {
|
|
||||||
await checkTableCreation(async (_, __, schema) => makeArrowTable([], { schema }))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
class DummyEmbedding implements EmbeddingFunction<string> {
|
|
||||||
public readonly sourceColumn = 'string'
|
|
||||||
public readonly embeddingDimension = 2
|
|
||||||
public readonly embeddingDataType = new Float16()
|
|
||||||
|
|
||||||
async embed (data: string[]): Promise<number[][]> {
|
|
||||||
return data.map(
|
|
||||||
() => [0.0, 0.0]
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class DummyEmbeddingWithNoDimension implements EmbeddingFunction<string> {
|
|
||||||
public readonly sourceColumn = 'string'
|
|
||||||
|
|
||||||
async embed (data: string[]): Promise<number[][]> {
|
|
||||||
return data.map(
|
|
||||||
() => [0.0, 0.0]
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('convertToTable', function () {
|
|
||||||
it('will infer data types correctly', async function () {
|
|
||||||
await checkTableCreation(async (records) => await convertToTable(records))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will allow a schema to be provided', async function () {
|
|
||||||
await checkTableCreation(async (records, _, schema) => await convertToTable(records, undefined, { schema }))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will use the field order of any provided schema', async function () {
|
|
||||||
await checkTableCreation(async (_, recordsReversed, schema) => await convertToTable(recordsReversed, undefined, { schema }))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will make an empty table', async function () {
|
|
||||||
await checkTableCreation(async (_, __, schema) => await convertToTable([], undefined, { schema }))
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will apply embeddings', async function () {
|
|
||||||
const records = sampleRecords()
|
|
||||||
const table = await convertToTable(records, new DummyEmbedding())
|
|
||||||
assert.isTrue(DataType.isFixedSizeList(table.getChild('vector')?.type))
|
|
||||||
assert.equal(table.getChild('vector')?.type.children[0].type.toString(), new Float16().toString())
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will fail if missing the embedding source column', async function () {
|
|
||||||
return await expect(convertToTable([{ id: 1 }], new DummyEmbedding())).to.be.rejectedWith("'string' was not present")
|
|
||||||
})
|
|
||||||
|
|
||||||
it('use embeddingDimension if embedding missing from table', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('string', new Utf8(), false)
|
|
||||||
])
|
|
||||||
// Simulate getting an empty Arrow table (minus embedding) from some other source
|
|
||||||
// In other words, we aren't starting with records
|
|
||||||
const table = makeEmptyTable(schema)
|
|
||||||
|
|
||||||
// If the embedding specifies the dimension we are fine
|
|
||||||
await fromTableToBuffer(table, new DummyEmbedding())
|
|
||||||
|
|
||||||
// We can also supply a schema and should be ok
|
|
||||||
const schemaWithEmbedding = new Schema([
|
|
||||||
new Field('string', new Utf8(), false),
|
|
||||||
new Field('vector', new FixedSizeList(2, new Field('item', new Float16(), false)), false)
|
|
||||||
])
|
|
||||||
await fromTableToBuffer(table, new DummyEmbeddingWithNoDimension(), schemaWithEmbedding)
|
|
||||||
|
|
||||||
// Otherwise we will get an error
|
|
||||||
return await expect(fromTableToBuffer(table, new DummyEmbeddingWithNoDimension())).to.be.rejectedWith('does not specify `embeddingDimension`')
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will apply embeddings to an empty table', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('string', new Utf8(), false),
|
|
||||||
new Field('vector', new FixedSizeList(2, new Field('item', new Float16(), false)), false)
|
|
||||||
])
|
|
||||||
const table = await convertToTable([], new DummyEmbedding(), { schema })
|
|
||||||
assert.isTrue(DataType.isFixedSizeList(table.getChild('vector')?.type))
|
|
||||||
assert.equal(table.getChild('vector')?.type.children[0].type.toString(), new Float16().toString())
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will complain if embeddings present but schema missing embedding column', async function () {
|
|
||||||
const schema = new Schema([
|
|
||||||
new Field('string', new Utf8(), false)
|
|
||||||
])
|
|
||||||
return await expect(convertToTable([], new DummyEmbedding(), { schema })).to.be.rejectedWith('column vector was missing')
|
|
||||||
})
|
|
||||||
|
|
||||||
it('will provide a nice error if run twice', async function () {
|
|
||||||
const records = sampleRecords()
|
|
||||||
const table = await convertToTable(records, new DummyEmbedding())
|
|
||||||
// fromTableToBuffer will try and apply the embeddings again
|
|
||||||
return await expect(fromTableToBuffer(table, new DummyEmbedding())).to.be.rejectedWith('already existed')
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
describe('makeEmptyTable', function () {
|
|
||||||
it('will make an empty table', async function () {
|
|
||||||
await checkTableCreation(async (_, __, schema) => makeEmptyTable(schema))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
describe('when using two versions of arrow', function () {
|
|
||||||
it('can still import data', async function() {
|
|
||||||
const schema = new OldSchema([
|
|
||||||
new OldField('id', new OldInt32()),
|
|
||||||
new OldField('vector', new OldFixedSizeList(1024, new OldField("item", new OldFloat32(), true))),
|
|
||||||
new OldField('struct', new OldStruct([
|
|
||||||
new OldField('nested', new OldDictionary(new OldUtf8(), new OldInt32(), 1, true)),
|
|
||||||
new OldField('ts_with_tz', new OldTimestampNanosecond("some_tz")),
|
|
||||||
new OldField('ts_no_tz', new OldTimestampNanosecond(null))
|
|
||||||
]))
|
|
||||||
]) as any
|
|
||||||
// We use arrow version 13 to emulate a "foreign arrow" and this version doesn't have metadataVersion
|
|
||||||
// In theory, this wouldn't matter. We don't rely on that property. However, it causes deepEqual to
|
|
||||||
// fail so we patch it back in
|
|
||||||
schema.metadataVersion = MetadataVersion.V5
|
|
||||||
const table = makeArrowTable(
|
|
||||||
[],
|
|
||||||
{ schema }
|
|
||||||
)
|
|
||||||
|
|
||||||
const buf = await fromTableToBuffer(table)
|
|
||||||
assert.isAbove(buf.byteLength, 0)
|
|
||||||
const actual = tableFromIPC(buf)
|
|
||||||
const actualSchema = actual.schema
|
|
||||||
assert.deepEqual(actualSchema, schema)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,55 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { describe } from 'mocha'
|
|
||||||
import { assert } from 'chai'
|
|
||||||
|
|
||||||
import { OpenAIEmbeddingFunction } from '../../embedding/openai'
|
|
||||||
import { isEmbeddingFunction } from '../../embedding/embedding_function'
|
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
||||||
const OpenAIApi = require('openai')
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
||||||
const { stub } = require('sinon')
|
|
||||||
|
|
||||||
describe('OpenAPIEmbeddings', function () {
|
|
||||||
const stubValue = {
|
|
||||||
data: [
|
|
||||||
{
|
|
||||||
embedding: Array(1536).fill(1.0)
|
|
||||||
},
|
|
||||||
{
|
|
||||||
embedding: Array(1536).fill(2.0)
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('#embed', function () {
|
|
||||||
it('should create vector embeddings', async function () {
|
|
||||||
const openAIStub = stub(OpenAIApi.Embeddings.prototype, 'create').returns(stubValue)
|
|
||||||
const f = new OpenAIEmbeddingFunction('text', 'sk-key')
|
|
||||||
const vectors = await f.embed(['abc', 'def'])
|
|
||||||
assert.isTrue(openAIStub.calledOnce)
|
|
||||||
assert.equal(vectors.length, 2)
|
|
||||||
assert.deepEqual(vectors[0], stubValue.data[0].embedding)
|
|
||||||
assert.deepEqual(vectors[1], stubValue.data[1].embedding)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
describe('isEmbeddingFunction', function () {
|
|
||||||
it('should match the isEmbeddingFunction guard', function () {
|
|
||||||
assert.isTrue(isEmbeddingFunction(new OpenAIEmbeddingFunction('text', 'sk-key')))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,76 +0,0 @@
|
|||||||
// Copyright 2023 Lance Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
// IO tests
|
|
||||||
|
|
||||||
import { describe } from 'mocha'
|
|
||||||
import { assert } from 'chai'
|
|
||||||
|
|
||||||
import * as lancedb from '../index'
|
|
||||||
import { type ConnectionOptions } from '../index'
|
|
||||||
|
|
||||||
describe('LanceDB S3 client', function () {
|
|
||||||
if (process.env.TEST_S3_BASE_URL != null) {
|
|
||||||
const baseUri = process.env.TEST_S3_BASE_URL
|
|
||||||
it('should have a valid url', async function () {
|
|
||||||
const opts = { uri: `${baseUri}/valid_url` }
|
|
||||||
const table = await createTestDB(opts, 2, 20)
|
|
||||||
const con = await lancedb.connect(opts)
|
|
||||||
assert.equal(con.uri, opts.uri)
|
|
||||||
|
|
||||||
const results = await table.search([0.1, 0.3]).limit(5).execute()
|
|
||||||
assert.equal(results.length, 5)
|
|
||||||
}).timeout(10_000)
|
|
||||||
} else {
|
|
||||||
describe.skip('Skip S3 test', function () {})
|
|
||||||
}
|
|
||||||
|
|
||||||
if (process.env.TEST_S3_BASE_URL != null && process.env.TEST_AWS_ACCESS_KEY_ID != null && process.env.TEST_AWS_SECRET_ACCESS_KEY != null) {
|
|
||||||
const baseUri = process.env.TEST_S3_BASE_URL
|
|
||||||
it('use custom credentials', async function () {
|
|
||||||
const opts: ConnectionOptions = {
|
|
||||||
uri: `${baseUri}/custom_credentials`,
|
|
||||||
awsCredentials: {
|
|
||||||
accessKeyId: process.env.TEST_AWS_ACCESS_KEY_ID as string,
|
|
||||||
secretKey: process.env.TEST_AWS_SECRET_ACCESS_KEY as string
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const table = await createTestDB(opts, 2, 20)
|
|
||||||
console.log(table)
|
|
||||||
const con = await lancedb.connect(opts)
|
|
||||||
console.log(con)
|
|
||||||
assert.equal(con.uri, opts.uri)
|
|
||||||
|
|
||||||
const results = await table.search([0.1, 0.3]).limit(5).execute()
|
|
||||||
assert.equal(results.length, 5)
|
|
||||||
}).timeout(10_000)
|
|
||||||
} else {
|
|
||||||
describe.skip('Skip S3 test', function () {})
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
async function createTestDB (opts: ConnectionOptions, numDimensions: number = 2, numRows: number = 2): Promise<lancedb.Table> {
|
|
||||||
const con = await lancedb.connect(opts)
|
|
||||||
|
|
||||||
const data = []
|
|
||||||
for (let i = 0; i < numRows; i++) {
|
|
||||||
const vector = []
|
|
||||||
for (let j = 0; j < numDimensions; j++) {
|
|
||||||
vector.push(i + (j * 0.1))
|
|
||||||
}
|
|
||||||
data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
|
|
||||||
}
|
|
||||||
|
|
||||||
return await con.createTable('vectors_2', data)
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,45 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
import { toSQL } from '../util'
|
|
||||||
import * as chai from 'chai'
|
|
||||||
|
|
||||||
const expect = chai.expect
|
|
||||||
|
|
||||||
describe('toSQL', function () {
|
|
||||||
it('should turn string to SQL expression', function () {
|
|
||||||
expect(toSQL('foo')).to.equal("'foo'")
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should turn number to SQL expression', function () {
|
|
||||||
expect(toSQL(123)).to.equal('123')
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should turn boolean to SQL expression', function () {
|
|
||||||
expect(toSQL(true)).to.equal('TRUE')
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should turn null to SQL expression', function () {
|
|
||||||
expect(toSQL(null)).to.equal('NULL')
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should turn Date to SQL expression', function () {
|
|
||||||
const date = new Date('05 October 2011 14:48 UTC')
|
|
||||||
expect(toSQL(date)).to.equal("'2011-10-05T14:48:00.000Z'")
|
|
||||||
})
|
|
||||||
|
|
||||||
it('should turn array to SQL expression', function () {
|
|
||||||
expect(toSQL(['foo', 'bar', true, 1])).to.equal("['foo', 'bar', TRUE, 1]")
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
// Copyright 2023 LanceDB Developers.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
export type Literal = string | number | boolean | null | Date | Literal[]
|
|
||||||
|
|
||||||
export function toSQL (value: Literal): string {
|
|
||||||
if (typeof value === 'string') {
|
|
||||||
return `'${value}'`
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof value === 'number') {
|
|
||||||
return value.toString()
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof value === 'boolean') {
|
|
||||||
return value ? 'TRUE' : 'FALSE'
|
|
||||||
}
|
|
||||||
|
|
||||||
if (value === null) {
|
|
||||||
return 'NULL'
|
|
||||||
}
|
|
||||||
|
|
||||||
if (value instanceof Date) {
|
|
||||||
return `'${value.toISOString()}'`
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Array.isArray(value)) {
|
|
||||||
return `[${value.map(toSQL).join(', ')}]`
|
|
||||||
}
|
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
||||||
throw new Error(`Unsupported value type: ${typeof value} value: (${value})`)
|
|
||||||
}
|
|
||||||
|
|
||||||
export class TTLCache {
|
|
||||||
private readonly cache: Map<string, { value: any, expires: number }>
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param ttl Time to live in milliseconds
|
|
||||||
*/
|
|
||||||
constructor (private readonly ttl: number) {
|
|
||||||
this.cache = new Map()
|
|
||||||
}
|
|
||||||
|
|
||||||
get (key: string): any | undefined {
|
|
||||||
const entry = this.cache.get(key)
|
|
||||||
if (entry === undefined) {
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
|
|
||||||
if (entry.expires < Date.now()) {
|
|
||||||
this.cache.delete(key)
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
|
|
||||||
return entry.value
|
|
||||||
}
|
|
||||||
|
|
||||||
set (key: string, value: any): void {
|
|
||||||
this.cache.set(key, { value, expires: Date.now() + this.ttl })
|
|
||||||
}
|
|
||||||
|
|
||||||
delete (key: string): void {
|
|
||||||
this.cache.delete(key)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
{
|
|
||||||
"include": [
|
|
||||||
"src/**/*.ts",
|
|
||||||
"src/*.ts"
|
|
||||||
],
|
|
||||||
"compilerOptions": {
|
|
||||||
"target": "ES2020",
|
|
||||||
"module": "commonjs",
|
|
||||||
"declaration": true,
|
|
||||||
"outDir": "./dist",
|
|
||||||
"strict": true,
|
|
||||||
"sourceMap": true,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
13
nodejs/CLAUDE.md
Normal file
13
nodejs/CLAUDE.md
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
These are the typescript bindings of LanceDB.
|
||||||
|
The core Rust library is in the `../rust/lancedb` directory, the rust binding
|
||||||
|
code is in the `src/` directory and the typescript bindings are in
|
||||||
|
the `lancedb/` directory.
|
||||||
|
|
||||||
|
Whenever you change the Rust code, you will need to recompile: `npm run build`.
|
||||||
|
|
||||||
|
Common commands:
|
||||||
|
* Build: `npm run build`
|
||||||
|
* Lint: `npm run lint`
|
||||||
|
* Fix lints: `npm run lint-fix`
|
||||||
|
* Test: `npm test`
|
||||||
|
* Run single test file: `npm test __test__/arrow.test.ts`
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.20.0-beta.1"
|
version = "0.21.2"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -1,7 +1,16 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
import { Schema } from "apache-arrow";
|
import {
|
||||||
|
Bool,
|
||||||
|
Field,
|
||||||
|
Int32,
|
||||||
|
List,
|
||||||
|
Schema,
|
||||||
|
Struct,
|
||||||
|
Uint8,
|
||||||
|
Utf8,
|
||||||
|
} from "apache-arrow";
|
||||||
|
|
||||||
import * as arrow15 from "apache-arrow-15";
|
import * as arrow15 from "apache-arrow-15";
|
||||||
import * as arrow16 from "apache-arrow-16";
|
import * as arrow16 from "apache-arrow-16";
|
||||||
@@ -11,10 +20,12 @@ import * as arrow18 from "apache-arrow-18";
|
|||||||
import {
|
import {
|
||||||
convertToTable,
|
convertToTable,
|
||||||
fromBufferToRecordBatch,
|
fromBufferToRecordBatch,
|
||||||
|
fromDataToBuffer,
|
||||||
fromRecordBatchToBuffer,
|
fromRecordBatchToBuffer,
|
||||||
fromTableToBuffer,
|
fromTableToBuffer,
|
||||||
makeArrowTable,
|
makeArrowTable,
|
||||||
makeEmptyTable,
|
makeEmptyTable,
|
||||||
|
tableFromIPC,
|
||||||
} from "../lancedb/arrow";
|
} from "../lancedb/arrow";
|
||||||
import {
|
import {
|
||||||
EmbeddingFunction,
|
EmbeddingFunction,
|
||||||
@@ -253,6 +264,98 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(actualSchema).toEqual(schema);
|
expect(actualSchema).toEqual(schema);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("will detect vector columns when name contains 'vector' or 'embedding'", async function () {
|
||||||
|
// Test various naming patterns that should be detected as vector columns
|
||||||
|
const floatVectorTable = makeArrowTable([
|
||||||
|
{
|
||||||
|
// Float vectors (use decimal values to ensure they're treated as floats)
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
user_vector: [1.1, 2.2],
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
text_embedding: [3.3, 4.4],
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
doc_embeddings: [5.5, 6.6],
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
my_vector_field: [7.7, 8.8],
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
embedding_model: [9.9, 10.1],
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
VECTOR_COL: [11.1, 12.2], // uppercase
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
Vector_Mixed: [13.3, 14.4], // mixed case
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Check that columns with 'vector' or 'embedding' in name are converted to FixedSizeList
|
||||||
|
const floatVectorColumns = [
|
||||||
|
"user_vector",
|
||||||
|
"text_embedding",
|
||||||
|
"doc_embeddings",
|
||||||
|
"my_vector_field",
|
||||||
|
"embedding_model",
|
||||||
|
"VECTOR_COL",
|
||||||
|
"Vector_Mixed",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const columnName of floatVectorColumns) {
|
||||||
|
expect(
|
||||||
|
DataType.isFixedSizeList(
|
||||||
|
floatVectorTable.getChild(columnName)?.type,
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
|
// Check that float vectors use Float32 by default
|
||||||
|
expect(
|
||||||
|
floatVectorTable
|
||||||
|
.getChild(columnName)
|
||||||
|
?.type.children[0].type.toString(),
|
||||||
|
).toEqual(new Float32().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that regular integer arrays still get treated as float vectors
|
||||||
|
// (since JavaScript doesn't distinguish integers from floats at runtime)
|
||||||
|
const integerArrayTable = makeArrowTable([
|
||||||
|
{
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
vector_int: [1, 2], // Regular array with integers - should be Float32
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
embedding_int: [3, 4], // Regular array with integers - should be Float32
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
const integerArrayColumns = ["vector_int", "embedding_int"];
|
||||||
|
|
||||||
|
for (const columnName of integerArrayColumns) {
|
||||||
|
expect(
|
||||||
|
DataType.isFixedSizeList(
|
||||||
|
integerArrayTable.getChild(columnName)?.type,
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
|
// Regular integer arrays should use Float32 (avoiding false positives)
|
||||||
|
expect(
|
||||||
|
integerArrayTable
|
||||||
|
.getChild(columnName)
|
||||||
|
?.type.children[0].type.toString(),
|
||||||
|
).toEqual(new Float32().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test normal list should NOT be converted to FixedSizeList
|
||||||
|
const normalListTable = makeArrowTable([
|
||||||
|
{
|
||||||
|
// biome-ignore lint/style/useNamingConvention: Testing vector column detection patterns
|
||||||
|
normal_list: [15.5, 16.6], // should NOT be detected as vector
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
expect(
|
||||||
|
DataType.isFixedSizeList(
|
||||||
|
normalListTable.getChild("normal_list")?.type,
|
||||||
|
),
|
||||||
|
).toBe(false);
|
||||||
|
expect(
|
||||||
|
DataType.isList(normalListTable.getChild("normal_list")?.type),
|
||||||
|
).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
it("will allow different vector column types", async function () {
|
it("will allow different vector column types", async function () {
|
||||||
const table = makeArrowTable([{ fp16: [1], fp32: [1], fp64: [1] }], {
|
const table = makeArrowTable([{ fp16: [1], fp32: [1], fp64: [1] }], {
|
||||||
vectorColumns: {
|
vectorColumns: {
|
||||||
@@ -375,8 +478,221 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(table2.schema).toEqual(schema);
|
expect(table2.schema).toEqual(schema);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("will handle missing columns in schema alignment when using embeddings", async function () {
|
||||||
|
const schema = new Schema(
|
||||||
|
[
|
||||||
|
new Field("domain", new Utf8(), true),
|
||||||
|
new Field("name", new Utf8(), true),
|
||||||
|
new Field("description", new Utf8(), true),
|
||||||
|
],
|
||||||
|
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ domain: "google.com", name: "Google" },
|
||||||
|
{ domain: "facebook.com", name: "Facebook" },
|
||||||
|
];
|
||||||
|
|
||||||
|
const table = await convertToTable(data, undefined, { schema });
|
||||||
|
|
||||||
|
expect(table.numCols).toBe(3);
|
||||||
|
expect(table.numRows).toBe(2);
|
||||||
|
|
||||||
|
const descriptionColumn = table.getChild("description");
|
||||||
|
expect(descriptionColumn).toBeDefined();
|
||||||
|
expect(descriptionColumn?.nullCount).toBe(2);
|
||||||
|
expect(descriptionColumn?.toArray()).toEqual([null, null]);
|
||||||
|
|
||||||
|
expect(table.getChild("domain")?.toArray()).toEqual([
|
||||||
|
"google.com",
|
||||||
|
"facebook.com",
|
||||||
|
]);
|
||||||
|
expect(table.getChild("name")?.toArray()).toEqual([
|
||||||
|
"Google",
|
||||||
|
"Facebook",
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("will handle completely missing nested struct columns", async function () {
|
||||||
|
const schema = new Schema(
|
||||||
|
[
|
||||||
|
new Field("id", new Utf8(), true),
|
||||||
|
new Field("name", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"metadata",
|
||||||
|
new Struct([
|
||||||
|
new Field("version", new Int32(), true),
|
||||||
|
new Field("author", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"tags",
|
||||||
|
new List(new Field("item", new Utf8(), true)),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ id: "doc1", name: "Document 1" },
|
||||||
|
{ id: "doc2", name: "Document 2" },
|
||||||
|
];
|
||||||
|
|
||||||
|
const table = await convertToTable(data, undefined, { schema });
|
||||||
|
|
||||||
|
expect(table.numCols).toBe(3);
|
||||||
|
expect(table.numRows).toBe(2);
|
||||||
|
|
||||||
|
const buf = await fromTableToBuffer(table);
|
||||||
|
const retrievedTable = tableFromIPC(buf);
|
||||||
|
|
||||||
|
const rows = [];
|
||||||
|
for (let i = 0; i < retrievedTable.numRows; i++) {
|
||||||
|
rows.push(retrievedTable.get(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(rows[0].metadata.version).toBe(null);
|
||||||
|
expect(rows[0].metadata.author).toBe(null);
|
||||||
|
expect(rows[0].metadata.tags).toBe(null);
|
||||||
|
expect(rows[0].id).toBe("doc1");
|
||||||
|
expect(rows[0].name).toBe("Document 1");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("will handle partially missing nested struct fields", async function () {
|
||||||
|
const schema = new Schema(
|
||||||
|
[
|
||||||
|
new Field("id", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"metadata",
|
||||||
|
new Struct([
|
||||||
|
new Field("version", new Int32(), true),
|
||||||
|
new Field("author", new Utf8(), true),
|
||||||
|
new Field("created_at", new Utf8(), true),
|
||||||
|
]),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ id: "doc1", metadata: { version: 1, author: "Alice" } },
|
||||||
|
{ id: "doc2", metadata: { version: 2 } },
|
||||||
|
];
|
||||||
|
|
||||||
|
const table = await convertToTable(data, undefined, { schema });
|
||||||
|
|
||||||
|
expect(table.numCols).toBe(2);
|
||||||
|
expect(table.numRows).toBe(2);
|
||||||
|
|
||||||
|
const metadataColumn = table.getChild("metadata");
|
||||||
|
expect(metadataColumn).toBeDefined();
|
||||||
|
expect(metadataColumn?.type.toString()).toBe(
|
||||||
|
"Struct<{version:Int32, author:Utf8, created_at:Utf8}>",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("will handle multiple levels of nested structures", async function () {
|
||||||
|
const schema = new Schema(
|
||||||
|
[
|
||||||
|
new Field("id", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"config",
|
||||||
|
new Struct([
|
||||||
|
new Field("database", new Utf8(), true),
|
||||||
|
new Field(
|
||||||
|
"connection",
|
||||||
|
new Struct([
|
||||||
|
new Field("host", new Utf8(), true),
|
||||||
|
new Field("port", new Int32(), true),
|
||||||
|
new Field(
|
||||||
|
"ssl",
|
||||||
|
new Struct([
|
||||||
|
new Field("enabled", new Bool(), true),
|
||||||
|
new Field("cert_path", new Utf8(), true),
|
||||||
|
]),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
true,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{
|
||||||
|
id: "config1",
|
||||||
|
config: {
|
||||||
|
database: "postgres",
|
||||||
|
connection: { host: "localhost" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "config2",
|
||||||
|
config: { database: "mysql" },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "config3",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const table = await convertToTable(data, undefined, { schema });
|
||||||
|
|
||||||
|
expect(table.numCols).toBe(2);
|
||||||
|
expect(table.numRows).toBe(3);
|
||||||
|
|
||||||
|
const configColumn = table.getChild("config");
|
||||||
|
expect(configColumn).toBeDefined();
|
||||||
|
expect(configColumn?.type.toString()).toBe(
|
||||||
|
"Struct<{database:Utf8, connection:Struct<{host:Utf8, port:Int32, ssl:Struct<{enabled:Bool, cert_path:Utf8}>}>}>",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("will handle missing columns in Arrow table input when using embeddings", async function () {
|
||||||
|
const incompleteTable = makeArrowTable([
|
||||||
|
{ domain: "google.com", name: "Google" },
|
||||||
|
{ domain: "facebook.com", name: "Facebook" },
|
||||||
|
]);
|
||||||
|
|
||||||
|
const schema = new Schema(
|
||||||
|
[
|
||||||
|
new Field("domain", new Utf8(), true),
|
||||||
|
new Field("name", new Utf8(), true),
|
||||||
|
new Field("description", new Utf8(), true),
|
||||||
|
],
|
||||||
|
new Map([["embedding_functions", JSON.stringify([])]]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const buf = await fromDataToBuffer(incompleteTable, undefined, schema);
|
||||||
|
|
||||||
|
expect(buf.byteLength).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
const retrievedTable = tableFromIPC(buf);
|
||||||
|
expect(retrievedTable.numCols).toBe(3);
|
||||||
|
expect(retrievedTable.numRows).toBe(2);
|
||||||
|
|
||||||
|
const descriptionColumn = retrievedTable.getChild("description");
|
||||||
|
expect(descriptionColumn).toBeDefined();
|
||||||
|
expect(descriptionColumn?.nullCount).toBe(2);
|
||||||
|
expect(descriptionColumn?.toArray()).toEqual([null, null]);
|
||||||
|
|
||||||
|
expect(retrievedTable.getChild("domain")?.toArray()).toEqual([
|
||||||
|
"google.com",
|
||||||
|
"facebook.com",
|
||||||
|
]);
|
||||||
|
expect(retrievedTable.getChild("name")?.toArray()).toEqual([
|
||||||
|
"Google",
|
||||||
|
"Facebook",
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
it("should correctly retain values in nested struct fields", async function () {
|
it("should correctly retain values in nested struct fields", async function () {
|
||||||
// Define test data with nested struct
|
|
||||||
const testData = [
|
const testData = [
|
||||||
{
|
{
|
||||||
id: "doc1",
|
id: "doc1",
|
||||||
@@ -400,10 +716,8 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
// Create Arrow table from the data
|
|
||||||
const table = makeArrowTable(testData);
|
const table = makeArrowTable(testData);
|
||||||
|
|
||||||
// Verify schema has the nested struct fields
|
|
||||||
const metadataField = table.schema.fields.find(
|
const metadataField = table.schema.fields.find(
|
||||||
(f) => f.name === "metadata",
|
(f) => f.name === "metadata",
|
||||||
);
|
);
|
||||||
@@ -417,23 +731,17 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
"text",
|
"text",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
// Convert to buffer and back (simulating storage and retrieval)
|
|
||||||
const buf = await fromTableToBuffer(table);
|
const buf = await fromTableToBuffer(table);
|
||||||
const retrievedTable = tableFromIPC(buf);
|
const retrievedTable = tableFromIPC(buf);
|
||||||
|
|
||||||
// Verify the retrieved table has the same structure
|
|
||||||
const rows = [];
|
const rows = [];
|
||||||
for (let i = 0; i < retrievedTable.numRows; i++) {
|
for (let i = 0; i < retrievedTable.numRows; i++) {
|
||||||
rows.push(retrievedTable.get(i));
|
rows.push(retrievedTable.get(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check values in the first row
|
|
||||||
const firstRow = rows[0];
|
const firstRow = rows[0];
|
||||||
expect(firstRow.id).toBe("doc1");
|
expect(firstRow.id).toBe("doc1");
|
||||||
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
|
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
|
||||||
|
|
||||||
// Verify metadata values are preserved (this is where the bug is)
|
|
||||||
expect(firstRow.metadata).toBeDefined();
|
|
||||||
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
|
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
|
||||||
expect(firstRow.metadata.startLine).toBe(10);
|
expect(firstRow.metadata.startLine).toBe(10);
|
||||||
expect(firstRow.metadata.endLine).toBe(20);
|
expect(firstRow.metadata.endLine).toBe(20);
|
||||||
@@ -592,14 +900,14 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
).rejects.toThrow("column vector was missing");
|
).rejects.toThrow("column vector was missing");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("will provide a nice error if run twice", async function () {
|
it("will skip embedding application if already applied", async function () {
|
||||||
const records = sampleRecords();
|
const records = sampleRecords();
|
||||||
const table = await convertToTable(records, dummyEmbeddingConfig);
|
const table = await convertToTable(records, dummyEmbeddingConfig);
|
||||||
|
|
||||||
// fromTableToBuffer will try and apply the embeddings again
|
// fromTableToBuffer will try and apply the embeddings again
|
||||||
await expect(
|
// but should skip since the column already has non-null values
|
||||||
fromTableToBuffer(table, dummyEmbeddingConfig),
|
const result = await fromTableToBuffer(table, dummyEmbeddingConfig);
|
||||||
).rejects.toThrow("already existed");
|
expect(result.byteLength).toBeGreaterThan(0);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -42,6 +42,28 @@ describe("remote connection", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should accept overall timeout configuration", async () => {
|
||||||
|
await connect("db://test", {
|
||||||
|
apiKey: "fake",
|
||||||
|
clientConfig: {
|
||||||
|
timeoutConfig: { timeout: 30 },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test with all timeout parameters
|
||||||
|
await connect("db://test", {
|
||||||
|
apiKey: "fake",
|
||||||
|
clientConfig: {
|
||||||
|
timeoutConfig: {
|
||||||
|
timeout: 60,
|
||||||
|
connectTimeout: 10,
|
||||||
|
readTimeout: 20,
|
||||||
|
poolIdleTimeout: 300,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it("should pass down apiKey and userAgent", async () => {
|
it("should pass down apiKey and userAgent", async () => {
|
||||||
await withMockDatabase(
|
await withMockDatabase(
|
||||||
(req, res) => {
|
(req, res) => {
|
||||||
|
|||||||
46
nodejs/__test__/session.test.ts
Normal file
46
nodejs/__test__/session.test.ts
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
import * as tmp from "tmp";
|
||||||
|
import { Session, connect } from "../lancedb";
|
||||||
|
|
||||||
|
describe("Session", () => {
|
||||||
|
let tmpDir: tmp.DirResult;
|
||||||
|
beforeEach(() => {
|
||||||
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||||
|
});
|
||||||
|
afterEach(() => tmpDir.removeCallback());
|
||||||
|
|
||||||
|
it("should configure cache sizes and work with database operations", async () => {
|
||||||
|
// Create session with small cache limits for testing
|
||||||
|
const indexCacheSize = BigInt(1024 * 1024); // 1MB
|
||||||
|
const metadataCacheSize = BigInt(512 * 1024); // 512KB
|
||||||
|
|
||||||
|
const session = new Session(indexCacheSize, metadataCacheSize);
|
||||||
|
|
||||||
|
// Record initial cache state
|
||||||
|
const initialCacheSize = session.sizeBytes();
|
||||||
|
const initialCacheItems = session.approxNumItems();
|
||||||
|
|
||||||
|
// Test session works with database connection
|
||||||
|
const db = await connect({ uri: tmpDir.name, session: session });
|
||||||
|
|
||||||
|
// Create and use a table to exercise the session
|
||||||
|
const data = Array.from({ length: 100 }, (_, i) => ({
|
||||||
|
id: i,
|
||||||
|
text: `item ${i}`,
|
||||||
|
}));
|
||||||
|
const table = await db.createTable("test", data);
|
||||||
|
const results = await table.query().limit(5).toArray();
|
||||||
|
|
||||||
|
expect(results).toHaveLength(5);
|
||||||
|
|
||||||
|
// Verify cache usage increased after operations
|
||||||
|
const finalCacheSize = session.sizeBytes();
|
||||||
|
const finalCacheItems = session.approxNumItems();
|
||||||
|
|
||||||
|
expect(finalCacheSize).toBeGreaterThan(initialCacheSize); // Cache should have grown
|
||||||
|
expect(finalCacheItems).toBeGreaterThanOrEqual(initialCacheItems); // Items should not decrease
|
||||||
|
expect(initialCacheSize).toBeLessThan(indexCacheSize + metadataCacheSize); // Within limits
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -33,7 +33,12 @@ import {
|
|||||||
register,
|
register,
|
||||||
} from "../lancedb/embedding";
|
} from "../lancedb/embedding";
|
||||||
import { Index } from "../lancedb/indices";
|
import { Index } from "../lancedb/indices";
|
||||||
import { instanceOfFullTextQuery } from "../lancedb/query";
|
import {
|
||||||
|
BooleanQuery,
|
||||||
|
Occur,
|
||||||
|
Operator,
|
||||||
|
instanceOfFullTextQuery,
|
||||||
|
} from "../lancedb/query";
|
||||||
import exp = require("constants");
|
import exp = require("constants");
|
||||||
|
|
||||||
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||||
@@ -363,9 +368,9 @@ describe("merge insert", () => {
|
|||||||
{ a: 4, b: "z" },
|
{ a: 4, b: "z" },
|
||||||
];
|
];
|
||||||
|
|
||||||
expect(
|
const result = (await table.toArrow()).toArray().sort((a, b) => a.a - b.a);
|
||||||
JSON.parse(JSON.stringify((await table.toArrow()).toArray())),
|
|
||||||
).toEqual(expected);
|
expect(result.map((row) => ({ ...row }))).toEqual(expected);
|
||||||
});
|
});
|
||||||
test("conditional update", async () => {
|
test("conditional update", async () => {
|
||||||
const newData = [
|
const newData = [
|
||||||
@@ -554,6 +559,32 @@ describe("When creating an index", () => {
|
|||||||
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
||||||
expect(rst.numRows).toBe(1);
|
expect(rst.numRows).toBe(1);
|
||||||
|
|
||||||
|
// test nprobes
|
||||||
|
rst = await tbl.query().nearestTo(queryVec).limit(2).nprobes(50).toArrow();
|
||||||
|
expect(rst.numRows).toBe(2);
|
||||||
|
rst = await tbl
|
||||||
|
.query()
|
||||||
|
.nearestTo(queryVec)
|
||||||
|
.limit(2)
|
||||||
|
.minimumNprobes(15)
|
||||||
|
.toArrow();
|
||||||
|
expect(rst.numRows).toBe(2);
|
||||||
|
rst = await tbl
|
||||||
|
.query()
|
||||||
|
.nearestTo(queryVec)
|
||||||
|
.limit(2)
|
||||||
|
.minimumNprobes(10)
|
||||||
|
.maximumNprobes(20)
|
||||||
|
.toArrow();
|
||||||
|
expect(rst.numRows).toBe(2);
|
||||||
|
|
||||||
|
expect(() => tbl.query().nearestTo(queryVec).minimumNprobes(0)).toThrow(
|
||||||
|
"Invalid input, minimum_nprobes must be greater than 0",
|
||||||
|
);
|
||||||
|
expect(() => tbl.query().nearestTo(queryVec).maximumNprobes(5)).toThrow(
|
||||||
|
"Invalid input, maximum_nprobes must be greater than or equal to minimum_nprobes",
|
||||||
|
);
|
||||||
|
|
||||||
await tbl.dropIndex("vec_idx");
|
await tbl.dropIndex("vec_idx");
|
||||||
const indices2 = await tbl.listIndices();
|
const indices2 = await tbl.listIndices();
|
||||||
expect(indices2.length).toBe(0);
|
expect(indices2.length).toBe(0);
|
||||||
@@ -1531,6 +1562,18 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
|
|
||||||
const results = await table.search("hello").toArray();
|
const results = await table.search("hello").toArray();
|
||||||
expect(results[0].text).toBe(data[0].text);
|
expect(results[0].text).toBe(data[0].text);
|
||||||
|
|
||||||
|
const results2 = await table
|
||||||
|
.search(new MatchQuery("hello world", "text"))
|
||||||
|
.toArray();
|
||||||
|
expect(results2.length).toBe(2);
|
||||||
|
|
||||||
|
const results3 = await table
|
||||||
|
.search(
|
||||||
|
new MatchQuery("hello world", "text", { operator: Operator.And }),
|
||||||
|
)
|
||||||
|
.toArray();
|
||||||
|
expect(results3.length).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("full text search without lowercase", async () => {
|
test("full text search without lowercase", async () => {
|
||||||
@@ -1607,6 +1650,114 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(resultSet.has("fob")).toBe(true);
|
expect(resultSet.has("fob")).toBe(true);
|
||||||
expect(resultSet.has("fo")).toBe(true);
|
expect(resultSet.has("fo")).toBe(true);
|
||||||
expect(resultSet.has("food")).toBe(true);
|
expect(resultSet.has("food")).toBe(true);
|
||||||
|
|
||||||
|
const prefixResults = await table
|
||||||
|
.search(
|
||||||
|
new MatchQuery("foo", "text", { fuzziness: 3, prefixLength: 3 }),
|
||||||
|
)
|
||||||
|
.toArray();
|
||||||
|
expect(prefixResults.length).toBe(2);
|
||||||
|
const resultSet2 = new Set(prefixResults.map((r) => r.text));
|
||||||
|
expect(resultSet2.has("foo")).toBe(true);
|
||||||
|
expect(resultSet2.has("food")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("full text search boolean query", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const data = [
|
||||||
|
{ text: "The cat and dog are playing" },
|
||||||
|
{ text: "The cat is sleeping" },
|
||||||
|
{ text: "The dog is barking" },
|
||||||
|
{ text: "The dog chases the cat" },
|
||||||
|
];
|
||||||
|
const table = await db.createTable("test", data);
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts({ withPosition: false }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const shouldResults = await table
|
||||||
|
.search(
|
||||||
|
new BooleanQuery([
|
||||||
|
[Occur.Should, new MatchQuery("cat", "text")],
|
||||||
|
[Occur.Should, new MatchQuery("dog", "text")],
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.toArray();
|
||||||
|
expect(shouldResults.length).toBe(4);
|
||||||
|
|
||||||
|
const mustResults = await table
|
||||||
|
.search(
|
||||||
|
new BooleanQuery([
|
||||||
|
[Occur.Must, new MatchQuery("cat", "text")],
|
||||||
|
[Occur.Must, new MatchQuery("dog", "text")],
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.toArray();
|
||||||
|
expect(mustResults.length).toBe(2);
|
||||||
|
|
||||||
|
const mustNotResults = await table
|
||||||
|
.search(
|
||||||
|
new BooleanQuery([
|
||||||
|
[Occur.Must, new MatchQuery("cat", "text")],
|
||||||
|
[Occur.MustNot, new MatchQuery("dog", "text")],
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
.toArray();
|
||||||
|
expect(mustNotResults.length).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("full text search ngram", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const data = [
|
||||||
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
||||||
|
{ text: "lance database", vector: [0.4, 0.5, 0.6] },
|
||||||
|
{ text: "lance is cool", vector: [0.7, 0.8, 0.9] },
|
||||||
|
];
|
||||||
|
const table = await db.createTable("test", data);
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts({ baseTokenizer: "ngram" }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const results = await table.search("lan").toArray();
|
||||||
|
expect(results.length).toBe(2);
|
||||||
|
const resultSet = new Set(results.map((r) => r.text));
|
||||||
|
expect(resultSet.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet.has("lance is cool")).toBe(true);
|
||||||
|
|
||||||
|
const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
|
||||||
|
expect(results2.length).toBe(2);
|
||||||
|
const resultSet2 = new Set(results2.map((r) => r.text));
|
||||||
|
expect(resultSet2.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet2.has("lance is cool")).toBe(true);
|
||||||
|
|
||||||
|
// the default min_ngram_length is 3, so "la" should not match
|
||||||
|
const results3 = await table.search("la").toArray();
|
||||||
|
expect(results3.length).toBe(0);
|
||||||
|
|
||||||
|
// test setting min_ngram_length and prefix_only
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts({
|
||||||
|
baseTokenizer: "ngram",
|
||||||
|
ngramMinLength: 2,
|
||||||
|
prefixOnly: true,
|
||||||
|
}),
|
||||||
|
replace: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
const results4 = await table.search("lan").toArray();
|
||||||
|
expect(results4.length).toBe(2);
|
||||||
|
const resultSet4 = new Set(results4.map((r) => r.text));
|
||||||
|
expect(resultSet4.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet4.has("lance is cool")).toBe(true);
|
||||||
|
|
||||||
|
const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
|
||||||
|
expect(results5.length).toBe(0);
|
||||||
|
|
||||||
|
const results6 = await table.search("la").toArray();
|
||||||
|
expect(results6.length).toBe(2);
|
||||||
|
const resultSet6 = new Set(results6.map((r) => r.text));
|
||||||
|
expect(resultSet6.has("lance database")).toBe(true);
|
||||||
|
expect(resultSet6.has("lance is cool")).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
@@ -1712,4 +1863,43 @@ describe("column name options", () => {
|
|||||||
expect(results[0].query_index).toBe(0);
|
expect(results[0].query_index).toBe(0);
|
||||||
expect(results[1].query_index).toBe(1);
|
expect(results[1].query_index).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("index and search multivectors", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const data = [];
|
||||||
|
// generate 512 random multivectors
|
||||||
|
for (let i = 0; i < 256; i++) {
|
||||||
|
data.push({
|
||||||
|
multivector: Array.from({ length: 10 }, () =>
|
||||||
|
Array(2).fill(Math.random()),
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const table = await db.createTable("multivectors", data, {
|
||||||
|
schema: new Schema([
|
||||||
|
new Field(
|
||||||
|
"multivector",
|
||||||
|
new List(
|
||||||
|
new Field(
|
||||||
|
"item",
|
||||||
|
new FixedSizeList(2, new Field("item", new Float32())),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
});
|
||||||
|
|
||||||
|
const results = await table.search(data[0].multivector).limit(10).toArray();
|
||||||
|
expect(results.length).toBe(10);
|
||||||
|
|
||||||
|
await table.createIndex("multivector", {
|
||||||
|
config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const results2 = await table
|
||||||
|
.search(data[0].multivector)
|
||||||
|
.limit(10)
|
||||||
|
.toArray();
|
||||||
|
expect(results2.length).toBe(10);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
2
nodejs/examples/package-lock.json
generated
2
nodejs/examples/package-lock.json
generated
@@ -30,7 +30,7 @@
|
|||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
],
|
],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"os": [
|
"os": [
|
||||||
"darwin",
|
"darwin",
|
||||||
"linux",
|
"linux",
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ import {
|
|||||||
Struct,
|
Struct,
|
||||||
Timestamp,
|
Timestamp,
|
||||||
Type,
|
Type,
|
||||||
|
Uint8,
|
||||||
Utf8,
|
Utf8,
|
||||||
Vector,
|
Vector,
|
||||||
makeVector as arrowMakeVector,
|
makeVector as arrowMakeVector,
|
||||||
@@ -51,6 +52,15 @@ import {
|
|||||||
sanitizeTable,
|
sanitizeTable,
|
||||||
sanitizeType,
|
sanitizeType,
|
||||||
} from "./sanitize";
|
} from "./sanitize";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a field name indicates a vector column.
|
||||||
|
*/
|
||||||
|
function nameSuggestsVectorColumn(fieldName: string): boolean {
|
||||||
|
const nameLower = fieldName.toLowerCase();
|
||||||
|
return nameLower.includes("vector") || nameLower.includes("embedding");
|
||||||
|
}
|
||||||
|
|
||||||
export * from "apache-arrow";
|
export * from "apache-arrow";
|
||||||
export type SchemaLike =
|
export type SchemaLike =
|
||||||
| Schema
|
| Schema
|
||||||
@@ -107,6 +117,20 @@ export type IntoVector =
|
|||||||
| number[]
|
| number[]
|
||||||
| Promise<Float32Array | Float64Array | number[]>;
|
| Promise<Float32Array | Float64Array | number[]>;
|
||||||
|
|
||||||
|
export type MultiVector = IntoVector[];
|
||||||
|
|
||||||
|
export function isMultiVector(value: unknown): value is MultiVector {
|
||||||
|
return Array.isArray(value) && isIntoVector(value[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isIntoVector(value: unknown): value is IntoVector {
|
||||||
|
return (
|
||||||
|
value instanceof Float32Array ||
|
||||||
|
value instanceof Float64Array ||
|
||||||
|
(Array.isArray(value) && !Array.isArray(value[0]))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
export function isArrowTable(value: object): value is TableLike {
|
export function isArrowTable(value: object): value is TableLike {
|
||||||
if (value instanceof ArrowTable) return true;
|
if (value instanceof ArrowTable) return true;
|
||||||
return "schema" in value && "batches" in value;
|
return "schema" in value && "batches" in value;
|
||||||
@@ -417,7 +441,9 @@ function inferSchema(
|
|||||||
} else {
|
} else {
|
||||||
const inferredType = inferType(value, path, opts);
|
const inferredType = inferType(value, path, opts);
|
||||||
if (inferredType === undefined) {
|
if (inferredType === undefined) {
|
||||||
throw new Error(`Failed to infer data type for field ${path.join(".")} at row ${rowI}. \
|
throw new Error(`Failed to infer data type for field ${path.join(
|
||||||
|
".",
|
||||||
|
)} at row ${rowI}. \
|
||||||
Consider providing an explicit schema.`);
|
Consider providing an explicit schema.`);
|
||||||
}
|
}
|
||||||
pathTree.set(path, inferredType);
|
pathTree.set(path, inferredType);
|
||||||
@@ -575,10 +601,17 @@ function inferType(
|
|||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
// Try to automatically detect embedding columns.
|
// Try to automatically detect embedding columns.
|
||||||
if (valueType instanceof Float && path[path.length - 1] === "vector") {
|
if (nameSuggestsVectorColumn(path[path.length - 1])) {
|
||||||
// We default to Float32 for vectors.
|
// Check if value is a Uint8Array for integer vector type determination
|
||||||
const child = new Field("item", new Float32(), true);
|
if (value instanceof Uint8Array) {
|
||||||
return new FixedSizeList(value.length, child);
|
// For integer vectors, we default to Uint8 (matching Python implementation)
|
||||||
|
const child = new Field("item", new Uint8(), true);
|
||||||
|
return new FixedSizeList(value.length, child);
|
||||||
|
} else {
|
||||||
|
// For float vectors, we default to Float32
|
||||||
|
const child = new Field("item", new Float32(), true);
|
||||||
|
return new FixedSizeList(value.length, child);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
const child = new Field("item", valueType, true);
|
const child = new Field("item", valueType, true);
|
||||||
return new List(child);
|
return new List(child);
|
||||||
@@ -799,11 +832,17 @@ async function applyEmbeddingsFromMetadata(
|
|||||||
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
|
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if destination column exists and handle accordingly
|
||||||
if (columns[destColumn] !== undefined) {
|
if (columns[destColumn] !== undefined) {
|
||||||
throw new Error(
|
const existingColumn = columns[destColumn];
|
||||||
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
// If the column exists but is all null, we can fill it with embeddings
|
||||||
);
|
if (existingColumn.nullCount !== existingColumn.length) {
|
||||||
|
// Column has non-null values, skip embedding application
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (table.batches.length > 1) {
|
if (table.batches.length > 1) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
||||||
@@ -831,6 +870,15 @@ async function applyEmbeddingsFromMetadata(
|
|||||||
const vector = makeVector(vectors, destType);
|
const vector = makeVector(vectors, destType);
|
||||||
columns[destColumn] = vector;
|
columns[destColumn] = vector;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add any missing columns from the schema as null vectors
|
||||||
|
for (const field of schema.fields) {
|
||||||
|
if (!(field.name in columns)) {
|
||||||
|
const nullValues = new Array(table.numRows).fill(null);
|
||||||
|
columns[field.name] = makeVector(nullValues, field.type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const newTable = new ArrowTable(columns);
|
const newTable = new ArrowTable(columns);
|
||||||
return alignTable(newTable, schema);
|
return alignTable(newTable, schema);
|
||||||
}
|
}
|
||||||
@@ -903,11 +951,23 @@ async function applyEmbeddings<T>(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// Check if destination column exists and handle accordingly
|
||||||
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
||||||
throw new Error(
|
const existingColumn = newColumns[destColumn];
|
||||||
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
// If the column exists but is all null, we can fill it with embeddings
|
||||||
);
|
if (existingColumn.nullCount !== existingColumn.length) {
|
||||||
|
// Column has non-null values, skip embedding application and return table as-is
|
||||||
|
let newTable = new ArrowTable(newColumns);
|
||||||
|
if (schema != null) {
|
||||||
|
newTable = alignTable(newTable, schema as Schema);
|
||||||
|
}
|
||||||
|
return new ArrowTable(
|
||||||
|
new Schema(newTable.schema.fields, schemaMetadata),
|
||||||
|
newTable.batches,
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (table.batches.length > 1) {
|
if (table.batches.length > 1) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
||||||
@@ -967,7 +1027,21 @@ export async function convertToTable(
|
|||||||
embeddings?: EmbeddingFunctionConfig,
|
embeddings?: EmbeddingFunctionConfig,
|
||||||
makeTableOptions?: Partial<MakeArrowTableOptions>,
|
makeTableOptions?: Partial<MakeArrowTableOptions>,
|
||||||
): Promise<ArrowTable> {
|
): Promise<ArrowTable> {
|
||||||
const table = makeArrowTable(data, makeTableOptions);
|
let processedData = data;
|
||||||
|
|
||||||
|
// If we have a schema with embedding metadata, we need to preprocess the data
|
||||||
|
// to ensure all nested fields are present
|
||||||
|
if (
|
||||||
|
makeTableOptions?.schema &&
|
||||||
|
makeTableOptions.schema.metadata?.has("embedding_functions")
|
||||||
|
) {
|
||||||
|
processedData = ensureNestedFieldsExist(
|
||||||
|
data,
|
||||||
|
makeTableOptions.schema as Schema,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const table = makeArrowTable(processedData, makeTableOptions);
|
||||||
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1060,7 +1134,16 @@ export async function fromDataToBuffer(
|
|||||||
schema = sanitizeSchema(schema);
|
schema = sanitizeSchema(schema);
|
||||||
}
|
}
|
||||||
if (isArrowTable(data)) {
|
if (isArrowTable(data)) {
|
||||||
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
|
const table = sanitizeTable(data);
|
||||||
|
// If we have a schema with embedding functions, we need to ensure all columns exist
|
||||||
|
// before applying embeddings, since applyEmbeddingsFromMetadata expects all columns
|
||||||
|
// to be present in the table
|
||||||
|
if (schema && schema.metadata?.has("embedding_functions")) {
|
||||||
|
const alignedTable = alignTableToSchema(table, schema);
|
||||||
|
return fromTableToBuffer(alignedTable, embeddings, schema);
|
||||||
|
} else {
|
||||||
|
return fromTableToBuffer(table, embeddings, schema);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
const table = await convertToTable(data, embeddings, { schema });
|
const table = await convertToTable(data, embeddings, { schema });
|
||||||
return fromTableToBuffer(table);
|
return fromTableToBuffer(table);
|
||||||
@@ -1129,7 +1212,7 @@ function alignBatch(batch: RecordBatch, schema: Schema): RecordBatch {
|
|||||||
type: new Struct(schema.fields),
|
type: new Struct(schema.fields),
|
||||||
length: batch.numRows,
|
length: batch.numRows,
|
||||||
nullCount: batch.nullCount,
|
nullCount: batch.nullCount,
|
||||||
children: alignedChildren,
|
children: alignedChildren as unknown as ArrowData<DataType>[],
|
||||||
});
|
});
|
||||||
return new RecordBatch(schema, newData);
|
return new RecordBatch(schema, newData);
|
||||||
}
|
}
|
||||||
@@ -1201,6 +1284,79 @@ function validateSchemaEmbeddings(
|
|||||||
return new Schema(fields, schema.metadata);
|
return new Schema(fields, schema.metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensures that all nested fields defined in the schema exist in the data,
|
||||||
|
* filling missing fields with null values.
|
||||||
|
*/
|
||||||
|
export function ensureNestedFieldsExist(
|
||||||
|
data: Array<Record<string, unknown>>,
|
||||||
|
schema: Schema,
|
||||||
|
): Array<Record<string, unknown>> {
|
||||||
|
return data.map((row) => {
|
||||||
|
const completeRow: Record<string, unknown> = {};
|
||||||
|
|
||||||
|
for (const field of schema.fields) {
|
||||||
|
if (field.name in row) {
|
||||||
|
if (
|
||||||
|
field.type.constructor.name === "Struct" &&
|
||||||
|
row[field.name] !== null &&
|
||||||
|
row[field.name] !== undefined
|
||||||
|
) {
|
||||||
|
// Handle nested struct
|
||||||
|
const nestedValue = row[field.name] as Record<string, unknown>;
|
||||||
|
completeRow[field.name] = ensureStructFieldsExist(
|
||||||
|
nestedValue,
|
||||||
|
field.type,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// Non-struct field or null struct value
|
||||||
|
completeRow[field.name] = row[field.name];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Field is missing from the data - set to null
|
||||||
|
completeRow[field.name] = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return completeRow;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recursively ensures that all fields in a struct type exist in the data,
|
||||||
|
* filling missing fields with null values.
|
||||||
|
*/
|
||||||
|
function ensureStructFieldsExist(
|
||||||
|
data: Record<string, unknown>,
|
||||||
|
structType: Struct,
|
||||||
|
): Record<string, unknown> {
|
||||||
|
const completeStruct: Record<string, unknown> = {};
|
||||||
|
|
||||||
|
for (const childField of structType.children) {
|
||||||
|
if (childField.name in data) {
|
||||||
|
if (
|
||||||
|
childField.type.constructor.name === "Struct" &&
|
||||||
|
data[childField.name] !== null &&
|
||||||
|
data[childField.name] !== undefined
|
||||||
|
) {
|
||||||
|
// Recursively handle nested struct
|
||||||
|
completeStruct[childField.name] = ensureStructFieldsExist(
|
||||||
|
data[childField.name] as Record<string, unknown>,
|
||||||
|
childField.type,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// Non-struct field or null struct value
|
||||||
|
completeStruct[childField.name] = data[childField.name];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Field is missing - set to null
|
||||||
|
completeStruct[childField.name] = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return completeStruct;
|
||||||
|
}
|
||||||
|
|
||||||
interface JsonDataType {
|
interface JsonDataType {
|
||||||
type: string;
|
type: string;
|
||||||
fields?: JsonField[];
|
fields?: JsonField[];
|
||||||
@@ -1334,3 +1490,64 @@ function fieldToJson(field: Field): JsonField {
|
|||||||
metadata: field.metadata,
|
metadata: field.metadata,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function alignTableToSchema(
|
||||||
|
table: ArrowTable,
|
||||||
|
targetSchema: Schema,
|
||||||
|
): ArrowTable {
|
||||||
|
const existingColumns = new Map<string, Vector>();
|
||||||
|
|
||||||
|
// Map existing columns
|
||||||
|
for (const field of table.schema.fields) {
|
||||||
|
existingColumns.set(field.name, table.getChild(field.name)!);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create vectors for all fields in target schema
|
||||||
|
const alignedColumns: Record<string, Vector> = {};
|
||||||
|
|
||||||
|
for (const field of targetSchema.fields) {
|
||||||
|
if (existingColumns.has(field.name)) {
|
||||||
|
// Column exists, use it
|
||||||
|
alignedColumns[field.name] = existingColumns.get(field.name)!;
|
||||||
|
} else {
|
||||||
|
// Column missing, create null vector
|
||||||
|
alignedColumns[field.name] = createNullVector(field, table.numRows);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create new table with aligned schema and columns
|
||||||
|
return new ArrowTable(targetSchema, alignedColumns);
|
||||||
|
}
|
||||||
|
|
||||||
|
function createNullVector(field: Field, numRows: number): Vector {
|
||||||
|
if (field.type.constructor.name === "Struct") {
|
||||||
|
// For struct types, create a struct with null fields
|
||||||
|
const structType = field.type as Struct;
|
||||||
|
const childVectors = structType.children.map((childField) =>
|
||||||
|
createNullVector(childField, numRows),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create struct data
|
||||||
|
const structData = makeData({
|
||||||
|
type: structType,
|
||||||
|
length: numRows,
|
||||||
|
nullCount: 0,
|
||||||
|
children: childVectors.map((v) => v.data[0]),
|
||||||
|
});
|
||||||
|
|
||||||
|
return arrowMakeVector(structData);
|
||||||
|
} else {
|
||||||
|
// For other types, create a vector of nulls
|
||||||
|
const nullBitmap = new Uint8Array(Math.ceil(numRows / 8));
|
||||||
|
// All bits are 0, meaning all values are null
|
||||||
|
|
||||||
|
const data = makeData({
|
||||||
|
type: field.type,
|
||||||
|
length: numRows,
|
||||||
|
nullCount: numRows,
|
||||||
|
nullBitmap,
|
||||||
|
});
|
||||||
|
|
||||||
|
return arrowMakeVector(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -85,6 +85,9 @@ export interface OpenTableOptions {
|
|||||||
/**
|
/**
|
||||||
* Set the size of the index cache, specified as a number of entries
|
* Set the size of the index cache, specified as a number of entries
|
||||||
*
|
*
|
||||||
|
* @deprecated Use session-level cache configuration instead.
|
||||||
|
* Create a Session with custom cache sizes and pass it to the connect() function.
|
||||||
|
*
|
||||||
* The exact meaning of an "entry" will depend on the type of index:
|
* The exact meaning of an "entry" will depend on the type of index:
|
||||||
* - IVF: there is one entry for each IVF partition
|
* - IVF: there is one entry for each IVF partition
|
||||||
* - BTREE: there is one entry for the entire index
|
* - BTREE: there is one entry for the entire index
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import {
|
|||||||
import {
|
import {
|
||||||
ConnectionOptions,
|
ConnectionOptions,
|
||||||
Connection as LanceDbConnection,
|
Connection as LanceDbConnection,
|
||||||
|
Session,
|
||||||
} from "./native.js";
|
} from "./native.js";
|
||||||
|
|
||||||
export {
|
export {
|
||||||
@@ -51,6 +52,8 @@ export {
|
|||||||
OpenTableOptions,
|
OpenTableOptions,
|
||||||
} from "./connection";
|
} from "./connection";
|
||||||
|
|
||||||
|
export { Session } from "./native.js";
|
||||||
|
|
||||||
export {
|
export {
|
||||||
ExecutableQuery,
|
ExecutableQuery,
|
||||||
Query,
|
Query,
|
||||||
@@ -64,7 +67,10 @@ export {
|
|||||||
PhraseQuery,
|
PhraseQuery,
|
||||||
BoostQuery,
|
BoostQuery,
|
||||||
MultiMatchQuery,
|
MultiMatchQuery,
|
||||||
|
BooleanQuery,
|
||||||
FullTextQueryType,
|
FullTextQueryType,
|
||||||
|
Operator,
|
||||||
|
Occur,
|
||||||
} from "./query";
|
} from "./query";
|
||||||
|
|
||||||
export {
|
export {
|
||||||
@@ -97,6 +103,7 @@ export {
|
|||||||
RecordBatchLike,
|
RecordBatchLike,
|
||||||
DataLike,
|
DataLike,
|
||||||
IntoVector,
|
IntoVector,
|
||||||
|
MultiVector,
|
||||||
} from "./arrow";
|
} from "./arrow";
|
||||||
export { IntoSql, packBits } from "./util";
|
export { IntoSql, packBits } from "./util";
|
||||||
|
|
||||||
@@ -127,6 +134,7 @@ export { IntoSql, packBits } from "./util";
|
|||||||
export async function connect(
|
export async function connect(
|
||||||
uri: string,
|
uri: string,
|
||||||
options?: Partial<ConnectionOptions>,
|
options?: Partial<ConnectionOptions>,
|
||||||
|
session?: Session,
|
||||||
): Promise<Connection>;
|
): Promise<Connection>;
|
||||||
/**
|
/**
|
||||||
* Connect to a LanceDB instance at the given URI.
|
* Connect to a LanceDB instance at the given URI.
|
||||||
@@ -145,31 +153,43 @@ export async function connect(
|
|||||||
* storageOptions: {timeout: "60s"}
|
* storageOptions: {timeout: "60s"}
|
||||||
* });
|
* });
|
||||||
* ```
|
* ```
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* const session = Session.default();
|
||||||
|
* const conn = await connect({
|
||||||
|
* uri: "/path/to/database",
|
||||||
|
* session: session
|
||||||
|
* });
|
||||||
|
* ```
|
||||||
*/
|
*/
|
||||||
export async function connect(
|
export async function connect(
|
||||||
options: Partial<ConnectionOptions> & { uri: string },
|
options: Partial<ConnectionOptions> & { uri: string },
|
||||||
): Promise<Connection>;
|
): Promise<Connection>;
|
||||||
export async function connect(
|
export async function connect(
|
||||||
uriOrOptions: string | (Partial<ConnectionOptions> & { uri: string }),
|
uriOrOptions: string | (Partial<ConnectionOptions> & { uri: string }),
|
||||||
options: Partial<ConnectionOptions> = {},
|
options?: Partial<ConnectionOptions>,
|
||||||
): Promise<Connection> {
|
): Promise<Connection> {
|
||||||
let uri: string | undefined;
|
let uri: string | undefined;
|
||||||
|
let finalOptions: Partial<ConnectionOptions> = {};
|
||||||
|
|
||||||
if (typeof uriOrOptions !== "string") {
|
if (typeof uriOrOptions !== "string") {
|
||||||
const { uri: uri_, ...opts } = uriOrOptions;
|
const { uri: uri_, ...opts } = uriOrOptions;
|
||||||
uri = uri_;
|
uri = uri_;
|
||||||
options = opts;
|
finalOptions = opts;
|
||||||
} else {
|
} else {
|
||||||
uri = uriOrOptions;
|
uri = uriOrOptions;
|
||||||
|
finalOptions = options || {};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!uri) {
|
if (!uri) {
|
||||||
throw new Error("uri is required");
|
throw new Error("uri is required");
|
||||||
}
|
}
|
||||||
|
|
||||||
options = (options as ConnectionOptions) ?? {};
|
finalOptions = (finalOptions as ConnectionOptions) ?? {};
|
||||||
(<ConnectionOptions>options).storageOptions = cleanseStorageOptions(
|
(<ConnectionOptions>finalOptions).storageOptions = cleanseStorageOptions(
|
||||||
(<ConnectionOptions>options).storageOptions,
|
(<ConnectionOptions>finalOptions).storageOptions,
|
||||||
);
|
);
|
||||||
const nativeConn = await LanceDbConnection.new(uri, options);
|
const nativeConn = await LanceDbConnection.new(uri, finalOptions);
|
||||||
return new LocalConnection(nativeConn);
|
return new LocalConnection(nativeConn);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -439,7 +439,7 @@ export interface FtsOptions {
|
|||||||
*
|
*
|
||||||
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
||||||
*/
|
*/
|
||||||
baseTokenizer?: "simple" | "whitespace" | "raw";
|
baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* language for stemming and stop words
|
* language for stemming and stop words
|
||||||
@@ -472,6 +472,21 @@ export interface FtsOptions {
|
|||||||
* whether to remove punctuation
|
* whether to remove punctuation
|
||||||
*/
|
*/
|
||||||
asciiFolding?: boolean;
|
asciiFolding?: boolean;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ngram min length
|
||||||
|
*/
|
||||||
|
ngramMinLength?: number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ngram max length
|
||||||
|
*/
|
||||||
|
ngramMaxLength?: number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* whether to only index the prefix of the token for ngram tokenizer
|
||||||
|
*/
|
||||||
|
prefixOnly?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export class Index {
|
export class Index {
|
||||||
@@ -608,6 +623,9 @@ export class Index {
|
|||||||
options?.stem,
|
options?.stem,
|
||||||
options?.removeStopWords,
|
options?.removeStopWords,
|
||||||
options?.asciiFolding,
|
options?.asciiFolding,
|
||||||
|
options?.ngramMinLength,
|
||||||
|
options?.ngramMaxLength,
|
||||||
|
options?.prefixOnly,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -448,6 +448,10 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
|||||||
* For best results we recommend tuning this parameter with a benchmark against
|
* For best results we recommend tuning this parameter with a benchmark against
|
||||||
* your actual data to find the smallest possible value that will still give
|
* your actual data to find the smallest possible value that will still give
|
||||||
* you the desired recall.
|
* you the desired recall.
|
||||||
|
*
|
||||||
|
* For more fine grained control over behavior when you have a very narrow filter
|
||||||
|
* you can use `minimumNprobes` and `maximumNprobes`. This method sets both
|
||||||
|
* the minimum and maximum to the same value.
|
||||||
*/
|
*/
|
||||||
nprobes(nprobes: number): VectorQuery {
|
nprobes(nprobes: number): VectorQuery {
|
||||||
super.doCall((inner) => inner.nprobes(nprobes));
|
super.doCall((inner) => inner.nprobes(nprobes));
|
||||||
@@ -455,6 +459,33 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the minimum number of probes used.
|
||||||
|
*
|
||||||
|
* This controls the minimum number of partitions that will be searched. This
|
||||||
|
* parameter will impact every query against a vector index, regardless of the
|
||||||
|
* filter. See `nprobes` for more details. Higher values will increase recall
|
||||||
|
* but will also increase latency.
|
||||||
|
*/
|
||||||
|
minimumNprobes(minimumNprobes: number): VectorQuery {
|
||||||
|
super.doCall((inner) => inner.minimumNprobes(minimumNprobes));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the maximum number of probes used.
|
||||||
|
*
|
||||||
|
* This controls the maximum number of partitions that will be searched. If this
|
||||||
|
* number is greater than minimumNprobes then the excess partitions will _only_ be
|
||||||
|
* searched if we have not found enough results. This can be useful when there is
|
||||||
|
* a narrow filter to allow these queries to spend more time searching and avoid
|
||||||
|
* potential false negatives.
|
||||||
|
*/
|
||||||
|
maximumNprobes(maximumNprobes: number): VectorQuery {
|
||||||
|
super.doCall((inner) => inner.maximumNprobes(maximumNprobes));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Set the distance range to use
|
* Set the distance range to use
|
||||||
*
|
*
|
||||||
@@ -762,6 +793,31 @@ export enum FullTextQueryType {
|
|||||||
MatchPhrase = "match_phrase",
|
MatchPhrase = "match_phrase",
|
||||||
Boost = "boost",
|
Boost = "boost",
|
||||||
MultiMatch = "multi_match",
|
MultiMatch = "multi_match",
|
||||||
|
Boolean = "boolean",
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enum representing the logical operators used in full-text queries.
|
||||||
|
*
|
||||||
|
* - `And`: All terms must match.
|
||||||
|
* - `Or`: At least one term must match.
|
||||||
|
*/
|
||||||
|
export enum Operator {
|
||||||
|
And = "AND",
|
||||||
|
Or = "OR",
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enum representing the occurrence of terms in full-text queries.
|
||||||
|
*
|
||||||
|
* - `Must`: The term must be present in the document.
|
||||||
|
* - `Should`: The term should contribute to the document score, but is not required.
|
||||||
|
* - `MustNot`: The term must not be present in the document.
|
||||||
|
*/
|
||||||
|
export enum Occur {
|
||||||
|
Should = "SHOULD",
|
||||||
|
Must = "MUST",
|
||||||
|
MustNot = "MUST_NOT",
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -791,6 +847,7 @@ export function instanceOfFullTextQuery(obj: any): obj is FullTextQuery {
|
|||||||
export class MatchQuery implements FullTextQuery {
|
export class MatchQuery implements FullTextQuery {
|
||||||
/** @ignore */
|
/** @ignore */
|
||||||
public readonly inner: JsFullTextQuery;
|
public readonly inner: JsFullTextQuery;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates an instance of MatchQuery.
|
* Creates an instance of MatchQuery.
|
||||||
*
|
*
|
||||||
@@ -800,6 +857,8 @@ export class MatchQuery implements FullTextQuery {
|
|||||||
* - `boost`: The boost factor for the query (default is 1.0).
|
* - `boost`: The boost factor for the query (default is 1.0).
|
||||||
* - `fuzziness`: The fuzziness level for the query (default is 0).
|
* - `fuzziness`: The fuzziness level for the query (default is 0).
|
||||||
* - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
* - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||||
|
* - `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||||
|
* - `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
|
||||||
*/
|
*/
|
||||||
constructor(
|
constructor(
|
||||||
query: string,
|
query: string,
|
||||||
@@ -808,6 +867,8 @@ export class MatchQuery implements FullTextQuery {
|
|||||||
boost?: number;
|
boost?: number;
|
||||||
fuzziness?: number;
|
fuzziness?: number;
|
||||||
maxExpansions?: number;
|
maxExpansions?: number;
|
||||||
|
operator?: Operator;
|
||||||
|
prefixLength?: number;
|
||||||
},
|
},
|
||||||
) {
|
) {
|
||||||
let fuzziness = options?.fuzziness;
|
let fuzziness = options?.fuzziness;
|
||||||
@@ -820,6 +881,8 @@ export class MatchQuery implements FullTextQuery {
|
|||||||
options?.boost ?? 1.0,
|
options?.boost ?? 1.0,
|
||||||
fuzziness,
|
fuzziness,
|
||||||
options?.maxExpansions ?? 50,
|
options?.maxExpansions ?? 50,
|
||||||
|
options?.operator ?? Operator.Or,
|
||||||
|
options?.prefixLength ?? 0,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -836,9 +899,11 @@ export class PhraseQuery implements FullTextQuery {
|
|||||||
*
|
*
|
||||||
* @param query - The phrase to search for in the specified column.
|
* @param query - The phrase to search for in the specified column.
|
||||||
* @param column - The name of the column to search within.
|
* @param column - The name of the column to search within.
|
||||||
|
* @param options - Optional parameters for the phrase query.
|
||||||
|
* - `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0).
|
||||||
*/
|
*/
|
||||||
constructor(query: string, column: string) {
|
constructor(query: string, column: string, options?: { slop?: number }) {
|
||||||
this.inner = JsFullTextQuery.phraseQuery(query, column);
|
this.inner = JsFullTextQuery.phraseQuery(query, column, options?.slop ?? 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
queryType(): FullTextQueryType {
|
queryType(): FullTextQueryType {
|
||||||
@@ -889,18 +954,21 @@ export class MultiMatchQuery implements FullTextQuery {
|
|||||||
* @param columns - An array of column names to search within.
|
* @param columns - An array of column names to search within.
|
||||||
* @param options - Optional parameters for the multi-match query.
|
* @param options - Optional parameters for the multi-match query.
|
||||||
* - `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
* - `boosts`: An array of boost factors for each column (default is 1.0 for all).
|
||||||
|
* - `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||||
*/
|
*/
|
||||||
constructor(
|
constructor(
|
||||||
query: string,
|
query: string,
|
||||||
columns: string[],
|
columns: string[],
|
||||||
options?: {
|
options?: {
|
||||||
boosts?: number[];
|
boosts?: number[];
|
||||||
|
operator?: Operator;
|
||||||
},
|
},
|
||||||
) {
|
) {
|
||||||
this.inner = JsFullTextQuery.multiMatchQuery(
|
this.inner = JsFullTextQuery.multiMatchQuery(
|
||||||
query,
|
query,
|
||||||
columns,
|
columns,
|
||||||
options?.boosts,
|
options?.boosts,
|
||||||
|
options?.operator ?? Operator.Or,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -908,3 +976,23 @@ export class MultiMatchQuery implements FullTextQuery {
|
|||||||
return FullTextQueryType.MultiMatch;
|
return FullTextQueryType.MultiMatch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class BooleanQuery implements FullTextQuery {
|
||||||
|
/** @ignore */
|
||||||
|
public readonly inner: JsFullTextQuery;
|
||||||
|
/**
|
||||||
|
* Creates an instance of BooleanQuery.
|
||||||
|
*
|
||||||
|
* @param queries - An array of (Occur, FullTextQuery objects) to combine.
|
||||||
|
* Occur specifies whether the query must match, or should match.
|
||||||
|
*/
|
||||||
|
constructor(queries: [Occur, FullTextQuery][]) {
|
||||||
|
this.inner = JsFullTextQuery.booleanQuery(
|
||||||
|
queries.map(([occur, query]) => [occur, query.inner]),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
queryType(): FullTextQueryType {
|
||||||
|
return FullTextQueryType.Boolean;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -6,9 +6,11 @@ import {
|
|||||||
Data,
|
Data,
|
||||||
DataType,
|
DataType,
|
||||||
IntoVector,
|
IntoVector,
|
||||||
|
MultiVector,
|
||||||
Schema,
|
Schema,
|
||||||
dataTypeToJson,
|
dataTypeToJson,
|
||||||
fromDataToBuffer,
|
fromDataToBuffer,
|
||||||
|
isMultiVector,
|
||||||
tableFromIPC,
|
tableFromIPC,
|
||||||
} from "./arrow";
|
} from "./arrow";
|
||||||
|
|
||||||
@@ -75,10 +77,10 @@ export interface OptimizeOptions {
|
|||||||
* // Delete all versions older than 1 day
|
* // Delete all versions older than 1 day
|
||||||
* const olderThan = new Date();
|
* const olderThan = new Date();
|
||||||
* olderThan.setDate(olderThan.getDate() - 1));
|
* olderThan.setDate(olderThan.getDate() - 1));
|
||||||
* tbl.cleanupOlderVersions(olderThan);
|
* tbl.optimize({cleanupOlderThan: olderThan});
|
||||||
*
|
*
|
||||||
* // Delete all versions except the current version
|
* // Delete all versions except the current version
|
||||||
* tbl.cleanupOlderVersions(new Date());
|
* tbl.optimize({cleanupOlderThan: new Date()});
|
||||||
*/
|
*/
|
||||||
cleanupOlderThan: Date;
|
cleanupOlderThan: Date;
|
||||||
deleteUnverified: boolean;
|
deleteUnverified: boolean;
|
||||||
@@ -346,7 +348,7 @@ export abstract class Table {
|
|||||||
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
|
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
|
||||||
*/
|
*/
|
||||||
abstract search(
|
abstract search(
|
||||||
query: string | IntoVector | FullTextQuery,
|
query: string | IntoVector | MultiVector | FullTextQuery,
|
||||||
queryType?: string,
|
queryType?: string,
|
||||||
ftsColumns?: string | string[],
|
ftsColumns?: string | string[],
|
||||||
): VectorQuery | Query;
|
): VectorQuery | Query;
|
||||||
@@ -357,7 +359,7 @@ export abstract class Table {
|
|||||||
* is the same thing as calling `nearestTo` on the builder returned
|
* is the same thing as calling `nearestTo` on the builder returned
|
||||||
* by `query`. @see {@link Query#nearestTo} for more details.
|
* by `query`. @see {@link Query#nearestTo} for more details.
|
||||||
*/
|
*/
|
||||||
abstract vectorSearch(vector: IntoVector): VectorQuery;
|
abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
|
||||||
/**
|
/**
|
||||||
* Add new columns with defined values.
|
* Add new columns with defined values.
|
||||||
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
|
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
|
||||||
@@ -668,7 +670,7 @@ export class LocalTable extends Table {
|
|||||||
}
|
}
|
||||||
|
|
||||||
search(
|
search(
|
||||||
query: string | IntoVector | FullTextQuery,
|
query: string | IntoVector | MultiVector | FullTextQuery,
|
||||||
queryType: string = "auto",
|
queryType: string = "auto",
|
||||||
ftsColumns?: string | string[],
|
ftsColumns?: string | string[],
|
||||||
): VectorQuery | Query {
|
): VectorQuery | Query {
|
||||||
@@ -715,7 +717,15 @@ export class LocalTable extends Table {
|
|||||||
return this.query().nearestTo(queryPromise);
|
return this.query().nearestTo(queryPromise);
|
||||||
}
|
}
|
||||||
|
|
||||||
vectorSearch(vector: IntoVector): VectorQuery {
|
vectorSearch(vector: IntoVector | MultiVector): VectorQuery {
|
||||||
|
if (isMultiVector(vector)) {
|
||||||
|
const query = this.query().nearestTo(vector[0]);
|
||||||
|
for (const v of vector.slice(1)) {
|
||||||
|
query.addQueryVector(v);
|
||||||
|
}
|
||||||
|
return query;
|
||||||
|
}
|
||||||
|
|
||||||
return this.query().nearestTo(vector);
|
return this.query().nearestTo(vector);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.20.0-beta.1",
|
"version": "0.21.2",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
"files": ["lancedb.darwin-arm64.node"],
|
"files": ["lancedb.darwin-arm64.node"],
|
||||||
"license": "Apache 2.0",
|
"license": "Apache-2.0",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 18"
|
"node": ">= 18"
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user