mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 13:29:57 +00:00
Compare commits
19 Commits
python-v0.
...
v0.1.8
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cc7e54298b | ||
|
|
d1e8a97a2a | ||
|
|
01dadb0862 | ||
|
|
0724d41c4b | ||
|
|
cbb56e25ab | ||
|
|
78de8f5782 | ||
|
|
a6544c2a31 | ||
|
|
39ed70896a | ||
|
|
ae672df1b7 | ||
|
|
15c3f42387 | ||
|
|
f65d85efcc | ||
|
|
6b5c046c3b | ||
|
|
d00f4e51d0 | ||
|
|
fbc44d4243 | ||
|
|
b53eee42ce | ||
|
|
7e0d6088ca | ||
|
|
5210f40a33 | ||
|
|
5ec4a5d730 | ||
|
|
e4f64fca7b |
12
.bumpversion.cfg
Normal file
12
.bumpversion.cfg
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
[bumpversion]
|
||||||
|
current_version = 0.1.8
|
||||||
|
commit = True
|
||||||
|
message = Bump version: {current_version} → {new_version}
|
||||||
|
tag = True
|
||||||
|
tag_name = v{new_version}
|
||||||
|
|
||||||
|
[bumpversion:file:node/package.json]
|
||||||
|
|
||||||
|
[bumpversion:file:rust/ffi/node/Cargo.toml]
|
||||||
|
|
||||||
|
[bumpversion:file:rust/vectordb/Cargo.toml]
|
||||||
29
.github/workflows/cargo-publish.yml
vendored
Normal file
29
.github/workflows/cargo-publish.yml
vendored
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
name: Cargo Publish
|
||||||
|
|
||||||
|
on:
|
||||||
|
release:
|
||||||
|
types: [ published ]
|
||||||
|
|
||||||
|
env:
|
||||||
|
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
||||||
|
# key, so we set it to make sure it is always consistent.
|
||||||
|
CARGO_TERM_COLOR: always
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
timeout-minutes: 30
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
with:
|
||||||
|
workspaces: rust
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
|
- name: Publish the package
|
||||||
|
run: |
|
||||||
|
cargo publish -p vectordb --all-features --token ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
||||||
55
.github/workflows/make-release-commit.yml
vendored
Normal file
55
.github/workflows/make-release-commit.yml
vendored
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
name: Create release commit
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
dry_run:
|
||||||
|
description: 'Dry run (create the local commit/tags but do not push it)'
|
||||||
|
required: true
|
||||||
|
default: "false"
|
||||||
|
type: choice
|
||||||
|
options:
|
||||||
|
- "true"
|
||||||
|
- "false"
|
||||||
|
part:
|
||||||
|
description: 'What kind of release is this?'
|
||||||
|
required: true
|
||||||
|
default: 'patch'
|
||||||
|
type: choice
|
||||||
|
options:
|
||||||
|
- patch
|
||||||
|
- minor
|
||||||
|
- major
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
bump-version:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out main
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
ref: main
|
||||||
|
persist-credentials: false
|
||||||
|
fetch-depth: 0
|
||||||
|
lfs: true
|
||||||
|
- name: Set git configs for bumpversion
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
git config user.name 'Lance Release'
|
||||||
|
git config user.email 'lance-dev@lancedb.com'
|
||||||
|
- name: Set up Python 3.10
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.10"
|
||||||
|
- name: Bump version, create tag and commit
|
||||||
|
run: |
|
||||||
|
pip install bump2version
|
||||||
|
bumpversion --verbose ${{ inputs.part }}
|
||||||
|
- name: Push new version and tag
|
||||||
|
if: ${{ inputs.dry_run }} == "false"
|
||||||
|
uses: ad-m/github-push-action@master
|
||||||
|
with:
|
||||||
|
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
|
branch: main
|
||||||
|
tags: true
|
||||||
|
|
||||||
12
.github/workflows/node.yml
vendored
12
.github/workflows/node.yml
vendored
@@ -67,8 +67,12 @@ jobs:
|
|||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
npm ci
|
npm ci
|
||||||
npm run build
|
|
||||||
npm run tsc
|
npm run tsc
|
||||||
|
npm run build
|
||||||
|
npm run pack-build
|
||||||
|
npm install --no-save ./dist/vectordb-*.tgz
|
||||||
|
# Remove index.node to test with dependency installed
|
||||||
|
rm index.node
|
||||||
- name: Test
|
- name: Test
|
||||||
run: npm run test
|
run: npm run test
|
||||||
macos:
|
macos:
|
||||||
@@ -94,8 +98,12 @@ jobs:
|
|||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
npm ci
|
npm ci
|
||||||
npm run build
|
|
||||||
npm run tsc
|
npm run tsc
|
||||||
|
npm run build
|
||||||
|
npm run pack-build
|
||||||
|
npm install --no-save ./dist/vectordb-*.tgz
|
||||||
|
# Remove index.node to test with dependency installed
|
||||||
|
rm index.node
|
||||||
- name: Test
|
- name: Test
|
||||||
run: |
|
run: |
|
||||||
npm run test
|
npm run test
|
||||||
|
|||||||
137
.github/workflows/npm-publish.yml
vendored
Normal file
137
.github/workflows/npm-publish.yml
vendored
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
name: NPM Publish
|
||||||
|
|
||||||
|
on:
|
||||||
|
release:
|
||||||
|
types: [ published ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
node:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
working-directory: node
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- uses: actions/setup-node@v3
|
||||||
|
with:
|
||||||
|
node-version: 20
|
||||||
|
cache: 'npm'
|
||||||
|
cache-dependency-path: node/package-lock.json
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
npm ci
|
||||||
|
npm run tsc
|
||||||
|
npm pack
|
||||||
|
- name: Upload Linux Artifacts
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: node-package
|
||||||
|
path: |
|
||||||
|
node/vectordb-*.tgz
|
||||||
|
|
||||||
|
node-macos:
|
||||||
|
runs-on: macos-12
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
target: [x86_64-apple-darwin, aarch64-apple-darwin]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Install system dependencies
|
||||||
|
run: brew install protobuf
|
||||||
|
- name: Install npm dependencies
|
||||||
|
run: |
|
||||||
|
cd node
|
||||||
|
npm ci
|
||||||
|
- name: Install rustup target
|
||||||
|
if: ${{ matrix.target == 'aarch64-apple-darwin' }}
|
||||||
|
run: rustup target add aarch64-apple-darwin
|
||||||
|
- name: Build MacOS native node modules
|
||||||
|
run: bash ci/build_macos_artifacts.sh ${{ matrix.target }}
|
||||||
|
- name: Upload Darwin Artifacts
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: darwin-native
|
||||||
|
path: |
|
||||||
|
node/dist/vectordb-darwin*.tgz
|
||||||
|
|
||||||
|
node-linux:
|
||||||
|
name: node-linux (${{ matrix.arch}}-unknown-linux-${{ matrix.libc }})
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
libc:
|
||||||
|
- gnu
|
||||||
|
# TODO: re-enable musl once we have refactored to pre-built containers
|
||||||
|
# Right now we have to build node from source which is too expensive.
|
||||||
|
# - musl
|
||||||
|
arch:
|
||||||
|
- x86_64
|
||||||
|
# Building on aarch64 is too slow for now
|
||||||
|
# - aarch64
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Change owner to root (for npm)
|
||||||
|
# The docker container is run as root, so we need the files to be owned by root
|
||||||
|
# Otherwise npm is a nightmare: https://github.com/npm/cli/issues/3773
|
||||||
|
run: sudo chown -R root:root .
|
||||||
|
- name: Set up QEMU
|
||||||
|
if: ${{ matrix.arch == 'aarch64' }}
|
||||||
|
uses: docker/setup-qemu-action@v2
|
||||||
|
with:
|
||||||
|
platforms: arm64
|
||||||
|
- name: Build Linux GNU native node modules
|
||||||
|
if: ${{ matrix.libc == 'gnu' }}
|
||||||
|
run: |
|
||||||
|
docker run \
|
||||||
|
-v $(pwd):/io -w /io \
|
||||||
|
quay.io/pypa/manylinux2014_${{ matrix.arch }} \
|
||||||
|
bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-gnu
|
||||||
|
- name: Build musl Linux native node modules
|
||||||
|
if: ${{ matrix.libc == 'musl' }}
|
||||||
|
run: |
|
||||||
|
docker run --platform linux/arm64/v8 \
|
||||||
|
-v $(pwd):/io -w /io \
|
||||||
|
quay.io/pypa/musllinux_1_1_${{ matrix.arch }} \
|
||||||
|
bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-musl
|
||||||
|
- name: Upload Linux Artifacts
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: linux-native
|
||||||
|
path: |
|
||||||
|
node/dist/vectordb-linux*.tgz
|
||||||
|
|
||||||
|
release:
|
||||||
|
needs: [node, node-macos, node-linux]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
steps:
|
||||||
|
- uses: actions/download-artifact@v3
|
||||||
|
- name: Display structure of downloaded files
|
||||||
|
run: ls -R
|
||||||
|
- uses: actions/setup-node@v3
|
||||||
|
with:
|
||||||
|
node-version: 20
|
||||||
|
- name: Publish to NPM
|
||||||
|
env:
|
||||||
|
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||||
|
run: |
|
||||||
|
for filename in */*.tgz; do
|
||||||
|
npm publish $filename
|
||||||
|
done
|
||||||
31
.github/workflows/pypi-publish.yml
vendored
Normal file
31
.github/workflows/pypi-publish.yml
vendored
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
name: PyPI Publish
|
||||||
|
|
||||||
|
on:
|
||||||
|
release:
|
||||||
|
types: [ published ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
publish:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# Only runs on tags that matches the python-make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
working-directory: python
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.8"
|
||||||
|
- name: Build distribution
|
||||||
|
run: |
|
||||||
|
ls -la
|
||||||
|
pip install wheel setuptools --upgrade
|
||||||
|
python setup.py sdist bdist_wheel
|
||||||
|
- name: Publish
|
||||||
|
uses: pypa/gh-action-pypi-publish@v1.8.5
|
||||||
|
with:
|
||||||
|
password: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||||
|
packages-dir: python/dist
|
||||||
4
.github/workflows/python.yml
vendored
4
.github/workflows/python.yml
vendored
@@ -32,9 +32,11 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
pip install -e .
|
pip install -e .
|
||||||
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
||||||
pip install pytest pytest-mock black
|
pip install pytest pytest-mock black isort
|
||||||
- name: Black
|
- name: Black
|
||||||
run: black --check --diff --no-color --quiet .
|
run: black --check --diff --no-color --quiet .
|
||||||
|
- name: isort
|
||||||
|
run: isort --check --diff --quiet .
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: pytest -x -v --durations=30 tests
|
run: pytest -x -v --durations=30 tests
|
||||||
- name: doctest
|
- name: doctest
|
||||||
|
|||||||
67
.github/workflows/rust.yml
vendored
Normal file
67
.github/workflows/rust.yml
vendored
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
name: Rust
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- rust/**
|
||||||
|
- .github/workflows/rust.yml
|
||||||
|
|
||||||
|
env:
|
||||||
|
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
||||||
|
# key, so we set it to make sure it is always consistent.
|
||||||
|
CARGO_TERM_COLOR: always
|
||||||
|
# Disable full debug symbol generation to speed up CI build and keep memory down
|
||||||
|
# "1" means line tables only, which is useful for panic tracebacks.
|
||||||
|
RUSTFLAGS: "-C debuginfo=1"
|
||||||
|
RUST_BACKTRACE: "1"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
linux:
|
||||||
|
timeout-minutes: 30
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
working-directory: rust
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
lfs: true
|
||||||
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
with:
|
||||||
|
workspaces: rust
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
|
- name: Build
|
||||||
|
run: cargo build --all-features
|
||||||
|
- name: Run tests
|
||||||
|
run: cargo test --all-features
|
||||||
|
macos:
|
||||||
|
runs-on: macos-12
|
||||||
|
timeout-minutes: 30
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
working-directory: rust
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
lfs: true
|
||||||
|
- name: CPU features
|
||||||
|
run: sysctl -a | grep cpu
|
||||||
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
with:
|
||||||
|
workspaces: rust
|
||||||
|
- name: Install dependencies
|
||||||
|
run: brew install protobuf
|
||||||
|
- name: Build
|
||||||
|
run: cargo build --all-features
|
||||||
|
- name: Run tests
|
||||||
|
run: cargo test --all-features
|
||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -4,6 +4,8 @@
|
|||||||
**/__pycache__
|
**/__pycache__
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
||||||
|
.vscode
|
||||||
|
|
||||||
rust/target
|
rust/target
|
||||||
rust/Cargo.lock
|
rust/Cargo.lock
|
||||||
|
|
||||||
|
|||||||
36
Cargo.lock
generated
36
Cargo.lock
generated
@@ -190,6 +190,7 @@ dependencies = [
|
|||||||
"arrow-data",
|
"arrow-data",
|
||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
"flatbuffers",
|
"flatbuffers",
|
||||||
|
"zstd",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -654,6 +655,12 @@ version = "3.12.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8"
|
checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bytemuck"
|
||||||
|
version = "1.13.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "byteorder"
|
name = "byteorder"
|
||||||
version = "1.4.3"
|
version = "1.4.3"
|
||||||
@@ -1646,9 +1653,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance"
|
name = "lance"
|
||||||
version = "0.4.17"
|
version = "0.4.21"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "86dda8185bd1ffae7b910c1f68035af23be9b717c52e9cc4de176cd30b47f772"
|
checksum = "3d6c2e7bcfc71c7167ec70cd06c6d55c644a148f6580218c5a0b66e13ac5b5cc"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"accelerate-src",
|
"accelerate-src",
|
||||||
"arrow",
|
"arrow",
|
||||||
@@ -1657,7 +1664,9 @@ dependencies = [
|
|||||||
"arrow-buffer",
|
"arrow-buffer",
|
||||||
"arrow-cast",
|
"arrow-cast",
|
||||||
"arrow-data",
|
"arrow-data",
|
||||||
|
"arrow-ipc",
|
||||||
"arrow-ord",
|
"arrow-ord",
|
||||||
|
"arrow-row",
|
||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
"arrow-select",
|
"arrow-select",
|
||||||
"async-recursion",
|
"async-recursion",
|
||||||
@@ -1668,6 +1677,7 @@ dependencies = [
|
|||||||
"bytes",
|
"bytes",
|
||||||
"cblas",
|
"cblas",
|
||||||
"chrono",
|
"chrono",
|
||||||
|
"dashmap",
|
||||||
"datafusion",
|
"datafusion",
|
||||||
"futures",
|
"futures",
|
||||||
"lapack",
|
"lapack",
|
||||||
@@ -1684,6 +1694,7 @@ dependencies = [
|
|||||||
"prost-types",
|
"prost-types",
|
||||||
"rand",
|
"rand",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
"roaring",
|
||||||
"shellexpand",
|
"shellexpand",
|
||||||
"snafu",
|
"snafu",
|
||||||
"sqlparser-lance",
|
"sqlparser-lance",
|
||||||
@@ -2598,6 +2609,12 @@ dependencies = [
|
|||||||
"winreg",
|
"winreg",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "retain_mut"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ring"
|
name = "ring"
|
||||||
version = "0.16.20"
|
version = "0.16.20"
|
||||||
@@ -2613,6 +2630,17 @@ dependencies = [
|
|||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "roaring"
|
||||||
|
version = "0.10.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ef0fb5e826a8bde011ecae6a8539dd333884335c57ff0f003fbe27c25bbe8f71"
|
||||||
|
dependencies = [
|
||||||
|
"bytemuck",
|
||||||
|
"byteorder",
|
||||||
|
"retain_mut",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustc_version"
|
name = "rustc_version"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
@@ -3358,7 +3386,7 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "vectordb"
|
name = "vectordb"
|
||||||
version = "0.0.1"
|
version = "0.1.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-data",
|
"arrow-data",
|
||||||
@@ -3373,7 +3401,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "vectordb-node"
|
name = "vectordb-node"
|
||||||
version = "0.1.0"
|
version = "0.1.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
"arrow-ipc",
|
"arrow-ipc",
|
||||||
|
|||||||
91
ci/build_linux_artifacts.sh
Normal file
91
ci/build_linux_artifacts.sh
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Builds the Linux artifacts (node binaries).
|
||||||
|
# Usage: ./build_linux_artifacts.sh [target]
|
||||||
|
# Targets supported:
|
||||||
|
# - x86_64-unknown-linux-gnu:centos
|
||||||
|
# - aarch64-unknown-linux-gnu:centos
|
||||||
|
# - aarch64-unknown-linux-musl
|
||||||
|
# - x86_64-unknown-linux-musl
|
||||||
|
|
||||||
|
# TODO: refactor this into a Docker container we can pull
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
setup_dependencies() {
|
||||||
|
echo "Installing system dependencies..."
|
||||||
|
if [[ $1 == *musl ]]; then
|
||||||
|
# musllinux
|
||||||
|
apk add openssl-dev
|
||||||
|
else
|
||||||
|
# manylinux2014
|
||||||
|
yum install -y openssl-devel unzip
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $1 == x86_64* ]]; then
|
||||||
|
ARCH=x86_64
|
||||||
|
else
|
||||||
|
# gnu target
|
||||||
|
ARCH=aarch_64
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Install new enough protobuf (yum-provided is old)
|
||||||
|
PB_REL=https://github.com/protocolbuffers/protobuf/releases
|
||||||
|
PB_VERSION=23.1
|
||||||
|
curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-$ARCH.zip
|
||||||
|
unzip protoc-$PB_VERSION-linux-$ARCH.zip -d /usr/local
|
||||||
|
}
|
||||||
|
|
||||||
|
install_node() {
|
||||||
|
echo "Installing node..."
|
||||||
|
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
|
||||||
|
source "$HOME"/.bashrc
|
||||||
|
|
||||||
|
if [[ $1 == *musl ]]; then
|
||||||
|
# This node version is 15, we need 16 or higher:
|
||||||
|
# apk add nodejs-current npm
|
||||||
|
# So instead we install from source (nvm doesn't provide binaries for musl):
|
||||||
|
nvm install -s --no-progress 17
|
||||||
|
else
|
||||||
|
nvm install --no-progress 17 # latest that supports glibc 2.17
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
install_rust() {
|
||||||
|
echo "Installing rust..."
|
||||||
|
curl https://sh.rustup.rs -sSf | bash -s -- -y
|
||||||
|
export PATH="$PATH:/root/.cargo/bin"
|
||||||
|
}
|
||||||
|
|
||||||
|
build_node_binary() {
|
||||||
|
echo "Building node library for $1..."
|
||||||
|
pushd node
|
||||||
|
|
||||||
|
npm ci
|
||||||
|
|
||||||
|
if [[ $1 == *musl ]]; then
|
||||||
|
# This is needed for cargo to allow build cdylibs with musl
|
||||||
|
export RUSTFLAGS="-C target-feature=-crt-static"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Cargo can run out of memory while pulling dependencies, espcially when running
|
||||||
|
# in QEMU. This is a workaround for that.
|
||||||
|
export CARGO_NET_GIT_FETCH_WITH_CLI=true
|
||||||
|
|
||||||
|
# We don't pass in target, since the native target here already matches
|
||||||
|
# and openblas-src doesn't do well with cross-compilation.
|
||||||
|
npm run build-release
|
||||||
|
npm run pack-build
|
||||||
|
|
||||||
|
popd
|
||||||
|
}
|
||||||
|
|
||||||
|
TARGET=${1:-x86_64-unknown-linux-gnu}
|
||||||
|
# Others:
|
||||||
|
# aarch64-unknown-linux-gnu
|
||||||
|
# x86_64-unknown-linux-musl
|
||||||
|
# aarch64-unknown-linux-musl
|
||||||
|
|
||||||
|
setup_dependencies $TARGET
|
||||||
|
install_node $TARGET
|
||||||
|
install_rust
|
||||||
|
build_node_binary $TARGET
|
||||||
33
ci/build_macos_artifacts.sh
Normal file
33
ci/build_macos_artifacts.sh
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# Builds the macOS artifacts (node binaries).
|
||||||
|
# Usage: ./ci/build_macos_artifacts.sh [target]
|
||||||
|
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
|
||||||
|
|
||||||
|
prebuild_rust() {
|
||||||
|
# Building here for the sake of easier debugging.
|
||||||
|
pushd rust/ffi/node
|
||||||
|
echo "Building rust library for $1"
|
||||||
|
export RUST_BACKTRACE=1
|
||||||
|
cargo build --release --target $1
|
||||||
|
popd
|
||||||
|
}
|
||||||
|
|
||||||
|
build_node_binaries() {
|
||||||
|
pushd node
|
||||||
|
echo "Building node library for $1"
|
||||||
|
npm run build-release -- --target $1
|
||||||
|
npm run pack-build -- --target $1
|
||||||
|
popd
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ -n "$1" ]; then
|
||||||
|
targets=$1
|
||||||
|
else
|
||||||
|
targets="x86_64-apple-darwin aarch64-apple-darwin"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Building artifacts for targets: $targets"
|
||||||
|
for target in $targets
|
||||||
|
do
|
||||||
|
prebuild_rust $target
|
||||||
|
build_node_binaries $target
|
||||||
|
done
|
||||||
@@ -67,7 +67,7 @@ There are a couple of parameters that can be used to fine-tune the search:
|
|||||||
e.g., for 1M vectors divided up into 256 partitions, nprobes should be set to ~20-40.<br/>
|
e.g., for 1M vectors divided up into 256 partitions, nprobes should be set to ~20-40.<br/>
|
||||||
Note: nprobes is only applicable if an ANN index is present. If specified on a table without an ANN index, it is ignored.
|
Note: nprobes is only applicable if an ANN index is present. If specified on a table without an ANN index, it is ignored.
|
||||||
- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory.<br/>
|
- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory.<br/>
|
||||||
A higher number makes search more accurate but also slower. If you find the recall is less than idea, try refine_factor=10 to start.<br/>
|
A higher number makes search more accurate but also slower. If you find the recall is less than ideal, try refine_factor=10 to start.<br/>
|
||||||
e.g., for 1M vectors divided into 256 partitions, if you're looking for top 20, then refine_factor=200 reranks the whole partition.<br/>
|
e.g., for 1M vectors divided into 256 partitions, if you're looking for top 20, then refine_factor=200 reranks the whole partition.<br/>
|
||||||
Note: refine_factor is only applicable if an ANN index is present. If specified on a table without an ANN index, it is ignored.
|
Note: refine_factor is only applicable if an ANN index is present. If specified on a table without an ANN index, it is ignored.
|
||||||
|
|
||||||
|
|||||||
@@ -1,18 +1,19 @@
|
|||||||
import sys
|
|
||||||
from modal import Secret, Stub, Image, web_endpoint
|
|
||||||
import lancedb
|
|
||||||
import re
|
|
||||||
import pickle
|
import pickle
|
||||||
import requests
|
import re
|
||||||
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from langchain.chains import RetrievalQA
|
||||||
from langchain.document_loaders import UnstructuredHTMLLoader
|
from langchain.document_loaders import UnstructuredHTMLLoader
|
||||||
from langchain.embeddings import OpenAIEmbeddings
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
|
from langchain.llms import OpenAI
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain.vectorstores import LanceDB
|
from langchain.vectorstores import LanceDB
|
||||||
from langchain.llms import OpenAI
|
from modal import Image, Secret, Stub, web_endpoint
|
||||||
from langchain.chains import RetrievalQA
|
|
||||||
|
import lancedb
|
||||||
|
|
||||||
lancedb_image = Image.debian_slim().pip_install(
|
lancedb_image = Image.debian_slim().pip_install(
|
||||||
"lancedb", "langchain", "openai", "pandas", "tiktoken", "unstructured", "tabulate"
|
"lancedb", "langchain", "openai", "pandas", "tiktoken", "unstructured", "tabulate"
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ The key features of LanceDB include:
|
|||||||
|
|
||||||
* Zero-copy, automatic versioning, manage versions of your data without needing extra infrastructure.
|
* Zero-copy, automatic versioning, manage versions of your data without needing extra infrastructure.
|
||||||
|
|
||||||
* Ecosystem integrations with [LangChain 🦜️🔗](https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lanecdb.html), [LlamaIndex 🦙](https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html), Apache-Arrow, Pandas, Polars, DuckDB and more on the way.
|
* Ecosystem integrations with [LangChain 🦜️🔗](https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html), [LlamaIndex 🦙](https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html), Apache-Arrow, Pandas, Polars, DuckDB and more on the way.
|
||||||
|
|
||||||
LanceDB's core is written in Rust 🦀 and is built using <a href="https://github.com/lancedb/lance">Lance</a>, an open-source columnar format designed for performant ML workloads.
|
LanceDB's core is written in Rust 🦀 and is built using <a href="https://github.com/lancedb/lance">Lance</a>, an open-source columnar format designed for performant ML workloads.
|
||||||
|
|
||||||
|
|||||||
@@ -21,12 +21,13 @@ from argparse import ArgumentParser
|
|||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
|
|
||||||
import lance
|
import lance
|
||||||
import lancedb
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast
|
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast
|
||||||
|
|
||||||
|
import lancedb
|
||||||
|
|
||||||
MODEL_ID = "openai/clip-vit-base-patch32"
|
MODEL_ID = "openai/clip-vit-base-patch32"
|
||||||
|
|
||||||
device = "cuda"
|
device = "cuda"
|
||||||
|
|||||||
4
node/.npmignore
Normal file
4
node/.npmignore
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
gen_test_data.py
|
||||||
|
index.node
|
||||||
|
dist/lancedb*.tgz
|
||||||
|
vectordb*.tgz
|
||||||
@@ -8,6 +8,10 @@ A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb)
|
|||||||
npm install vectordb
|
npm install vectordb
|
||||||
```
|
```
|
||||||
|
|
||||||
|
This will download the appropriate native library for your platform. We currently
|
||||||
|
support x86_64 Linux, aarch64 Linux, Intel MacOS, and ARM (M1/M2) MacOS. We do not
|
||||||
|
yet support Windows or musl-based Linux (such as Alpine Linux).
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
### Basic Example
|
### Basic Example
|
||||||
@@ -24,17 +28,33 @@ The [examples](./examples) folder contains complete examples.
|
|||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
The LanceDB javascript is built with npm:
|
To build everything fresh:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install
|
||||||
|
npm run tsc
|
||||||
|
npm run build
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you should be able to run the tests with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm test
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rebuilding Rust library
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run build
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rebuilding Typescript
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
npm run tsc
|
npm run tsc
|
||||||
```
|
```
|
||||||
|
|
||||||
Run the tests with
|
### Fix lints
|
||||||
|
|
||||||
```bash
|
|
||||||
npm test
|
|
||||||
```
|
|
||||||
|
|
||||||
To run the linter and have it automatically fix all errors
|
To run the linter and have it automatically fix all errors
|
||||||
|
|
||||||
|
|||||||
@@ -12,29 +12,26 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
const { currentTarget } = require('@neon-rs/load');
|
||||||
|
|
||||||
let nativeLib;
|
let nativeLib;
|
||||||
|
|
||||||
function getPlatformLibrary() {
|
|
||||||
if (process.platform === "darwin" && process.arch == "arm64") {
|
|
||||||
return require('./aarch64-apple-darwin.node');
|
|
||||||
} else if (process.platform === "darwin" && process.arch == "x64") {
|
|
||||||
return require('./x86_64-apple-darwin.node');
|
|
||||||
} else if (process.platform === "linux" && process.arch == "x64") {
|
|
||||||
return require('./x86_64-unknown-linux-gnu.node');
|
|
||||||
} else {
|
|
||||||
throw new Error(`vectordb: unsupported platform ${process.platform}_${process.arch}. Please file a bug report at https://github.com/lancedb/lancedb/issues`)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
nativeLib = require('./index.node')
|
nativeLib = require(`vectordb-${currentTarget()}`);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (e.code === "MODULE_NOT_FOUND") {
|
try {
|
||||||
nativeLib = getPlatformLibrary();
|
// Might be developing locally, so try that. But don't expose that error
|
||||||
} else {
|
// to the user.
|
||||||
throw new Error('vectordb: failed to load native library. Please file a bug report at https://github.com/lancedb/lancedb/issues');
|
nativeLib = require("./index.node");
|
||||||
|
} catch {
|
||||||
|
throw new Error(`vectordb: failed to load native library.
|
||||||
|
You may need to run \`npm install vectordb-${currentTarget()}\`.
|
||||||
|
|
||||||
|
If that does not work, please file a bug report at https://github.com/lancedb/lancedb/issues
|
||||||
|
|
||||||
|
Source error: ${e}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = nativeLib
|
// Dynamic require for runtime.
|
||||||
|
module.exports = nativeLib;
|
||||||
|
|||||||
41
node/package-lock.json
generated
41
node/package-lock.json
generated
@@ -7,12 +7,22 @@
|
|||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.1.5",
|
"version": "0.1.5",
|
||||||
|
"cpu": [
|
||||||
|
"x64",
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
"license": "Apache-2.0",
|
"license": "Apache-2.0",
|
||||||
|
"os": [
|
||||||
|
"darwin",
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@apache-arrow/ts": "^12.0.0",
|
"@apache-arrow/ts": "^12.0.0",
|
||||||
|
"@neon-rs/load": "^0.0.74",
|
||||||
"apache-arrow": "^12.0.0"
|
"apache-arrow": "^12.0.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@neon-rs/cli": "^0.0.74",
|
||||||
"@types/chai": "^4.3.4",
|
"@types/chai": "^4.3.4",
|
||||||
"@types/mocha": "^10.0.1",
|
"@types/mocha": "^10.0.1",
|
||||||
"@types/node": "^18.16.2",
|
"@types/node": "^18.16.2",
|
||||||
@@ -35,6 +45,12 @@
|
|||||||
"typedoc": "^0.24.7",
|
"typedoc": "^0.24.7",
|
||||||
"typedoc-plugin-markdown": "^3.15.3",
|
"typedoc-plugin-markdown": "^3.15.3",
|
||||||
"typescript": "*"
|
"typescript": "*"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"vectordb-darwin-arm64": "0.1.2",
|
||||||
|
"vectordb-darwin-x64": "0.1.2",
|
||||||
|
"vectordb-linux-arm64-gnu": "0.1.2",
|
||||||
|
"vectordb-linux-x64-gnu": "0.1.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@apache-arrow/ts": {
|
"node_modules/@apache-arrow/ts": {
|
||||||
@@ -202,6 +218,20 @@
|
|||||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@neon-rs/cli": {
|
||||||
|
"version": "0.0.74",
|
||||||
|
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
|
||||||
|
"integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
|
||||||
|
"dev": true,
|
||||||
|
"bin": {
|
||||||
|
"neon": "index.js"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@neon-rs/load": {
|
||||||
|
"version": "0.0.74",
|
||||||
|
"resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
|
||||||
|
"integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
|
||||||
|
},
|
||||||
"node_modules/@nodelib/fs.scandir": {
|
"node_modules/@nodelib/fs.scandir": {
|
||||||
"version": "2.1.5",
|
"version": "2.1.5",
|
||||||
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
|
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
|
||||||
@@ -4601,6 +4631,17 @@
|
|||||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"@neon-rs/cli": {
|
||||||
|
"version": "0.0.74",
|
||||||
|
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
|
||||||
|
"integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
|
||||||
|
"dev": true
|
||||||
|
},
|
||||||
|
"@neon-rs/load": {
|
||||||
|
"version": "0.0.74",
|
||||||
|
"resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
|
||||||
|
"integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
|
||||||
|
},
|
||||||
"@nodelib/fs.scandir": {
|
"@nodelib/fs.scandir": {
|
||||||
"version": "2.1.5",
|
"version": "2.1.5",
|
||||||
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
|
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
|
||||||
|
|||||||
@@ -1,16 +1,19 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.1.5",
|
"version": "0.1.8",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"tsc": "tsc -b",
|
"tsc": "tsc -b",
|
||||||
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json-render-diagnostics",
|
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
|
||||||
"build-release": "npm run build -- --release",
|
"build-release": "npm run build -- --release",
|
||||||
|
"cross-release": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cross build --message-format=json --release -p vectordb-node",
|
||||||
"test": "mocha -recursive dist/test",
|
"test": "mocha -recursive dist/test",
|
||||||
"lint": "eslint src --ext .js,.ts",
|
"lint": "eslint src --ext .js,.ts",
|
||||||
"clean": "rm -rf node_modules *.node dist/"
|
"clean": "rm -rf node_modules *.node dist/",
|
||||||
|
"pack-build": "neon pack-build",
|
||||||
|
"check-npm": "printenv && which node && which npm && npm --version"
|
||||||
},
|
},
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
@@ -25,6 +28,7 @@
|
|||||||
"author": "Lance Devs",
|
"author": "Lance Devs",
|
||||||
"license": "Apache-2.0",
|
"license": "Apache-2.0",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@neon-rs/cli": "^0.0.74",
|
||||||
"@types/chai": "^4.3.4",
|
"@types/chai": "^4.3.4",
|
||||||
"@types/mocha": "^10.0.1",
|
"@types/mocha": "^10.0.1",
|
||||||
"@types/node": "^18.16.2",
|
"@types/node": "^18.16.2",
|
||||||
@@ -50,6 +54,29 @@
|
|||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@apache-arrow/ts": "^12.0.0",
|
"@apache-arrow/ts": "^12.0.0",
|
||||||
|
"@neon-rs/load": "^0.0.74",
|
||||||
"apache-arrow": "^12.0.0"
|
"apache-arrow": "^12.0.0"
|
||||||
|
},
|
||||||
|
"os": [
|
||||||
|
"darwin",
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"cpu": [
|
||||||
|
"x64",
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"neon": {
|
||||||
|
"targets": {
|
||||||
|
"x86_64-apple-darwin": "vectordb-darwin-x64",
|
||||||
|
"aarch64-apple-darwin": "vectordb-darwin-arm64",
|
||||||
|
"x86_64-unknown-linux-gnu": "vectordb-linux-x64-gnu",
|
||||||
|
"aarch64-unknown-linux-gnu": "vectordb-linux-arm64-gnu"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"vectordb-darwin-arm64": "0.1.8",
|
||||||
|
"vectordb-darwin-x64": "0.1.8",
|
||||||
|
"vectordb-linux-x64-gnu": "0.1.8",
|
||||||
|
"vectordb-linux-arm64-gnu": "0.1.8"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ import { fromRecordsToBuffer } from './arrow'
|
|||||||
import type { EmbeddingFunction } from './embedding/embedding_function'
|
import type { EmbeddingFunction } from './embedding/embedding_function'
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||||
const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd, tableCreateVectorIndex } = require('../native.js')
|
const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd, tableCreateVectorIndex, tableCountRows } = require('../native.js')
|
||||||
|
|
||||||
export type { EmbeddingFunction }
|
export type { EmbeddingFunction }
|
||||||
export { OpenAIEmbeddingFunction } from './embedding/openai'
|
export { OpenAIEmbeddingFunction } from './embedding/openai'
|
||||||
@@ -178,6 +178,13 @@ export class Table<T = number[]> {
|
|||||||
async create_index (indexParams: VectorIndexParams): Promise<any> {
|
async create_index (indexParams: VectorIndexParams): Promise<any> {
|
||||||
return await this.createIndex(indexParams)
|
return await this.createIndex(indexParams)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of rows in this table.
|
||||||
|
*/
|
||||||
|
async countRows (): Promise<number> {
|
||||||
|
return tableCountRows.call(this._tbl)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
interface IvfPQIndexConfig {
|
interface IvfPQIndexConfig {
|
||||||
@@ -293,6 +300,8 @@ export class Query<T = number[]> {
|
|||||||
return this
|
return this
|
||||||
}
|
}
|
||||||
|
|
||||||
|
where = this.filter
|
||||||
|
|
||||||
/** Return only the specified columns.
|
/** Return only the specified columns.
|
||||||
*
|
*
|
||||||
* @param value Only select the specified columns. If not specified, all columns will be returned.
|
* @param value Only select the specified columns. If not specified, all columns will be returned.
|
||||||
|
|||||||
@@ -64,13 +64,20 @@ describe('LanceDB client', function () {
|
|||||||
assert.equal(results[0].id, 1)
|
assert.equal(results[0].id, 1)
|
||||||
})
|
})
|
||||||
|
|
||||||
it('uses a filter', async function () {
|
it('uses a filter / where clause', async function () {
|
||||||
|
// eslint-disable-next-line @typescript-eslint/explicit-function-return-type
|
||||||
|
const assertResults = (results: Array<Record<string, unknown>>) => {
|
||||||
|
assert.equal(results.length, 1)
|
||||||
|
assert.equal(results[0].id, 2)
|
||||||
|
}
|
||||||
|
|
||||||
const uri = await createTestDB()
|
const uri = await createTestDB()
|
||||||
const con = await lancedb.connect(uri)
|
const con = await lancedb.connect(uri)
|
||||||
const table = await con.openTable('vectors')
|
const table = await con.openTable('vectors')
|
||||||
const results = await table.search([0.1, 0.1]).filter('id == 2').execute()
|
let results = await table.search([0.1, 0.1]).filter('id == 2').execute()
|
||||||
assert.equal(results.length, 1)
|
assertResults(results)
|
||||||
assert.equal(results[0].id, 2)
|
results = await table.search([0.1, 0.1]).where('id == 2').execute()
|
||||||
|
assertResults(results)
|
||||||
})
|
})
|
||||||
|
|
||||||
it('select only a subset of columns', async function () {
|
it('select only a subset of columns', async function () {
|
||||||
@@ -103,9 +110,7 @@ describe('LanceDB client', function () {
|
|||||||
const tableName = `vectors_${Math.floor(Math.random() * 100)}`
|
const tableName = `vectors_${Math.floor(Math.random() * 100)}`
|
||||||
const table = await con.createTable(tableName, data)
|
const table = await con.createTable(tableName, data)
|
||||||
assert.equal(table.name, tableName)
|
assert.equal(table.name, tableName)
|
||||||
|
assert.equal(await table.countRows(), 2)
|
||||||
const results = await table.search([0.1, 0.3]).execute()
|
|
||||||
assert.equal(results.length, 2)
|
|
||||||
})
|
})
|
||||||
|
|
||||||
it('appends records to an existing table ', async function () {
|
it('appends records to an existing table ', async function () {
|
||||||
@@ -118,16 +123,14 @@ describe('LanceDB client', function () {
|
|||||||
]
|
]
|
||||||
|
|
||||||
const table = await con.createTable('vectors', data)
|
const table = await con.createTable('vectors', data)
|
||||||
const results = await table.search([0.1, 0.3]).execute()
|
assert.equal(await table.countRows(), 2)
|
||||||
assert.equal(results.length, 2)
|
|
||||||
|
|
||||||
const dataAdd = [
|
const dataAdd = [
|
||||||
{ id: 3, vector: [2.1, 2.2], price: 10, name: 'c' },
|
{ id: 3, vector: [2.1, 2.2], price: 10, name: 'c' },
|
||||||
{ id: 4, vector: [3.1, 3.2], price: 50, name: 'd' }
|
{ id: 4, vector: [3.1, 3.2], price: 50, name: 'd' }
|
||||||
]
|
]
|
||||||
await table.add(dataAdd)
|
await table.add(dataAdd)
|
||||||
const resultsAdd = await table.search([0.1, 0.3]).execute()
|
assert.equal(await table.countRows(), 4)
|
||||||
assert.equal(resultsAdd.length, 4)
|
|
||||||
})
|
})
|
||||||
|
|
||||||
it('overwrite all records in a table', async function () {
|
it('overwrite all records in a table', async function () {
|
||||||
@@ -135,16 +138,14 @@ describe('LanceDB client', function () {
|
|||||||
const con = await lancedb.connect(uri)
|
const con = await lancedb.connect(uri)
|
||||||
|
|
||||||
const table = await con.openTable('vectors')
|
const table = await con.openTable('vectors')
|
||||||
const results = await table.search([0.1, 0.3]).execute()
|
assert.equal(await table.countRows(), 2)
|
||||||
assert.equal(results.length, 2)
|
|
||||||
|
|
||||||
const dataOver = [
|
const dataOver = [
|
||||||
{ vector: [2.1, 2.2], price: 10, name: 'foo' },
|
{ vector: [2.1, 2.2], price: 10, name: 'foo' },
|
||||||
{ vector: [3.1, 3.2], price: 50, name: 'bar' }
|
{ vector: [3.1, 3.2], price: 50, name: 'bar' }
|
||||||
]
|
]
|
||||||
await table.overwrite(dataOver)
|
await table.overwrite(dataOver)
|
||||||
const resultsAdd = await table.search([0.1, 0.3]).execute()
|
assert.equal(await table.countRows(), 2)
|
||||||
assert.equal(resultsAdd.length, 2)
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[bumpversion]
|
[bumpversion]
|
||||||
current_version = 0.1.7
|
current_version = 0.1.8
|
||||||
commit = True
|
commit = True
|
||||||
message = [python] Bump version: {current_version} → {new_version}
|
message = [python] Bump version: {current_version} → {new_version}
|
||||||
tag = True
|
tag = True
|
||||||
|
|||||||
@@ -13,7 +13,8 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from .exceptions import MissingValueError, MissingColumnError
|
|
||||||
|
from .exceptions import MissingColumnError, MissingValueError
|
||||||
|
|
||||||
|
|
||||||
def contextualize(raw_df: pd.DataFrame) -> Contextualizer:
|
def contextualize(raw_df: pd.DataFrame) -> Contextualizer:
|
||||||
@@ -52,14 +53,16 @@ def contextualize(raw_df: pd.DataFrame) -> Contextualizer:
|
|||||||
6 the lazy dog 1
|
6 the lazy dog 1
|
||||||
7 lazy dog I 1
|
7 lazy dog I 1
|
||||||
8 dog I love 1
|
8 dog I love 1
|
||||||
>>> contextualize(data).window(7).stride(1).text_col('token').to_df()
|
9 I love sandwiches 2
|
||||||
|
10 love sandwiches 2
|
||||||
|
>>> contextualize(data).window(7).stride(1).min_window_size(7).text_col('token').to_df()
|
||||||
token document_id
|
token document_id
|
||||||
0 The quick brown fox jumped over the 1
|
0 The quick brown fox jumped over the 1
|
||||||
1 quick brown fox jumped over the lazy 1
|
1 quick brown fox jumped over the lazy 1
|
||||||
2 brown fox jumped over the lazy dog 1
|
2 brown fox jumped over the lazy dog 1
|
||||||
3 fox jumped over the lazy dog I 1
|
3 fox jumped over the lazy dog I 1
|
||||||
4 jumped over the lazy dog I love 1
|
4 jumped over the lazy dog I love 1
|
||||||
|
5 over the lazy dog I love sandwiches 1
|
||||||
|
|
||||||
``stride`` determines how many rows to skip between each window start. This can
|
``stride`` determines how many rows to skip between each window start. This can
|
||||||
be used to reduce the total number of windows generated.
|
be used to reduce the total number of windows generated.
|
||||||
@@ -70,6 +73,8 @@ def contextualize(raw_df: pd.DataFrame) -> Contextualizer:
|
|||||||
2 brown fox jumped over 1
|
2 brown fox jumped over 1
|
||||||
4 jumped over the lazy 1
|
4 jumped over the lazy 1
|
||||||
6 the lazy dog I 1
|
6 the lazy dog I 1
|
||||||
|
8 dog I love sandwiches 1
|
||||||
|
10 love sandwiches 2
|
||||||
|
|
||||||
``groupby`` determines how to group the rows. For example, we would like to have
|
``groupby`` determines how to group the rows. For example, we would like to have
|
||||||
context windows that don't cross document boundaries. In this case, we can
|
context windows that don't cross document boundaries. In this case, we can
|
||||||
@@ -80,6 +85,25 @@ def contextualize(raw_df: pd.DataFrame) -> Contextualizer:
|
|||||||
0 The quick brown fox 1
|
0 The quick brown fox 1
|
||||||
2 brown fox jumped over 1
|
2 brown fox jumped over 1
|
||||||
4 jumped over the lazy 1
|
4 jumped over the lazy 1
|
||||||
|
6 the lazy dog 1
|
||||||
|
9 I love sandwiches 2
|
||||||
|
|
||||||
|
``min_window_size`` determines the minimum size of the context windows that are generated
|
||||||
|
This can be used to trim the last few context windows which have size less than
|
||||||
|
``min_window_size``. By default context windows of size 1 are skipped.
|
||||||
|
|
||||||
|
>>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_df()
|
||||||
|
token document_id
|
||||||
|
0 The quick brown fox jumped over 1
|
||||||
|
3 fox jumped over the lazy dog 1
|
||||||
|
6 the lazy dog 1
|
||||||
|
9 I love sandwiches 2
|
||||||
|
|
||||||
|
>>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_df()
|
||||||
|
token document_id
|
||||||
|
0 The quick brown fox jumped over 1
|
||||||
|
3 fox jumped over the lazy dog 1
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return Contextualizer(raw_df)
|
return Contextualizer(raw_df)
|
||||||
|
|
||||||
@@ -92,6 +116,7 @@ class Contextualizer:
|
|||||||
self._groupby = None
|
self._groupby = None
|
||||||
self._stride = None
|
self._stride = None
|
||||||
self._window = None
|
self._window = None
|
||||||
|
self._min_window_size = 2
|
||||||
self._raw_df = raw_df
|
self._raw_df = raw_df
|
||||||
|
|
||||||
def window(self, window: int) -> Contextualizer:
|
def window(self, window: int) -> Contextualizer:
|
||||||
@@ -139,6 +164,17 @@ class Contextualizer:
|
|||||||
self._text_col = text_col
|
self._text_col = text_col
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def min_window_size(self, min_window_size: int) -> Contextualizer:
|
||||||
|
"""Set the (optional) min_window_size size for the context window.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
min_window_size: int
|
||||||
|
The min_window_size.
|
||||||
|
"""
|
||||||
|
self._min_window_size = min_window_size
|
||||||
|
return self
|
||||||
|
|
||||||
def to_df(self) -> pd.DataFrame:
|
def to_df(self) -> pd.DataFrame:
|
||||||
"""Create the context windows and return a DataFrame."""
|
"""Create the context windows and return a DataFrame."""
|
||||||
|
|
||||||
@@ -159,12 +195,19 @@ class Contextualizer:
|
|||||||
|
|
||||||
def process_group(grp):
|
def process_group(grp):
|
||||||
# For each group, create the text rolling window
|
# For each group, create the text rolling window
|
||||||
|
# with values of size >= min_window_size
|
||||||
text = grp[self._text_col].values
|
text = grp[self._text_col].values
|
||||||
contexts = grp.iloc[: -self._window : self._stride, :].copy()
|
contexts = grp.iloc[:: self._stride, :].copy()
|
||||||
contexts[self._text_col] = [
|
windows = [
|
||||||
" ".join(text[start_i : start_i + self._window])
|
" ".join(text[start_i : min(start_i + self._window, len(grp))])
|
||||||
for start_i in range(0, len(grp) - self._window, self._stride)
|
for start_i in range(0, len(grp), self._stride)
|
||||||
|
if start_i + self._window <= len(grp)
|
||||||
|
or len(grp) - start_i >= self._min_window_size
|
||||||
]
|
]
|
||||||
|
# if last few rows dropped
|
||||||
|
if len(windows) < len(contexts):
|
||||||
|
contexts = contexts.iloc[: len(windows)]
|
||||||
|
contexts[self._text_col] = windows
|
||||||
return contexts
|
return contexts
|
||||||
|
|
||||||
if self._groupby is None:
|
if self._groupby is None:
|
||||||
|
|||||||
@@ -13,16 +13,16 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import functools
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os
|
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
from pyarrow import fs
|
from pyarrow import fs
|
||||||
|
|
||||||
from .common import DATA, URI
|
from .common import DATA, URI
|
||||||
from .table import LanceTable
|
from .table import LanceTable
|
||||||
from .util import get_uri_scheme, get_uri_location
|
from .util import get_uri_location, get_uri_scheme
|
||||||
|
|
||||||
|
|
||||||
class LanceDBConnection:
|
class LanceDBConnection:
|
||||||
@@ -56,7 +56,16 @@ class LanceDBConnection:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, uri: URI):
|
def __init__(self, uri: URI):
|
||||||
is_local = isinstance(uri, Path) or get_uri_scheme(uri) == "file"
|
if not isinstance(uri, Path):
|
||||||
|
scheme = get_uri_scheme(uri)
|
||||||
|
is_local = isinstance(uri, Path) or scheme == "file"
|
||||||
|
# managed lancedb remote uses schema like lancedb+[http|grpc|...]://
|
||||||
|
self._is_managed_remote = not is_local and scheme.startswith("lancedb")
|
||||||
|
if self._is_managed_remote:
|
||||||
|
if len(scheme.split("+")) != 2:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid LanceDB URI: {uri}, expected uri to have scheme like lancedb+<flavor>://..."
|
||||||
|
)
|
||||||
if is_local:
|
if is_local:
|
||||||
if isinstance(uri, str):
|
if isinstance(uri, str):
|
||||||
uri = Path(uri)
|
uri = Path(uri)
|
||||||
@@ -64,10 +73,49 @@ class LanceDBConnection:
|
|||||||
Path(uri).mkdir(parents=True, exist_ok=True)
|
Path(uri).mkdir(parents=True, exist_ok=True)
|
||||||
self._uri = str(uri)
|
self._uri = str(uri)
|
||||||
|
|
||||||
|
self._entered = False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def uri(self) -> str:
|
def uri(self) -> str:
|
||||||
return self._uri
|
return self._uri
|
||||||
|
|
||||||
|
@functools.cached_property
|
||||||
|
def is_managed_remote(self) -> bool:
|
||||||
|
return self._is_managed_remote
|
||||||
|
|
||||||
|
@functools.cached_property
|
||||||
|
def remote_flavor(self) -> str:
|
||||||
|
if not self.is_managed_remote:
|
||||||
|
raise ValueError(
|
||||||
|
"Not a managed remote LanceDB, there should be no server flavor"
|
||||||
|
)
|
||||||
|
return get_uri_scheme(self.uri).split("+")[1]
|
||||||
|
|
||||||
|
@functools.cached_property
|
||||||
|
def _client(self) -> "lancedb.remote.LanceDBClient":
|
||||||
|
if not self.is_managed_remote:
|
||||||
|
raise ValueError("Not a managed remote LanceDB, there should be no client")
|
||||||
|
|
||||||
|
# don't import unless we are really using remote
|
||||||
|
from lancedb.remote.client import RestfulLanceDBClient
|
||||||
|
|
||||||
|
if self.remote_flavor == "http":
|
||||||
|
return RestfulLanceDBClient(self._uri)
|
||||||
|
|
||||||
|
raise ValueError("Unsupported remote flavor: " + self.remote_flavor)
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
if self._entered:
|
||||||
|
raise ValueError("Cannot re-enter the same LanceDBConnection twice")
|
||||||
|
self._entered = True
|
||||||
|
await self._client.close()
|
||||||
|
|
||||||
|
async def __aenter__(self) -> LanceDBConnection:
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||||
|
await self.close()
|
||||||
|
|
||||||
def table_names(self) -> list[str]:
|
def table_names(self) -> list[str]:
|
||||||
"""Get the names of all tables in the database.
|
"""Get the names of all tables in the database.
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,9 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import Literal
|
|
||||||
|
import asyncio
|
||||||
|
from typing import Awaitable, Literal
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -168,8 +170,28 @@ class LanceQueryBuilder:
|
|||||||
and also the "score" column which is the distance between the query
|
and also the "score" column which is the distance between the query
|
||||||
vector and the returned vector.
|
vector and the returned vector.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
return self.to_arrow().to_pandas()
|
||||||
|
|
||||||
|
def to_arrow(self) -> pa.Table:
|
||||||
|
"""
|
||||||
|
Execute the query and return the results as a arrow Table.
|
||||||
|
In addition to the selected columns, LanceDB also returns a vector
|
||||||
|
and also the "score" column which is the distance between the query
|
||||||
|
vector and the returned vector.
|
||||||
|
"""
|
||||||
|
if self._table._conn.is_managed_remote:
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
except RuntimeError:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
result = self._table._conn._client.query(
|
||||||
|
self._table.name, self.to_remote_query()
|
||||||
|
)
|
||||||
|
return loop.run_until_complete(result).to_arrow()
|
||||||
|
|
||||||
ds = self._table.to_lance()
|
ds = self._table.to_lance()
|
||||||
tbl = ds.to_table(
|
return ds.to_table(
|
||||||
columns=self._columns,
|
columns=self._columns,
|
||||||
filter=self._where,
|
filter=self._where,
|
||||||
nearest={
|
nearest={
|
||||||
@@ -181,7 +203,20 @@ class LanceQueryBuilder:
|
|||||||
"refine_factor": self._refine_factor,
|
"refine_factor": self._refine_factor,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
return tbl.to_pandas()
|
|
||||||
|
def to_remote_query(self) -> "VectorQuery":
|
||||||
|
# don't import unless we are connecting to remote
|
||||||
|
from lancedb.remote.client import VectorQuery
|
||||||
|
|
||||||
|
return VectorQuery(
|
||||||
|
vector=self._query.tolist(),
|
||||||
|
filter=self._where,
|
||||||
|
k=self._limit,
|
||||||
|
_metric=self._metric,
|
||||||
|
columns=self._columns,
|
||||||
|
nprobes=self._nprobes,
|
||||||
|
refine_factor=self._refine_factor,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class LanceFtsQueryBuilder(LanceQueryBuilder):
|
class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||||
|
|||||||
61
python/lancedb/remote/__init__.py
Normal file
61
python/lancedb/remote/__init__.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import abc
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import attr
|
||||||
|
import pandas as pd
|
||||||
|
import pyarrow as pa
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
__all__ = ["LanceDBClient", "VectorQuery", "VectorQueryResult"]
|
||||||
|
|
||||||
|
|
||||||
|
class VectorQuery(BaseModel):
|
||||||
|
# vector to search for
|
||||||
|
vector: List[float]
|
||||||
|
|
||||||
|
# sql filter to refine the query with
|
||||||
|
filter: Optional[str] = None
|
||||||
|
|
||||||
|
# top k results to return
|
||||||
|
k: int
|
||||||
|
|
||||||
|
# # metrics
|
||||||
|
_metric: str = "L2"
|
||||||
|
|
||||||
|
# which columns to return in the results
|
||||||
|
columns: Optional[List[str]] = None
|
||||||
|
|
||||||
|
# optional query parameters for tuning the results,
|
||||||
|
# e.g. `{"nprobes": "10", "refine_factor": "10"}`
|
||||||
|
nprobes: int = 10
|
||||||
|
|
||||||
|
refine_factor: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
@attr.define
|
||||||
|
class VectorQueryResult:
|
||||||
|
# for now the response is directly seralized into a pandas dataframe
|
||||||
|
tbl: pa.Table
|
||||||
|
|
||||||
|
def to_arrow(self) -> pa.Table:
|
||||||
|
return self.tbl
|
||||||
|
|
||||||
|
|
||||||
|
class LanceDBClient(abc.ABC):
|
||||||
|
@abc.abstractmethod
|
||||||
|
def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
|
||||||
|
"""Query the LanceDB server for the given table and query."""
|
||||||
|
pass
|
||||||
79
python/lancedb/remote/client.py
Normal file
79
python/lancedb/remote/client.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import functools
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
import attr
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
from lancedb.remote import VectorQuery, VectorQueryResult
|
||||||
|
from lancedb.remote.errors import LanceDBClientError
|
||||||
|
|
||||||
|
|
||||||
|
def _check_not_closed(f):
|
||||||
|
@functools.wraps(f)
|
||||||
|
def wrapped(self, *args, **kwargs):
|
||||||
|
if self.closed:
|
||||||
|
raise ValueError("Connection is closed")
|
||||||
|
return f(self, *args, **kwargs)
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
|
||||||
|
@attr.define(slots=False)
|
||||||
|
class RestfulLanceDBClient:
|
||||||
|
url: str
|
||||||
|
closed: bool = attr.field(default=False, init=False)
|
||||||
|
|
||||||
|
@functools.cached_property
|
||||||
|
def session(self) -> aiohttp.ClientSession:
|
||||||
|
parsed = urllib.parse.urlparse(self.url)
|
||||||
|
scheme = parsed.scheme
|
||||||
|
if not scheme.startswith("lancedb"):
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid scheme: {scheme}, must be like lancedb+<flavor>://"
|
||||||
|
)
|
||||||
|
flavor = scheme.split("+")[1]
|
||||||
|
url = f"{flavor}://{parsed.hostname}:{parsed.port}"
|
||||||
|
return aiohttp.ClientSession(url)
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
await self.session.close()
|
||||||
|
self.closed = True
|
||||||
|
|
||||||
|
@_check_not_closed
|
||||||
|
async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
|
||||||
|
async with self.session.post(
|
||||||
|
f"/table/{table_name}/", json=query.dict(exclude_none=True)
|
||||||
|
) as resp:
|
||||||
|
resp: aiohttp.ClientResponse = resp
|
||||||
|
if 400 <= resp.status < 500:
|
||||||
|
raise LanceDBClientError(
|
||||||
|
f"Bad Request: {resp.status}, error: {await resp.text()}"
|
||||||
|
)
|
||||||
|
if 500 <= resp.status < 600:
|
||||||
|
raise LanceDBClientError(
|
||||||
|
f"Internal Server Error: {resp.status}, error: {await resp.text()}"
|
||||||
|
)
|
||||||
|
if resp.status != 200:
|
||||||
|
raise LanceDBClientError(
|
||||||
|
f"Unknown Error: {resp.status}, error: {await resp.text()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
resp_body = await resp.read()
|
||||||
|
with pa.ipc.open_file(pa.BufferReader(resp_body)) as reader:
|
||||||
|
tbl = reader.read_all()
|
||||||
|
return VectorQueryResult(tbl)
|
||||||
16
python/lancedb/remote/errors.py
Normal file
16
python/lancedb/remote/errors.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
class LanceDBClientError(RuntimeError):
|
||||||
|
pass
|
||||||
@@ -14,7 +14,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import shutil
|
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
@@ -27,7 +26,6 @@ from lance.vector import vec_to_table
|
|||||||
|
|
||||||
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||||
from .query import LanceFtsQueryBuilder, LanceQueryBuilder
|
from .query import LanceFtsQueryBuilder, LanceQueryBuilder
|
||||||
from .util import get_uri_scheme
|
|
||||||
|
|
||||||
|
|
||||||
def _sanitize_data(data, schema):
|
def _sanitize_data(data, schema):
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.1.7"
|
version = "0.1.8"
|
||||||
dependencies = ["pylance>=0.4.17", "ratelimiter", "retry", "tqdm"]
|
dependencies = ["pylance>=0.4.20", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic", "attr"]
|
||||||
description = "lancedb"
|
description = "lancedb"
|
||||||
authors = [
|
authors = [
|
||||||
{ name = "LanceDB Devs", email = "dev@lancedb.com" },
|
{ name = "LanceDB Devs", email = "dev@lancedb.com" },
|
||||||
@@ -37,7 +37,7 @@ repository = "https://github.com/lancedb/lancedb"
|
|||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
tests = [
|
tests = [
|
||||||
"pytest", "pytest-mock", "doctest"
|
"pytest", "pytest-mock", "doctest", "pytest-asyncio"
|
||||||
]
|
]
|
||||||
dev = [
|
dev = [
|
||||||
"ruff", "pre-commit", "black"
|
"ruff", "pre-commit", "black"
|
||||||
|
|||||||
77
python/tests/test_context.py
Normal file
77
python/tests/test_context.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from lancedb.context import contextualize
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def raw_df() -> pd.DataFrame:
|
||||||
|
return pd.DataFrame(
|
||||||
|
{
|
||||||
|
"token": [
|
||||||
|
"The",
|
||||||
|
"quick",
|
||||||
|
"brown",
|
||||||
|
"fox",
|
||||||
|
"jumped",
|
||||||
|
"over",
|
||||||
|
"the",
|
||||||
|
"lazy",
|
||||||
|
"dog",
|
||||||
|
"I",
|
||||||
|
"love",
|
||||||
|
"sandwiches",
|
||||||
|
],
|
||||||
|
"document_id": [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_contextualizer(raw_df: pd.DataFrame):
|
||||||
|
result = (
|
||||||
|
contextualize(raw_df)
|
||||||
|
.window(6)
|
||||||
|
.stride(3)
|
||||||
|
.text_col("token")
|
||||||
|
.groupby("document_id")
|
||||||
|
.to_df()["token"]
|
||||||
|
.to_list()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == [
|
||||||
|
"The quick brown fox jumped over",
|
||||||
|
"fox jumped over the lazy dog",
|
||||||
|
"the lazy dog",
|
||||||
|
"I love sandwiches",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_contextualizer_with_threshold(raw_df: pd.DataFrame):
|
||||||
|
result = (
|
||||||
|
contextualize(raw_df)
|
||||||
|
.window(6)
|
||||||
|
.stride(3)
|
||||||
|
.text_col("token")
|
||||||
|
.groupby("document_id")
|
||||||
|
.min_window_size(4)
|
||||||
|
.to_df()["token"]
|
||||||
|
.to_list()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == [
|
||||||
|
"The quick brown fox jumped over",
|
||||||
|
"fox jumped over the lazy dog",
|
||||||
|
]
|
||||||
27
python/tests/test_e2e_remote_db.py
Normal file
27
python/tests/test_e2e_remote_db.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from lancedb import LanceDBConnection
|
||||||
|
|
||||||
|
# TODO: setup integ test mark and script
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="Need to set up a local server")
|
||||||
|
def test_against_local_server():
|
||||||
|
conn = LanceDBConnection("lancedb+http://localhost:10024")
|
||||||
|
table = conn.open_table("sift1m_ivf1024_pq16")
|
||||||
|
df = table.search(np.random.rand(128)).to_df()
|
||||||
|
assert len(df) == 10
|
||||||
@@ -14,6 +14,7 @@ import sys
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
|
||||||
from lancedb.embeddings import with_embeddings
|
from lancedb.embeddings import with_embeddings
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -13,13 +13,13 @@
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import lancedb.fts
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pytest
|
import pytest
|
||||||
import tantivy
|
import tantivy
|
||||||
|
|
||||||
import lancedb as ldb
|
import lancedb as ldb
|
||||||
|
import lancedb.fts
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import lancedb
|
import lancedb
|
||||||
|
|||||||
@@ -17,12 +17,15 @@ import pandas as pd
|
|||||||
import pandas.testing as tm
|
import pandas.testing as tm
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from lancedb.db import LanceDBConnection
|
||||||
from lancedb.query import LanceQueryBuilder
|
from lancedb.query import LanceQueryBuilder
|
||||||
|
|
||||||
|
|
||||||
class MockTable:
|
class MockTable:
|
||||||
def __init__(self, tmp_path):
|
def __init__(self, tmp_path):
|
||||||
self.uri = tmp_path
|
self.uri = tmp_path
|
||||||
|
self._conn = LanceDBConnection("/tmp/lance/")
|
||||||
|
|
||||||
def to_lance(self):
|
def to_lance(self):
|
||||||
return lance.dataset(self.uri)
|
return lance.dataset(self.uri)
|
||||||
|
|||||||
95
python/tests/test_remote_client.py
Normal file
95
python/tests/test_remote_client.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import attr
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import pyarrow as pa
|
||||||
|
import pytest
|
||||||
|
from aiohttp import web
|
||||||
|
|
||||||
|
from lancedb.remote.client import RestfulLanceDBClient, VectorQuery
|
||||||
|
|
||||||
|
|
||||||
|
@attr.define
|
||||||
|
class MockLanceDBServer:
|
||||||
|
runner: web.AppRunner = attr.field(init=False)
|
||||||
|
site: web.TCPSite = attr.field(init=False)
|
||||||
|
|
||||||
|
async def query_handler(self, request: web.Request) -> web.Response:
|
||||||
|
table_name = request.match_info["table_name"]
|
||||||
|
assert table_name == "test_table"
|
||||||
|
|
||||||
|
request_json = await request.json()
|
||||||
|
# TODO: do some matching
|
||||||
|
|
||||||
|
vecs = pd.Series([np.random.rand(128) for x in range(10)], name="vector")
|
||||||
|
ids = pd.Series(range(10), name="id")
|
||||||
|
df = pd.DataFrame([vecs, ids]).T
|
||||||
|
|
||||||
|
batch = pa.RecordBatch.from_pandas(
|
||||||
|
df,
|
||||||
|
schema=pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), 128)),
|
||||||
|
pa.field("id", pa.int64()),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
sink = pa.BufferOutputStream()
|
||||||
|
with pa.ipc.new_file(sink, batch.schema) as writer:
|
||||||
|
writer.write_batch(batch)
|
||||||
|
|
||||||
|
return web.Response(body=sink.getvalue().to_pybytes())
|
||||||
|
|
||||||
|
async def setup(self):
|
||||||
|
app = web.Application()
|
||||||
|
app.add_routes([web.post("/table/{table_name}", self.query_handler)])
|
||||||
|
self.runner = web.AppRunner(app)
|
||||||
|
await self.runner.setup()
|
||||||
|
self.site = web.TCPSite(self.runner, "localhost", 8111)
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
await self.site.start()
|
||||||
|
|
||||||
|
async def stop(self):
|
||||||
|
await self.runner.cleanup()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="flaky somehow, fix later")
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_e2e_with_mock_server():
|
||||||
|
mock_server = MockLanceDBServer()
|
||||||
|
await mock_server.setup()
|
||||||
|
await mock_server.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
client = RestfulLanceDBClient("lancedb+http://localhost:8111")
|
||||||
|
df = (
|
||||||
|
await client.query(
|
||||||
|
"test_table",
|
||||||
|
VectorQuery(
|
||||||
|
vector=np.random.rand(128).tolist(),
|
||||||
|
k=10,
|
||||||
|
_metric="L2",
|
||||||
|
columns=["id", "vector"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
).to_df()
|
||||||
|
|
||||||
|
assert "vector" in df.columns
|
||||||
|
assert "id" in df.columns
|
||||||
|
finally:
|
||||||
|
# make sure we don't leak resources
|
||||||
|
await mock_server.stop()
|
||||||
35
python/tests/test_remote_db.py
Normal file
35
python/tests/test_remote_db.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
from lancedb.db import LanceDBConnection
|
||||||
|
from lancedb.remote.client import VectorQuery, VectorQueryResult
|
||||||
|
|
||||||
|
|
||||||
|
class FakeLanceDBClient:
|
||||||
|
async def close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
|
||||||
|
assert table_name == "test"
|
||||||
|
t = pa.schema([]).empty_table()
|
||||||
|
return VectorQueryResult(t)
|
||||||
|
|
||||||
|
|
||||||
|
def test_remote_db():
|
||||||
|
conn = LanceDBConnection("lancedb+http://client-will-be-injected")
|
||||||
|
setattr(conn, "_client", FakeLanceDBClient())
|
||||||
|
|
||||||
|
table = conn["test"]
|
||||||
|
table.search([1.0, 2.0]).to_df()
|
||||||
@@ -11,11 +11,13 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import functools
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from lancedb.table import LanceTable
|
from lancedb.table import LanceTable
|
||||||
|
|
||||||
|
|
||||||
@@ -23,6 +25,10 @@ class MockDB:
|
|||||||
def __init__(self, uri: Path):
|
def __init__(self, uri: Path):
|
||||||
self.uri = uri
|
self.uri = uri
|
||||||
|
|
||||||
|
@functools.cached_property
|
||||||
|
def is_managed_remote(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def db(tmp_path) -> MockDB:
|
def db(tmp_path) -> MockDB:
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "vectordb-node"
|
name = "vectordb-node"
|
||||||
version = "0.1.0"
|
version = "0.1.8"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license = "Apache-2.0"
|
license = "Apache-2.0"
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|||||||
@@ -97,6 +97,7 @@ fn get_index_params_builder(
|
|||||||
let ivf_params = IvfBuildParams {
|
let ivf_params = IvfBuildParams {
|
||||||
num_partitions: np,
|
num_partitions: np,
|
||||||
max_iters,
|
max_iters,
|
||||||
|
centroids: None,
|
||||||
};
|
};
|
||||||
index_builder.ivf_params(ivf_params)
|
index_builder.ivf_params(ivf_params)
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -264,6 +264,25 @@ fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
Ok(promise)
|
Ok(promise)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn table_count_rows(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
|
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||||
|
let rt = runtime(&mut cx)?;
|
||||||
|
let channel = cx.channel();
|
||||||
|
|
||||||
|
let (deferred, promise) = cx.promise();
|
||||||
|
let table = js_table.table.clone();
|
||||||
|
|
||||||
|
rt.block_on(async move {
|
||||||
|
let num_rows_result = table.lock().unwrap().count_rows().await;
|
||||||
|
|
||||||
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
|
let num_rows = num_rows_result.or_else(|err| cx.throw_error(err.to_string()))?;
|
||||||
|
Ok(cx.number(num_rows as f64))
|
||||||
|
});
|
||||||
|
});
|
||||||
|
Ok(promise)
|
||||||
|
}
|
||||||
|
|
||||||
#[neon::main]
|
#[neon::main]
|
||||||
fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
||||||
cx.export_function("databaseNew", database_new)?;
|
cx.export_function("databaseNew", database_new)?;
|
||||||
@@ -272,6 +291,7 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
|||||||
cx.export_function("tableSearch", table_search)?;
|
cx.export_function("tableSearch", table_search)?;
|
||||||
cx.export_function("tableCreate", table_create)?;
|
cx.export_function("tableCreate", table_create)?;
|
||||||
cx.export_function("tableAdd", table_add)?;
|
cx.export_function("tableAdd", table_add)?;
|
||||||
|
cx.export_function("tableCountRows", table_count_rows)?;
|
||||||
cx.export_function(
|
cx.export_function(
|
||||||
"tableCreateVectorIndex",
|
"tableCreateVectorIndex",
|
||||||
index::vector::table_create_vector_index,
|
index::vector::table_create_vector_index,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "vectordb"
|
name = "vectordb"
|
||||||
version = "0.0.1"
|
version = "0.1.8"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license = "Apache-2.0"
|
license = "Apache-2.0"
|
||||||
@@ -14,7 +14,7 @@ arrow-data = "37.0"
|
|||||||
arrow-schema = "37.0"
|
arrow-schema = "37.0"
|
||||||
object_store = "0.5.6"
|
object_store = "0.5.6"
|
||||||
snafu = "0.7.4"
|
snafu = "0.7.4"
|
||||||
lance = "0.4.17"
|
lance = "0.4.21"
|
||||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ impl Database {
|
|||||||
///
|
///
|
||||||
/// * A [Database] object.
|
/// * A [Database] object.
|
||||||
pub async fn connect(uri: &str) -> Result<Database> {
|
pub async fn connect(uri: &str) -> Result<Database> {
|
||||||
let object_store = ObjectStore::new(uri).await?;
|
let (object_store, _) = ObjectStore::from_uri(uri).await?;
|
||||||
if object_store.is_local() {
|
if object_store.is_local() {
|
||||||
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
|
Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
|
||||||
}
|
}
|
||||||
@@ -69,7 +69,7 @@ impl Database {
|
|||||||
pub async fn table_names(&self) -> Result<Vec<String>> {
|
pub async fn table_names(&self) -> Result<Vec<String>> {
|
||||||
let f = self
|
let f = self
|
||||||
.object_store
|
.object_store
|
||||||
.read_dir("/")
|
.read_dir(self.uri.as_str())
|
||||||
.await?
|
.await?
|
||||||
.iter()
|
.iter()
|
||||||
.map(|fname| Path::new(fname))
|
.map(|fname| Path::new(fname))
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ pub trait VectorIndexBuilder {
|
|||||||
fn get_column(&self) -> Option<String>;
|
fn get_column(&self) -> Option<String>;
|
||||||
fn get_index_name(&self) -> Option<String>;
|
fn get_index_name(&self) -> Option<String>;
|
||||||
fn build(&self) -> VectorIndexParams;
|
fn build(&self) -> VectorIndexParams;
|
||||||
|
|
||||||
|
fn get_replace(&self) -> bool;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct IvfPQIndexBuilder {
|
pub struct IvfPQIndexBuilder {
|
||||||
@@ -28,6 +30,7 @@ pub struct IvfPQIndexBuilder {
|
|||||||
metric_type: Option<MetricType>,
|
metric_type: Option<MetricType>,
|
||||||
ivf_params: Option<IvfBuildParams>,
|
ivf_params: Option<IvfBuildParams>,
|
||||||
pq_params: Option<PQBuildParams>,
|
pq_params: Option<PQBuildParams>,
|
||||||
|
replace: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IvfPQIndexBuilder {
|
impl IvfPQIndexBuilder {
|
||||||
@@ -38,6 +41,7 @@ impl IvfPQIndexBuilder {
|
|||||||
metric_type: None,
|
metric_type: None,
|
||||||
ivf_params: None,
|
ivf_params: None,
|
||||||
pq_params: None,
|
pq_params: None,
|
||||||
|
replace: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -67,6 +71,11 @@ impl IvfPQIndexBuilder {
|
|||||||
self.pq_params = Some(pq_params);
|
self.pq_params = Some(pq_params);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn replace(&mut self, replace: bool) -> &mut IvfPQIndexBuilder {
|
||||||
|
self.replace = replace;
|
||||||
|
self
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl VectorIndexBuilder for IvfPQIndexBuilder {
|
impl VectorIndexBuilder for IvfPQIndexBuilder {
|
||||||
@@ -84,6 +93,10 @@ impl VectorIndexBuilder for IvfPQIndexBuilder {
|
|||||||
|
|
||||||
VectorIndexParams::with_ivf_pq_params(pq_params.metric_type, ivf_params, pq_params)
|
VectorIndexParams::with_ivf_pq_params(pq_params.metric_type, ivf_params, pq_params)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_replace(&self) -> bool {
|
||||||
|
self.replace
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -177,7 +177,7 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_setters_getters() {
|
async fn test_setters_getters() {
|
||||||
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
||||||
let ds = Dataset::write(&mut batches, ":memory:", None)
|
let ds = Dataset::write(&mut batches, "memory://foo", None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
@@ -206,7 +206,7 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_execute() {
|
async fn test_execute() {
|
||||||
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
||||||
let ds = Dataset::write(&mut batches, ":memory:", None)
|
let ds = Dataset::write(&mut batches, "memory://foo", None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -130,6 +130,7 @@ impl Table {
|
|||||||
IndexType::Vector,
|
IndexType::Vector,
|
||||||
index_builder.get_index_name(),
|
index_builder.get_index_name(),
|
||||||
&index_builder.build(),
|
&index_builder.build(),
|
||||||
|
index_builder.get_replace(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
self.dataset = Arc::new(dataset);
|
self.dataset = Arc::new(dataset);
|
||||||
@@ -233,7 +234,7 @@ mod tests {
|
|||||||
let uri = tmp_dir.path().to_str().unwrap();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
||||||
let schema = batches.schema().clone();
|
let _ = batches.schema().clone();
|
||||||
Table::create(&uri, "test", batches).await.unwrap();
|
Table::create(&uri, "test", batches).await.unwrap();
|
||||||
|
|
||||||
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
||||||
|
|||||||
Reference in New Issue
Block a user