Compare commits

...

111 Commits

Author SHA1 Message Date
Will Jones
5d63b215e0 try manylinux again 2023-05-24 20:54:47 -07:00
Will Jones
c4237c61d5 fixes for action 2023-05-24 18:22:08 -07:00
Will Jones
47f5163768 cleanup 2023-05-23 18:43:24 -07:00
Will Jones
7ea809d8a0 match versions 2023-05-23 17:41:42 -07:00
Will Jones
8d7726b1ea fill out rest of release script 2023-05-23 17:40:34 -07:00
Will Jones
f5bb2a2096 more progress on release workflow 2023-05-23 17:14:58 -07:00
Will Jones
89aee07fa9 wip: see if we can build the lib in ci 2023-05-22 14:27:47 -07:00
Will Jones
aac2ffa4b3 Lint and test vectordb node in CI (#92)
Closes #90.
2023-05-22 14:26:06 -07:00
gsilvestrin
e28fe7b468 nodejs append records api (#85) 2023-05-18 15:13:57 -07:00
gsilvestrin
61b9479bd9 JavaScript client initial linux support (#84) 2023-05-16 17:04:06 -07:00
gsilvestrin
961d892c89 Added TypeScript example (#82) 2023-05-16 13:40:52 -07:00
Jai
0b35e6dfa9 node quickstart (#83) 2023-05-16 09:53:04 -07:00
Jai
ca96fc55f6 add link to node quickstart to readme (#81) 2023-05-16 09:24:12 -07:00
gsilvestrin
395c7460d5 nodejs create_table (#75) 2023-05-15 19:00:17 -07:00
Jai
92d810eac4 docs build (#78) 2023-05-14 10:18:28 -07:00
Jai
a55a579b7f nodejs read only example (#77) 2023-05-12 15:50:59 -07:00
gsilvestrin
202924f832 updated node example (#74) 2023-05-11 12:55:02 -07:00
gsilvestrin
648f8123ca Exposing limit parameter (#73) 2023-05-11 09:12:06 -07:00
gsilvestrin
5bb5b0a685 javascript example improvements (#72) 2023-05-10 22:06:44 -07:00
gsilvestrin
c2e73262ef bump version and skipping building the native lib during install (#71) 2023-05-10 15:10:46 -07:00
gsilvestrin
f5bf6181e3 Merge pull request #70 from lancedb/gsilvestrin/nodejs_client-merge
JavaScript / Node.js library for LanceDB
2023-05-10 13:44:52 -07:00
gsilvestrin
c2dc1da509 Removing sample db 2023-05-10 13:40:17 -07:00
gsilvestrin
38e6efc185 JavaScript / Node.js library for LanceDB
- Core rust library
- ffi bridge that exposes rust functionality to javascript
- npm package that provides a TypeScript / JavaScript library
- limitations: it only supports reading for now
2023-05-10 12:51:49 -07:00
Chang She
636a6d3761 Merge pull request #65 from lancedb/jaichopra/add-youtube-transcript-example 2023-05-08 17:45:35 -07:00
Jai Chopra
2a855c9f6a update image url 2023-05-08 17:39:52 -07:00
Jai Chopra
5c47b0c6a2 add youtube transcript example 2023-05-08 17:38:08 -07:00
Jai
d12bc24091 Merge pull request #63 from lancedb/jaichopra/update-readme-ecosystem
update ecosystem in readme
2023-05-07 09:12:25 -07:00
Jai Chopra
c4261b23e6 update blog url 2023-05-07 08:18:24 -07:00
Jai Chopra
ab0abbbfab update ecosystem in readme 2023-05-07 08:17:02 -07:00
Chang She
13c9a2e1c9 Merge pull request #61 from lancedb/jaichopra/langchain-example-doc
add langchain example
2023-05-05 16:06:40 -07:00
Jai Chopra
7e3db16225 add langchain example 2023-05-05 16:00:14 -07:00
Jai
62abe2d96f Merge pull request #57 from lancedb/jaichopra/s3-lambda-docs
S3 Lambda example
2023-05-05 14:08:24 -07:00
Chang She
59014a01e0 bump version for v0.1.2 2023-05-05 11:27:09 -07:00
Jai Chopra
11f423ccf5 clean up 2023-05-04 17:21:53 -07:00
Chang She
47ae17ea05 Merge pull request #58 from lancedb/changhiskhan/parse-schema
Add method to get the URI scheme to support cloud storage
2023-05-04 14:36:45 -07:00
Chang She
b6739f3f66 windows paths 2023-05-04 11:41:05 -07:00
Jai Chopra
6ff3c60cd1 clean up example 2023-05-04 10:14:31 -07:00
Chang She
3a2df0ce45 Add method to get the URI scheme to support cloud storage 2023-05-04 09:47:03 -07:00
Jai Chopra
6556e42e6d update lambda example to lancedb 2023-05-04 08:17:13 -07:00
Jai Chopra
c3d90b2c78 update tagline 2023-05-04 08:17:13 -07:00
Jai Chopra
66f7d5cec9 also update docs index 2023-05-04 08:17:13 -07:00
Jai Chopra
4336ed050d add new feature to readme.md 2023-05-04 08:17:13 -07:00
Lei Xu
976344257c add cargo metadata 2023-05-04 08:17:13 -07:00
Lei Xu
906551b001 initialize the rust core 2023-05-04 08:17:13 -07:00
Chang She
33ac42a51c bump version for v0.1.1 2023-05-04 08:17:13 -07:00
Chang She
c0bc65cdfa Merge pull request #55 from lancedb/jaichopra/update-tagline
update tagline
2023-05-03 21:06:41 -07:00
Jai Chopra
298b81f0b0 update tagline 2023-05-03 19:55:10 -07:00
Jai
fe7a3ccd60 Merge pull request #53 from lancedb/jaichopra/update-major-features-readme
also update docs index
2023-05-03 07:51:54 -07:00
Jai Chopra
baf8d7c1a1 also update docs index 2023-05-03 07:50:44 -07:00
Chang She
2021e1bf6d Merge pull request #52 from lancedb/jaichopra/update-major-features-readme 2023-05-03 07:36:09 -07:00
Jai Chopra
2dbe71cf88 add new feature to readme.md 2023-05-03 07:30:46 -07:00
Jai
7cd36196b4 Update langchain.md 2023-04-27 11:08:29 -07:00
Lei Xu
afe19ade7f Merge pull request #49 from lancedb/lei/rust_core
Rust core directory
2023-04-27 10:40:21 -07:00
Lei Xu
118efdce73 add cargo metadata 2023-04-27 10:36:01 -07:00
Lei Xu
b0426387e7 initialize the rust core 2023-04-27 10:31:50 -07:00
Jai
87fb4d0645 Update langchain.md 2023-04-27 07:13:18 -07:00
Jai
c930b94917 Update s3_lambda.md 2023-04-27 07:12:52 -07:00
Chang She
afa7fe19e6 bump version for v0.1.1 2023-04-26 16:55:25 -07:00
Jai
aa23d911f5 Update langchain.md 2023-04-26 14:50:09 -07:00
Jai Chopra
ca8d8e82b7 add simple langchain example 2023-04-26 14:44:20 -07:00
Jai
3d3ba913ed Update s3_lambda.md 2023-04-26 10:19:27 -07:00
Jai
0346d5319e Update s3_lambda.md 2023-04-26 10:18:47 -07:00
Jai
41eadf6fd9 Update s3_lambda.md 2023-04-26 10:18:31 -07:00
Jai Chopra
e784c6311d tree github build script from remote 2023-04-25 21:40:28 -07:00
Chang She
66080d791b Merge pull request #46 from lancedb/changhiskhan/improve-index-docs 2023-04-25 21:13:51 -07:00
Chang She
5554fddd54 Merge branch 'main' into changhiskhan/improve-index-docs 2023-04-25 21:04:01 -07:00
Chang She
f06ea935fe Merge pull request #47 from lancedb/changhiskhan/expose-metric
Make distance metric configurable in LanceDB
2023-04-25 21:02:59 -07:00
Chang She
a8db7f56d2 tolerance 2023-04-25 20:08:18 -07:00
Chang She
7a375185a1 increment lance version to include cosine distance fix 2023-04-25 19:57:58 -07:00
Chang She
6592b4c13b document metric in create_index 2023-04-24 22:46:21 -07:00
Chang She
72a44eb927 specify metric during index creation 2023-04-24 22:45:37 -07:00
Chang She
b0e578c609 add documentation for metric 2023-04-24 22:42:30 -07:00
Chang She
89e6232aeb Make distance metric configurable during search 2023-04-24 22:40:40 -07:00
Chang She
44ea687984 Merge pull request #45 from lancedb/changhiskhan/notebook-fix
Minor notebook fix. Closes #40
2023-04-24 20:12:03 -07:00
Chang She
4f2dae8a0d Add more detailed docs for the ANN index and search features 2023-04-24 19:19:55 -07:00
Chang She
5e748e6e70 Minor notebook fix. Closes #40 2023-04-24 18:46:05 -07:00
Chang She
177192f852 Merge pull request #37 from lancedb/gsilvestrin/ratelimit_3.11
skipping embeddings rate limit when python version > 3.10
2023-04-22 21:03:18 -07:00
Lei Xu
1fb596942f Merge pull request #39 from wilhelmjung/patch-1
Update index.md
2023-04-22 20:32:36 -07:00
YangWeiliang_DeepNova@Deepexi
73d3cb78e6 Update index.md
Just a typo. Fixed.
2023-04-23 09:52:23 +08:00
gsilvestrin
a1583444ec add ann_index to main doc page 2023-04-20 16:07:25 -07:00
gsilvestrin
78e4f4d1a8 add ann_index to main doc page 2023-04-20 13:19:10 -07:00
gsilvestrin
b92eb988b6 add ann_index to main doc page 2023-04-20 11:51:42 -07:00
gsilvestrin
0cd092814d skipping rate limit when python version > 3.10 2023-04-20 10:28:14 -07:00
Jai
a6294925df Update README.md 2023-04-20 10:22:03 -07:00
Chang She
342b726ed7 bump version for v0.1 2023-04-19 23:26:46 -07:00
Chang She
159b175316 Merge pull request #34 from lancedb/changhiskhan/overwrite-table
Add mode to overwrite table if already exists
2023-04-19 21:11:56 -07:00
Chang She
7876156d54 Merge pull request #36 from lancedb/gsilvestrin/ann_docs
[DOC] ann indexes documentation
2023-04-19 21:11:26 -07:00
Chang She
d64e85e9d7 Merge pull request #35 from lancedb/changhiskhan/table-versioning
expose methods to work with versioning in tables
2023-04-19 21:09:32 -07:00
gsilvestrin
3e79b4d9cb review comments 2023-04-19 20:33:23 -07:00
gsilvestrin
3eac75e61a review comments 2023-04-19 20:23:18 -07:00
gsilvestrin
b19ce10184 review comments 2023-04-19 19:28:40 -07:00
gsilvestrin
ce34d055af search docs 2023-04-19 19:26:27 -07:00
gsilvestrin
8bf4d169e2 wip ann indexes doc 2023-04-19 17:23:31 -07:00
gsilvestrin
4f7f33f7b7 wip ann indexes doc 2023-04-19 17:23:06 -07:00
gsilvestrin
32b21e1d20 wip ann indexes doc 2023-04-19 17:20:58 -07:00
Chang She
6062bfdb8f fix docs link 2023-04-19 16:59:30 -07:00
Chang She
93a5c5c15c Merge pull request #33 from lancedb/changhiskhan/doc-embedding-function
[DOC] embedding function documentation
2023-04-19 16:55:22 -07:00
Chang She
99310e099e expose methods to work with versioning in tables 2023-04-19 16:48:06 -07:00
Chang She
85dda53779 review comments 2023-04-19 16:35:48 -07:00
Chang She
d7c5793803 Add mode to overwrite table if already exists 2023-04-19 16:22:11 -07:00
Chang She
08e67d04bb [DOC] embedding function documentation 2023-04-19 15:54:32 -07:00
Lei Xu
ec197b1855 Merge pull request #31 from lancedb/lei/doc
[Doc] Pandas, Parrow, DuckDB integration
2023-04-19 14:55:42 -07:00
Lei Xu
23d4e3561f add link to basic and indexing 2023-04-19 14:53:45 -07:00
Lei Xu
de6bfab124 comments 2023-04-19 14:42:43 -07:00
Chang She
d7fb2b1d6b Merge pull request #32 from lancedb/changhiskhan/doc-basic-ops
[DOC] basic operations
2023-04-19 14:39:40 -07:00
Chang She
cdb534076f [DOC] basic operations 2023-04-19 14:29:03 -07:00
Lei Xu
c38d80cab2 remove print 2023-04-19 14:26:07 -07:00
Lei Xu
45e02bb62b no polars for now 2023-04-19 14:24:58 -07:00
Lei Xu
b3fdabdf45 use python and arrow 2023-04-19 14:15:18 -07:00
Chang She
1c3f9f1e3b Merge pull request #26 from lancedb/changhiskhan/invalidate-dataset
invalidate cached dataset after create_index and add
2023-04-19 14:02:29 -07:00
Chang She
f0ea1d898b invalidate cached dataset after create_index and add 2023-04-18 16:51:26 -07:00
64 changed files with 14377 additions and 76 deletions

View File

@@ -40,9 +40,8 @@ jobs:
python -m pip install -e .
python -m pip install -r ../docs/requirements.txt
- name: Build docs
working-directory: docs
run: |
mkdocs build
PYTHONPATH=. mkdocs build -f docs/mkdocs.yml
- name: Setup Pages
uses: actions/configure-pages@v2
- name: Upload artifact

105
.github/workflows/node.yml vendored Normal file
View File

@@ -0,0 +1,105 @@
name: Node
on:
push:
branches:
- main
pull_request:
paths:
- node/**
- rust/ffi/node/**
- .github/workflows/node.yml
env:
# Disable full debug symbol generation to speed up CI build and keep memory down
# "1" means line tables only, which is useful for panic tracebacks.
RUSTFLAGS: "-C debuginfo=1"
RUST_BACKTRACE: "1"
jobs:
lint:
name: Lint
runs-on: ubuntu-22.04
defaults:
run:
shell: bash
working-directory: node
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
lfs: true
- uses: actions/setup-node@v3
with:
node-version: 18
cache: 'npm'
cache-dependency-path: node/package-lock.json
- name: Lint
run: |
npm ci
npm run lint
linux:
name: Linux (Node ${{ matrix.node-version }})
timeout-minutes: 30
strategy:
matrix:
node-version: [ "16", "18" ]
runs-on: "ubuntu-22.04"
defaults:
run:
shell: bash
working-directory: node
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
lfs: true
- uses: actions/setup-node@v3
with:
node-version: ${{ matrix.node-version }}
cache: 'npm'
cache-dependency-path: node/package-lock.json
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
npm ci
npm run tsc
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
- name: Test
run: npm run test
macos:
timeout-minutes: 30
runs-on: "macos-13"
defaults:
run:
shell: bash
working-directory: node
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
lfs: true
- uses: actions/setup-node@v3
with:
node-version: 18
cache: 'npm'
cache-dependency-path: node/package-lock.json
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: brew install protobuf
- name: Build
run: |
npm ci
npm run tsc
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
- name: Test
run: |
npm run test

201
.github/workflows/release.yml vendored Normal file
View File

@@ -0,0 +1,201 @@
name: Prepare Release
# Based on https://github.com/dherman/neon-prebuild-example/blob/eaa4d33d682e5eb7abbc3da7aed153a1b1acb1b3/.github/workflows/publish.yml
on:
push:
tags:
- v*
jobs:
draft-release:
runs-on: ubuntu-latest
steps:
- uses: softprops/action-gh-release@v1
with:
draft: true
prerelease: true # hardcoded on for now
generate_release_notes: true
rust:
runs-on: ubuntu-latest
needs: draft-release
defaults:
run:
shell: bash
working-directory: rust/vectordb
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
lfs: true
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Package Rust
run: cargo package --all-features
- uses: softprops/action-gh-release@v1
with:
draft: true
files: target/vectordb-*.crate
python:
runs-on: ubuntu-latest
needs: draft-release
defaults:
run:
shell: bash
working-directory: python
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Build wheel
run: |
pip install wheel
python setup.py sdist bdist_wheel
- uses: softprops/action-gh-release@v1
with:
draft: true
files: |
python/dist/lancedb-*.tar.gz
python/dist/lancedb-*.whl
node:
runs-on: ubuntu-latest
needs: draft-release
defaults:
run:
shell: bash
working-directory: node
steps:
- name: Checkout
uses: actions/checkout@v2
- uses: actions/setup-node@v3
with:
node-version: 20
cache: 'npm'
cache-dependency-path: node/package-lock.json
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
npm ci
npm run tsc
npm pack
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/vectordb-*.tgz
node-macos:
runs-on: macos-12
needs: draft-release
strategy:
fail-fast: false
matrix:
target: [x86_64-apple-darwin, aarch64-apple-darwin]
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install system dependencies
run: brew install protobuf
- name: Install npm dependencies
run: |
cd node
npm ci
- name: Build MacOS native node modules
run: bash ci/build_macos_artifacts.sh ${{ matrix.target }}
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/dist/vectordb-darwin*.tgz
node-linux:
name: Linux ${{ matrix.settings.arch}} native node module
runs-on: ubuntu-latest
needs: draft-release
strategy:
fail-fast: false
matrix:
libc:
- gnu
- musl
settings:
- container: quay.io/pypa/manylinux2014_x86_64
arch: x86_64
protoc_arch: x86_64
- container: quay.io/pypa/manylinux2014_aarch64
arch: aarch64
protoc_arch: aarch_64
container: ${{ matrix.settings.container }}
defaults:
run:
shell: bash
working-directory: node
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install system dependencies
run: |
yum install -y openssl-devel unzip
# Install new enough protobuf (yum-provided is old)
PB_REL=https://github.com/protocolbuffers/protobuf/releases \
PB_VERSION=23.1 \
curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-${{ matrix.settings.protoc_arch }}.zip && \
unzip protoc-$PB_VERSION-linux-${{ matrix.settings.protoc_arch }}.zip -d /usr/local
- uses: actions/setup-node@v3
with:
node-version: 18
- name: Setup Rust
uses: ATiltedTree/setup-rust@v1
with:
rust-version: stable
- name: Install npm dependencies
run: npm ci
- name: Build ${{ matrix.settings.libc }} library
run: |
export BUILD_TARGET=${{ matrix.settings.arch }}-unknown-linux-${{ matrix.settings.libc }}
rustup target add $BUILD_TARGET
export RUSTFLAGS="-C target-feature=-crt-static"
npm run build-release -- --target $BUILD_TARGET
npm run pack-build -- --target $BUILD_TARGET
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/dist/vectordb-linux*.tgz
release:
needs: [python, node, node-macos, node-linux, rust]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v3
- name: Publish to PyPI
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
run: |
python -m twine upload --non-interactive \
--skip-existing \
--repository testpypi python/dist/*
- name: Publish to NPM
run: |
for filename in node/dist/*.tgz; do
npm publish --dry-run $filename
done
- name: Publish to crates.io
env:
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
run: |
cargo publish --dry-run --no-verify rust/target/vectordb-*.crate
# - uses: softprops/action-gh-release@v1
# with:
# draft: false

17
.gitignore vendored
View File

@@ -2,6 +2,9 @@
**/*.whl
*.egg-info
**/__pycache__
.DS_Store
.vscode
rust/target
rust/Cargo.lock
@@ -15,3 +18,17 @@ python/build
python/dist
notebooks/.ipynb_checkpoints
**/.hypothesis
## Javascript
*.node
**/node_modules
**/.DS_Store
node/dist
node/examples/**/package-lock.json
node/examples/**/dist
## Rust
target

3791
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

6
Cargo.toml Normal file
View File

@@ -0,0 +1,6 @@
[workspace]
members = [
"rust/vectordb",
"rust/ffi/node"
]
resolver = "2"

39
Cross.toml Normal file
View File

@@ -0,0 +1,39 @@
# These make sure our builds are compatible with old glibc versions.
[target.x86_64-unknown-linux-gnu]
pre-build = [
# Install newer gfortran
"yum install -y openssl-devel unzip gcc-gfortran",
"scl enable devtoolset-11 bash",
# protobuf is too old, so we directly download binaries
"PB_REL=https://github.com/protocolbuffers/protobuf/releases",
"PB_VERSION=23.1",
"curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-x86_64.zip",
"unzip protoc-$PB_VERSION-linux-x86_64.zip -d /usr/local",
]
image = "ghcr.io/cross-rs/x86_64-unknown-linux-gnu:main-centos"
[target.aarch64-unknown-linux-gnu]
pre-build = [
"yum install -y openssl-devel unzip",
# protobuf is too old, so we directly download binaries
"PB_REL=https://github.com/protocolbuffers/protobuf/releases",
"PB_VERSION=23.1",
"curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-x86_64.zip",
"unzip protoc-$PB_VERSION-linux-x86_64.zip -d /usr/local",
]
# https://github.com/cross-rs/cross/blob/main/docker/Dockerfile.aarch64-unknown-linux-gnu.centos
image = "ghcr.io/cross-rs/aarch64-unknown-linux-gnu:main-centos"
[target.x86_64-unknown-linux-musl]
# https://github.com/cross-rs/cross/blob/main/docker/Dockerfile.x86_64-unknown-linux-musl
pre-build = [
"dpkg --add-architecture $CROSS_DEB_ARCH",
"apt-get update && apt-get install --assume-yes libssl-dev:$CROSS_DEB_ARCH",
]
[target.aarch64-unknown-linux-musl]
# https://github.com/cross-rs/cross/blob/main/docker/Dockerfile.aarch64-unknown-linux-musl
pre-build = [
"dpkg --add-architecture $CROSS_DEB_ARCH",
"apt-get update && apt-get install --assume-yes libssl-dev:$CROSS_DEB_ARCH",
]

View File

@@ -3,12 +3,12 @@
<img width="275" alt="LanceDB Logo" src="https://user-images.githubusercontent.com/917119/226205734-6063d87a-1ecc-45fe-85be-1dea6383a3d8.png">
**Serverless, low-latency vector database for AI applications**
**Developer-friendly, serverless vector database for AI applications**
<a href="https://lancedb.github.io/lancedb/">Documentation</a>
<a href="https://blog.eto.ai/">Blog</a>
<a href="https://blog.lancedb.com/">Blog</a>
<a href="https://discord.gg/zMM32dvNtd">Discord</a>
<a href="https://twitter.com/etodotai">Twitter</a>
<a href="https://twitter.com/lancedb">Twitter</a>
</p>
</div>
@@ -21,23 +21,41 @@ The key features of LanceDB include:
* Production-scale vector search with no servers to manage.
* Combine attribute-based information with vectors and store them as a single source-of-truth.
* Store, query and filter vectors, metadata and multi-modal data (text, images, videos, point clouds, and more).
* Native Python and Javascript/Typescript support.
* Zero-copy, automatic versioning, manage versions of your data without needing extra infrastructure.
* Ecosystem integrations: Apache-Arrow, Pandas, Polars, DuckDB and more on the way.
* Ecosystem integrations with [LangChain 🦜️🔗](https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lanecdb.html), [LlamaIndex 🦙](https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html), Apache-Arrow, Pandas, Polars, DuckDB and more on the way.
LanceDB's core is written in Rust 🦀 and is built using <a href="https://github.com/eto-ai/lance">Lance</a>, an open-source columnar format designed for performant ML workloads.
## Quick Start
**Installation**
**Javascript**
```shell
npm install vectordb
```
```javascript
const lancedb = require('vectordb');
const db = await lancedb.connect('data/sample-lancedb');
const table = await db.createTable('vectors',
[{ id: 1, vector: [0.1, 0.2], item: "foo", price: 10 },
{ id: 2, vector: [1.1, 1.2], item: "bar", price: 50 }])
const query = table.search([0.1, 0.3]);
query.limit = 20;
const results = await query.execute();
```
**Python**
```shell
pip install lancedb
```
**Quickstart**
```python
import lancedb

View File

@@ -0,0 +1,40 @@
# Builds the Linux artifacts (node binaries).
# Usage: ./build_linux_artifacts.sh [target]
# Targets supported:
# - x86_64-unknown-linux-gnu:centos
# - aarch64-unknown-linux-gnu:centos
# - aarch64-unknown-linux-musl
# - x86_64-unknown-linux-musl
# On MacOS, need to run in a linux container:
# docker run -v $(pwd):/io -w /io
# Must run rustup toolchain install stable-x86_64-unknown-linux-gnu --force-non-host
set -e
build_node_binaries() {
pushd node
for target in $1
do
echo "Building node library for $target"
# cross doesn't yet pass this down to Docker, so we do it ourselves.
export CROSS_CONTAINER_OPTS="--platform linux/amd64"
if [[ $target == *musl ]]; then
# This is needed for cargo to allow build cdylibs with musl
RUSTFLAGS="-C target-feature=-crt-static"
fi
npm run cross-release -- --target $target
npm run pack-build -- --target $target
done
popd
}
if [ -n "$1" ]; then
targets=$1
else
# targets="x86_64-unknown-linux-gnu aarch64-unknown-linux-gnu aarch64-unknown-linux-musl x86_64-unknown-linux-musl"
targets="aarch64-unknown-linux-gnu"
fi
build_node_binaries $targets

View File

@@ -0,0 +1,22 @@
# Builds the macOS artifacts (node binaries).
# Usage: ./build_macos_artifacts.sh [target]
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
build_node_binaries() {
pushd node
for target in $1
do
echo "Building node library for $target"
npm run build-release -- --target $target
npm run pack-build -- --target $target
done
popd
}
if [ -n "$1" ]; then
targets=$1
else
targets="x86_64-apple-darwin aarch64-apple-darwin"
fi
build_node_binaries $targets

View File

@@ -0,0 +1,31 @@
# On MacOS, need to run in a linux container:
# cat ci/ubuntu_build.dockerfile | docker build -t lancedb-node-build -
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $(pwd):/io -w /io lancedb-node-build bash ci/build_linux_artifacts.sh
FROM ubuntu:20.04
ARG DEBIAN_FRONTEND=noninteractive
ENV TZ=Europe/Moscow
RUN apt update && apt install -y protobuf-compiler libssl-dev build-essential curl \
software-properties-common npm docker.io
# Install rust
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
# Install cross
# https://github.com/cross-rs/cross/issues/1257#issuecomment-1544553706
RUN cargo install cross --git https://github.com/cross-rs/cross
# Install additional build targets
RUN rustup target add x86_64-unknown-linux-gnu aarch64-unknown-linux-gnu aarch64-unknown-linux-musl x86_64-unknown-linux-musl
# Install node
RUN npm install npm@latest -g && \
npm install n -g && \
n latest
# set CROSS_CONTAINER_IN_CONTAINER to inform `cross` that it is executed from within a container
ENV CROSS_CONTAINER_IN_CONTAINER=true
ENV CROSS_CONTAINER_ENGINE_NO_BUILDKIT=1

View File

@@ -3,12 +3,30 @@ docs_dir: src
theme:
name: "material"
features:
- content.code.copy
plugins:
- search
- mkdocstrings
- mkdocstrings:
handlers:
python:
paths: [../python]
- mkdocs-jupyter
nav:
- Home: index.md
- Basics: basic.md
- Embeddings: embedding.md
- Indexing: ann_indexes.md
- Integrations: integrations.md
- Python API: python.md
markdown_extensions:
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.snippets
- pymdownx.superfences

95
docs/src/ann_indexes.md Normal file
View File

@@ -0,0 +1,95 @@
# ANN (Approximate Nearest Neighbor) Indexes
You can create an index over your vector data to make search faster.
Vector indexes are faster but less accurate than exhaustive search.
LanceDB provides many parameters to fine-tune the index's size, the speed of queries, and the accuracy of results.
Currently, LanceDB does *not* automatically create the ANN index.
LanceDB has optimized code for KNN as well. For many use-cases, datasets under 100K vectors won't require index creation at all.
If you can live with <100ms latency, skipping index creation is a simpler workflow while guaranteeing 100% recall.
In the future we will look to automatically create and configure the ANN index.
## Creating an ANN Index
Creating indexes is done via the [create_index](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.create_index) method.
```python
import lancedb
import numpy as np
uri = "~/.lancedb"
db = lancedb.connect(uri)
# Create 10,000 sample vectors
data = [{"vector": row, "item": f"item {i}"}
for i, row in enumerate(np.random.random((10_000, 768)).astype('float32'))]
# Add the vectors to a table
tbl = db.create_table("my_vectors", data=data)
# Create and train the index - you need to have enough data in the table for an effective training step
tbl.create_index(num_partitions=256, num_sub_vectors=96)
```
Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index
creation by providing the following parameters:
- **metric** (default: "L2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance.
- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table
with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional.
A higher number leads to faster queries, but it makes index generation slower.
- **num_sub_vectors** (default: 96): The number of subvectors (M) that will be created during Product Quantization (PQ). A larger number makes
search more accurate, but also makes the index larger and slower to build.
## Querying an ANN Index
Querying vector indexes is done via the [search](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.search) function.
There are a couple of parameters that can be used to fine-tune the search:
- **limit** (default: 10): The amount of results that will be returned
- **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower.<br/>
Most of the time, setting nprobes to cover 5-10% of the dataset should achieve high recall with low latency.<br/>
e.g., for 1M vectors divided up into 256 partitions, nprobes should be set to ~20-40.<br/>
Note: nprobes is only applicable if an ANN index is present. If specified on a table without an ANN index, it is ignored.
- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory.<br/>
A higher number makes search more accurate but also slower. If you find the recall is less than idea, try refine_factor=10 to start.<br/>
e.g., for 1M vectors divided into 256 partitions, if you're looking for top 20, then refine_factor=200 reranks the whole partition.<br/>
Note: refine_factor is only applicable if an ANN index is present. If specified on a table without an ANN index, it is ignored.
```python
tbl.search(np.random.random((768))) \
.limit(2) \
.nprobes(20) \
.refine_factor(10) \
.to_df()
vector item score
0 [0.44949695, 0.8444449, 0.06281311, 0.23338133... item 1141 103.575333
1 [0.48587373, 0.269207, 0.15095535, 0.65531915,... item 3953 108.393867
```
The search will return the data requested in addition to the score of each item.
**Note:** The score is the distance between the query vector and the element. A lower number means that the result is more relevant.
### Filtering (where clause)
You can further filter the elements returned by a search using a where clause.
```python
tbl.search(np.random.random((768))).where("item != 'item 1141'").to_df()
```
### Projections (select clause)
You can select the columns returned by the query using a select clause.
```python
tbl.search(np.random.random((768))).select(["vector"]).to_df()
vector score
0 [0.30928212, 0.022668175, 0.1756372, 0.4911822... 93.971092
1 [0.2525465, 0.01723831, 0.261568, 0.002007689,... 95.173485
...
```

77
docs/src/basic.md Normal file
View File

@@ -0,0 +1,77 @@
# Basic LanceDB Functionality
## How to connect to a database
In local mode, LanceDB stores data in a directory on your local machine. To connect to a local database, you can use the following code:
```python
import lancedb
uri = "~/.lancedb"
db = lancedb.connect(uri)
```
LanceDB will create the directory if it doesn't exist (including parent directories).
If you need a reminder of the uri, use the `db.uri` property.
## How to create a table
To create a table, you can use the following code:
```python
tbl = db.create_table("my_table",
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
```
Under the hood, LanceDB is converting the input data into an Apache Arrow table
and persisting it to disk in [Lance format](github.com/eto-ai/lance).
If the table already exists, LanceDB will raise an error by default.
If you want to overwrite the table, you can pass in `mode="overwrite"`
to the `create_table` method.
You can also pass in a pandas DataFrame directly:
```python
import pandas as pd
df = pd.DataFrame([{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
tbl = db.create_table("table_from_df", data=df)
```
## How to open an existing table
Once created, you can open a table using the following code:
```python
tbl = db.open_table("my_table")
```
If you forget the name of your table, you can always get a listing of all table names:
```python
db.table_names()
```
## How to add data to a table
After a table has been created, you can always add more data to it using
```python
df = pd.DataFrame([{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}])
tbl.add(df)
```
## How to search for (approximate) nearest neighbors
Once you've embedded the query, you can find its nearest neighbors using the following code:
```python
tbl.search([100, 100]).limit(2).to_df()
```
This returns a pandas DataFrame with the results.
## What's next
This section covered the very basics of the LanceDB API.
LanceDB supports many additional features when creating indices to speed up search and options for search.
These are contained in the next section of the documentation.

97
docs/src/embedding.md Normal file
View File

@@ -0,0 +1,97 @@
# Embedding Functions
Embeddings are high dimensional floating-point vector representations of your data or query.
Anything can be embedded using some embedding model or function.
For a given embedding function, the output will always have the same number of dimensions.
## Creating an embedding function
Any function that takes as input a batch (list) of data and outputs a batch (list) of embeddings
can be used by LanceDB as an embedding function. The input and output batch sizes should be the same.
### HuggingFace example
One popular free option would be to use the [sentence-transformers](https://www.sbert.net/) library from HuggingFace.
You can install this using pip: `pip install sentence-transformers`.
```python
from sentence_transformers import SentenceTransformer
name="paraphrase-albert-small-v2"
model = SentenceTransformer(name)
# used for both training and querying
def embed_func(batch):
return [model.encode(sentence) for sentence in batch]
```
### OpenAI example
You can also use an external API like OpenAI to generate embeddings
```python
import openai
import os
# Configuring the environment variable OPENAI_API_KEY
if "OPENAI_API_KEY" not in os.environ:
# OR set the key here as a variable
openai.api_key = "sk-..."
# verify that the API key is working
assert len(openai.Model.list()["data"]) > 0
def embed_func(c):
rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
return [record["embedding"] for record in rs["data"]]
```
## Applying an embedding function
Using an embedding function, you can apply it to raw data
to generate embeddings for each row.
Say if you have a pandas DataFrame with a `text` column that you want to be embedded,
you can use the [with_embeddings](https://lancedb.github.io/lancedb/python/#lancedb.embeddings.with_embeddings)
function to generate embeddings and add create a combined pyarrow table:
```python
import pandas as pd
from lancedb.embeddings import with_embeddings
df = pd.DataFrame([{"text": "pepperoni"},
{"text": "pineapple"}])
data = with_embeddings(embed_func, df)
# The output is used to create / append to a table
# db.create_table("my_table", data=data)
```
If your data is in a different column, you can specify the `column` kwarg to `with_embeddings`.
By default, LanceDB calls the function with batches of 1000 rows. This can be configured
using the `batch_size` parameter to `with_embeddings`.
LanceDB automatically wraps the function with retry and rate-limit logic to ensure the OpenAI
API call is reliable.
## Searching with an embedding function
At inference time, you also need the same embedding function to embed your query text.
It's important that you use the same model / function otherwise the embedding vectors don't
belong in the same latent space and your results will be nonsensical.
```python
query = "What's the best pizza topping?"
query_vector = embed_func([query])[0]
tbl.search(query_vector).limit(10).to_df()
```
The above snippet returns a pandas DataFrame with the 10 closest vectors to the query.
## Roadmap
In the near future, we'll be integrating the embedding functions deeper into LanceDB<br/>.
The goal is that you just have to configure the function once when you create the table,
and then you'll never have to deal with embeddings / vectors after that unless you want to.
We'll also integrate more popular models and APIs.

View File

@@ -0,0 +1,7 @@
# Code documentation Q&A bot with LangChain
## use LanceDB's LangChain integration to build a Q&A bot for your documentation
<img id="splash" width="400" alt="langchain" src="https://user-images.githubusercontent.com/917119/236580868-61a246a9-e587-4c2b-8ae5-6fe5f7b7e81e.png">
This example is in a [notebook](https://github.com/lancedb/lancedb/blob/main/notebooks/code_qa_bot.ipynb)

View File

@@ -0,0 +1,99 @@
# YouTube transcript QA bot with NodeJS
## use LanceDB's Javascript API and OpenAI to build a QA bot for YouTube transcripts
<img id="splash" width="400" alt="nodejs" src="https://github.com/lancedb/lancedb/assets/917119/3a140e75-bf8e-438a-a1e4-af14a72bcf98">
This Q&A bot will allow you to search through youtube transcripts using natural language! We'll introduce how you can use LanceDB's Javascript API to store and manage your data easily.
For this example we're using a HuggingFace dataset that contains YouTube transcriptions: `jamescalam/youtube-transcriptions`, to make it easier, we've converted it to a LanceDB `db` already, which you can download and put in a working directory:
```wget -c https://eto-public.s3.us-west-2.amazonaws.com/lancedb_demo.tar.gz -O - | tar -xz -C .```
Now, we'll create a simple app that can:
1. Take a text based query and search for contexts in our corpus, using embeddings generated from the OpenAI Embedding API.
2. Create a prompt with the contexts, and call the OpenAI Completion API to answer the text based query.
Dependencies and setup of OpenAI API:
```javascript
const lancedb = require("vectordb");
const { Configuration, OpenAIApi } = require("openai");
const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY,
});
const openai = new OpenAIApi(configuration);
```
First, let's set our question and the context amount. The context amount will be used to query similar documents in our corpus.
```javascript
const QUESTION = "who was the 12th person on the moon and when did they land?";
const CONTEXT_AMOUNT = 3;
```
Now, let's generate an embedding from this question:
```javascript
const embeddingResponse = await openai.createEmbedding({
model: "text-embedding-ada-002",
input: QUESTION,
});
const embedding = embeddingResponse.data["data"][0]["embedding"];
```
Once we have the embedding, we can connect to LanceDB (using the database we downloaded earlier), and search through the chatbot table.
We'll extract 3 similar documents found.
```javascript
const db = await lancedb.connect('./lancedb');
const tbl = await db.openTable('chatbot');
const query = tbl.search(embedding);
query.limit = CONTEXT_AMOUNT;
const context = await query.execute();
```
Let's combine the context together so we can pass it into our prompt:
```javascript
for (let i = 1; i < context.length; i++) {
context[0]["text"] += " " + context[i]["text"];
}
```
Lastly, let's construct the prompt. You could play around with this to create more accurate/better prompts to yield results.
```javascript
const prompt = "Answer the question based on the context below.\n\n" +
"Context:\n" +
`${context[0]["text"]}\n` +
`\n\nQuestion: ${QUESTION}\nAnswer:`;
```
We pass the prompt, along with the context, to the completion API.
```javascript
const completion = await openai.createCompletion({
model: "text-davinci-003",
prompt,
temperature: 0,
max_tokens: 400,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
});
```
And that's it!
```javascript
console.log(completion.data.choices[0].text);
```
The response is (which is non deterministic):
```
The 12th person on the moon was Harrison Schmitt and he landed on December 11, 1972.
```

View File

@@ -0,0 +1,106 @@
# Serverless LanceDB
## Store your data on S3 and use Lambda to compute embeddings and retrieve queries in production easily.
<img id="splash" width="400" alt="s3-lambda" src="https://user-images.githubusercontent.com/917119/234653050-305a1e90-9305-40ab-b014-c823172a948c.png">
This is a great option if you're wanting to scale with your use case and save effort and costs of maintenance.
Let's walk through how to get a simple Lambda function that queries the SIFT dataset on S3.
Before we start, you'll need to ensure you create a secure account access to AWS. We recommend using user policies, as this way AWS can share credentials securely without you having to pass around environment variables into Lambda.
We'll also use a container to ship our Lambda code. This is a good option for Lambda as you don't have the space limits that you would otherwise by building a package yourself.
# Initial setup: creating a LanceDB Table and storing it remotely on S3
We'll use the SIFT vector dataset as an example. To make it easier, we've already made a Lance-format SIFT dataset publicly available, which we can access and use to populate our LanceDB Table.
To do this, download the Lance files locally first from:
```
s3://eto-public/datasets/sift/vec_data.lance
```
Then, we can write a quick Python script to populate our LanceDB Table:
```python
import pylance
sift_dataset = pylance.dataset("/path/to/local/vec_data.lance")
df = sift_dataset.to_table().to_pandas()
import lancedb
db = lancedb.connect(".")
table = db.create_table("vector_example", df)
```
Once we've created our Table, we are free to move this data over to S3 so we can remotely host it.
# Building our Lambda app: a simple event handler for vector search
Now that we've got a remotely hosted LanceDB Table, we'll want to be able to query it from Lambda. To do so, let's create a new `Dockerfile` using the AWS python container base:
```docker
FROM public.ecr.aws/lambda/python:3.10
RUN pip3 install --upgrade pip
RUN pip3 install --no-cache-dir -U numpy --target "${LAMBDA_TASK_ROOT}"
RUN pip3 install --no-cache-dir -U lancedb --target "${LAMBDA_TASK_ROOT}"
COPY app.py ${LAMBDA_TASK_ROOT}
CMD [ "app.handler" ]
```
Now let's make a simple Lambda function that queries the SIFT dataset in `app.py`.
```python
import json
import numpy as np
import lancedb
db = lancedb.connect("s3://eto-public/tables")
table = db.open_table("vector_example")
def handler(event, context):
status_code = 200
if event['query_vector'] is None:
status_code = 404
return {
"statusCode": status_code,
"headers": {
"Content-Type": "application/json"
},
"body": json.dumps({
"Error ": "No vector to query was issued"
})
}
# Shape of SIFT is (128,1M), d=float32
query_vector = np.array(event['query_vector'], dtype=np.float32)
rs = table.search(query_vector).limit(2).to_df()
return {
"statusCode": status_code,
"headers": {
"Content-Type": "application/json"
},
"body": rs.to_json()
}
```
# Deploying the container to ECR
The next step is to build and push the container to ECR, where it can then be used to create a new Lambda function.
It's best to follow the official AWS documentation for how to do this, which you can view here:
```
https://docs.aws.amazon.com/lambda/latest/dg/images-create.html#images-upload
```
# Final step: setting up your Lambda function
Once the container is pushed, you can create a Lambda function by selecting the container.

View File

@@ -0,0 +1,7 @@
# YouTube transcript search
## Search through youtube transcripts using natural language with LanceDB
<img id="splash" width="400" alt="youtube transcript search" src="https://user-images.githubusercontent.com/917119/236965568-def7394d-171c-45f2-939d-8edfeaadd88c.png">
This example is in a [notebook](https://github.com/lancedb/lancedb/blob/main/notebooks/youtube_transcript_search.ipynb)

View File

@@ -1,16 +1,18 @@
# Welcome to LanceDB's Documentation
LanceDB is an open-source database for vector-search built with persistent storage, which greatly simplifies retrevial, filtering and management of embeddings.
LanceDB is an open-source database for vector-search built with persistent storage, which greatly simplifies retrivial, filtering and management of embeddings.
The key features of LanceDB include:
* Production-scale vector search with no servers to manage.
* Combine attribute-based information with vectors and store them as a single source-of-truth.
* Store, query and filter vectors, metadata and multi-modal data (text, images, videos, point clouds, and more).
* Native Python and Javascript/Typescript support (coming soon).
* Zero-copy, automatic versioning, manage versions of your data without needing extra infrastructure.
* Ecosystem integrations: Apache-Arrow, Pandas, Polars, DuckDB and more on the way.
* Ecosystem integrations with [LangChain 🦜️🔗](https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lanecdb.html), [LlamaIndex 🦙](https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html), Apache-Arrow, Pandas, Polars, DuckDB and more on the way.
LanceDB's core is written in Rust 🦀 and is built using Lance, an open-source columnar format designed for performant ML workloads.
@@ -33,7 +35,15 @@ table = db.create_table("my_table",
result = table.search([100, 100]).limit(2).to_df()
```
## Complete Demos
We will be adding completed demo apps built using LanceDB.
- [YouTube Transcript Search](../notebooks/youtube_transcript_search.ipynb)
## Documentation Quick Links
* [`Basic Operations`](basic.md) - basic functionality of LanceDB.
* [`Embedding Functions`](embedding.md) - functions for working with embeddings.
* [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries.
* [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
* [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK.

111
docs/src/integrations.md Normal file
View File

@@ -0,0 +1,111 @@
# Integrations
Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, PyArrow and DuckDB.
## Pandas and PyArrow
First, we need to connect to a `LanceDB` database.
``` py
import lancedb
db = lancedb.connect("/tmp/lancedb")
```
And write a `Pandas DataFrame` to LanceDB directly.
```py
import pandas as pd
data = pd.DataFrame({
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0]
})
table = db.create_table("pd_table", data=data)
# Optionally, create a IVF_PQ index
table.create_index(num_partitions=256, num_sub_vectors=96)
```
You will find detailed instructions of creating dataset and index in [Basic Operations](basic.md) and [Indexing](indexing.md)
sections.
We can now perform similarity searches via `LanceDB`.
```py
# Open the table previously created.
table = db.open_table("pd_table")
query_vector = [100, 100]
# Pandas DataFrame
df = table.search(query_vector).limit(1).to_df()
print(df)
```
```
vector item price score
0 [5.9, 26.5] bar 20.0 14257.05957
```
If you have a simple filter, it's faster to provide a where clause to `LanceDB`'s search query.
If you have more complex criteria, you can always apply the filter to the resulting pandas `DataFrame` from the search query.
```python
# Apply the filter via LanceDB
results = table.search([100, 100]).where("price < 15").to_df()
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
# Apply the filter via Pandas
df = results = table.search([100, 100]).to_df()
results = df[df.price < 15]
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
```
## DuckDB
`LanceDB` works with `DuckDB` via [PyArrow integration](https://duckdb.org/docs/guides/python/sql_on_arrow).
Let us start with installing `duckdb` and `lancedb`.
```shell
pip install duckdb lancedb
```
We will re-use the dataset created previously
```python
import lancedb
db = lancedb.connect("/tmp/lancedb")
table = db.open_table("pd_table")
arrow_table = table.to_arrow()
```
`DuckDB` can directly query the `arrow_table`:
```python
In [15]: duckdb.query("SELECT * FROM t")
Out[15]:
┌─────────────┬─────────┬────────┐
│ vector │ item │ price │
│ float[] │ varchar │ double │
├─────────────┼─────────┼────────┤
│ [3.1, 4.1] │ foo │ 10.0 │
│ [5.9, 26.5] │ bar │ 20.0 │
└─────────────┴─────────┴────────┘
In [16]: duckdb.query("SELECT mean(price) FROM t")
Out[16]:
┌─────────────┐
│ mean(price) │
│ double │
├─────────────┤
│ 15.0 │
└─────────────┘
```

View File

@@ -6,7 +6,9 @@
pip install lancedb
```
::: lancedb
::: lancedb.db
::: lancedb.table
::: lancedb.query
## ::: lancedb
## ::: lancedb.db
## ::: lancedb.table
## ::: lancedb.query
## ::: lancedb.embeddings
## ::: lancedb.context

16
node/.eslintrc.js Normal file
View File

@@ -0,0 +1,16 @@
module.exports = {
env: {
browser: true,
es2021: true
},
extends: 'standard-with-typescript',
overrides: [
],
parserOptions: {
project: './tsconfig.json',
ecmaVersion: 'latest',
sourceType: 'module'
},
rules: {
}
}

2
node/.npmignore Normal file
View File

@@ -0,0 +1,2 @@
gen_test_data.py
index.node

59
node/README.md Normal file
View File

@@ -0,0 +1,59 @@
# LanceDB
A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb).
## Installation
```bash
npm install vectordb
```
This will download the appropriate native library for your platform. We currently
support x86_64 Linux, Intel MacOS, and ARM (M1/M2) MacOS.
## Usage
### Basic Example
```javascript
const lancedb = require('vectordb');
const db = lancedb.connect('<PATH_TO_LANCEDB_DATASET>');
const table = await db.openTable('my_table');
const query = await table.search([0.1, 0.3]).setLimit(20).execute();
console.log(results);
```
The [examples](./examples) folder contains complete examples.
## Development
Build and install the rust library with:
```bash
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
```
`npm run build` builds the Rust library, `npm run pack-build` packages the Rust
binary into an npm module called `@vectordb/<platform>` (for example,
`@vectordb/darwin-arm64.node`), and then `npm run install ...` installs that
module.
The LanceDB javascript is built with npm:
```bash
npm run tsc
```
Run the tests with
```bash
npm test
```
To run the linter and have it automatically fix all errors
```bash
npm run lint -- --fix
```

36
node/examples/js/index.js Normal file
View File

@@ -0,0 +1,36 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
'use strict'
async function example () {
const lancedb = require('vectordb')
const db = await lancedb.connect('data/sample-lancedb')
const data = [
{ id: 1, vector: [0.1, 0.2], price: 10 },
{ id: 2, vector: [1.1, 1.2], price: 50 }
]
const table = await db.createTable('vectors', data)
console.log(await db.tableNames())
const results = await table
.search([0.1, 0.3])
.limit(20)
.execute()
console.log(results)
}
example()

View File

@@ -0,0 +1,14 @@
{
"name": "vectordb-example-js",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"dependencies": {
"vectordb": "^0.1.0"
}
}

View File

@@ -0,0 +1,22 @@
{
"name": "vectordb-example-ts",
"version": "1.0.0",
"description": "",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"tsc": "tsc -b",
"build": "tsc"
},
"author": "Lance Devs",
"license": "Apache-2.0",
"devDependencies": {
"@types/node": "^18.16.2",
"ts-node": "^10.9.1",
"ts-node-dev": "^2.0.0",
"typescript": "*"
},
"dependencies": {
"vectordb": "^0.1.0"
}
}

View File

@@ -0,0 +1,35 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import * as vectordb from 'vectordb';
async function example () {
const db = await vectordb.connect('data/sample-lancedb')
const data = [
{ id: 1, vector: [0.1, 0.2], price: 10 },
{ id: 2, vector: [1.1, 1.2], price: 50 }
]
const table = await db.createTable('vectors', data)
console.log(await db.tableNames())
const results = await table
.search([0.1, 0.3])
.limit(20)
.execute()
console.log(results)
}
example().then(_ => { console.log ("All done!") })

View File

@@ -0,0 +1,10 @@
{
"include": ["src/**/*.ts"],
"compilerOptions": {
"target": "es2016",
"module": "commonjs",
"declaration": true,
"outDir": "./dist",
"strict": true
}
}

8
node/gen_test_data.py Normal file
View File

@@ -0,0 +1,8 @@
import lancedb
uri = "sample-lancedb"
db = lancedb.connect(uri)
table = db.create_table("my_table",
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])

31
node/native.js Normal file
View File

@@ -0,0 +1,31 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
const { currentTarget } = require('@neon-rs/load');
let nativeLib;
try {
nativeLib = require(`@vectordb/${currentTarget()}`);
} catch (e) {
throw new Error(`vectordb: failed to load native library.
You may need to run \`npm install @vectordb/${currentTarget()}\`.
If that does not work, please file a bug report at https://github.com/lancedb/lancedb/issues
Source error: ${e}`);
}
// Dynamic require for runtime.
module.exports = nativeLib;

7001
node/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

79
node/package.json Normal file
View File

@@ -0,0 +1,79 @@
{
"name": "vectordb",
"version": "0.1.2",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"tsc": "tsc -b",
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
"build-release": "npm run build -- --release",
"cross-release": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cross build --message-format=json --release -p vectordb-node",
"test": "mocha -recursive dist/test",
"lint": "eslint src --ext .js,.ts",
"pack-build": "neon pack-build"
},
"repository": {
"type": "git",
"url": "https://github.com/lancedb/lancedb/node"
},
"keywords": [
"data-format",
"data-science",
"machine-learning",
"data-analytics"
],
"author": "Lance Devs",
"license": "Apache-2.0",
"devDependencies": {
"@neon-rs/cli": "^0.0.74",
"@types/chai": "^4.3.4",
"@types/mocha": "^10.0.1",
"@types/node": "^18.16.2",
"@types/temp": "^0.9.1",
"@typescript-eslint/eslint-plugin": "^5.59.1",
"cargo-cp-artifact": "^0.1",
"chai": "^4.3.7",
"eslint": "^8.39.0",
"eslint-config-standard-with-typescript": "^34.0.1",
"eslint-plugin-import": "^2.27.5",
"eslint-plugin-n": "^15.7.0",
"eslint-plugin-promise": "^6.1.1",
"mocha": "^10.2.0",
"temp": "^0.9.4",
"ts-node": "^10.9.1",
"ts-node-dev": "^2.0.0",
"typescript": "*"
},
"dependencies": {
"@apache-arrow/ts": "^12.0.0",
"@neon-rs/load": "^0.0.74",
"apache-arrow": "^12.0.0"
},
"os": [
"darwin",
"linux"
],
"cpu": [
"x64",
"arm64"
],
"neon": {
"targets": {
"x86_64-apple-darwin": "@vectordb/darwin-x64",
"aarch64-apple-darwin": "@vectordb/darwin-arm64",
"x86_64-unknown-linux-gnu": "@vectordb/linux-x64-gnu",
"x86_64-unknown-linux-musl": "@vectordb/linux-x64-musl",
"aarch64-unknown-linux-gnu": "@vectordb/linux-arm64-gnu",
"aarch64-unknown-linux-musl": "@vectordb/linux-arm64-musl"
}
},
"optionalDependencies": {
"@vectordb/darwin-arm64": "0.1.2",
"@vectordb/darwin-x64": "0.1.2",
"@vectordb/linux-x64-gnu": "0.1.2",
"@vectordb/linux-x64-musl": "0.1.2",
"@vectordb/linux-arm64-gnu": "0.1.2",
"@vectordb/linux-arm64-musl": "0.1.2"
}
}

66
node/src/arrow.ts Normal file
View File

@@ -0,0 +1,66 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import {
Field,
Float32,
List,
makeBuilder,
RecordBatchFileWriter,
Table,
type Vector,
vectorFromArray
} from 'apache-arrow'
export function convertToTable (data: Array<Record<string, unknown>>): Table {
if (data.length === 0) {
throw new Error('At least one record needs to be provided')
}
const columns = Object.keys(data[0])
const records: Record<string, Vector> = {}
for (const columnsKey of columns) {
if (columnsKey === 'vector') {
const children = new Field<Float32>('item', new Float32())
const list = new List(children)
const listBuilder = makeBuilder({
type: list
})
const vectorSize = (data[0].vector as any[]).length
for (const datum of data) {
if ((datum[columnsKey] as any[]).length !== vectorSize) {
throw new Error(`Invalid vector size, expected ${vectorSize}`)
}
listBuilder.append(datum[columnsKey])
}
records[columnsKey] = listBuilder.finish().toVector()
} else {
const values = []
for (const datum of data) {
values.push(datum[columnsKey])
}
records[columnsKey] = vectorFromArray(values)
}
}
return new Table(records)
}
export async function fromRecordsToBuffer (data: Array<Record<string, unknown>>): Promise<Buffer> {
const table = convertToTable(data)
const writer = RecordBatchFileWriter.writeAll(table)
return Buffer.from(await writer.toUint8Array())
}

179
node/src/index.ts Normal file
View File

@@ -0,0 +1,179 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import {
RecordBatchFileWriter,
type Table as ArrowTable,
tableFromIPC,
Vector
} from 'apache-arrow'
import { fromRecordsToBuffer } from './arrow'
// eslint-disable-next-line @typescript-eslint/no-var-requires
const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd } = require('../native.js')
/**
* Connect to a LanceDB instance at the given URI
* @param uri The uri of the database.
*/
export async function connect (uri: string): Promise<Connection> {
return new Connection(uri)
}
/**
* A connection to a LanceDB database.
*/
export class Connection {
private readonly _uri: string
private readonly _db: any
constructor (uri: string) {
this._uri = uri
this._db = databaseNew(uri)
}
get uri (): string {
return this._uri
}
/**
* Get the names of all tables in the database.
*/
async tableNames (): Promise<string[]> {
return databaseTableNames.call(this._db)
}
/**
* Open a table in the database.
* @param name The name of the table.
*/
async openTable (name: string): Promise<Table> {
const tbl = await databaseOpenTable.call(this._db, name)
return new Table(tbl, name)
}
async createTable (name: string, data: Array<Record<string, unknown>>): Promise<Table> {
await tableCreate.call(this._db, name, await fromRecordsToBuffer(data))
return await this.openTable(name)
}
async createTableArrow (name: string, table: ArrowTable): Promise<Table> {
const writer = RecordBatchFileWriter.writeAll(table)
await tableCreate.call(this._db, name, Buffer.from(await writer.toUint8Array()))
return await this.openTable(name)
}
}
/**
* A table in a LanceDB database.
*/
export class Table {
private readonly _tbl: any
private readonly _name: string
constructor (tbl: any, name: string) {
this._tbl = tbl
this._name = name
}
get name (): string {
return this._name
}
/**
* Create a search query to find the nearest neighbors of the given query vector.
* @param queryVector The query vector.
*/
search (queryVector: number[]): Query {
return new Query(this._tbl, queryVector)
}
/**
* Insert records into this Table
* @param data Records to be inserted into the Table
*
* @param mode Append / Overwrite existing records. Default: Append
* @return The number of rows added to the table
*/
async add (data: Array<Record<string, unknown>>): Promise<number> {
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Append.toString())
}
async overwrite (data: Array<Record<string, unknown>>): Promise<number> {
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Overwrite.toString())
}
}
/**
* A builder for nearest neighbor queries for LanceDB.
*/
export class Query {
private readonly _tbl: any
private readonly _query_vector: number[]
private _limit: number
private readonly _refine_factor?: number
private readonly _nprobes: number
private readonly _columns?: string[]
private _filter?: string
private readonly _metric = 'L2'
constructor (tbl: any, queryVector: number[]) {
this._tbl = tbl
this._query_vector = queryVector
this._limit = 10
this._nprobes = 20
this._refine_factor = undefined
this._columns = undefined
this._filter = undefined
}
limit (value: number): Query {
this._limit = value
return this
}
filter (value: string): Query {
this._filter = value
return this
}
/**
* Execute the query and return the results as an Array of Objects
*/
async execute<T = Record<string, unknown>> (): Promise<T[]> {
let buffer
if (this._filter != null) {
buffer = await tableSearch.call(this._tbl, this._query_vector, this._limit, this._filter)
} else {
buffer = await tableSearch.call(this._tbl, this._query_vector, this._limit)
}
const data = tableFromIPC(buffer)
return data.toArray().map((entry: Record<string, unknown>) => {
const newObject: Record<string, unknown> = {}
Object.keys(entry).forEach((key: string) => {
if (entry[key] instanceof Vector) {
newObject[key] = (entry[key] as Vector).toArray()
} else {
newObject[key] = entry[key]
}
})
return newObject as unknown as T
})
}
}
export enum WriteMode {
Overwrite = 'overwrite',
Append = 'append'
}

146
node/src/test/test.ts Normal file
View File

@@ -0,0 +1,146 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { describe } from 'mocha'
import { assert } from 'chai'
import { track } from 'temp'
import * as lancedb from '../index'
describe('LanceDB client', function () {
describe('when creating a connection to lancedb', function () {
it('should have a valid url', async function () {
const uri = await createTestDB()
const con = await lancedb.connect(uri)
assert.equal(con.uri, uri)
})
it('should return the existing table names', async function () {
const uri = await createTestDB()
const con = await lancedb.connect(uri)
assert.deepEqual(await con.tableNames(), ['vectors'])
})
})
describe('when querying an existing dataset', function () {
it('should open a table', async function () {
const uri = await createTestDB()
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
assert.equal(table.name, 'vectors')
})
it('execute a query', async function () {
const uri = await createTestDB()
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
const results = await table.search([0.1, 0.3]).execute()
assert.equal(results.length, 2)
assert.equal(results[0].price, 10)
const vector = results[0].vector as Float32Array
assert.approximately(vector[0], 0.0, 0.2)
assert.approximately(vector[0], 0.1, 0.3)
})
it('limits # of results', async function () {
const uri = await createTestDB()
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
const results = await table.search([0.1, 0.3]).limit(1).execute()
assert.equal(results.length, 1)
assert.equal(results[0].id, 1)
})
it('uses a filter', async function () {
const uri = await createTestDB()
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
const results = await table.search([0.1, 0.3]).filter('id == 2').execute()
assert.equal(results.length, 1)
assert.equal(results[0].id, 2)
})
})
describe('when creating a new dataset', function () {
it('creates a new table from javascript objects', async function () {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const data = [
{ id: 1, vector: [0.1, 0.2], price: 10 },
{ id: 2, vector: [1.1, 1.2], price: 50 }
]
const tableName = `vectors_${Math.floor(Math.random() * 100)}`
const table = await con.createTable(tableName, data)
assert.equal(table.name, tableName)
const results = await table.search([0.1, 0.3]).execute()
assert.equal(results.length, 2)
})
it('appends records to an existing table ', async function () {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const data = [
{ id: 1, vector: [0.1, 0.2], price: 10 },
{ id: 2, vector: [1.1, 1.2], price: 50 }
]
const table = await con.createTable('vectors', data)
const results = await table.search([0.1, 0.3]).execute()
assert.equal(results.length, 2)
const dataAdd = [
{ id: 3, vector: [2.1, 2.2], price: 10 },
{ id: 4, vector: [3.1, 3.2], price: 50 }
]
await table.add(dataAdd)
const resultsAdd = await table.search([0.1, 0.3]).execute()
assert.equal(resultsAdd.length, 4)
})
it('overwrite all records in a table', async function () {
const uri = await createTestDB()
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
const results = await table.search([0.1, 0.3]).execute()
assert.equal(results.length, 2)
const dataOver = [
{ vector: [2.1, 2.2], price: 10, name: 'foo' },
{ vector: [3.1, 3.2], price: 50, name: 'bar' }
]
await table.overwrite(dataOver)
const resultsAdd = await table.search([0.1, 0.3]).execute()
assert.equal(resultsAdd.length, 2)
})
})
})
async function createTestDB (): Promise<string> {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const data = [
{ id: 1, vector: [0.1, 0.2], name: 'foo', price: 10, is_active: true },
{ id: 2, vector: [1.1, 1.2], name: 'bar', price: 50, is_active: false }
]
await con.createTable('vectors', data)
return dir
}

10
node/tsconfig.json Normal file
View File

@@ -0,0 +1,10 @@
{
"include": ["src/**/*.ts"],
"compilerOptions": {
"target": "es2016",
"module": "commonjs",
"declaration": true,
"outDir": "./dist",
"strict": true
}
}

357
notebooks/code_qa_bot.ipynb Normal file
View File

@@ -0,0 +1,357 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "13cb272e",
"metadata": {},
"source": [
"# Code documentation Q&A bot example with LangChain\n",
"\n",
"This Q&A bot will allow you to query your own documentation easily using questions. We'll also demonstrate the use of LangChain and LanceDB using the OpenAI API. \n",
"\n",
"In this example we'll use Pandas 2.0 documentation, but, this could be replaced for your own docs as well"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "66638d6c",
"metadata": {},
"outputs": [],
"source": [
"!pip install --quiet openai langchain\n",
"!pip install --quiet -U lancedb"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "d1cdcac3",
"metadata": {},
"source": [
"First, let's get some setup out of the way. As we're using the OpenAI API, ensure that you've set your key (and organization if needed):"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "58ee1868",
"metadata": {},
"outputs": [],
"source": [
"import openai\n",
"import os\n",
"\n",
"# Configuring the environment variable OPENAI_API_KEY\n",
"if \"OPENAI_API_KEY\" not in os.environ:\n",
" # OR set the key here as a variable\n",
" openai.api_key = \"sk-...\"\n",
" \n",
"assert len(openai.Model.list()[\"data\"]) > 0"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "34f524d3",
"metadata": {},
"source": [
"# Loading in our code documentation, generating embeddings and storing our documents in LanceDB\n",
"\n",
"We're going to use the power of LangChain to help us create our Q&A bot. It comes with several APIs that can make our development much easier as well as a LanceDB integration for vectorstore."
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "b55d22f1",
"metadata": {},
"outputs": [],
"source": [
"import lancedb\n",
"import re\n",
"import pickle\n",
"from pathlib import Path\n",
"\n",
"from langchain.document_loaders import UnstructuredHTMLLoader\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain.vectorstores import LanceDB\n",
"from langchain.llms import OpenAI\n",
"from langchain.chains import RetrievalQA"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "6ccf9b2b",
"metadata": {},
"source": [
"You can download the Pandas documentation from https://pandas.pydata.org/docs/. To make sure we're not littering our repo with docs, we won't include it in the LanceDB repo, so download this and store it locally first."
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "ae42496c",
"metadata": {},
"source": [
"We'll create a simple helper function that can help to extract metadata, so we can use this downstream when we're wanting to query with filters. In this case, we want to keep the lineage of the uri or path for each document that we process:"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "d171d062",
"metadata": {},
"outputs": [],
"source": [
"def get_document_title(document):\n",
" m = str(document.metadata[\"source\"])\n",
" title = re.findall(\"pandas.documentation(.*).html\", m)\n",
" if title[0] is not None:\n",
" return(title[0])\n",
" return ''"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "130162ad",
"metadata": {},
"source": [
"# Pre-processing and loading the documentation\n",
"\n",
"Next, let's pre-process and load the documentation. To make sure we don't need to do this repeatedly if we were updating code, we're caching it using pickle so we can retrieve it again (this could take a few minutes to run the first time yyou do it). We'll also add some more metadata to the docs here such as the title and version of the code:"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "33bfe7d8",
"metadata": {},
"outputs": [],
"source": [
"docs_path = Path(\"docs.pkl\")\n",
"docs = []\n",
"\n",
"if not docs_path.exists():\n",
" for p in Path(\"./pandas.documentation\").rglob(\"*.html\"):\n",
" if p.is_dir():\n",
" continue\n",
" loader = UnstructuredHTMLLoader(p)\n",
" raw_document = loader.load()\n",
" \n",
" m = {}\n",
" m[\"title\"] = get_document_title(raw_document[0])\n",
" m[\"version\"] = \"2.0rc0\"\n",
" raw_document[0].metadata = raw_document[0].metadata | m\n",
" raw_document[0].metadata[\"source\"] = str(raw_document[0].metadata[\"source\"])\n",
" docs = docs + raw_document\n",
"\n",
" with docs_path.open(\"wb\") as fh:\n",
" pickle.dump(docs, fh)\n",
"else:\n",
" with docs_path.open(\"rb\") as fh:\n",
" docs = pickle.load(fh)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "c3852dd3",
"metadata": {},
"source": [
"# Generating emebeddings from our docs\n",
"\n",
"Now that we have our raw documents loaded, we need to pre-process them to generate embeddings:"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "82230563",
"metadata": {},
"outputs": [],
"source": [
"text_splitter = RecursiveCharacterTextSplitter(\n",
" chunk_size=1000,\n",
" chunk_overlap=200,\n",
")\n",
"documents = text_splitter.split_documents(docs)\n",
"embeddings = OpenAIEmbeddings()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "43e68215",
"metadata": {},
"source": [
"# Storing and querying with LanceDB\n",
"\n",
"Let's connect to LanceDB so we can store our documents. We'll create a Table to store them in:"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "74780a58",
"metadata": {},
"outputs": [],
"source": [
"db = lancedb.connect('/tmp/lancedb')\n",
"table = db.create_table(\"pandas_docs\", data=[\n",
" {\"vector\": embeddings.embed_query(\"Hello World\"), \"text\": \"Hello World\", \"id\": \"1\"}\n",
"], mode=\"overwrite\")\n",
"docsearch = LanceDB.from_documents(documents, embeddings, connection=table)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "3cb1dc5d",
"metadata": {},
"source": [
"Now let's create our RetrievalQA chain using the LanceDB vector store:"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "6a5891ad",
"metadata": {},
"outputs": [],
"source": [
"qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type=\"stuff\", retriever=docsearch.as_retriever())"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "28d93b85",
"metadata": {},
"source": [
"And thats it! We're all setup. The next step is to run some queries, let's try a few:"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "70d88316",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"' The major differences in pandas 2.0 include installing optional dependencies with pip extras, the ability to use any numpy numeric dtype in an Index, and enhancements, notable bug fixes, backwards incompatible API changes, deprecations, and performance improvements.'"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"What are the major differences in pandas 2.0?\"\n",
"qa.run(query)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "85a0397c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"' 2.0.0rc0'"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"What's the current version of pandas?\"\n",
"qa.run(query)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "923f86c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"' Optional dependencies can be installed with pip install \"pandas[all]\" or \"pandas[performance]\". This will install all recommended performance dependencies such as numexpr, bottleneck and numba.'"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"How do I make use of installing optional dependencies?\"\n",
"qa.run(query)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "02082f83",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\" \\n\\nPandas 2.0 includes a number of API breaking changes, such as increased minimum versions for dependencies, the use of os.linesep for DataFrame.to_csv's line_terminator, and reorganization of the library. See the release notes for a full list of changes.\""
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"What are the backwards incompatible API changes in Pandas 2.0?\"\n",
"qa.run(query)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75cea547",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -1,7 +1,6 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "42bf01fb",
"metadata": {},
@@ -22,10 +21,10 @@
"output_type": "stream",
"text": [
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
]
}
@@ -88,7 +87,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "5ac2b6a3",
"metadata": {},
@@ -231,7 +229,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "2106b5bb",
"metadata": {},
@@ -251,7 +248,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "39f3161f3ef54a129cd65fb296332b54",
"model_id": "c6f1c76d9567421d88911923388d2530",
"version_major": 2,
"version_minor": 0
},
@@ -574,7 +571,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "559a095b",
"metadata": {},
@@ -631,7 +627,7 @@
" <iframe\n",
" width=\"400\"\n",
" height=\"300\"\n",
" src=\"https://www.youtube.com/embed/pNvujJ1XyeQ?start=289.76\"\n",
" src=\"https://www.youtube.com/embed/pNvujJ1XyeQ?start=289\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" \n",
@@ -639,7 +635,7 @@
" "
],
"text/plain": [
"<IPython.lib.display.YouTubeVideo at 0x177fde4d0>"
"<IPython.lib.display.YouTubeVideo at 0x13ec062c0>"
]
},
"execution_count": 15,
@@ -651,7 +647,7 @@
"from IPython.display import YouTubeVideo\n",
"\n",
"top_match = context.iloc[0]\n",
"YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=top_match[\"start\"])"
"YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=int(top_match[\"start\"]))"
]
},
{

View File

@@ -11,7 +11,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .db import LanceDBConnection, URI
from .db import URI, LanceDBConnection
def connect(uri: URI) -> LanceDBConnection:

View File

@@ -11,7 +11,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from typing import Union, List
from typing import List, Union
import numpy as np
import pandas as pd

View File

@@ -14,10 +14,12 @@
from __future__ import annotations
from pathlib import Path
import pyarrow as pa
from .common import URI, DATA
from .common import DATA, URI
from .table import LanceTable
from .util import get_uri_scheme
class LanceDBConnection:
@@ -26,10 +28,12 @@ class LanceDBConnection:
"""
def __init__(self, uri: URI):
if isinstance(uri, str):
uri = Path(uri)
uri = uri.expanduser().absolute()
Path(uri).mkdir(parents=True, exist_ok=True)
is_local = isinstance(uri, Path) or get_uri_scheme(uri) == "file"
if is_local:
if isinstance(uri, str):
uri = Path(uri)
uri = uri.expanduser().absolute()
Path(uri).mkdir(parents=True, exist_ok=True)
self._uri = str(uri)
@property
@@ -43,7 +47,11 @@ class LanceDBConnection:
-------
A list of table names.
"""
return [p.stem for p in Path(self.uri).glob("*.lance")]
if get_uri_scheme(self.uri) == "file":
return [p.stem for p in Path(self.uri).glob("*.lance")]
raise NotImplementedError(
"List table_names is only supported for local filesystem for now"
)
def __len__(self) -> int:
return len(self.table_names())
@@ -55,7 +63,11 @@ class LanceDBConnection:
return self.open_table(name)
def create_table(
self, name: str, data: DATA = None, schema: pa.Schema = None
self,
name: str,
data: DATA = None,
schema: pa.Schema = None,
mode: str = "create",
) -> LanceTable:
"""Create a table in the database.
@@ -67,6 +79,10 @@ class LanceDBConnection:
The data to insert into the table.
schema: pyarrow.Schema; optional
The schema of the table.
mode: str; default "create"
The mode to use when creating the table.
By default, if the table already exists, an exception is raised.
If you want to overwrite the table, use mode="overwrite".
Note
----
@@ -78,7 +94,7 @@ class LanceDBConnection:
A LanceTable object representing the table.
"""
if data is not None:
tbl = LanceTable.create(self, name, data, schema)
tbl = LanceTable.create(self, name, data, schema, mode=mode)
else:
tbl = LanceTable(self, name)
return tbl

View File

@@ -12,13 +12,14 @@
# limitations under the License.
import math
from retry import retry
import sys
from typing import Callable, Union
from lance.vector import vec_to_table
import numpy as np
import pandas as pd
import pyarrow as pa
from lance.vector import vec_to_table
from retry import retry
def with_embeddings(
@@ -64,13 +65,19 @@ class EmbeddingFunction:
return self.func(c.tolist())
if len(self.rate_limiter_kwargs) > 0:
import ratelimiter
v = int(sys.version_info.minor)
if v >= 11:
print(
"WARNING: rate limit only support up to 3.10, proceeding without rate limiter"
)
else:
import ratelimiter
max_calls = self.rate_limiter_kwargs["max_calls"]
limiter = ratelimiter.RateLimiter(
max_calls, period=self.rate_limiter_kwargs["period"]
)
embed_func = limiter(embed_func)
max_calls = self.rate_limiter_kwargs["max_calls"]
limiter = ratelimiter.RateLimiter(
max_calls, period=self.rate_limiter_kwargs["period"]
)
embed_func = limiter(embed_func)
batches = self.to_batches(text)
embeds = [emb for c in batches for emb in embed_func(c)]
return embeds
@@ -79,11 +86,6 @@ class EmbeddingFunction:
return f"EmbeddingFunction(func={self.func})"
def rate_limit(self, max_calls=0.9, period=1.0):
import sys
v = int(sys.version_info.minor)
if v >= 11:
raise ValueError("rate limit only support up to 3.10")
self.rate_limiter_kwargs = dict(max_calls=max_calls, period=period)
return self

View File

@@ -24,6 +24,7 @@ class LanceQueryBuilder:
"""
def __init__(self, table: "lancedb.table.LanceTable", query: np.ndarray):
self._metric = "L2"
self._nprobes = 20
self._refine_factor = None
self._table = table
@@ -77,6 +78,21 @@ class LanceQueryBuilder:
self._where = where
return self
def metric(self, metric: str) -> LanceQueryBuilder:
"""Set the distance metric to use.
Parameters
----------
metric: str
The distance metric to use. By default "l2" is used.
Returns
-------
The LanceQueryBuilder object.
"""
self._metric = metric
return self
def nprobes(self, nprobes: int) -> LanceQueryBuilder:
"""Set the number of probes to use.
@@ -108,7 +124,12 @@ class LanceQueryBuilder:
return self
def to_df(self) -> pd.DataFrame:
"""Execute the query and return the results as a pandas DataFrame."""
"""
Execute the query and return the results as a pandas DataFrame.
In addition to the selected columns, LanceDB also returns a vector
and also the "score" column which is the distance between the query
vector and the returned vector.
"""
ds = self._table.to_lance()
# TODO indexed search
tbl = ds.to_table(
@@ -118,6 +139,7 @@ class LanceQueryBuilder:
"column": VECTOR_COLUMN_NAME,
"q": self._query,
"k": self._limit,
"metric": self._metric,
"nprobes": self._nprobes,
"refine_factor": self._refine_factor,
},

View File

@@ -19,12 +19,12 @@ from functools import cached_property
import lance
import numpy as np
import pandas as pd
from lance import LanceDataset
import pyarrow as pa
from lance import LanceDataset
from lance.vector import vec_to_table
from .common import DATA, VEC, VECTOR_COLUMN_NAME
from .query import LanceQueryBuilder
from .common import DATA, VECTOR_COLUMN_NAME, VEC
def _sanitize_data(data, schema):
@@ -46,15 +46,41 @@ class LanceTable:
A table in a LanceDB database.
"""
def __init__(self, connection: "lancedb.db.LanceDBConnection", name: str):
def __init__(
self, connection: "lancedb.db.LanceDBConnection", name: str, version: int = None
):
self._conn = connection
self.name = name
self._version = version
def _reset_dataset(self):
try:
del self.__dict__["_dataset"]
except AttributeError:
pass
@property
def schema(self) -> pa.Schema:
"""Return the schema of the table."""
return self._dataset.schema
def list_versions(self):
"""List all versions of the table"""
return self._dataset.versions()
@property
def version(self):
"""Get the current version of the table"""
return self._dataset.version
def checkout(self, version: int):
"""Checkout a version of the table"""
max_ver = max([v["version"] for v in self._dataset.versions()])
if version < 1 or version > max_ver:
raise ValueError(f"Invalid version {version}")
self._version = version
self._reset_dataset()
def __len__(self):
return self._dataset.count_rows()
@@ -80,11 +106,14 @@ class LanceTable:
def _dataset_uri(self) -> str:
return os.path.join(self._conn.uri, f"{self.name}.lance")
def create_index(self, num_partitions=256, num_sub_vectors=96):
def create_index(self, metric="L2", num_partitions=256, num_sub_vectors=96):
"""Create an index on the table.
Parameters
----------
metric: str, default "L2"
The distance metric to use when creating the index. Valid values are "L2" or "cosine".
L2 is euclidean distance.
num_partitions: int
The number of IVF partitions to use when creating the index.
Default is 256.
@@ -92,16 +121,18 @@ class LanceTable:
The number of PQ sub-vectors to use when creating the index.
Default is 96.
"""
return self._dataset.create_index(
self._dataset.create_index(
column=VECTOR_COLUMN_NAME,
index_type="IVF_PQ",
metric=metric,
num_partitions=num_partitions,
num_sub_vectors=num_sub_vectors,
)
self._reset_dataset()
@cached_property
def _dataset(self) -> LanceDataset:
return lance.dataset(self._dataset_uri)
return lance.dataset(self._dataset_uri, version=self._version)
def to_lance(self) -> LanceDataset:
"""Return the LanceDataset backing this table."""
@@ -123,8 +154,9 @@ class LanceTable:
The number of vectors added to the table.
"""
data = _sanitize_data(data, self.schema)
ds = lance.write_dataset(data, self._dataset_uri, mode=mode)
return ds.count_rows()
lance.write_dataset(data, self._dataset_uri, mode=mode)
self._reset_dataset()
return len(self)
def search(self, query: VEC) -> LanceQueryBuilder:
"""Create a search query to find the nearest neighbors
@@ -138,6 +170,9 @@ class LanceTable:
Returns
-------
A LanceQueryBuilder object representing the query.
Once executed, the query returns selected columns, the vector,
and also the "score" column which is the distance between the query
vector and the returned vector.
"""
if isinstance(query, list):
query = np.array(query)
@@ -148,10 +183,10 @@ class LanceTable:
return LanceQueryBuilder(self, query)
@classmethod
def create(cls, db, name, data, schema=None):
def create(cls, db, name, data, schema=None, mode="create"):
tbl = LanceTable(db, name)
data = _sanitize_data(data, schema)
lance.write_dataset(data, tbl._dataset_uri, mode="create")
lance.write_dataset(data, tbl._dataset_uri, mode=mode)
return tbl

43
python/lancedb/util.py Normal file
View File

@@ -0,0 +1,43 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from urllib.parse import ParseResult, urlparse
from pyarrow import fs
def get_uri_scheme(uri: str) -> str:
"""
Get the scheme of a URI. If the URI does not have a scheme, assume it is a file URI.
Parameters
----------
uri : str
The URI to parse.
Returns
-------
str: The scheme of the URI.
"""
parsed = urlparse(uri)
scheme = parsed.scheme
if not scheme:
scheme = "file"
elif scheme in ["s3a", "s3n"]:
scheme = "s3"
elif len(scheme) == 1:
# Windows drive names are parsed as the scheme
# e.g. "c:\path" -> ParseResult(scheme="c", netloc="", path="/path", ...)
# So we add special handling here for schemes that are a single character
scheme = "file"
return scheme

View File

@@ -1,10 +1,10 @@
[project]
name = "lancedb"
version = "0.0.4"
dependencies = ["pylance", "ratelimiter", "retry", "tqdm"]
version = "0.1.2"
dependencies = ["pylance>=0.4.6", "ratelimiter", "retry", "tqdm"]
description = "lancedb"
authors = [
{ name = "Lance Devs", email = "dev@eto.ai" },
{ name = "LanceDB Devs", email = "dev@lancedb.com" },
]
license = { file = "LICENSE" }
readme = "README.md"

View File

@@ -11,6 +11,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import pandas as pd
import pytest
import lancedb
@@ -40,3 +43,57 @@ def test_basic(tmp_path):
assert len(db) == 1
assert db.open_table("test").name == db["test"].name
def test_ingest_pd(tmp_path):
db = lancedb.connect(tmp_path)
assert db.uri == str(tmp_path)
assert db.table_names() == []
data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
table = db.create_table("test", data=data)
rs = table.search([100, 100]).limit(1).to_df()
assert len(rs) == 1
assert rs["item"].iloc[0] == "bar"
rs = table.search([100, 100]).where("price < 15").limit(2).to_df()
assert len(rs) == 1
assert rs["item"].iloc[0] == "foo"
assert db.table_names() == ["test"]
assert "test" in db
assert len(db) == 1
assert db.open_table("test").name == db["test"].name
def test_create_mode(tmp_path):
db = lancedb.connect(tmp_path)
data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
db.create_table("test", data=data)
with pytest.raises(Exception):
db.create_table("test", data=data)
new_data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["fizz", "buzz"],
"price": [10.0, 20.0],
}
)
tbl = db.create_table("test", data=new_data, mode="overwrite")
assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]

View File

@@ -12,13 +12,14 @@
# limitations under the License.
import lance
from lancedb.query import LanceQueryBuilder
import numpy as np
import pandas as pd
import pandas.testing as tm
import pyarrow as pa
import pytest
from lancedb.query import LanceQueryBuilder
class MockTable:
def __init__(self, tmp_path):
@@ -60,3 +61,21 @@ def test_query_builder_with_filter(table):
df = LanceQueryBuilder(table, [0, 0]).where("id = 2").to_df()
assert df["id"].values[0] == 2
assert all(df["vector"].values[0] == [3, 4])
def test_query_builder_with_metric(table):
query = [4, 8]
df_default = LanceQueryBuilder(table, query).to_df()
df_l2 = LanceQueryBuilder(table, query).metric("l2").to_df()
tm.assert_frame_equal(df_default, df_l2)
df_cosine = LanceQueryBuilder(table, query).metric("cosine").limit(1).to_df()
assert df_cosine.score[0] == pytest.approx(
cosine_distance(query, df_cosine.vector[0]),
abs=1e-6,
)
assert 0 <= df_cosine.score[0] <= 1
def cosine_distance(vec1, vec2):
return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

View File

@@ -99,14 +99,41 @@ def test_add(db):
expected = pa.Table.from_arrays(
[
pa.FixedSizeListArray.from_arrays(pa.array([3.1, 4.1, 5.9, 26.5]), 2),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
pa.FixedSizeListArray.from_arrays(
pa.array([3.1, 4.1, 5.9, 26.5, 6.3, 100.5]), 2
),
pa.array(["foo", "bar", "new"]),
pa.array([10.0, 20.0, 30.0]),
],
schema=pa.schema([
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.string()),
pa.field("price", pa.float64()),
]),
schema=pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.string()),
pa.field("price", pa.float64()),
]
),
)
assert expected == table.to_arrow()
def test_versioning(db):
table = LanceTable.create(
db,
"test",
data=[
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
],
)
assert len(table.list_versions()) == 1
assert table.version == 1
table.add([{"vector": [6.3, 100.5], "item": "new", "price": 30.0}])
assert len(table.list_versions()) == 2
assert table.version == 2
assert len(table) == 3
table.checkout(1)
assert table.version == 1
assert len(table) == 2

30
python/tests/test_util.py Normal file
View File

@@ -0,0 +1,30 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from lancedb.util import get_uri_scheme
def test_normalize_uri():
uris = [
"relative/path",
"/absolute/path",
"file:///absolute/path",
"s3://bucket/path",
"gs://bucket/path",
"c:\\windows\\path",
]
schemes = ["file", "file", "file", "s3", "gs", "file"]
for uri, expected_scheme in zip(uris, schemes):
parsed_scheme = get_uri_scheme(uri)
assert parsed_scheme == expected_scheme

21
rust/ffi/node/Cargo.toml Normal file
View File

@@ -0,0 +1,21 @@
[package]
name = "vectordb-node"
version = "0.1.2"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
edition = "2018"
exclude = ["index.node"]
[lib]
crate-type = ["cdylib"]
[dependencies]
arrow-array = "37.0"
arrow-ipc = "37.0"
arrow-schema = "37.0"
once_cell = "1"
futures = "0.3"
lance = "0.4.3"
vectordb = { path = "../../vectordb" }
tokio = { version = "1.23", features = ["rt-multi-thread"] }
neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] }

3
rust/ffi/node/README.md Normal file
View File

@@ -0,0 +1,3 @@
The LanceDB node bridge (vectordb-node) allows javascript applications to access LanceDB datasets.
It is build using [Neon](https://neon-bindings.com). See the node project for an example of how it is used / tests

View File

@@ -0,0 +1,85 @@
How to release the node module
### 1. Bump the versions
<!-- TODO: we also need to bump the optional dependencies for node! -->
```shell
pushd rust/vectordb
cargo bump minor
popd
pushd rust/ffi/node
cargo bump minor
popd
pushd python
cargo bump minor
popd
pushd node
npm version minor
popd
git add -u
git commit -m "Bump versions"
git push
```
### 2. Push a new tag
```shell
git tag vX.X.X
git push --tag vX.X.X
```
When the tag is pushed, GitHub actions will start building the libraries and
will upload them to a draft release. Wait for those jobs to complete.
### 3. Publish the release
Once the jobs are complete, you can edit the
2. Push a tag, such as vX.X.X. Once the tag is pushrf, GitHub actions will start
building the native libraries and uploading them to a draft release. Wait for
those jobs to complete.
3. If the libraries are successful, edit the changelog and then publish the
release. Once you publish, a new action will start and upload all the
release artifacts to npm.
## Manual process
You can build the artifacts locally on a MacOS machine.
### Build the MacOS release libraries
One-time setup:
```shell
rustup target add x86_64-apple-darwin aarch64-apple-darwin
```
To build:
```shell
bash ci/build_macos_artifacts.sh
```
### Build the Linux release libraries
One-time setup, building the Docker container
```shell
cat ci/ubuntu_build.dockerfile | docker build -t lancedb-node-build -
```
To build:
```shell
docker run \
-v /var/run/docker.sock:/var/run/docker.sock \
-v $(pwd):/io -w /io \
lancedb-node-build \
bash ci/build_linux_artifacts.sh
```

View File

@@ -0,0 +1,60 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::io::Cursor;
use std::ops::Deref;
use std::sync::Arc;
use arrow_array::cast::as_list_array;
use arrow_array::{Array, FixedSizeListArray, RecordBatch};
use arrow_ipc::reader::FileReader;
use arrow_schema::{DataType, Field, Schema};
use lance::arrow::{FixedSizeListArrayExt, RecordBatchExt};
pub(crate) fn convert_record_batch(record_batch: RecordBatch) -> RecordBatch {
let column = record_batch
.column_by_name("vector")
.expect("vector column is missing");
let arr = as_list_array(column.deref());
let list_size = arr.values().len() / record_batch.num_rows();
let r = FixedSizeListArray::try_new(arr.values(), list_size as i32).unwrap();
let schema = Arc::new(Schema::new(vec![Field::new(
"vector",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, true)),
list_size as i32,
),
true,
)]));
let mut new_batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(r)]).unwrap();
if record_batch.num_columns() > 1 {
let rb = record_batch.drop_column("vector").unwrap();
new_batch = new_batch.merge(&rb).unwrap();
}
new_batch
}
pub(crate) fn arrow_buffer_to_record_batch(slice: &[u8]) -> Vec<RecordBatch> {
let mut batches: Vec<RecordBatch> = Vec::new();
let fr = FileReader::try_new(Cursor::new(slice), None);
let file_reader = fr.unwrap();
for b in file_reader {
let record_batch = convert_record_batch(b.unwrap());
batches.push(record_batch);
}
batches
}

View File

@@ -0,0 +1,36 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use neon::prelude::*;
pub(crate) fn vec_str_to_array<'a, C: Context<'a>>(
vec: &Vec<String>,
cx: &mut C,
) -> JsResult<'a, JsArray> {
let a = JsArray::new(cx, vec.len() as u32);
for (i, s) in vec.iter().enumerate() {
let v = cx.string(s);
a.set(cx, i as u32, v)?;
}
Ok(a)
}
pub(crate) fn js_array_to_vec(array: &JsArray, cx: &mut FunctionContext) -> Vec<f32> {
let mut query_vec: Vec<f32> = Vec::new();
for i in 0..array.len(cx) {
let entry: Handle<JsNumber> = array.get(cx, i).unwrap();
query_vec.push(entry.value(cx) as f32);
}
query_vec
}

217
rust/ffi/node/src/lib.rs Normal file
View File

@@ -0,0 +1,217 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::ops::Deref;
use std::sync::{Arc, Mutex};
use arrow_array::{Float32Array, RecordBatchReader};
use arrow_ipc::writer::FileWriter;
use futures::{TryFutureExt, TryStreamExt};
use lance::arrow::RecordBatchBuffer;
use lance::dataset::WriteMode;
use neon::prelude::*;
use neon::types::buffer::TypedArray;
use once_cell::sync::OnceCell;
use tokio::runtime::Runtime;
use vectordb::database::Database;
use vectordb::error::Error;
use vectordb::table::Table;
use crate::arrow::arrow_buffer_to_record_batch;
mod arrow;
mod convert;
struct JsDatabase {
database: Arc<Database>,
}
struct JsTable {
table: Arc<Mutex<Table>>,
}
impl Finalize for JsDatabase {}
impl Finalize for JsTable {}
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
static RUNTIME: OnceCell<Runtime> = OnceCell::new();
RUNTIME.get_or_try_init(|| Runtime::new().or_else(|err| cx.throw_error(err.to_string())))
}
fn database_new(mut cx: FunctionContext) -> JsResult<JsBox<JsDatabase>> {
let path = cx.argument::<JsString>(0)?.value(&mut cx);
let db = JsDatabase {
database: Arc::new(Database::connect(path).or_else(|err| cx.throw_error(err.to_string()))?),
};
Ok(cx.boxed(db))
}
fn database_table_names(mut cx: FunctionContext) -> JsResult<JsArray> {
let db = cx
.this()
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
let tables = db
.database
.table_names()
.or_else(|err| cx.throw_error(err.to_string()))?;
convert::vec_str_to_array(&tables, &mut cx)
}
fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
let db = cx
.this()
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let database = db.database.clone();
let (deferred, promise) = cx.promise();
rt.spawn(async move {
let table_rst = database.open_table(table_name).await;
deferred.settle_with(&channel, move |mut cx| {
let table = Arc::new(Mutex::new(table_rst.or_else(|err| cx.throw_error(err.to_string()))?));
Ok(cx.boxed(JsTable { table }))
});
});
Ok(promise)
}
fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let query_vector = cx.argument::<JsArray>(0)?; //. .as_value(&mut cx);
let limit = cx.argument::<JsNumber>(1)?.value(&mut cx);
let filter = cx.argument_opt(2).map(|f| f.downcast_or_throw::<JsString, _>(&mut cx).unwrap().value(&mut cx));
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let (deferred, promise) = cx.promise();
let table = js_table.table.clone();
let query = convert::js_array_to_vec(query_vector.deref(), &mut cx);
rt.spawn(async move {
let builder = table
.lock()
.unwrap()
.search(Float32Array::from(query))
.limit(limit as usize)
.filter(filter);
let record_batch_stream = builder.execute();
let results = record_batch_stream
.and_then(|stream| stream.try_collect::<Vec<_>>().map_err(Error::from))
.await;
deferred.settle_with(&channel, move |mut cx| {
let results = results.or_else(|err| cx.throw_error(err.to_string()))?;
let vector: Vec<u8> = Vec::new();
if results.is_empty() {
return cx.buffer(0);
}
let schema = results.get(0).unwrap().schema();
let mut fr = FileWriter::try_new(vector, schema.deref())
.or_else(|err| cx.throw_error(err.to_string()))?;
for batch in results.iter() {
fr.write(batch)
.or_else(|err| cx.throw_error(err.to_string()))?;
}
fr.finish().or_else(|err| cx.throw_error(err.to_string()))?;
let buf = fr
.into_inner()
.or_else(|err| cx.throw_error(err.to_string()))?;
Ok(JsBuffer::external(&mut cx, buf))
});
});
Ok(promise)
}
fn table_create(mut cx: FunctionContext) -> JsResult<JsPromise> {
let db = cx
.this()
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
let buffer = cx.argument::<JsBuffer>(1)?;
let batches = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx));
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let (deferred, promise) = cx.promise();
let database = db.database.clone();
rt.block_on(async move {
let batch_reader: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(batches));
let table_rst = database.create_table(table_name, batch_reader).await;
deferred.settle_with(&channel, move |mut cx| {
let table = Arc::new(Mutex::new(table_rst.or_else(|err| cx.throw_error(err.to_string()))?));
Ok(cx.boxed(JsTable { table }))
});
});
Ok(promise)
}
fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
let write_mode_map: HashMap<&str, WriteMode> = HashMap::from([
("create", WriteMode::Create),
("append", WriteMode::Append),
("overwrite", WriteMode::Overwrite),
]);
let js_table = cx
.this()
.downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let buffer = cx.argument::<JsBuffer>(0)?;
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
let batches = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx));
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let (deferred, promise) = cx.promise();
let table = js_table.table.clone();
let write_mode = write_mode_map.get(write_mode.as_str()).cloned();
rt.block_on(async move {
let batch_reader: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(batches));
let add_result = table.lock().unwrap().add(batch_reader, write_mode).await;
deferred.settle_with(&channel, move |mut cx| {
let added = add_result.or_else(|err| cx.throw_error(err.to_string()))?;
Ok(cx.number(added as f64))
});
});
Ok(promise)
}
#[neon::main]
fn main(mut cx: ModuleContext) -> NeonResult<()> {
cx.export_function("databaseNew", database_new)?;
cx.export_function("databaseTableNames", database_table_names)?;
cx.export_function("databaseOpenTable", database_open_table)?;
cx.export_function("tableSearch", table_search)?;
cx.export_function("tableCreate", table_create)?;
cx.export_function("tableAdd", table_add)?;
Ok(())
}

18
rust/vectordb/Cargo.toml Normal file
View File

@@ -0,0 +1,18 @@
[package]
name = "vectordb"
version = "0.1.2"
edition = "2021"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
repository = "https://github.com/lancedb/lancedb"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
arrow-array = "37.0"
arrow-schema = "37.0"
lance = "0.4.3"
tokio = { version = "1.23", features = ["rt-multi-thread"] }
[dev-dependencies]
tempfile = "3.5.0"

View File

@@ -0,0 +1,127 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use arrow_array::RecordBatchReader;
use std::fs::create_dir_all;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use crate::error::Result;
use crate::table::Table;
pub struct Database {
pub(crate) path: Arc<PathBuf>,
}
const LANCE_EXTENSION: &str = "lance";
/// A connection to LanceDB
impl Database {
/// Connects to LanceDB
///
/// # Arguments
///
/// * `path` - URI where the database is located, can be a local file or a supported remote cloud storage
///
/// # Returns
///
/// * A [Database] object.
pub fn connect<P: AsRef<Path>>(path: P) -> Result<Database> {
if !path.as_ref().try_exists()? {
create_dir_all(&path)?;
}
Ok(Database {
path: Arc::new(path.as_ref().to_path_buf()),
})
}
/// Get the names of all tables in the database.
///
/// # Returns
///
/// * A [Vec<String>] with all table names.
pub fn table_names(&self) -> Result<Vec<String>> {
let f = self
.path
.read_dir()?
.flatten()
.map(|dir_entry| dir_entry.path())
.filter(|path| {
let is_lance = path
.extension()
.map(|e| e.to_str().map(|e| e == LANCE_EXTENSION))
.flatten();
is_lance.unwrap_or(false)
})
.map(|p| {
p.file_stem()
.map(|s| s.to_str().map(|s| String::from(s)))
.flatten()
})
.flatten()
.collect();
Ok(f)
}
pub async fn create_table(
&self,
name: String,
batches: Box<dyn RecordBatchReader>,
) -> Result<Table> {
Table::create(self.path.clone(), name, batches).await
}
/// Open a table in the database.
///
/// # Arguments
/// * `name` - The name of the table.
///
/// # Returns
///
/// * A [Table] object.
pub async fn open_table(&self, name: String) -> Result<Table> {
Table::open(self.path.clone(), name).await
}
}
#[cfg(test)]
mod tests {
use std::fs::create_dir_all;
use tempfile::tempdir;
use crate::database::Database;
#[tokio::test]
async fn test_connect() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let db = Database::connect(&path_buf);
assert_eq!(db.unwrap().path.as_path(), path_buf.as_path())
}
#[tokio::test]
async fn test_table_names() {
let tmp_dir = tempdir().unwrap();
create_dir_all(tmp_dir.path().join("table1.lance")).unwrap();
create_dir_all(tmp_dir.path().join("table2.lance")).unwrap();
create_dir_all(tmp_dir.path().join("invalidlance")).unwrap();
let db = Database::connect(&tmp_dir.into_path()).unwrap();
let tables = db.table_names().unwrap();
assert_eq!(tables.len(), 2);
assert!(tables.contains(&String::from("table1")));
assert!(tables.contains(&String::from("table2")));
}
}

View File

@@ -0,0 +1,43 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#[derive(Debug)]
pub enum Error {
IO(String),
Lance(String),
}
impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let (catalog, message) = match self {
Self::IO(s) => ("I/O", s.as_str()),
Self::Lance(s) => ("Lance", s.as_str()),
};
write!(f, "LanceDBError({catalog}): {message}")
}
}
pub type Result<T> = std::result::Result<T, Error>;
impl From<std::io::Error> for Error {
fn from(e: std::io::Error) -> Self {
Self::IO(e.to_string())
}
}
impl From<lance::Error> for Error {
fn from(e: lance::Error) -> Self {
Self::Lance(e.to_string())
}
}

18
rust/vectordb/src/lib.rs Normal file
View File

@@ -0,0 +1,18 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod database;
pub mod error;
pub mod query;
pub mod table;

218
rust/vectordb/src/query.rs Normal file
View File

@@ -0,0 +1,218 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use arrow_array::Float32Array;
use lance::dataset::scanner::{DatasetRecordBatchStream, Scanner};
use lance::dataset::Dataset;
use lance::index::vector::MetricType;
use crate::error::Result;
/// A builder for nearest neighbor queries for LanceDB.
pub struct Query {
pub dataset: Arc<Dataset>,
pub query_vector: Float32Array,
pub limit: usize,
pub filter: Option<String>,
pub nprobes: usize,
pub refine_factor: Option<u32>,
pub metric_type: MetricType,
pub use_index: bool,
}
impl Query {
/// Creates a new Query object
///
/// # Arguments
///
/// * `dataset` - The table / dataset the query will be run against.
/// * `vector` The vector used for this query.
///
/// # Returns
///
/// * A [Query] object.
pub(crate) fn new(dataset: Arc<Dataset>, vector: Float32Array) -> Self {
Query {
dataset,
query_vector: vector,
limit: 10,
nprobes: 20,
refine_factor: None,
metric_type: MetricType::L2,
use_index: false,
filter: None
}
}
/// Execute the queries and return its results.
///
/// # Returns
///
/// * A [DatasetRecordBatchStream] with the query's results.
pub async fn execute(&self) -> Result<DatasetRecordBatchStream> {
let mut scanner: Scanner = self.dataset.scan();
scanner.nearest(
crate::table::VECTOR_COLUMN_NAME,
&self.query_vector,
self.limit,
)?;
scanner.nprobs(self.nprobes);
scanner.distance_metric(self.metric_type);
scanner.use_index(self.use_index);
self.filter.as_ref().map(|f| scanner.filter(f));
self.refine_factor.map(|rf| scanner.refine(rf));
Ok(scanner.try_into_stream().await?)
}
/// Set the maximum number of results to return.
///
/// # Arguments
///
/// * `limit` - The maximum number of results to return.
pub fn limit(mut self, limit: usize) -> Query {
self.limit = limit;
self
}
/// Set the vector used for this query.
///
/// # Arguments
///
/// * `vector` - The vector that will be used for search.
pub fn query_vector(mut self, query_vector: Float32Array) -> Query {
self.query_vector = query_vector;
self
}
/// Set the number of probes to use.
///
/// # Arguments
///
/// * `nprobes` - The number of probes to use.
pub fn nprobes(mut self, nprobes: usize) -> Query {
self.nprobes = nprobes;
self
}
/// Set the refine factor to use.
///
/// # Arguments
///
/// * `refine_factor` - The refine factor to use.
pub fn refine_factor(mut self, refine_factor: Option<u32>) -> Query {
self.refine_factor = refine_factor;
self
}
/// Set the distance metric to use.
///
/// # Arguments
///
/// * `metric_type` - The distance metric to use. By default [MetricType::L2] is used.
pub fn metric_type(mut self, metric_type: MetricType) -> Query {
self.metric_type = metric_type;
self
}
/// Whether to use an ANN index if available
///
/// # Arguments
///
/// * `use_index` - Sets Whether to use an ANN index if available
pub fn use_index(mut self, use_index: bool) -> Query {
self.use_index = use_index;
self
}
pub fn filter(mut self, filter: Option<String>) -> Query {
self.filter = filter;
self
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow_array::{Float32Array, RecordBatch, RecordBatchReader};
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
use lance::arrow::RecordBatchBuffer;
use lance::dataset::Dataset;
use lance::index::vector::MetricType;
use crate::query::Query;
#[tokio::test]
async fn test_setters_getters() {
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let ds = Dataset::write(&mut batches, ":memory:", None)
.await
.unwrap();
let vector = Float32Array::from_iter_values([0.1, 0.2]);
let query = Query::new(Arc::new(ds), vector.clone());
assert_eq!(query.query_vector, vector);
let new_vector = Float32Array::from_iter_values([9.8, 8.7]);
let query = query
.query_vector(new_vector.clone())
.limit(100)
.nprobes(1000)
.use_index(true)
.metric_type(MetricType::Cosine)
.refine_factor(Some(999));
assert_eq!(query.query_vector, new_vector);
assert_eq!(query.limit, 100);
assert_eq!(query.nprobes, 1000);
assert_eq!(query.use_index, true);
assert_eq!(query.metric_type, MetricType::Cosine);
assert_eq!(query.refine_factor, Some(999));
}
#[tokio::test]
async fn test_execute() {
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let ds = Dataset::write(&mut batches, ":memory:", None)
.await
.unwrap();
let vector = Float32Array::from_iter_values([0.1; 128]);
let query = Query::new(Arc::new(ds), vector.clone());
let result = query.execute().await;
assert_eq!(result.is_ok(), true);
}
fn make_test_batches() -> RecordBatchBuffer {
let dim: usize = 128;
let schema = Arc::new(ArrowSchema::new(vec![
ArrowField::new("key", DataType::Int32, false),
ArrowField::new(
"vector",
DataType::FixedSizeList(
Arc::new(ArrowField::new("item", DataType::Float32, true)),
dim as i32,
),
true,
),
ArrowField::new("uri", DataType::Utf8, true),
]));
RecordBatchBuffer::new(vec![RecordBatch::new_empty(schema.clone())])
}
}

239
rust/vectordb/src/table.rs Normal file
View File

@@ -0,0 +1,239 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::path::PathBuf;
use std::sync::Arc;
use arrow_array::{Float32Array, RecordBatchReader};
use lance::dataset::{Dataset, WriteMode, WriteParams};
use crate::error::{Error, Result};
use crate::query::Query;
pub const VECTOR_COLUMN_NAME: &str = "vector";
pub const LANCE_FILE_EXTENSION: &str = "lance";
/// A table in a LanceDB database.
pub struct Table {
name: String,
path: String,
dataset: Arc<Dataset>,
}
impl Table {
/// Opens an existing Table
///
/// # Arguments
///
/// * `base_path` - The base path where the table is located
/// * `name` The Table name
///
/// # Returns
///
/// * A [Table] object.
pub async fn open(base_path: Arc<PathBuf>, name: String) -> Result<Self> {
let ds_path = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let ds_uri = ds_path
.to_str()
.ok_or(Error::IO(format!("Unable to find table {}", name)))?;
let dataset = Dataset::open(ds_uri).await?;
let table = Table {
name,
path: ds_uri.to_string(),
dataset: Arc::new(dataset),
};
Ok(table)
}
/// Creates a new Table
///
/// # Arguments
///
/// * `base_path` - The base path where the table is located
/// * `name` The Table name
/// * `batches` RecordBatch to be saved in the database
///
/// # Returns
///
/// * A [Table] object.
pub async fn create(
base_path: Arc<PathBuf>,
name: String,
mut batches: Box<dyn RecordBatchReader>,
) -> Result<Self> {
let ds_path = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let path = ds_path
.to_str()
.ok_or(Error::IO(format!("Unable to find table {}", name)))?;
let dataset =
Arc::new(Dataset::write(&mut batches, path, Some(WriteParams::default())).await?);
Ok(Table { name, path: path.to_string(), dataset })
}
/// Insert records into this Table
///
/// # Arguments
///
/// * `batches` RecordBatch to be saved in the Table
/// * `write_mode` Append / Overwrite existing records. Default: Append
/// # Returns
///
/// * The number of rows added
pub async fn add(
&mut self,
mut batches: Box<dyn RecordBatchReader>,
write_mode: Option<WriteMode>
) -> Result<usize> {
let mut params = WriteParams::default();
params.mode = write_mode.unwrap_or(WriteMode::Append);
self.dataset = Arc::new(Dataset::write(&mut batches, self.path.as_str(), Some(params)).await?);
Ok(batches.count())
}
/// Creates a new Query object that can be executed.
///
/// # Arguments
///
/// * `vector` The vector used for this query.
///
/// # Returns
///
/// * A [Query] object.
pub fn search(&self, query_vector: Float32Array) -> Query {
Query::new(self.dataset.clone(), query_vector)
}
/// Returns the number of rows in this Table
pub async fn count_rows(&self) -> Result<usize> {
Ok(self.dataset.count_rows().await?)
}
}
#[cfg(test)]
mod tests {
use arrow_array::{Float32Array, Int32Array, RecordBatch, RecordBatchReader};
use arrow_schema::{DataType, Field, Schema};
use lance::arrow::RecordBatchBuffer;
use lance::dataset::{Dataset, WriteMode};
use std::sync::Arc;
use tempfile::tempdir;
use crate::table::Table;
#[tokio::test]
async fn test_new_table_not_exists() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let table = Table::open(Arc::new(path_buf), "test".to_string()).await;
assert!(table.is_err());
}
#[tokio::test]
async fn test_open() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
Dataset::write(
&mut batches,
path_buf.join("test.lance").to_str().unwrap(),
None,
)
.await
.unwrap();
let table = Table::open(Arc::new(path_buf), "test".to_string())
.await
.unwrap();
assert_eq!(table.name, "test")
}
#[tokio::test]
async fn test_add() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let schema = batches.schema().clone();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches).await.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema,
vec![Arc::new(Int32Array::from_iter_values(100..110))],
)
.unwrap()]));
table.add(new_batches, None).await.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 20);
assert_eq!(table.name, "test");
}
#[tokio::test]
async fn test_add_overwrite() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let schema = batches.schema().clone();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches).await.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema,
vec![Arc::new(Int32Array::from_iter_values(100..110))],
).unwrap()]));
table.add(new_batches, Some(WriteMode::Overwrite)).await.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
assert_eq!(table.name, "test");
}
#[tokio::test]
async fn test_search() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
Dataset::write(
&mut batches,
path_buf.join("test.lance").to_str().unwrap(),
None,
)
.await
.unwrap();
let table = Table::open(Arc::new(path_buf), "test".to_string())
.await
.unwrap();
let vector = Float32Array::from_iter_values([0.1, 0.2]);
let query = table.search(vector.clone());
assert_eq!(vector, query.query_vector);
}
fn make_test_batches() -> RecordBatchBuffer {
let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)]));
RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema.clone(),
vec![Arc::new(Int32Array::from_iter_values(0..10))],
)
.unwrap()])
}
}