Compare commits

...

16 Commits

Author SHA1 Message Date
Will Jones
a9dcfe7535 make node available to all users 2023-05-25 17:50:07 -07:00
Will Jones
0028b95fd8 mac debug info 2023-05-25 13:33:54 -07:00
Will Jones
102f1d7404 add dbg prints 2023-05-25 09:44:02 -07:00
Will Jones
500aa7b002 give up on musl for now 2023-05-25 09:21:40 -07:00
Will Jones
8aa0f6b4ba use manylinux containers locally 2023-05-25 09:21:40 -07:00
Will Jones
140aa32e08 try manylinux again 2023-05-25 09:21:40 -07:00
Will Jones
a067c3dc85 fixes for action 2023-05-25 09:21:40 -07:00
Will Jones
e762a4db4b cleanup 2023-05-25 09:21:40 -07:00
Will Jones
5e0ff01879 match versions 2023-05-25 09:21:40 -07:00
Will Jones
84356220dd fill out rest of release script 2023-05-25 09:21:40 -07:00
Will Jones
6c03662c68 more progress on release workflow 2023-05-25 09:21:40 -07:00
Will Jones
5e098f4fe5 wip: see if we can build the lib in ci 2023-05-25 09:21:40 -07:00
Chang She
f485378ea4 Basic full text search capabilities (#62)
This is v1 of integrating full text search index into LanceDB.

# API
The query API is roughly the same as before, except if the input is text
instead of a vector we assume that its fts search.

## Example
If `table` is a LanceDB LanceTable, then:

Build index: `table.create_fts_index("text")`

Query: `df = table.search("puppy").limit(10).select(["text"]).to_df()`

# Implementation
Here we use the tantivy-py package to build the index. We then use the
row id's as the full-text-search index's doc id then we just do a Take
operation to fetch the rows.

# Limitations

1. don't support incremental row appends yet. New data won't show up in
search
2. local filesystem only 
3. requires building tantivy explicitly

---------

Co-authored-by: Chang She <chang@lancedb.com>
2023-05-24 22:25:31 -06:00
gsilvestrin
f923cfe47f add create index to nodejs client (#89) 2023-05-24 16:45:58 -06:00
gsilvestrin
06cb7b6458 add query params to to nodejs client (#87) 2023-05-24 15:48:31 -06:00
gsilvestrin
bdef634954 bugfix: string columns should be converted to Utf8Array (#94) 2023-05-23 14:58:49 -07:00
35 changed files with 1574 additions and 100 deletions

View File

@@ -0,0 +1,70 @@
name: Create release commit
on:
workflow_dispatch:
inputs:
dry_run:
description: 'Just create the local commit/tags but do not push it'
required: true
default: "false"
type: choice
options:
- "true"
- "false"
part:
description: 'What kind of release is this?'
required: true
default: 'patch'
type: choice
options:
- patch
- minor
- major
jobs:
bump-version:
runs-on: ubuntu-latest
steps:
- name: Check out main
uses: actions/checkout@v3
with:
ref: main
persist-credentials: false
fetch-depth: 0
lfs: true
- name: Install cargo utils
run: cargo install cargo-bump cargo-get
- name: Bump vectordb
working-directory: rust/vectordb
run: |
cargo bump ${{ inputs.part }}
echo "CRATE_VERSION=$(cargo get version)" >> $GITHUB_ENV
- name: Bump rust/ffi/node
working-directory: rust/ffi/node
run: |
cargo bump ${{ inputs.part }}
echo "FFI_CRATE_VERSION=$(cargo get version)" >> $GITHUB_ENV
- name: Bump node
working-directory: node
run: |
npm version ${{ inputs.part }}
echo "NPM_PACKAGE_VERSION=$(cat package.json | jq -r '.version')" >> $GITHUB_ENV
- name: Create tag
run: |
if [ "$CRATE_VERSION" != "$FFI_CRATE_VERSION" ]; then
echo "Version mismatch between rust/vectordb and rust/ffi/node"
exit 1
fi
if [ "$CRATE_VERSION" != "$NPM_PACKAGE_VERSION" ]; then
echo "Version mismatch between rust/vectordb and node"
exit 1
fi
export TAG="v$CRATE_VERSION'"
git tag $TAG
- name: Push new version and tag
if: ${{ inputs.dry_run }} == "false"
uses: ad-m/github-push-action@master
with:
github_token: ${{ secrets.RELEASE_TOKEN }}
branch: main
tags: true

View File

@@ -67,8 +67,10 @@ jobs:
- name: Build
run: |
npm ci
npm run build
npm run tsc
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
- name: Test
run: npm run test
macos:
@@ -94,8 +96,10 @@ jobs:
- name: Build
run: |
npm ci
npm run build
npm run tsc
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
- name: Test
run: |
npm run test

View File

@@ -30,7 +30,7 @@ jobs:
python-version: 3.${{ matrix.python-minor-version }}
- name: Install lancedb
run: |
pip install -e .
pip install -e ".[fts]"
pip install pytest
- name: Run tests
run: pytest -x -v --durations=30 tests
@@ -49,10 +49,10 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.11"
- name: Install lancedb
run: |
pip install -e .
pip install -e ".[fts]"
pip install pytest
- name: Run tests
run: pytest -x -v --durations=30 tests

174
.github/workflows/release.yml vendored Normal file
View File

@@ -0,0 +1,174 @@
name: Prepare Release
# TODO: bump versions in CI
# NOTE: Python is a separate release for now.
on:
push:
tags:
- v*
jobs:
draft-release:
runs-on: ubuntu-latest
steps:
- uses: softprops/action-gh-release@v1
with:
draft: true
prerelease: true # hardcoded on for now
generate_release_notes: true
rust:
runs-on: ubuntu-latest
needs: draft-release
defaults:
run:
shell: bash
working-directory: rust/vectordb
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
lfs: true
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Package Rust
run: cargo package --all-features
- uses: softprops/action-gh-release@v1
with:
draft: true
files: target/package/vectordb-*.crate
fail_on_unmatched_files: true
node:
runs-on: ubuntu-latest
needs: draft-release
defaults:
run:
shell: bash
working-directory: node
steps:
- name: Checkout
uses: actions/checkout@v2
- uses: actions/setup-node@v3
with:
node-version: 20
cache: 'npm'
cache-dependency-path: node/package-lock.json
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
npm ci
npm run tsc
npm pack
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/vectordb-*.tgz
fail_on_unmatched_files: true
node-macos:
runs-on: macos-12
needs: draft-release
strategy:
fail-fast: false
matrix:
target: [x86_64-apple-darwin, aarch64-apple-darwin]
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install system dependencies
run: brew install protobuf
- name: Install npm dependencies
run: |
cd node
npm ci
- name: Install rustup target
if: ${{ matrix.target == 'aarch64-apple-darwin' }}
run: rustup target add aarch64-apple-darwin
- name: Build MacOS native node modules
run: bash ci/build_macos_artifacts.sh ${{ matrix.target }}
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/dist/vectordb-darwin*.tgz
fail_on_unmatched_files: true
node-linux:
name: node-linux (${{ matrix.arch}}-unknown-linux-${{ matrix.libc }})
runs-on: ubuntu-latest
needs: draft-release
strategy:
fail-fast: false
matrix:
libc:
- gnu
# TODO: re-enable musl once we have refactored to pre-built containers
# Right now we have to build node from source which is too expensive.
# - musl
arch:
- x86_64
- aarch64
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Change owner to root (for npm)
# The docker container is run as root, so we need the files to be owned by root
# Otherwise npm is a nightmare: https://github.com/npm/cli/issues/3773
run: sudo chown -R root:root .
- name: Set up QEMU
if: ${{ matrix.arch == 'aarch64' }}
uses: docker/setup-qemu-action@v2
with:
platforms: arm64
- name: Build Linux GNU native node modules
if: ${{ matrix.libc == 'gnu' }}
run: |
docker run \
-v $(pwd):/io -w /io \
quay.io/pypa/manylinux2014_${{ matrix.arch }} \
bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-gnu
- name: Build musl Linux native node modules
if: ${{ matrix.libc == 'musl' }}
run: |
docker run --platform linux/arm64/v8 \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_${{ matrix.arch }} \
bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-musl
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/dist/vectordb-linux*.tgz
fail_on_unmatched_files: true
release:
needs: [rust, node, node-macos, node-linux]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v3
- name: Publish to PyPI
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
run: |
python -m twine upload --non-interactive \
--skip-existing \
--repository testpypi python/dist/*
- name: Publish to NPM
run: |
for filename in node/dist/*.tgz; do
npm publish --dry-run $filename
done
- name: Publish to crates.io
env:
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
run: |
cargo publish --dry-run --no-verify rust/target/vectordb-*.crate
# - uses: softprops/action-gh-release@v1
# with:
# draft: false

2
.gitignore vendored
View File

@@ -4,6 +4,8 @@
**/__pycache__
.DS_Store
.vscode
rust/target
rust/Cargo.lock

6
Cargo.lock generated
View File

@@ -3356,18 +3356,20 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "vectordb"
version = "0.0.1"
version = "0.1.2"
dependencies = [
"arrow-array",
"arrow-data",
"arrow-schema",
"lance",
"rand",
"tempfile",
"tokio",
]
[[package]]
name = "vectordb-node"
version = "0.1.0"
version = "0.1.2"
dependencies = [
"arrow-array",
"arrow-ipc",

View File

@@ -0,0 +1,91 @@
#!/bin/bash
# Builds the Linux artifacts (node binaries).
# Usage: ./build_linux_artifacts.sh [target]
# Targets supported:
# - x86_64-unknown-linux-gnu:centos
# - aarch64-unknown-linux-gnu:centos
# - aarch64-unknown-linux-musl
# - x86_64-unknown-linux-musl
# TODO: refactor this into a Docker container we can pull
set -e
setup_dependencies() {
echo "Installing system dependencies..."
if [[ $1 == *musl ]]; then
# musllinux
apk add openssl-dev
else
# manylinux2014
yum install -y openssl-devel unzip
fi
if [[ $1 == x86_64* ]]; then
ARCH=x86_64
else
# gnu target
ARCH=aarch_64
fi
# Install new enough protobuf (yum-provided is old)
PB_REL=https://github.com/protocolbuffers/protobuf/releases
PB_VERSION=23.1
curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-$ARCH.zip
unzip protoc-$PB_VERSION-linux-$ARCH.zip -d /usr/local
}
install_node() {
echo "Installing node..."
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
source "$HOME"/.bashrc
if [[ $1 == *musl ]]; then
# This node version is 15, we need 16 or higher:
# apk add nodejs-current npm
# So instead we install from source (nvm doesn't provide binaries for musl):
nvm install -s --no-progress 17
else
nvm install --no-progress 17 # latest that supports glibc 2.17
fi
}
install_rust() {
echo "Installing rust..."
curl https://sh.rustup.rs -sSf | bash -s -- -y
export PATH="$PATH:/root/.cargo/bin"
}
build_node_binary() {
echo "Building node library for $1..."
pushd node
npm ci
if [[ $1 == *musl ]]; then
# This is needed for cargo to allow build cdylibs with musl
export RUSTFLAGS="-C target-feature=-crt-static"
fi
# Cargo can run out of memory while pulling dependencies, espcially when running
# in QEMU. This is a workaround for that.
export CARGO_NET_GIT_FETCH_WITH_CLI=true
# We don't pass in target, since the native target here already matches
# and openblas-src doesn't do well with cross-compilation.
npm run build-release
npm run pack-build
popd
}
TARGET=${1:-x86_64-unknown-linux-gnu}
# Others:
# aarch64-unknown-linux-gnu
# x86_64-unknown-linux-musl
# aarch64-unknown-linux-musl
setup_dependencies $TARGET
install_node $TARGET
install_rust
build_node_binary $TARGET

View File

@@ -0,0 +1,38 @@
# Builds the macOS artifacts (node binaries).
# Usage: ./build_macos_artifacts.sh [target]
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
prebuild_rust() {
# Building here for the sake of easier debugging.
pushd rust/ffi/node
for target in $1
do
echo "Building rust library for $target"
export RUST_BACKTRACE=1
cargo build --release --target $target
done
popd
}
build_node_binaries() {
pushd node
for target in $1
do
echo "Building node library for $target"
npm run build-release -- --target $target
npm run pack-build -- --target $target
done
popd
}
if [ -n "$1" ]; then
targets=$1
else
targets="x86_64-apple-darwin aarch64-apple-darwin"
fi
prebuild_rust $targets
build_node_binaries $targets

90
ci/release_process.md Normal file
View File

@@ -0,0 +1,90 @@
# How to release
This is for the Rust crate and Node module. For now, the Python module is
released separately.
The release is started by bumping the versions and pushing a new tag. To do this
automatically, use the `make_release_commit` GitHub action.
When the tag is pushed, GitHub actions will start building the libraries and
will upload them to a draft release.
While those jobs are running, edit the release notes as needed. For example,
bring relevant new features and bugfixes to the top of the notes and the testing
and CI changes to the bottom.
Once the jobs have finished, the release will be marked as not draft and the
artifacts will be released to crates.io, NPM, and PyPI.
## Manual process
You can also build the artifacts locally on a MacOS machine.
### Build the MacOS release libraries
One-time setup:
```shell
rustup target add x86_64-apple-darwin aarch64-apple-darwin
```
To build:
```shell
bash ci/build_macos_artifacts.sh
```
### Build the Linux release libraries
To build a Linux library, we need to use docker with a different build script:
```shell
ARCH=aarch64
docker run \
-v $(pwd):/io -w /io \
quay.io/pypa/manylinux2014_$ARCH \
bash ci/build_linux_artifacts.sh $ARCH-unknown-linux-gnu
```
You can change `ARCH` to `x86_64`.
Similar script for musl binaries (not yet working):
```shell
ARCH=aarch64
docker run \
--user $(id -u) \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_$ARCH \
bash ci/build_linux_artifacts.sh $ARCH-unknown-linux-musl
```
<!--
For debugging, use these snippets:
```shell
ARCH=aarch64
docker run -it \
-v $(pwd):/io -w /io \
quay.io/pypa/manylinux2014_$ARCH \
bash
```
```shell
ARCH=aarch64
docker run -it \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_$ARCH \
bash
```
Note: musllinux_1_1 is Alpine Linux 3.12
-->
```
docker run \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_aarch64 \
bash alpine_repro.sh
```

View File

@@ -19,6 +19,7 @@ nav:
- Basics: basic.md
- Embeddings: embedding.md
- Indexing: ann_indexes.md
- Full-text search: fts.md
- Integrations: integrations.md
- Python API: python.md

50
docs/src/fts.md Normal file
View File

@@ -0,0 +1,50 @@
# [EXPERIMENTAL] Full text search
LanceDB now provides experimental support for full text search.
This is currently Python only. We plan to push the integration down to Rust in the future
to make this available for JS as well.
## Installation
To use full text search, you must install the fts optional dependencies:
`pip install lancedb[fts]`
## Quickstart
Assume:
1. `table` is a LanceDB Table
2. `text` is the name of the Table column that we want to index
To create the index:
```python
table.create_fts_index("text")
```
To search:
```python
df = table.search("puppy").limit(10).select(["text"]).to_df()
```
LanceDB automatically looks for an FTS index if the input is str.
## Multiple text columns
If you have multiple columns to index, pass them all as a list to `create_fts_index`:
```python
table.create_fts_index(["text1", "text2"])
```
Note that the search API call does not change - you can search over all indexed columns at once.
## Current limitations
1. Currently we do not yet support incremental writes.
If you add data after fts index creation, it won't be reflected
in search results until you do a full reindex.
2. We currently only support local filesystem paths for the fts index.

View File

@@ -45,5 +45,6 @@ We will be adding completed demo apps built using LanceDB.
* [`Basic Operations`](basic.md) - basic functionality of LanceDB.
* [`Embedding Functions`](embedding.md) - functions for working with embeddings.
* [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries.
* [`Full text search`](fts.md) - [EXPERIMENTAL] full-text search API
* [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
* [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK.

2
node/.npmignore Normal file
View File

@@ -0,0 +1,2 @@
gen_test_data.py
index.node

View File

@@ -8,6 +8,10 @@ A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb)
npm install vectordb
```
This will download the appropriate native library for your platform. We currently
support x86_64 Linux, aarch64 Linux, Intel MacOS, and ARM (M1/M2) MacOS. We do not
yet support Windows or musl-based Linux (such as Alpine Linux).
## Usage
### Basic Example
@@ -24,6 +28,19 @@ The [examples](./examples) folder contains complete examples.
## Development
Build and install the rust library with:
```bash
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
```
`npm run build` builds the Rust library, `npm run pack-build` packages the Rust
binary into an npm module called `@vectordb/<platform>` (for example,
`@vectordb/darwin-arm64.node`), and then `npm run install ...` installs that
module.
The LanceDB javascript is built with npm:
```bash

View File

@@ -12,29 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
const { currentTarget } = require('@neon-rs/load');
let nativeLib;
function getPlatformLibrary() {
if (process.platform === "darwin" && process.arch == "arm64") {
return require('./aarch64-apple-darwin.node');
} else if (process.platform === "darwin" && process.arch == "x64") {
return require('./x86_64-apple-darwin.node');
} else if (process.platform === "linux" && process.arch == "x64") {
return require('./x86_64-unknown-linux-gnu.node');
} else {
throw new Error(`vectordb: unsupported platform ${process.platform}_${process.arch}. Please file a bug report at https://github.com/lancedb/lancedb/issues`)
}
}
try {
nativeLib = require('./index.node')
nativeLib = require(`@vectordb/${currentTarget()}`);
} catch (e) {
if (e.code === "MODULE_NOT_FOUND") {
nativeLib = getPlatformLibrary();
} else {
throw new Error('vectordb: failed to load native library. Please file a bug report at https://github.com/lancedb/lancedb/issues');
}
throw new Error(`vectordb: failed to load native library.
You may need to run \`npm install @vectordb/${currentTarget()}\`.
If that does not work, please file a bug report at https://github.com/lancedb/lancedb/issues
Source error: ${e}`);
}
module.exports = nativeLib
// Dynamic require for runtime.
module.exports = nativeLib;

45
node/package-lock.json generated
View File

@@ -7,12 +7,26 @@
"": {
"name": "vectordb",
"version": "0.1.1",
"cpu": [
"x64",
"arm64"
],
"license": "Apache-2.0",
"os": [
"darwin",
"linux"
],
"dependencies": {
"@apache-arrow/ts": "^12.0.0",
"@neon-rs/load": "^0.0.74",
"@vectordb/darwin-arm64": "0.1.1",
"@vectordb/darwin-x64": "0.1.1",
"@vectordb/linux-x64-gnu": "0.1.1",
"@vectordb/linux-x64-musl": "0.1.1",
"apache-arrow": "^12.0.0"
},
"devDependencies": {
"@neon-rs/cli": "^0.0.74",
"@types/chai": "^4.3.4",
"@types/mocha": "^10.0.1",
"@types/node": "^18.16.2",
@@ -30,6 +44,12 @@
"ts-node": "^10.9.1",
"ts-node-dev": "^2.0.0",
"typescript": "*"
},
"optionalDependencies": {
"@vectordb/darwin-arm64": "0.1.1",
"@vectordb/darwin-x64": "0.1.1",
"@vectordb/linux-x64-gnu": "0.1.1",
"@vectordb/linux-x64-musl": "0.1.1"
}
},
"node_modules/@apache-arrow/ts": {
@@ -197,6 +217,20 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"node_modules/@neon-rs/cli": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
"integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
"dev": true,
"bin": {
"neon": "index.js"
}
},
"node_modules/@neon-rs/load": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
"integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
},
"node_modules/@nodelib/fs.scandir": {
"version": "2.1.5",
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
@@ -4191,6 +4225,17 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"@neon-rs/cli": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
"integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
"dev": true
},
"@neon-rs/load": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
"integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
},
"@nodelib/fs.scandir": {
"version": "2.1.5",
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",

View File

@@ -1,15 +1,18 @@
{
"name": "vectordb",
"version": "0.1.1",
"version": "0.1.2",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"tsc": "tsc -b",
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json-render-diagnostics",
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
"build-release": "npm run build -- --release",
"cross-release": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cross build --message-format=json --release -p vectordb-node",
"test": "mocha -recursive dist/test",
"lint": "eslint src --ext .js,.ts"
"lint": "eslint src --ext .js,.ts",
"pack-build": "neon pack-build",
"check-npm": "printenv && which node && which npm && npm --version"
},
"repository": {
"type": "git",
@@ -24,6 +27,7 @@
"author": "Lance Devs",
"license": "Apache-2.0",
"devDependencies": {
"@neon-rs/cli": "^0.0.74",
"@types/chai": "^4.3.4",
"@types/mocha": "^10.0.1",
"@types/node": "^18.16.2",
@@ -44,6 +48,33 @@
},
"dependencies": {
"@apache-arrow/ts": "^12.0.0",
"@neon-rs/load": "^0.0.74",
"apache-arrow": "^12.0.0"
},
"os": [
"darwin",
"linux"
],
"cpu": [
"x64",
"arm64"
],
"neon": {
"targets": {
"x86_64-apple-darwin": "@vectordb/darwin-x64",
"aarch64-apple-darwin": "@vectordb/darwin-arm64",
"x86_64-unknown-linux-gnu": "@vectordb/linux-x64-gnu",
"x86_64-unknown-linux-musl": "@vectordb/linux-x64-musl",
"aarch64-unknown-linux-gnu": "@vectordb/linux-arm64-gnu",
"aarch64-unknown-linux-musl": "@vectordb/linux-arm64-musl"
}
},
"optionalDependencies": {
"@vectordb/darwin-arm64": "0.1.2",
"@vectordb/darwin-x64": "0.1.2",
"@vectordb/linux-x64-gnu": "0.1.2",
"@vectordb/linux-x64-musl": "0.1.2",
"@vectordb/linux-arm64-gnu": "0.1.2",
"@vectordb/linux-arm64-musl": "0.1.2"
}
}

View File

@@ -18,7 +18,7 @@ import {
List,
makeBuilder,
RecordBatchFileWriter,
Table,
Table, Utf8,
type Vector,
vectorFromArray
} from 'apache-arrow'
@@ -52,7 +52,12 @@ export function convertToTable (data: Array<Record<string, unknown>>): Table {
for (const datum of data) {
values.push(datum[columnsKey])
}
records[columnsKey] = vectorFromArray(values)
if (typeof values[0] === 'string') {
// `vectorFromArray` converts strings into dictionary vectors, forcing it back to a string column
records[columnsKey] = vectorFromArray(values, new Utf8())
} else {
records[columnsKey] = vectorFromArray(values)
}
}
}

View File

@@ -21,7 +21,7 @@ import {
import { fromRecordsToBuffer } from './arrow'
// eslint-disable-next-line @typescript-eslint/no-var-requires
const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd } = require('../native.js')
const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd, tableCreateVectorIndex } = require('../native.js')
/**
* Connect to a LanceDB instance at the given URI
@@ -100,64 +100,154 @@ export class Table {
}
/**
* Insert records into this Table
* @param data Records to be inserted into the Table
* Insert records into this Table.
*
* @param mode Append / Overwrite existing records. Default: Append
* @param data Records to be inserted into the Table
* @return The number of rows added to the table
*/
async add (data: Array<Record<string, unknown>>): Promise<number> {
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Append.toString())
}
/**
* Insert records into this Table, replacing its contents.
*
* @param data Records to be inserted into the Table
* @return The number of rows added to the table
*/
async overwrite (data: Array<Record<string, unknown>>): Promise<number> {
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Overwrite.toString())
}
async create_index (indexParams: VectorIndexParams): Promise<any> {
return tableCreateVectorIndex.call(this._tbl, indexParams)
}
}
interface IvfPQIndexConfig {
/**
* The column to be indexed
*/
column?: string
/**
* A unique name for the index
*/
index_name?: string
/**
* Metric type, L2 or Cosine
*/
metric_type?: MetricType
/**
* The number of partitions this index
*/
num_partitions?: number
/**
* The max number of iterations for kmeans training.
*/
max_iters?: number
/**
* Train as optimized product quantization.
*/
use_opq?: boolean
/**
* Number of subvectors to build PQ code
*/
num_sub_vectors?: number
/**
* The number of bits to present one PQ centroid.
*/
num_bits?: number
/**
* Max number of iterations to train OPQ, if `use_opq` is true.
*/
max_opq_iters?: number
type: 'ivf_pq'
}
export type VectorIndexParams = IvfPQIndexConfig
/**
* A builder for nearest neighbor queries for LanceDB.
*/
export class Query {
private readonly _tbl: any
private readonly _query_vector: number[]
private readonly _queryVector: number[]
private _limit: number
private readonly _refine_factor?: number
private readonly _nprobes: number
private _refineFactor?: number
private _nprobes: number
private readonly _columns?: string[]
private _filter?: string
private readonly _metric = 'L2'
private _metricType?: MetricType
constructor (tbl: any, queryVector: number[]) {
this._tbl = tbl
this._query_vector = queryVector
this._queryVector = queryVector
this._limit = 10
this._nprobes = 20
this._refine_factor = undefined
this._refineFactor = undefined
this._columns = undefined
this._filter = undefined
this._metricType = undefined
}
/***
* Sets the number of results that will be returned
* @param value number of results
*/
limit (value: number): Query {
this._limit = value
return this
}
/**
* Refine the results by reading extra elements and re-ranking them in memory.
* @param value refine factor to use in this query.
*/
refineFactor (value: number): Query {
this._refineFactor = value
return this
}
/**
* The number of probes used. A higher number makes search more accurate but also slower.
* @param value The number of probes used.
*/
nprobes (value: number): Query {
this._nprobes = value
return this
}
/**
* A filter statement to be applied to this query.
* @param value A filter in the same format used by a sql WHERE clause.
*/
filter (value: string): Query {
this._filter = value
return this
}
/**
* Execute the query and return the results as an Array of Objects
*/
* The MetricType used for this Query.
* @param value The metric to the. @see MetricType for the different options
*/
metricType (value: MetricType): Query {
this._metricType = value
return this
}
/**
* Execute the query and return the results as an Array of Objects
*/
async execute<T = Record<string, unknown>> (): Promise<T[]> {
let buffer
if (this._filter != null) {
buffer = await tableSearch.call(this._tbl, this._query_vector, this._limit, this._filter)
} else {
buffer = await tableSearch.call(this._tbl, this._query_vector, this._limit)
}
const buffer = await tableSearch.call(this._tbl, this)
const data = tableFromIPC(buffer)
return data.toArray().map((entry: Record<string, unknown>) => {
const newObject: Record<string, unknown> = {}
@@ -177,3 +267,18 @@ export enum WriteMode {
Overwrite = 'overwrite',
Append = 'append'
}
/**
* Distance metrics type.
*/
export enum MetricType {
/**
* Euclidean distance
*/
L2 = 'l2',
/**
* Cosine distance
*/
Cosine = 'cosine'
}

View File

@@ -17,6 +17,7 @@ import { assert } from 'chai'
import { track } from 'temp'
import * as lancedb from '../index'
import { MetricType, Query } from '../index'
describe('LanceDB client', function () {
describe('when creating a connection to lancedb', function () {
@@ -67,7 +68,7 @@ describe('LanceDB client', function () {
const uri = await createTestDB()
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
const results = await table.search([0.1, 0.3]).filter('id == 2').execute()
const results = await table.search([0.1, 0.1]).filter('id == 2').execute()
assert.equal(results.length, 1)
assert.equal(results[0].id, 2)
})
@@ -96,8 +97,8 @@ describe('LanceDB client', function () {
const con = await lancedb.connect(dir)
const data = [
{ id: 1, vector: [0.1, 0.2], price: 10 },
{ id: 2, vector: [1.1, 1.2], price: 50 }
{ id: 1, vector: [0.1, 0.2], price: 10, name: 'a' },
{ id: 2, vector: [1.1, 1.2], price: 50, name: 'b' }
]
const table = await con.createTable('vectors', data)
@@ -105,8 +106,8 @@ describe('LanceDB client', function () {
assert.equal(results.length, 2)
const dataAdd = [
{ id: 3, vector: [2.1, 2.2], price: 10 },
{ id: 4, vector: [3.1, 3.2], price: 50 }
{ id: 3, vector: [2.1, 2.2], price: 10, name: 'c' },
{ id: 4, vector: [3.1, 3.2], price: 50, name: 'd' }
]
await table.add(dataAdd)
const resultsAdd = await table.search([0.1, 0.3]).execute()
@@ -130,16 +131,43 @@ describe('LanceDB client', function () {
assert.equal(resultsAdd.length, 2)
})
})
describe('when creating a vector index', function () {
it('overwrite all records in a table', async function () {
const uri = await createTestDB(32, 300)
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
await table.create_index({ type: 'ivf_pq', column: 'vector', num_partitions: 2, max_iters: 2 })
}).timeout(10_000) // Timeout is high partially because GH macos runner is pretty slow
})
})
async function createTestDB (): Promise<string> {
describe('Query object', function () {
it('sets custom parameters', async function () {
const query = new Query(undefined, [0.1, 0.3])
.limit(1)
.metricType(MetricType.Cosine)
.refineFactor(100)
.nprobes(20) as Record<string, any>
assert.equal(query._limit, 1)
assert.equal(query._metricType, MetricType.Cosine)
assert.equal(query._refineFactor, 100)
assert.equal(query._nprobes, 20)
})
})
async function createTestDB (numDimensions: number = 2, numRows: number = 2): Promise<string> {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const data = [
{ id: 1, vector: [0.1, 0.2], name: 'foo', price: 10, is_active: true },
{ id: 2, vector: [1.1, 1.2], name: 'bar', price: 50, is_active: false }
]
const data = []
for (let i = 0; i < numRows; i++) {
const vector = []
for (let j = 0; j < numDimensions; j++) {
vector.push(i + (j * 0.1))
}
data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
}
await con.createTable('vectors', data)
return dir

122
python/lancedb/fts.py Normal file
View File

@@ -0,0 +1,122 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Full text search index using tantivy-py"""
import os
from typing import List, Tuple
import pyarrow as pa
import tantivy
from .table import LanceTable
def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index:
"""
Create a new Index (not populated)
Parameters
----------
index_path : str
Path to the index directory
text_fields : List[str]
List of text fields to index
Returns
-------
index : tantivy.Index
The index object (not yet populated)
"""
# Declaring our schema.
schema_builder = tantivy.SchemaBuilder()
# special field that we'll populate with row_id
schema_builder.add_integer_field("doc_id", stored=True)
# data fields
for name in text_fields:
schema_builder.add_text_field(name, stored=True)
schema = schema_builder.build()
os.makedirs(index_path, exist_ok=True)
index = tantivy.Index(schema, path=index_path)
return index
def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int:
"""
Populate an index with data from a LanceTable
Parameters
----------
index : tantivy.Index
The index object
table : LanceTable
The table to index
fields : List[str]
List of fields to index
"""
# first check the fields exist and are string or large string type
for name in fields:
f = table.schema.field(name) # raises KeyError if not found
if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
raise TypeError(f"Field {name} is not a string type")
# create a tantivy writer
writer = index.writer()
# write data into index
dataset = table.to_lance()
row_id = 0
for b in dataset.to_batches(columns=fields):
for i in range(b.num_rows):
doc = tantivy.Document()
doc.add_integer("doc_id", row_id)
for name in fields:
doc.add_text(name, b[name][i].as_py())
writer.add_document(doc)
row_id += 1
# commit changes
writer.commit()
return row_id
def search_index(
index: tantivy.Index, query: str, limit: int = 10
) -> Tuple[Tuple[int], Tuple[float]]:
"""
Search an index for a query
Parameters
----------
index : tantivy.Index
The index object
query : str
The query string
limit : int
The maximum number of results to return
Returns
-------
ids_and_score: list[tuple[int], tuple[float]]
A tuple of two tuples, the first containing the document ids
and the second containing the scores
"""
searcher = index.searcher()
query = index.parse_query(query)
# get top results
results = searcher.search(query, limit)
return tuple(
zip(
*[
(searcher.doc(doc_address)["doc_id"][0], score)
for score, doc_address in results.hits
]
)
)

View File

@@ -14,6 +14,7 @@ from __future__ import annotations
import numpy as np
import pandas as pd
import pyarrow as pa
from .common import VECTOR_COLUMN_NAME
@@ -131,7 +132,6 @@ class LanceQueryBuilder:
vector and the returned vector.
"""
ds = self._table.to_lance()
# TODO indexed search
tbl = ds.to_table(
columns=self._columns,
filter=self._where,
@@ -145,3 +145,26 @@ class LanceQueryBuilder:
},
)
return tbl.to_pandas()
class LanceFtsQueryBuilder(LanceQueryBuilder):
def to_df(self) -> pd.DataFrame:
try:
import tantivy
except ImportError:
raise ImportError(
"You need to install the `lancedb[fts]` extra to use this method."
)
from .fts import search_index
# get the index path
index_path = self._table._get_fts_index_path()
# open the index
index = tantivy.Index.open(index_path)
# get the scores and doc ids
row_ids, scores = search_index(index, self._query, self._limit)
scores = pa.array(scores)
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
output_tbl = output_tbl.append_column("score", scores)
return output_tbl.to_pandas()

View File

@@ -14,7 +14,9 @@
from __future__ import annotations
import os
import shutil
from functools import cached_property
from typing import List, Union
import lance
import numpy as np
@@ -24,7 +26,8 @@ from lance import LanceDataset
from lance.vector import vec_to_table
from .common import DATA, VEC, VECTOR_COLUMN_NAME
from .query import LanceQueryBuilder
from .query import LanceFtsQueryBuilder, LanceQueryBuilder
from .util import get_uri_scheme
def _sanitize_data(data, schema):
@@ -130,6 +133,27 @@ class LanceTable:
)
self._reset_dataset()
def create_fts_index(self, field_names: Union[str, List[str]]):
"""Create a full-text search index on the table.
Warning - this API is highly experimental and is highly likely to change
in the future.
Parameters
----------
field_names: str or list of str
The name(s) of the field to index.
"""
from .fts import create_index, populate_index
if isinstance(field_names, str):
field_names = [field_names]
index = create_index(self._get_fts_index_path(), field_names)
populate_index(index, self, field_names)
def _get_fts_index_path(self):
return os.path.join(self._dataset_uri, "_indices", "tantivy")
@cached_property
def _dataset(self) -> LanceDataset:
return lance.dataset(self._dataset_uri, version=self._version)
@@ -158,7 +182,7 @@ class LanceTable:
self._reset_dataset()
return len(self)
def search(self, query: VEC) -> LanceQueryBuilder:
def search(self, query: Union[VEC, str]) -> LanceQueryBuilder:
"""Create a search query to find the nearest neighbors
of the given query vector.
@@ -174,6 +198,10 @@ class LanceTable:
and also the "score" column which is the distance between the query
vector and the returned vector.
"""
if isinstance(query, str):
# fts
return LanceFtsQueryBuilder(self, query)
if isinstance(query, list):
query = np.array(query)
if isinstance(query, np.ndarray):

View File

@@ -45,6 +45,10 @@ dev = [
docs = [
"mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"
]
fts = [
# tantivy 0.19.2
"tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985"
]
[build-system]
requires = [

84
python/tests/test_fts.py Normal file
View File

@@ -0,0 +1,84 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import numpy as np
import pandas as pd
import pytest
import tantivy
import lancedb as ldb
import lancedb.fts
@pytest.fixture
def table(tmp_path) -> ldb.table.LanceTable:
db = ldb.connect(tmp_path)
vectors = [np.random.randn(128) for _ in range(100)]
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
text = [
" ".join(
[
nouns[random.randrange(0, 5)],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
]
)
for _ in range(100)
]
table = db.create_table(
"test", data=pd.DataFrame({"vector": vectors, "text": text, "text2": text})
)
return table
def test_create_index(tmp_path):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert isinstance(index, tantivy.Index)
assert os.path.exists(str(tmp_path / "index"))
def test_populate_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
def test_search_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
ldb.fts.populate_index(index, table, ["text"])
index.reload()
results = ldb.fts.search_index(index, query="puppy", limit=10)
assert len(results) == 2
assert len(results[0]) == 10 # row_ids
assert len(results[1]) == 10 # scores
def test_create_index_from_table(tmp_path, table):
table.create_fts_index("text")
df = table.search("puppy").limit(10).select(["text"]).to_df()
assert len(df) == 10
assert "text" in df.columns
def test_create_index_multiple_columns(tmp_path, table):
table.create_fts_index(["text", "text2"])
df = table.search("puppy").limit(10).to_df()
assert len(df) == 10
assert "text" in df.columns
assert "text2" in df.columns

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb-node"
version = "0.1.0"
version = "0.1.2"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
edition = "2018"

View File

@@ -0,0 +1,15 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod vector;

View File

@@ -0,0 +1,128 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::convert::TryFrom;
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use lance::index::vector::MetricType;
use neon::context::FunctionContext;
use neon::prelude::*;
use vectordb::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
use crate::{runtime, JsTable};
pub(crate) fn table_create_vector_index(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let index_params = cx.argument::<JsObject>(0)?;
let index_params_builder = get_index_params_builder(&mut cx, index_params).unwrap();
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let (deferred, promise) = cx.promise();
let table = js_table.table.clone();
rt.block_on(async move {
let add_result = table
.lock()
.unwrap()
.create_idx(&index_params_builder)
.await;
deferred.settle_with(&channel, move |mut cx| {
add_result
.map(|_| cx.undefined())
.or_else(|err| cx.throw_error(err.to_string()))
});
});
Ok(promise)
}
fn get_index_params_builder(
cx: &mut FunctionContext,
obj: Handle<JsObject>,
) -> Result<impl VectorIndexBuilder, String> {
let idx_type = obj
.get::<JsString, _, _>(cx, "type")
.map_err(|t| t.to_string())?
.value(cx);
match idx_type.as_str() {
"ivf_pq" => {
let mut index_builder: IvfPQIndexBuilder = IvfPQIndexBuilder::new();
let mut pq_params = PQBuildParams::default();
obj.get_opt::<JsString, _, _>(cx, "column")
.map_err(|t| t.to_string())?
.map(|s| index_builder.column(s.value(cx)));
obj.get_opt::<JsString, _, _>(cx, "index_name")
.map_err(|t| t.to_string())?
.map(|s| index_builder.index_name(s.value(cx)));
obj.get_opt::<JsString, _, _>(cx, "metric_type")
.map_err(|t| t.to_string())?
.map(|s| MetricType::try_from(s.value(cx).as_str()))
.map(|mt| {
let metric_type = mt.unwrap();
index_builder.metric_type(metric_type);
pq_params.metric_type = metric_type;
});
let num_partitions = obj
.get_opt::<JsNumber, _, _>(cx, "num_partitions")
.map_err(|t| t.to_string())?
.map(|s| s.value(cx) as usize);
let max_iters = obj
.get_opt::<JsNumber, _, _>(cx, "max_iters")
.map_err(|t| t.to_string())?
.map(|s| s.value(cx) as usize);
num_partitions.map(|np| {
let max_iters = max_iters.unwrap_or(50);
let ivf_params = IvfBuildParams {
num_partitions: np,
max_iters,
};
index_builder.ivf_params(ivf_params)
});
obj.get_opt::<JsBoolean, _, _>(cx, "use_opq")
.map_err(|t| t.to_string())?
.map(|s| pq_params.use_opq = s.value(cx));
obj.get_opt::<JsNumber, _, _>(cx, "num_sub_vectors")
.map_err(|t| t.to_string())?
.map(|s| pq_params.num_sub_vectors = s.value(cx) as usize);
obj.get_opt::<JsNumber, _, _>(cx, "num_bits")
.map_err(|t| t.to_string())?
.map(|s| pq_params.num_bits = s.value(cx) as usize);
obj.get_opt::<JsNumber, _, _>(cx, "max_iters")
.map_err(|t| t.to_string())?
.map(|s| pq_params.max_iters = s.value(cx) as usize);
obj.get_opt::<JsNumber, _, _>(cx, "max_opq_iters")
.map_err(|t| t.to_string())?
.map(|s| pq_params.max_opq_iters = s.value(cx) as usize);
Ok(index_builder)
}
t => Err(format!("{} is not a valid index type", t).to_string()),
}
}

View File

@@ -13,6 +13,7 @@
// limitations under the License.
use std::collections::HashMap;
use std::convert::TryFrom;
use std::ops::Deref;
use std::sync::{Arc, Mutex};
@@ -21,6 +22,7 @@ use arrow_ipc::writer::FileWriter;
use futures::{TryFutureExt, TryStreamExt};
use lance::arrow::RecordBatchBuffer;
use lance::dataset::WriteMode;
use lance::index::vector::MetricType;
use neon::prelude::*;
use neon::types::buffer::TypedArray;
use once_cell::sync::OnceCell;
@@ -34,17 +36,18 @@ use crate::arrow::arrow_buffer_to_record_batch;
mod arrow;
mod convert;
mod index;
struct JsDatabase {
database: Arc<Database>,
}
impl Finalize for JsDatabase {}
struct JsTable {
table: Arc<Mutex<Table>>,
}
impl Finalize for JsDatabase {}
impl Finalize for JsTable {}
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
@@ -87,7 +90,9 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
let table_rst = database.open_table(table_name).await;
deferred.settle_with(&channel, move |mut cx| {
let table = Arc::new(Mutex::new(table_rst.or_else(|err| cx.throw_error(err.to_string()))?));
let table = Arc::new(Mutex::new(
table_rst.or_else(|err| cx.throw_error(err.to_string()))?,
));
Ok(cx.boxed(JsTable { table }))
});
});
@@ -96,15 +101,32 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let query_vector = cx.argument::<JsArray>(0)?; //. .as_value(&mut cx);
let limit = cx.argument::<JsNumber>(1)?.value(&mut cx);
let filter = cx.argument_opt(2).map(|f| f.downcast_or_throw::<JsString, _>(&mut cx).unwrap().value(&mut cx));
let query_obj = cx.argument::<JsObject>(0)?;
let limit = query_obj
.get::<JsNumber, _, _>(&mut cx, "_limit")?
.value(&mut cx);
let filter = query_obj
.get_opt::<JsString, _, _>(&mut cx, "_filter")?
.map(|s| s.value(&mut cx));
let refine_factor = query_obj
.get_opt::<JsNumber, _, _>(&mut cx, "_refineFactor")?
.map(|s| s.value(&mut cx))
.map(|i| i as u32);
let nprobes = query_obj
.get::<JsNumber, _, _>(&mut cx, "_nprobes")?
.value(&mut cx) as usize;
let metric_type = query_obj
.get_opt::<JsString, _, _>(&mut cx, "_metricType")?
.map(|s| s.value(&mut cx))
.map(|s| MetricType::try_from(s.as_str()).unwrap());
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let (deferred, promise) = cx.promise();
let table = js_table.table.clone();
let query_vector = query_obj.get::<JsArray, _, _>(&mut cx, "_queryVector")?;
let query = convert::js_array_to_vec(query_vector.deref(), &mut cx);
rt.spawn(async move {
@@ -113,7 +135,10 @@ fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
.unwrap()
.search(Float32Array::from(query))
.limit(limit as usize)
.filter(filter);
.refine_factor(refine_factor)
.nprobes(nprobes)
.filter(filter)
.metric_type(metric_type);
let record_batch_stream = builder.execute();
let results = record_batch_stream
.and_then(|stream| stream.try_collect::<Vec<_>>().map_err(Error::from))
@@ -164,7 +189,9 @@ fn table_create(mut cx: FunctionContext) -> JsResult<JsPromise> {
let table_rst = database.create_table(table_name, batch_reader).await;
deferred.settle_with(&channel, move |mut cx| {
let table = Arc::new(Mutex::new(table_rst.or_else(|err| cx.throw_error(err.to_string()))?));
let table = Arc::new(Mutex::new(
table_rst.or_else(|err| cx.throw_error(err.to_string()))?,
));
Ok(cx.boxed(JsTable { table }))
});
});
@@ -178,9 +205,7 @@ fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
("overwrite", WriteMode::Overwrite),
]);
let js_table = cx
.this()
.downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let buffer = cx.argument::<JsBuffer>(0)?;
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
let batches = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx));
@@ -204,7 +229,6 @@ fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
Ok(promise)
}
#[neon::main]
fn main(mut cx: ModuleContext) -> NeonResult<()> {
cx.export_function("databaseNew", database_new)?;
@@ -213,5 +237,9 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
cx.export_function("tableSearch", table_search)?;
cx.export_function("tableCreate", table_create)?;
cx.export_function("tableAdd", table_add)?;
cx.export_function(
"tableCreateVectorIndex",
index::vector::table_create_vector_index,
)?;
Ok(())
}

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb"
version = "0.0.1"
version = "0.1.2"
edition = "2021"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
@@ -10,9 +10,11 @@ repository = "https://github.com/lancedb/lancedb"
[dependencies]
arrow-array = "37.0"
arrow-data = "37.0"
arrow-schema = "37.0"
lance = "0.4.3"
tokio = { version = "1.23", features = ["rt-multi-thread"] }
[dev-dependencies]
tempfile = "3.5.0"
rand = { version = "0.8.3", features = ["small_rng"] }

View File

@@ -0,0 +1,15 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod vector;

View File

@@ -0,0 +1,163 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use lance::index::vector::{MetricType, VectorIndexParams};
pub trait VectorIndexBuilder {
fn get_column(&self) -> Option<String>;
fn get_index_name(&self) -> Option<String>;
fn build(&self) -> VectorIndexParams;
}
pub struct IvfPQIndexBuilder {
column: Option<String>,
index_name: Option<String>,
metric_type: Option<MetricType>,
ivf_params: Option<IvfBuildParams>,
pq_params: Option<PQBuildParams>,
}
impl IvfPQIndexBuilder {
pub fn new() -> IvfPQIndexBuilder {
IvfPQIndexBuilder {
column: None,
index_name: None,
metric_type: None,
ivf_params: None,
pq_params: None,
}
}
}
impl IvfPQIndexBuilder {
pub fn column(&mut self, column: String) -> &mut IvfPQIndexBuilder {
self.column = Some(column);
self
}
pub fn index_name(&mut self, index_name: String) -> &mut IvfPQIndexBuilder {
self.index_name = Some(index_name);
self
}
pub fn metric_type(&mut self, metric_type: MetricType) -> &mut IvfPQIndexBuilder {
self.metric_type = Some(metric_type);
self
}
pub fn ivf_params(&mut self, ivf_params: IvfBuildParams) -> &mut IvfPQIndexBuilder {
self.ivf_params = Some(ivf_params);
self
}
pub fn pq_params(&mut self, pq_params: PQBuildParams) -> &mut IvfPQIndexBuilder {
self.pq_params = Some(pq_params);
self
}
}
impl VectorIndexBuilder for IvfPQIndexBuilder {
fn get_column(&self) -> Option<String> {
self.column.clone()
}
fn get_index_name(&self) -> Option<String> {
self.index_name.clone()
}
fn build(&self) -> VectorIndexParams {
let ivf_params = self.ivf_params.clone().unwrap_or(IvfBuildParams::default());
let pq_params = self.pq_params.clone().unwrap_or(PQBuildParams::default());
VectorIndexParams::with_ivf_pq_params(pq_params.metric_type, ivf_params, pq_params)
}
}
#[cfg(test)]
mod tests {
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use lance::index::vector::{MetricType, StageParams};
use crate::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
#[test]
fn test_builder_no_params() {
let index_builder = IvfPQIndexBuilder::new();
assert!(index_builder.get_column().is_none());
assert!(index_builder.get_index_name().is_none());
let index_params = index_builder.build();
assert_eq!(index_params.stages.len(), 2);
if let StageParams::Ivf(ivf_params) = index_params.stages.get(0).unwrap() {
let default = IvfBuildParams::default();
assert_eq!(ivf_params.num_partitions, default.num_partitions);
assert_eq!(ivf_params.max_iters, default.max_iters);
} else {
panic!("Expected first stage to be ivf")
}
if let StageParams::PQ(pq_params) = index_params.stages.get(1).unwrap() {
assert_eq!(pq_params.use_opq, false);
} else {
panic!("Expected second stage to be pq")
}
}
#[test]
fn test_builder_all_params() {
let mut index_builder = IvfPQIndexBuilder::new();
index_builder
.column("c".to_owned())
.metric_type(MetricType::Cosine)
.index_name("index".to_owned());
assert_eq!(index_builder.column.clone().unwrap(), "c");
assert_eq!(index_builder.metric_type.unwrap(), MetricType::Cosine);
assert_eq!(index_builder.index_name.clone().unwrap(), "index");
let ivf_params = IvfBuildParams::new(500);
let mut pq_params = PQBuildParams::default();
pq_params.use_opq = true;
pq_params.max_iters = 1;
pq_params.num_bits = 8;
pq_params.num_sub_vectors = 50;
pq_params.metric_type = MetricType::Cosine;
pq_params.max_opq_iters = 2;
index_builder.ivf_params(ivf_params);
index_builder.pq_params(pq_params);
let index_params = index_builder.build();
assert_eq!(index_params.stages.len(), 2);
if let StageParams::Ivf(ivf_params) = index_params.stages.get(0).unwrap() {
assert_eq!(ivf_params.num_partitions, 500);
} else {
assert!(false, "Expected first stage to be ivf")
}
if let StageParams::PQ(pq_params) = index_params.stages.get(1).unwrap() {
assert_eq!(pq_params.use_opq, true);
assert_eq!(pq_params.max_iters, 1);
assert_eq!(pq_params.num_bits, 8);
assert_eq!(pq_params.num_sub_vectors, 50);
assert_eq!(pq_params.metric_type, MetricType::Cosine);
assert_eq!(pq_params.max_opq_iters, 2);
} else {
assert!(false, "Expected second stage to be pq")
}
}
}

View File

@@ -14,5 +14,6 @@
pub mod database;
pub mod error;
pub mod index;
pub mod query;
pub mod table;

View File

@@ -29,7 +29,7 @@ pub struct Query {
pub filter: Option<String>,
pub nprobes: usize,
pub refine_factor: Option<u32>,
pub metric_type: MetricType,
pub metric_type: Option<MetricType>,
pub use_index: bool,
}
@@ -51,9 +51,9 @@ impl Query {
limit: 10,
nprobes: 20,
refine_factor: None,
metric_type: MetricType::L2,
metric_type: None,
use_index: false,
filter: None
filter: None,
}
}
@@ -71,10 +71,10 @@ impl Query {
self.limit,
)?;
scanner.nprobs(self.nprobes);
scanner.distance_metric(self.metric_type);
scanner.use_index(self.use_index);
self.filter.as_ref().map(|f| scanner.filter(f));
self.refine_factor.map(|rf| scanner.refine(rf));
self.metric_type.map(|mt| scanner.distance_metric(mt));
Ok(scanner.try_into_stream().await?)
}
@@ -123,7 +123,7 @@ impl Query {
/// # Arguments
///
/// * `metric_type` - The distance metric to use. By default [MetricType::L2] is used.
pub fn metric_type(mut self, metric_type: MetricType) -> Query {
pub fn metric_type(mut self, metric_type: Option<MetricType>) -> Query {
self.metric_type = metric_type;
self
}
@@ -174,14 +174,14 @@ mod tests {
.limit(100)
.nprobes(1000)
.use_index(true)
.metric_type(MetricType::Cosine)
.metric_type(Some(MetricType::Cosine))
.refine_factor(Some(999));
assert_eq!(query.query_vector, new_vector);
assert_eq!(query.limit, 100);
assert_eq!(query.nprobes, 1000);
assert_eq!(query.use_index, true);
assert_eq!(query.metric_type, MetricType::Cosine);
assert_eq!(query.metric_type, Some(MetricType::Cosine));
assert_eq!(query.refine_factor, Some(999));
}

View File

@@ -17,8 +17,10 @@ use std::sync::Arc;
use arrow_array::{Float32Array, RecordBatchReader};
use lance::dataset::{Dataset, WriteMode, WriteParams};
use lance::index::IndexType;
use crate::error::{Error, Result};
use crate::index::vector::VectorIndexBuilder;
use crate::query::Query;
pub const VECTOR_COLUMN_NAME: &str = "vector";
@@ -80,7 +82,30 @@ impl Table {
let dataset =
Arc::new(Dataset::write(&mut batches, path, Some(WriteParams::default())).await?);
Ok(Table { name, path: path.to_string(), dataset })
Ok(Table {
name,
path: path.to_string(),
dataset,
})
}
pub async fn create_idx(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
use lance::index::DatasetIndexExt;
let dataset = self
.dataset
.create_index(
&[index_builder
.get_column()
.unwrap_or(VECTOR_COLUMN_NAME.to_string())
.as_str()],
IndexType::Vector,
index_builder.get_index_name(),
&index_builder.build(),
)
.await?;
self.dataset = Arc::new(dataset);
Ok(())
}
/// Insert records into this Table
@@ -95,12 +120,13 @@ impl Table {
pub async fn add(
&mut self,
mut batches: Box<dyn RecordBatchReader>,
write_mode: Option<WriteMode>
write_mode: Option<WriteMode>,
) -> Result<usize> {
let mut params = WriteParams::default();
params.mode = write_mode.unwrap_or(WriteMode::Append);
self.dataset = Arc::new(Dataset::write(&mut batches, self.path.as_str(), Some(params)).await?);
self.dataset =
Arc::new(Dataset::write(&mut batches, self.path.as_str(), Some(params)).await?);
Ok(batches.count())
}
@@ -125,13 +151,21 @@ impl Table {
#[cfg(test)]
mod tests {
use arrow_array::{Float32Array, Int32Array, RecordBatch, RecordBatchReader};
use arrow_array::{
Array, FixedSizeListArray, Float32Array, Int32Array, RecordBatch, RecordBatchReader,
};
use arrow_data::ArrayDataBuilder;
use arrow_schema::{DataType, Field, Schema};
use lance::arrow::RecordBatchBuffer;
use lance::dataset::{Dataset, WriteMode};
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use rand::Rng;
use std::sync::Arc;
use tempfile::tempdir;
use crate::error::Result;
use crate::index::vector::IvfPQIndexBuilder;
use crate::table::Table;
#[tokio::test]
@@ -171,14 +205,17 @@ mod tests {
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let schema = batches.schema().clone();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches).await.unwrap();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches)
.await
.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema,
vec![Arc::new(Int32Array::from_iter_values(100..110))],
)
.unwrap()]));
let new_batches: Box<dyn RecordBatchReader> =
Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema,
vec![Arc::new(Int32Array::from_iter_values(100..110))],
)
.unwrap()]));
table.add(new_batches, None).await.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 20);
@@ -192,15 +229,22 @@ mod tests {
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let schema = batches.schema().clone();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches).await.unwrap();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches)
.await
.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema,
vec![Arc::new(Int32Array::from_iter_values(100..110))],
).unwrap()]));
let new_batches: Box<dyn RecordBatchReader> =
Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema,
vec![Arc::new(Int32Array::from_iter_values(100..110))],
)
.unwrap()]));
table.add(new_batches, Some(WriteMode::Overwrite)).await.unwrap();
table
.add(new_batches, Some(WriteMode::Overwrite))
.await
.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
assert_eq!(table.name, "test");
}
@@ -236,4 +280,74 @@ mod tests {
)
.unwrap()])
}
#[tokio::test]
async fn test_create_index() {
use arrow_array::RecordBatch;
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
use rand;
use std::iter::repeat_with;
use arrow_array::Float32Array;
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let dimension = 16;
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
"embeddings",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, true)),
dimension,
),
false,
)]));
let mut rng = rand::thread_rng();
let float_arr = Float32Array::from(
repeat_with(|| rng.gen::<f32>())
.take(512 * dimension as usize)
.collect::<Vec<f32>>(),
);
let vectors = Arc::new(create_fixed_size_list(float_arr, dimension).unwrap());
let batches = RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema.clone(),
vec![vectors.clone()],
)
.unwrap()]);
let reader: Box<dyn RecordBatchReader + Send> = Box::new(batches);
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), reader)
.await
.unwrap();
let mut i = IvfPQIndexBuilder::new();
let index_builder = i
.column("embeddings".to_string())
.index_name("my_index".to_string())
.ivf_params(IvfBuildParams::new(256))
.pq_params(PQBuildParams::default());
table.create_idx(index_builder).await.unwrap();
assert_eq!(table.dataset.load_indices().await.unwrap().len(), 1);
assert_eq!(table.count_rows().await.unwrap(), 512);
assert_eq!(table.name, "test");
}
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
let list_type = DataType::FixedSizeList(
Arc::new(Field::new("item", values.data_type().clone(), true)),
list_size,
);
let data = ArrayDataBuilder::new(list_type)
.len(values.len() / list_size as usize)
.add_child_data(values.into_data())
.build()
.unwrap();
Ok(FixedSizeListArray::from(data))
}
}