Compare commits

..

12 Commits

Author SHA1 Message Date
Will Jones
be910485e7 make node available to all users 2023-05-25 17:18:31 -07:00
Will Jones
0028b95fd8 mac debug info 2023-05-25 13:33:54 -07:00
Will Jones
102f1d7404 add dbg prints 2023-05-25 09:44:02 -07:00
Will Jones
500aa7b002 give up on musl for now 2023-05-25 09:21:40 -07:00
Will Jones
8aa0f6b4ba use manylinux containers locally 2023-05-25 09:21:40 -07:00
Will Jones
140aa32e08 try manylinux again 2023-05-25 09:21:40 -07:00
Will Jones
a067c3dc85 fixes for action 2023-05-25 09:21:40 -07:00
Will Jones
e762a4db4b cleanup 2023-05-25 09:21:40 -07:00
Will Jones
5e0ff01879 match versions 2023-05-25 09:21:40 -07:00
Will Jones
84356220dd fill out rest of release script 2023-05-25 09:21:40 -07:00
Will Jones
6c03662c68 more progress on release workflow 2023-05-25 09:21:40 -07:00
Will Jones
5e098f4fe5 wip: see if we can build the lib in ci 2023-05-25 09:21:40 -07:00
33 changed files with 730 additions and 404 deletions

View File

@@ -0,0 +1,70 @@
name: Create release commit
on:
workflow_dispatch:
inputs:
dry_run:
description: 'Just create the local commit/tags but do not push it'
required: true
default: "false"
type: choice
options:
- "true"
- "false"
part:
description: 'What kind of release is this?'
required: true
default: 'patch'
type: choice
options:
- patch
- minor
- major
jobs:
bump-version:
runs-on: ubuntu-latest
steps:
- name: Check out main
uses: actions/checkout@v3
with:
ref: main
persist-credentials: false
fetch-depth: 0
lfs: true
- name: Install cargo utils
run: cargo install cargo-bump cargo-get
- name: Bump vectordb
working-directory: rust/vectordb
run: |
cargo bump ${{ inputs.part }}
echo "CRATE_VERSION=$(cargo get version)" >> $GITHUB_ENV
- name: Bump rust/ffi/node
working-directory: rust/ffi/node
run: |
cargo bump ${{ inputs.part }}
echo "FFI_CRATE_VERSION=$(cargo get version)" >> $GITHUB_ENV
- name: Bump node
working-directory: node
run: |
npm version ${{ inputs.part }}
echo "NPM_PACKAGE_VERSION=$(cat package.json | jq -r '.version')" >> $GITHUB_ENV
- name: Create tag
run: |
if [ "$CRATE_VERSION" != "$FFI_CRATE_VERSION" ]; then
echo "Version mismatch between rust/vectordb and rust/ffi/node"
exit 1
fi
if [ "$CRATE_VERSION" != "$NPM_PACKAGE_VERSION" ]; then
echo "Version mismatch between rust/vectordb and node"
exit 1
fi
export TAG="v$CRATE_VERSION'"
git tag $TAG
- name: Push new version and tag
if: ${{ inputs.dry_run }} == "false"
uses: ad-m/github-push-action@master
with:
github_token: ${{ secrets.RELEASE_TOKEN }}
branch: main
tags: true

View File

@@ -67,8 +67,10 @@ jobs:
- name: Build
run: |
npm ci
npm run build
npm run tsc
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
- name: Test
run: npm run test
macos:
@@ -94,8 +96,10 @@ jobs:
- name: Build
run: |
npm ci
npm run build
npm run tsc
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
- name: Test
run: |
npm run test

View File

@@ -30,8 +30,7 @@ jobs:
python-version: 3.${{ matrix.python-minor-version }}
- name: Install lancedb
run: |
pip install -e .
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install -e ".[fts]"
pip install pytest
- name: Run tests
run: pytest -x -v --durations=30 tests
@@ -53,8 +52,7 @@ jobs:
python-version: "3.11"
- name: Install lancedb
run: |
pip install -e .
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install -e ".[fts]"
pip install pytest
- name: Run tests
run: pytest -x -v --durations=30 tests

174
.github/workflows/release.yml vendored Normal file
View File

@@ -0,0 +1,174 @@
name: Prepare Release
# TODO: bump versions in CI
# NOTE: Python is a separate release for now.
on:
push:
tags:
- v*
jobs:
draft-release:
runs-on: ubuntu-latest
steps:
- uses: softprops/action-gh-release@v1
with:
draft: true
prerelease: true # hardcoded on for now
generate_release_notes: true
rust:
runs-on: ubuntu-latest
needs: draft-release
defaults:
run:
shell: bash
working-directory: rust/vectordb
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
lfs: true
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Package Rust
run: cargo package --all-features
- uses: softprops/action-gh-release@v1
with:
draft: true
files: target/package/vectordb-*.crate
fail_on_unmatched_files: true
node:
runs-on: ubuntu-latest
needs: draft-release
defaults:
run:
shell: bash
working-directory: node
steps:
- name: Checkout
uses: actions/checkout@v2
- uses: actions/setup-node@v3
with:
node-version: 20
cache: 'npm'
cache-dependency-path: node/package-lock.json
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
npm ci
npm run tsc
npm pack
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/vectordb-*.tgz
fail_on_unmatched_files: true
node-macos:
runs-on: macos-12
needs: draft-release
strategy:
fail-fast: false
matrix:
target: [x86_64-apple-darwin, aarch64-apple-darwin]
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install system dependencies
run: brew install protobuf
- name: Install npm dependencies
run: |
cd node
npm ci
- name: Install rustup target
if: ${{ matrix.target == 'aarch64-apple-darwin' }}
run: rustup target add aarch64-apple-darwin
- name: Build MacOS native node modules
run: bash ci/build_macos_artifacts.sh ${{ matrix.target }}
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/dist/vectordb-darwin*.tgz
fail_on_unmatched_files: true
node-linux:
name: node-linux (${{ matrix.arch}}-unknown-linux-${{ matrix.libc }})
runs-on: ubuntu-latest
needs: draft-release
strategy:
fail-fast: false
matrix:
libc:
- gnu
# TODO: re-enable musl once we have refactored to pre-built containers
# Right now we have to build node from source which is too expensive.
# - musl
arch:
- x86_64
- aarch64
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Change owner to root (for npm)
# The docker container is run as root, so we need the files to be owned by root
# Otherwise npm is a nightmare: https://github.com/npm/cli/issues/3773
run: sudo chown -R root:root .
- name: Set up QEMU
if: ${{ matrix.arch == 'aarch64' }}
uses: docker/setup-qemu-action@v2
with:
platforms: arm64
- name: Build Linux GNU native node modules
if: ${{ matrix.libc == 'gnu' }}
run: |
docker run \
-v $(pwd):/io -w /io \
quay.io/pypa/manylinux2014_${{ matrix.arch }} \
bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-gnu
- name: Build musl Linux native node modules
if: ${{ matrix.libc == 'musl' }}
run: |
docker run --platform linux/arm64/v8 \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_${{ matrix.arch }} \
bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-musl
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/dist/vectordb-linux*.tgz
fail_on_unmatched_files: true
release:
needs: [rust, node, node-macos, node-linux]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v3
- name: Publish to PyPI
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
run: |
python -m twine upload --non-interactive \
--skip-existing \
--repository testpypi python/dist/*
- name: Publish to NPM
run: |
for filename in node/dist/*.tgz; do
npm publish --dry-run $filename
done
- name: Publish to crates.io
env:
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
run: |
cargo publish --dry-run --no-verify rust/target/vectordb-*.crate
# - uses: softprops/action-gh-release@v1
# with:
# draft: false

2
.gitignore vendored
View File

@@ -4,6 +4,8 @@
**/__pycache__
.DS_Store
.vscode
rust/target
rust/Cargo.lock

11
Cargo.lock generated
View File

@@ -1052,7 +1052,6 @@ dependencies = [
"paste",
"petgraph",
"rand",
"regex",
"uuid",
]
@@ -1646,9 +1645,9 @@ dependencies = [
[[package]]
name = "lance"
version = "0.4.17"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86dda8185bd1ffae7b910c1f68035af23be9b717c52e9cc4de176cd30b47f772"
checksum = "fc96cf89139af6f439a0e28ccd04ddf81be795b79fda3105b7a8952fadeb778e"
dependencies = [
"accelerate-src",
"arrow",
@@ -1685,7 +1684,6 @@ dependencies = [
"rand",
"reqwest",
"shellexpand",
"snafu",
"sqlparser-lance",
"tokio",
"url",
@@ -3358,13 +3356,12 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "vectordb"
version = "0.0.1"
version = "0.1.2"
dependencies = [
"arrow-array",
"arrow-data",
"arrow-schema",
"lance",
"object_store",
"rand",
"tempfile",
"tokio",
@@ -3372,7 +3369,7 @@ dependencies = [
[[package]]
name = "vectordb-node"
version = "0.1.0"
version = "0.1.2"
dependencies = [
"arrow-array",
"arrow-ipc",

View File

@@ -0,0 +1,86 @@
#!/bin/bash
# Builds the Linux artifacts (node binaries).
# Usage: ./build_linux_artifacts.sh [target]
# Targets supported:
# - x86_64-unknown-linux-gnu:centos
# - aarch64-unknown-linux-gnu:centos
# - aarch64-unknown-linux-musl
# - x86_64-unknown-linux-musl
# TODO: refactor this into a Docker container we can pull
set -e
setup_dependencies() {
echo "Installing system dependencies..."
if [[ $1 == *musl ]]; then
# musllinux
apk add openssl-dev
else
# manylinux2014
yum install -y openssl-devel unzip
fi
if [[ $1 == x86_64* ]]; then
ARCH=x86_64
else
# gnu target
ARCH=aarch_64
fi
# Install new enough protobuf (yum-provided is old)
PB_REL=https://github.com/protocolbuffers/protobuf/releases
PB_VERSION=23.1
curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-$ARCH.zip
unzip protoc-$PB_VERSION-linux-$ARCH.zip -d /usr/local
}
install_node() {
echo "Installing node..."
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
source "$HOME"/.bashrc
if [[ $1 == *musl ]]; then
# This node version is 15, we need 16 or higher:
# apk add nodejs-current npm
# So instead we install from source (nvm doesn't provide binaries for musl):
nvm install -s --no-progress 17
else
nvm install --no-progress 17 # latest that supports glibc 2.17
fi
}
install_rust() {
echo "Installing rust..."
curl https://sh.rustup.rs -sSf | bash -s -- -y
export PATH="$PATH:/root/.cargo/bin"
}
build_node_binary() {
echo "Building node library for $1..."
pushd node
npm ci
if [[ $1 == *musl ]]; then
# This is needed for cargo to allow build cdylibs with musl
export RUSTFLAGS="-C target-feature=-crt-static"
fi
# We don't pass in target, since the native target here already matches
# and openblas-src doesn't do well with cross-compilation.
npm run build-release
npm run pack-build
popd
}
TARGET=${1:-x86_64-unknown-linux-gnu}
# Others:
# aarch64-unknown-linux-gnu
# x86_64-unknown-linux-musl
# aarch64-unknown-linux-musl
setup_dependencies $TARGET
install_node $TARGET
install_rust
build_node_binary $TARGET

View File

@@ -0,0 +1,38 @@
# Builds the macOS artifacts (node binaries).
# Usage: ./build_macos_artifacts.sh [target]
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
prebuild_rust() {
# Building here for the sake of easier debugging.
pushd rust/ffi/node
for target in $1
do
echo "Building rust library for $target"
export RUST_BACKTRACE=1
cargo build --release --target $target
done
popd
}
build_node_binaries() {
pushd node
for target in $1
do
echo "Building node library for $target"
npm run build-release -- --target $target
npm run pack-build -- --target $target
done
popd
}
if [ -n "$1" ]; then
targets=$1
else
targets="x86_64-apple-darwin aarch64-apple-darwin"
fi
prebuild_rust $targets
build_node_binaries $targets

90
ci/release_process.md Normal file
View File

@@ -0,0 +1,90 @@
# How to release
This is for the Rust crate and Node module. For now, the Python module is
released separately.
The release is started by bumping the versions and pushing a new tag. To do this
automatically, use the `make_release_commit` GitHub action.
When the tag is pushed, GitHub actions will start building the libraries and
will upload them to a draft release.
While those jobs are running, edit the release notes as needed. For example,
bring relevant new features and bugfixes to the top of the notes and the testing
and CI changes to the bottom.
Once the jobs have finished, the release will be marked as not draft and the
artifacts will be released to crates.io, NPM, and PyPI.
## Manual process
You can also build the artifacts locally on a MacOS machine.
### Build the MacOS release libraries
One-time setup:
```shell
rustup target add x86_64-apple-darwin aarch64-apple-darwin
```
To build:
```shell
bash ci/build_macos_artifacts.sh
```
### Build the Linux release libraries
To build a Linux library, we need to use docker with a different build script:
```shell
ARCH=aarch64
docker run \
-v $(pwd):/io -w /io \
quay.io/pypa/manylinux2014_$ARCH \
bash ci/build_linux_artifacts.sh $ARCH-unknown-linux-gnu
```
You can change `ARCH` to `x86_64`.
Similar script for musl binaries (not yet working):
```shell
ARCH=aarch64
docker run \
--user $(id -u) \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_$ARCH \
bash ci/build_linux_artifacts.sh $ARCH-unknown-linux-musl
```
<!--
For debugging, use these snippets:
```shell
ARCH=aarch64
docker run -it \
-v $(pwd):/io -w /io \
quay.io/pypa/manylinux2014_$ARCH \
bash
```
```shell
ARCH=aarch64
docker run -it \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_$ARCH \
bash
```
Note: musllinux_1_1 is Alpine Linux 3.12
-->
```
docker run \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_aarch64 \
bash alpine_repro.sh
```

View File

@@ -6,10 +6,9 @@ to make this available for JS as well.
## Installation
To use full text search, you must install optional dependency tantivy-py:
To use full text search, you must install the fts optional dependencies:
# tantivy 0.19.2
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
`pip install lancedb[fts]`
## Quickstart

2
node/.npmignore Normal file
View File

@@ -0,0 +1,2 @@
gen_test_data.py
index.node

View File

@@ -8,6 +8,10 @@ A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb)
npm install vectordb
```
This will download the appropriate native library for your platform. We currently
support x86_64 Linux, aarch64 Linux, Intel MacOS, and ARM (M1/M2) MacOS. We do not
yet support Windows or musl-based Linux (such as Alpine Linux).
## Usage
### Basic Example
@@ -24,6 +28,19 @@ The [examples](./examples) folder contains complete examples.
## Development
Build and install the rust library with:
```bash
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
```
`npm run build` builds the Rust library, `npm run pack-build` packages the Rust
binary into an npm module called `@vectordb/<platform>` (for example,
`@vectordb/darwin-arm64.node`), and then `npm run install ...` installs that
module.
The LanceDB javascript is built with npm:
```bash

View File

@@ -12,29 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
const { currentTarget } = require('@neon-rs/load');
let nativeLib;
function getPlatformLibrary() {
if (process.platform === "darwin" && process.arch == "arm64") {
return require('./aarch64-apple-darwin.node');
} else if (process.platform === "darwin" && process.arch == "x64") {
return require('./x86_64-apple-darwin.node');
} else if (process.platform === "linux" && process.arch == "x64") {
return require('./x86_64-unknown-linux-gnu.node');
} else {
throw new Error(`vectordb: unsupported platform ${process.platform}_${process.arch}. Please file a bug report at https://github.com/lancedb/lancedb/issues`)
}
}
try {
nativeLib = require('./index.node')
nativeLib = require(`@vectordb/${currentTarget()}`);
} catch (e) {
if (e.code === "MODULE_NOT_FOUND") {
nativeLib = getPlatformLibrary();
} else {
throw new Error('vectordb: failed to load native library. Please file a bug report at https://github.com/lancedb/lancedb/issues');
}
throw new Error(`vectordb: failed to load native library.
You may need to run \`npm install @vectordb/${currentTarget()}\`.
If that does not work, please file a bug report at https://github.com/lancedb/lancedb/issues
Source error: ${e}`);
}
module.exports = nativeLib
// Dynamic require for runtime.
module.exports = nativeLib;

45
node/package-lock.json generated
View File

@@ -7,12 +7,26 @@
"": {
"name": "vectordb",
"version": "0.1.1",
"cpu": [
"x64",
"arm64"
],
"license": "Apache-2.0",
"os": [
"darwin",
"linux"
],
"dependencies": {
"@apache-arrow/ts": "^12.0.0",
"@neon-rs/load": "^0.0.74",
"@vectordb/darwin-arm64": "0.1.1",
"@vectordb/darwin-x64": "0.1.1",
"@vectordb/linux-x64-gnu": "0.1.1",
"@vectordb/linux-x64-musl": "0.1.1",
"apache-arrow": "^12.0.0"
},
"devDependencies": {
"@neon-rs/cli": "^0.0.74",
"@types/chai": "^4.3.4",
"@types/mocha": "^10.0.1",
"@types/node": "^18.16.2",
@@ -30,6 +44,12 @@
"ts-node": "^10.9.1",
"ts-node-dev": "^2.0.0",
"typescript": "*"
},
"optionalDependencies": {
"@vectordb/darwin-arm64": "0.1.1",
"@vectordb/darwin-x64": "0.1.1",
"@vectordb/linux-x64-gnu": "0.1.1",
"@vectordb/linux-x64-musl": "0.1.1"
}
},
"node_modules/@apache-arrow/ts": {
@@ -197,6 +217,20 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"node_modules/@neon-rs/cli": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
"integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
"dev": true,
"bin": {
"neon": "index.js"
}
},
"node_modules/@neon-rs/load": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
"integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
},
"node_modules/@nodelib/fs.scandir": {
"version": "2.1.5",
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
@@ -4191,6 +4225,17 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"@neon-rs/cli": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
"integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
"dev": true
},
"@neon-rs/load": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
"integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
},
"@nodelib/fs.scandir": {
"version": "2.1.5",
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",

View File

@@ -1,15 +1,18 @@
{
"name": "vectordb",
"version": "0.1.1",
"version": "0.1.2",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"tsc": "tsc -b",
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json-render-diagnostics",
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
"build-release": "npm run build -- --release",
"cross-release": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cross build --message-format=json --release -p vectordb-node",
"test": "mocha -recursive dist/test",
"lint": "eslint src --ext .js,.ts"
"lint": "eslint src --ext .js,.ts",
"pack-build": "neon pack-build",
"check-npm": "printenv && which node && which npm && npm --version"
},
"repository": {
"type": "git",
@@ -24,6 +27,7 @@
"author": "Lance Devs",
"license": "Apache-2.0",
"devDependencies": {
"@neon-rs/cli": "^0.0.74",
"@types/chai": "^4.3.4",
"@types/mocha": "^10.0.1",
"@types/node": "^18.16.2",
@@ -44,6 +48,33 @@
},
"dependencies": {
"@apache-arrow/ts": "^12.0.0",
"@neon-rs/load": "^0.0.74",
"apache-arrow": "^12.0.0"
},
"os": [
"darwin",
"linux"
],
"cpu": [
"x64",
"arm64"
],
"neon": {
"targets": {
"x86_64-apple-darwin": "@vectordb/darwin-x64",
"aarch64-apple-darwin": "@vectordb/darwin-arm64",
"x86_64-unknown-linux-gnu": "@vectordb/linux-x64-gnu",
"x86_64-unknown-linux-musl": "@vectordb/linux-x64-musl",
"aarch64-unknown-linux-gnu": "@vectordb/linux-arm64-gnu",
"aarch64-unknown-linux-musl": "@vectordb/linux-arm64-musl"
}
},
"optionalDependencies": {
"@vectordb/darwin-arm64": "0.1.2",
"@vectordb/darwin-x64": "0.1.2",
"@vectordb/linux-x64-gnu": "0.1.2",
"@vectordb/linux-x64-musl": "0.1.2",
"@vectordb/linux-arm64-gnu": "0.1.2",
"@vectordb/linux-arm64-musl": "0.1.2"
}
}

View File

@@ -15,16 +15,15 @@
import {
Field,
Float32,
List, type ListBuilder,
List,
makeBuilder,
RecordBatchFileWriter,
Table, Utf8,
type Vector,
vectorFromArray
} from 'apache-arrow'
import { type EmbeddingFunction } from './index'
export function convertToTable<T> (data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Table {
export function convertToTable (data: Array<Record<string, unknown>>): Table {
if (data.length === 0) {
throw new Error('At least one record needs to be provided')
}
@@ -34,7 +33,11 @@ export function convertToTable<T> (data: Array<Record<string, unknown>>, embeddi
for (const columnsKey of columns) {
if (columnsKey === 'vector') {
const listBuilder = newVectorListBuilder()
const children = new Field<Float32>('item', new Float32())
const list = new List(children)
const listBuilder = makeBuilder({
type: list
})
const vectorSize = (data[0].vector as any[]).length
for (const datum of data) {
if ((datum[columnsKey] as any[]).length !== vectorSize) {
@@ -49,14 +52,6 @@ export function convertToTable<T> (data: Array<Record<string, unknown>>, embeddi
for (const datum of data) {
values.push(datum[columnsKey])
}
if (columnsKey === embeddings?.sourceColumn) {
const vectors = embeddings.embed(values as T[])
const listBuilder = newVectorListBuilder()
vectors.map(v => listBuilder.append(v))
records.vector = listBuilder.finish().toVector()
}
if (typeof values[0] === 'string') {
// `vectorFromArray` converts strings into dictionary vectors, forcing it back to a string column
records[columnsKey] = vectorFromArray(values, new Utf8())
@@ -69,17 +64,8 @@ export function convertToTable<T> (data: Array<Record<string, unknown>>, embeddi
return new Table(records)
}
// Creates a new Arrow ListBuilder that stores a Vector column
function newVectorListBuilder (): ListBuilder<Float32, any> {
const children = new Field<Float32>('item', new Float32())
const list = new List(children)
return makeBuilder({
type: list
})
}
export async function fromRecordsToBuffer<T> (data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Promise<Buffer> {
const table = convertToTable(data, embeddings)
export async function fromRecordsToBuffer (data: Array<Record<string, unknown>>): Promise<Buffer> {
const table = convertToTable(data)
const writer = RecordBatchFileWriter.writeAll(table)
return Buffer.from(await writer.toUint8Array())
}

View File

@@ -28,8 +28,7 @@ const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSe
* @param uri The uri of the database.
*/
export async function connect (uri: string): Promise<Connection> {
const db = await databaseNew(uri)
return new Connection(db, uri)
return new Connection(uri)
}
/**
@@ -39,9 +38,9 @@ export class Connection {
private readonly _uri: string
private readonly _db: any
constructor (db: any, uri: string) {
constructor (uri: string) {
this._uri = uri
this._db = db
this._db = databaseNew(uri)
}
get uri (): string {
@@ -56,50 +55,17 @@ export class Connection {
}
/**
* Open a table in the database.
*
* @param name The name of the table.
*/
async openTable (name: string): Promise<Table>
/**
* Open a table in the database.
*
* @param name The name of the table.
* @param embeddings An embedding function to use on this Table
*/
async openTable<T> (name: string, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
async openTable<T> (name: string, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
* Open a table in the database.
* @param name The name of the table.
*/
async openTable (name: string): Promise<Table> {
const tbl = await databaseOpenTable.call(this._db, name)
if (embeddings !== undefined) {
return new Table(tbl, name, embeddings)
} else {
return new Table(tbl, name)
}
return new Table(tbl, name)
}
/**
* Creates a new Table and initialize it with new data.
*
* @param name The name of the table.
* @param data Non-empty Array of Records to be inserted into the Table
*/
async createTable (name: string, data: Array<Record<string, unknown>>): Promise<Table>
/**
* Creates a new Table and initialize it with new data.
*
* @param name The name of the table.
* @param data Non-empty Array of Records to be inserted into the Table
* @param embeddings An embedding function to use on this Table
*/
async createTable<T> (name: string, data: Array<Record<string, unknown>>, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
async createTable<T> (name: string, data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
const tbl = await tableCreate.call(this._db, name, await fromRecordsToBuffer(data, embeddings))
if (embeddings !== undefined) {
return new Table(tbl, name, embeddings)
} else {
return new Table(tbl, name)
}
async createTable (name: string, data: Array<Record<string, unknown>>): Promise<Table> {
await tableCreate.call(this._db, name, await fromRecordsToBuffer(data))
return await this.openTable(name)
}
async createTableArrow (name: string, table: ArrowTable): Promise<Table> {
@@ -109,22 +75,16 @@ export class Connection {
}
}
export class Table<T = number[]> {
/**
* A table in a LanceDB database.
*/
export class Table {
private readonly _tbl: any
private readonly _name: string
private readonly _embeddings?: EmbeddingFunction<T>
constructor (tbl: any, name: string)
/**
* @param tbl
* @param name
* @param embeddings An embedding function to use when interacting with this table
*/
constructor (tbl: any, name: string, embeddings: EmbeddingFunction<T>)
constructor (tbl: any, name: string, embeddings?: EmbeddingFunction<T>) {
constructor (tbl: any, name: string) {
this._tbl = tbl
this._name = name
this._embeddings = embeddings
}
get name (): string {
@@ -132,16 +92,10 @@ export class Table<T = number[]> {
}
/**
* Creates a search query to find the nearest neighbors of the given search term
* @param query The query search term
*/
search (query: T): Query {
let queryVector: number[]
if (this._embeddings !== undefined) {
queryVector = this._embeddings.embed([query])[0]
} else {
queryVector = query as number[]
}
* Create a search query to find the nearest neighbors of the given query vector.
* @param queryVector The query vector.
*/
search (queryVector: number[]): Query {
return new Query(this._tbl, queryVector)
}
@@ -152,7 +106,7 @@ export class Table<T = number[]> {
* @return The number of rows added to the table
*/
async add (data: Array<Record<string, unknown>>): Promise<number> {
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Append.toString())
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Append.toString())
}
/**
@@ -162,14 +116,9 @@ export class Table<T = number[]> {
* @return The number of rows added to the table
*/
async overwrite (data: Array<Record<string, unknown>>): Promise<number> {
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Overwrite.toString())
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Overwrite.toString())
}
/**
* Create an ANN index on this Table vector index.
*
* @param indexParams The parameters of this Index, @see VectorIndexParams.
*/
async create_index (indexParams: VectorIndexParams): Promise<any> {
return tableCreateVectorIndex.call(this._tbl, indexParams)
}
@@ -319,21 +268,6 @@ export enum WriteMode {
Append = 'append'
}
/**
* An embedding function that automatically creates vector representation for a given column.
*/
export interface EmbeddingFunction<T> {
/**
* The name of the column that will be used as input for the Embedding Function.
*/
sourceColumn: string
/**
* Creates a vector representation for the given values.
*/
embed: (data: T[]) => number[][]
}
/**
* Distance metrics type.
*/

View File

@@ -1,52 +0,0 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// IO tests
import { describe } from 'mocha'
import { assert } from 'chai'
import * as lancedb from '../index'
describe('LanceDB S3 client', function () {
if (process.env.TEST_S3_BASE_URL != null) {
const baseUri = process.env.TEST_S3_BASE_URL
it('should have a valid url', async function () {
const uri = `${baseUri}/valid_url`
const table = await createTestDB(uri, 2, 20)
const con = await lancedb.connect(uri)
assert.equal(con.uri, uri)
const results = await table.search([0.1, 0.3]).limit(5).execute()
assert.equal(results.length, 5)
})
} else {
describe.skip('Skip S3 test', function () {})
}
})
async function createTestDB (uri: string, numDimensions: number = 2, numRows: number = 2): Promise<lancedb.Table> {
const con = await lancedb.connect(uri)
const data = []
for (let i = 0; i < numRows; i++) {
const vector = []
for (let j = 0; j < numDimensions; j++) {
vector.push(i + (j * 0.1))
}
data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
}
return await con.createTable('vectors', data)
}

View File

@@ -17,7 +17,7 @@ import { assert } from 'chai'
import { track } from 'temp'
import * as lancedb from '../index'
import { type EmbeddingFunction, MetricType, Query } from '../index'
import { MetricType, Query } from '../index'
describe('LanceDB client', function () {
describe('when creating a connection to lancedb', function () {
@@ -140,39 +140,6 @@ describe('LanceDB client', function () {
await table.create_index({ type: 'ivf_pq', column: 'vector', num_partitions: 2, max_iters: 2 })
}).timeout(10_000) // Timeout is high partially because GH macos runner is pretty slow
})
describe('when using a custom embedding function', function () {
class TextEmbedding implements EmbeddingFunction<string> {
sourceColumn: string
constructor (targetColumn: string) {
this.sourceColumn = targetColumn
}
_embedding_map = new Map<string, number[]>([
['foo', [2.1, 2.2]],
['bar', [3.1, 3.2]]
])
embed (data: string[]): number[][] {
return data.map(datum => this._embedding_map.get(datum) ?? [0.0, 0.0])
}
}
it('should encode the original data into embeddings', async function () {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const embeddings = new TextEmbedding('name')
const data = [
{ price: 10, name: 'foo' },
{ price: 50, name: 'bar' }
]
const table = await con.createTable('vectors', data, embeddings)
const results = await table.search('foo').execute()
assert.equal(results.length, 2)
})
})
})
describe('Query object', function () {

View File

@@ -16,13 +16,7 @@ import os
from typing import List, Tuple
import pyarrow as pa
try:
import tantivy
except ImportError:
raise ImportError(
"Please install tantivy-py `pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985` to use the full text search feature."
)
import tantivy
from .table import LanceTable

View File

@@ -153,7 +153,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
import tantivy
except ImportError:
raise ImportError(
"Please install tantivy-py `pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985` to use the full text search feature."
"You need to install the `lancedb[fts]` extra to use this method."
)
from .fts import search_index

View File

@@ -1,7 +1,7 @@
[project]
name = "lancedb"
version = "0.1.4"
dependencies = ["pylance>=0.4.17", "ratelimiter", "retry", "tqdm"]
version = "0.1.2"
dependencies = ["pylance>=0.4.6", "ratelimiter", "retry", "tqdm"]
description = "lancedb"
authors = [
{ name = "LanceDB Devs", email = "dev@lancedb.com" },
@@ -45,6 +45,10 @@ dev = [
docs = [
"mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"
]
fts = [
# tantivy 0.19.2
"tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985"
]
[build-system]
requires = [

View File

@@ -14,6 +14,7 @@ import sys
import numpy as np
import pyarrow as pa
from lancedb.embeddings import with_embeddings

View File

@@ -13,13 +13,13 @@
import os
import random
import lancedb.fts
import numpy as np
import pandas as pd
import pytest
import tantivy
import lancedb as ldb
import lancedb.fts
@pytest.fixture

View File

@@ -17,6 +17,7 @@ import pandas as pd
import pandas.testing as tm
import pyarrow as pa
import pytest
from lancedb.query import LanceQueryBuilder

View File

@@ -16,6 +16,7 @@ from pathlib import Path
import pandas as pd
import pyarrow as pa
import pytest
from lancedb.table import LanceTable

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb-node"
version = "0.1.0"
version = "0.1.2"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
edition = "2018"
@@ -15,7 +15,7 @@ arrow-ipc = "37.0"
arrow-schema = "37.0"
once_cell = "1"
futures = "0.3"
lance = "0.4.17"
lance = "0.4.3"
vectordb = { path = "../../vectordb" }
tokio = { version = "1.23", features = ["rt-multi-thread"] }
neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] }

View File

@@ -39,7 +39,7 @@ pub(crate) fn table_create_vector_index(mut cx: FunctionContext) -> JsResult<JsP
let add_result = table
.lock()
.unwrap()
.create_index(&index_params_builder)
.create_idx(&index_params_builder)
.await;
deferred.settle_with(&channel, move |mut cx| {

View File

@@ -56,46 +56,23 @@ fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
RUNTIME.get_or_try_init(|| Runtime::new().or_else(|err| cx.throw_error(err.to_string())))
}
fn database_new(mut cx: FunctionContext) -> JsResult<JsPromise> {
fn database_new(mut cx: FunctionContext) -> JsResult<JsBox<JsDatabase>> {
let path = cx.argument::<JsString>(0)?.value(&mut cx);
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let (deferred, promise) = cx.promise();
rt.spawn(async move {
let database = Database::connect(&path).await;
deferred.settle_with(&channel, move |mut cx| {
let db = JsDatabase {
database: Arc::new(database.or_else(|err| cx.throw_error(err.to_string()))?),
};
Ok(cx.boxed(db))
});
});
Ok(promise)
let db = JsDatabase {
database: Arc::new(Database::connect(path).or_else(|err| cx.throw_error(err.to_string()))?),
};
Ok(cx.boxed(db))
}
fn database_table_names(mut cx: FunctionContext) -> JsResult<JsPromise> {
fn database_table_names(mut cx: FunctionContext) -> JsResult<JsArray> {
let db = cx
.this()
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
let rt = runtime(&mut cx)?;
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let database = db.database.clone();
rt.spawn(async move {
let tables_rst = database.table_names().await;
deferred.settle_with(&channel, move |mut cx| {
let tables = tables_rst.or_else(|err| cx.throw_error(err.to_string()))?;
let table_names = convert::vec_str_to_array(&tables, &mut cx);
table_names
});
});
Ok(promise)
let tables = db
.database
.table_names()
.or_else(|err| cx.throw_error(err.to_string()))?;
convert::vec_str_to_array(&tables, &mut cx)
}
fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
@@ -110,7 +87,7 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
let (deferred, promise) = cx.promise();
rt.spawn(async move {
let table_rst = database.open_table(&table_name).await;
let table_rst = database.open_table(table_name).await;
deferred.settle_with(&channel, move |mut cx| {
let table = Arc::new(Mutex::new(
@@ -209,7 +186,7 @@ fn table_create(mut cx: FunctionContext) -> JsResult<JsPromise> {
rt.block_on(async move {
let batch_reader: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(batches));
let table_rst = database.create_table(&table_name, batch_reader).await;
let table_rst = database.create_table(table_name, batch_reader).await;
deferred.settle_with(&channel, move |mut cx| {
let table = Arc::new(Mutex::new(

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb"
version = "0.0.1"
version = "0.1.2"
edition = "2021"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
@@ -12,9 +12,7 @@ repository = "https://github.com/lancedb/lancedb"
arrow-array = "37.0"
arrow-data = "37.0"
arrow-schema = "37.0"
object_store = "0.5.6"
lance = "0.4.17"
lance = "0.4.3"
tokio = { version = "1.23", features = ["rt-multi-thread"] }
[dev-dependencies]

View File

@@ -12,19 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fs::create_dir_all;
use std::path::Path;
use arrow_array::RecordBatchReader;
use lance::io::object_store::ObjectStore;
use std::fs::create_dir_all;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use crate::error::Result;
use crate::table::Table;
pub struct Database {
object_store: ObjectStore,
pub(crate) uri: String,
pub(crate) path: Arc<PathBuf>,
}
const LANCE_EXTENSION: &str = "lance";
@@ -40,17 +37,12 @@ impl Database {
/// # Returns
///
/// * A [Database] object.
pub async fn connect(uri: &str) -> Result<Database> {
let object_store = ObjectStore::new(uri).await?;
if object_store.is_local() {
let path = Path::new(uri);
if !path.try_exists()? {
create_dir_all(&path)?;
}
pub fn connect<P: AsRef<Path>>(path: P) -> Result<Database> {
if !path.as_ref().try_exists()? {
create_dir_all(&path)?;
}
Ok(Database {
uri: uri.to_string(),
object_store,
path: Arc::new(path.as_ref().to_path_buf()),
})
}
@@ -59,13 +51,12 @@ impl Database {
/// # Returns
///
/// * A [Vec<String>] with all table names.
pub async fn table_names(&self) -> Result<Vec<String>> {
pub fn table_names(&self) -> Result<Vec<String>> {
let f = self
.object_store
.read_dir("/")
.await?
.iter()
.map(|fname| Path::new(fname))
.path
.read_dir()?
.flatten()
.map(|dir_entry| dir_entry.path())
.filter(|path| {
let is_lance = path
.extension()
@@ -85,10 +76,10 @@ impl Database {
pub async fn create_table(
&self,
name: &str,
name: String,
batches: Box<dyn RecordBatchReader>,
) -> Result<Table> {
Table::create(&self.uri, name, batches).await
Table::create(self.path.clone(), name, batches).await
}
/// Open a table in the database.
@@ -99,8 +90,8 @@ impl Database {
/// # Returns
///
/// * A [Table] object.
pub async fn open_table(&self, name: &str) -> Result<Table> {
Table::open(&self.uri, name).await
pub async fn open_table(&self, name: String) -> Result<Table> {
Table::open(self.path.clone(), name).await
}
}
@@ -114,10 +105,10 @@ mod tests {
#[tokio::test]
async fn test_connect() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let db = Database::connect(uri).await.unwrap();
let path_buf = tmp_dir.into_path();
let db = Database::connect(&path_buf);
assert_eq!(db.uri, uri);
assert_eq!(db.unwrap().path.as_path(), path_buf.as_path())
}
#[tokio::test]
@@ -127,16 +118,10 @@ mod tests {
create_dir_all(tmp_dir.path().join("table2.lance")).unwrap();
create_dir_all(tmp_dir.path().join("invalidlance")).unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let db = Database::connect(uri).await.unwrap();
let tables = db.table_names().await.unwrap();
let db = Database::connect(&tmp_dir.into_path()).unwrap();
let tables = db.table_names().unwrap();
assert_eq!(tables.len(), 2);
assert!(tables.contains(&String::from("table1")));
assert!(tables.contains(&String::from("table2")));
}
#[tokio::test]
async fn test_connect_s3() {
// let db = Database::connect("s3://bucket/path/to/database").await.unwrap();
}
}

View File

@@ -41,15 +41,3 @@ impl From<lance::Error> for Error {
Self::Lance(e.to_string())
}
}
impl From<object_store::Error> for Error {
fn from(e: object_store::Error) -> Self {
Self::IO(e.to_string())
}
}
impl From<object_store::path::Error> for Error {
fn from(e: object_store::path::Error) -> Self {
Self::IO(e.to_string())
}
}

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::path::Path;
use std::path::PathBuf;
use std::sync::Arc;
use arrow_array::{Float32Array, RecordBatchReader};
@@ -24,21 +24,16 @@ use crate::index::vector::VectorIndexBuilder;
use crate::query::Query;
pub const VECTOR_COLUMN_NAME: &str = "vector";
pub const LANCE_FILE_EXTENSION: &str = "lance";
/// A table in a LanceDB database.
pub struct Table {
name: String,
uri: String,
path: String,
dataset: Arc<Dataset>,
}
impl std::fmt::Display for Table {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Table({})", self.name)
}
}
impl Table {
/// Opens an existing Table
///
@@ -50,21 +45,18 @@ impl Table {
/// # Returns
///
/// * A [Table] object.
pub async fn open(base_uri: &str, name: &str) -> Result<Self> {
let path = Path::new(base_uri);
let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let uri = table_uri
.as_path()
pub async fn open(base_path: Arc<PathBuf>, name: String) -> Result<Self> {
let ds_path = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let ds_uri = ds_path
.to_str()
.ok_or(Error::IO(format!("Invalid table name: {}", name)))?;
let dataset = Dataset::open(&uri).await?;
Ok(Table {
name: name.to_string(),
uri: uri.to_string(),
.ok_or(Error::IO(format!("Unable to find table {}", name)))?;
let dataset = Dataset::open(ds_uri).await?;
let table = Table {
name,
path: ds_uri.to_string(),
dataset: Arc::new(dataset),
})
};
Ok(table)
}
/// Creates a new Table
@@ -79,28 +71,25 @@ impl Table {
///
/// * A [Table] object.
pub async fn create(
base_uri: &str,
name: &str,
base_path: Arc<PathBuf>,
name: String,
mut batches: Box<dyn RecordBatchReader>,
) -> Result<Self> {
let base_path = Path::new(base_uri);
let table_uri = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let uri = table_uri
.as_path()
let ds_path = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let path = ds_path
.to_str()
.ok_or(Error::IO(format!("Invalid table name: {}", name)))?
.to_string();
.ok_or(Error::IO(format!("Unable to find table {}", name)))?;
let dataset =
Arc::new(Dataset::write(&mut batches, &uri, Some(WriteParams::default())).await?);
Arc::new(Dataset::write(&mut batches, path, Some(WriteParams::default())).await?);
Ok(Table {
name: name.to_string(),
uri,
name,
path: path.to_string(),
dataset,
})
}
/// Create index on the table.
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
pub async fn create_idx(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
use lance::index::DatasetIndexExt;
let dataset = self
@@ -136,7 +125,8 @@ impl Table {
let mut params = WriteParams::default();
params.mode = write_mode.unwrap_or(WriteMode::Append);
self.dataset = Arc::new(Dataset::write(&mut batches, &self.uri, Some(params)).await?);
self.dataset =
Arc::new(Dataset::write(&mut batches, self.path.as_str(), Some(params)).await?);
Ok(batches.count())
}
@@ -161,8 +151,6 @@ impl Table {
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow_array::{
Array, FixedSizeListArray, Float32Array, Int32Array, RecordBatch, RecordBatchReader,
};
@@ -173,52 +161,53 @@ mod tests {
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use rand::Rng;
use std::sync::Arc;
use tempfile::tempdir;
use super::*;
use crate::error::Result;
use crate::index::vector::IvfPQIndexBuilder;
use crate::table::Table;
#[tokio::test]
async fn test_new_table_not_exists() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let path_buf = tmp_dir.into_path();
let table = Table::open(&uri, "test").await;
let table = Table::open(Arc::new(path_buf), "test".to_string()).await;
assert!(table.is_err());
}
#[tokio::test]
async fn test_open() {
let tmp_dir = tempdir().unwrap();
let dataset_path = tmp_dir.path().join("test.lance");
let uri = tmp_dir.path().to_str().unwrap();
let path_buf = tmp_dir.into_path();
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
Dataset::write(
&mut batches,
path_buf.join("test.lance").to_str().unwrap(),
None,
)
.await
.unwrap();
let table = Table::open(Arc::new(path_buf), "test".to_string())
.await
.unwrap();
let table = Table::open(uri, "test").await.unwrap();
assert_eq!(table.name, "test")
}
#[test]
fn test_object_store_path() {
use std::path::Path as StdPath;
let p = StdPath::new("s3://bucket/path/to/file");
let c = p.join("subfile");
assert_eq!(c.to_str().unwrap(), "s3://bucket/path/to/file/subfile");
}
#[tokio::test]
async fn test_add() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let path_buf = tmp_dir.into_path();
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let schema = batches.schema().clone();
let mut table = Table::create(&uri, "test", batches).await.unwrap();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches)
.await
.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
let new_batches: Box<dyn RecordBatchReader> =
@@ -236,11 +225,13 @@ mod tests {
#[tokio::test]
async fn test_add_overwrite() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let path_buf = tmp_dir.into_path();
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let schema = batches.schema().clone();
let mut table = Table::create(uri, "test", batches).await.unwrap();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches)
.await
.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
let new_batches: Box<dyn RecordBatchReader> =
@@ -261,16 +252,21 @@ mod tests {
#[tokio::test]
async fn test_search() {
let tmp_dir = tempdir().unwrap();
let dataset_path = tmp_dir.path().join("test.lance");
let uri = tmp_dir.path().to_str().unwrap();
let path_buf = tmp_dir.into_path();
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
Dataset::write(
&mut batches,
path_buf.join("test.lance").to_str().unwrap(),
None,
)
.await
.unwrap();
let table = Table::open(Arc::new(path_buf), "test".to_string())
.await
.unwrap();
let table = Table::open(uri, "test").await.unwrap();
let vector = Float32Array::from_iter_values([0.1, 0.2]);
let query = table.search(vector.clone());
assert_eq!(vector, query.query_vector);
@@ -295,7 +291,7 @@ mod tests {
use arrow_array::Float32Array;
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let path_buf = tmp_dir.into_path();
let dimension = 16;
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
@@ -322,7 +318,9 @@ mod tests {
.unwrap()]);
let reader: Box<dyn RecordBatchReader + Send> = Box::new(batches);
let mut table = Table::create(uri, "test", reader).await.unwrap();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), reader)
.await
.unwrap();
let mut i = IvfPQIndexBuilder::new();
@@ -332,7 +330,7 @@ mod tests {
.ivf_params(IvfBuildParams::new(256))
.pq_params(PQBuildParams::default());
table.create_index(index_builder).await.unwrap();
table.create_idx(index_builder).await.unwrap();
assert_eq!(table.dataset.load_indices().await.unwrap().len(), 1);
assert_eq!(table.count_rows().await.unwrap(), 512);