Compare commits

..

5 Commits

Author SHA1 Message Date
Chang She
2b26775ed1 python v0.1.4 2023-05-31 20:11:25 -07:00
Lei Xu
306ada5cb8 Support S3 and GCS from typescript SDK (#106) 2023-05-30 21:32:17 -07:00
gsilvestrin
d3aa8bfbc5 add embedding functions to the nodejs client (#95) 2023-05-26 18:09:20 -07:00
Chang She
04d97347d7 move tantivy-py installation to be separate from wheel (#97)
pypi does not allow packages to be uploaded that has a direct reference

for now we'll just ask the user to install tantivy separately

---------

Co-authored-by: Chang She <chang@lancedb.com>
2023-05-25 17:57:26 -06:00
Chang She
22aa8a93c2 bump version for v0.1.3 2023-05-25 17:01:52 -06:00
33 changed files with 406 additions and 743 deletions

View File

@@ -67,10 +67,8 @@ jobs:
- name: Build
run: |
npm ci
npm run tsc
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
npm run tsc
- name: Test
run: npm run test
macos:
@@ -96,10 +94,8 @@ jobs:
- name: Build
run: |
npm ci
npm run tsc
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
npm run tsc
- name: Test
run: |
npm run test

View File

@@ -30,7 +30,8 @@ jobs:
python-version: 3.${{ matrix.python-minor-version }}
- name: Install lancedb
run: |
pip install -e ".[fts]"
pip install -e .
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install pytest
- name: Run tests
run: pytest -x -v --durations=30 tests
@@ -52,7 +53,8 @@ jobs:
python-version: "3.11"
- name: Install lancedb
run: |
pip install -e ".[fts]"
pip install -e .
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install pytest
- name: Run tests
run: pytest -x -v --durations=30 tests

View File

@@ -1,194 +0,0 @@
name: Prepare Release
# Based on https://github.com/dherman/neon-prebuild-example/blob/eaa4d33d682e5eb7abbc3da7aed153a1b1acb1b3/.github/workflows/publish.yml
on:
push:
tags:
- v*
jobs:
draft-release:
runs-on: ubuntu-latest
steps:
- uses: softprops/action-gh-release@v1
with:
draft: true
prerelease: true # hardcoded on for now
generate_release_notes: true
rust:
runs-on: ubuntu-latest
needs: draft-release
defaults:
run:
shell: bash
working-directory: rust/vectordb
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
lfs: true
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Package Rust
run: cargo package --all-features
- uses: softprops/action-gh-release@v1
with:
draft: true
files: target/package/vectordb-*.crate
fail_on_unmatched_files: true
python:
runs-on: ubuntu-latest
needs: draft-release
defaults:
run:
shell: bash
working-directory: python
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Build wheel
run: |
pip install wheel
python setup.py sdist bdist_wheel
- uses: softprops/action-gh-release@v1
with:
draft: true
files: |
python/dist/lancedb-*.tar.gz
python/dist/lancedb-*.whl
fail_on_unmatched_files: true
node:
runs-on: ubuntu-latest
needs: draft-release
defaults:
run:
shell: bash
working-directory: node
steps:
- name: Checkout
uses: actions/checkout@v2
- uses: actions/setup-node@v3
with:
node-version: 20
cache: 'npm'
cache-dependency-path: node/package-lock.json
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
npm ci
npm run tsc
npm pack
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/vectordb-*.tgz
fail_on_unmatched_files: true
node-macos:
runs-on: macos-12
needs: draft-release
strategy:
fail-fast: false
matrix:
target: [x86_64-apple-darwin, aarch64-apple-darwin]
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install system dependencies
run: brew install protobuf
- name: Install npm dependencies
run: |
cd node
npm ci
- name: Build MacOS native node modules
run: bash ci/build_macos_artifacts.sh ${{ matrix.target }}
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/dist/vectordb-darwin*.tgz
fail_on_unmatched_files: true
node-linux:
name: node-linux (${{ matrix.arch}}-unknown-linux-${{ matrix.libc }})
runs-on: ubuntu-latest
needs: draft-release
strategy:
fail-fast: false
matrix:
libc:
- gnu
# TODO: re-enable musl once we have refactored to pre-built containers
# Right now we have to build node from source which is too expensive.
# - musl
arch:
- x86_64
- aarch64
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set up QEMU
if: ${{ matrix.arch == 'aarch64' }}
uses: docker/setup-qemu-action@v2
with:
platforms: arm64
- name: Build Linux GNU native node modules
if: ${{ matrix.libc == 'gnu' }}
run: |
docker run \
-v $(pwd):/io -w /io \
quay.io/pypa/manylinux2014_${{ matrix.arch }} \
bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-gnu
- name: Build musl Linux native node modules
if: ${{ matrix.libc == 'musl' }}
run: |
docker run --platform linux/arm64/v8 \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_${{ matrix.arch }} \
bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-musl
- uses: softprops/action-gh-release@v1
with:
draft: true
files: node/dist/vectordb-linux*.tgz
fail_on_unmatched_files: true
release:
needs: [python, node, node-macos, node-linux, rust]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v3
- name: Publish to PyPI
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
run: |
python -m twine upload --non-interactive \
--skip-existing \
--repository testpypi python/dist/*
- name: Publish to NPM
run: |
for filename in node/dist/*.tgz; do
npm publish --dry-run $filename
done
- name: Publish to crates.io
env:
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
run: |
cargo publish --dry-run --no-verify rust/target/vectordb-*.crate
# - uses: softprops/action-gh-release@v1
# with:
# draft: false

2
.gitignore vendored
View File

@@ -4,8 +4,6 @@
**/__pycache__
.DS_Store
.vscode
rust/target
rust/Cargo.lock

11
Cargo.lock generated
View File

@@ -1052,6 +1052,7 @@ dependencies = [
"paste",
"petgraph",
"rand",
"regex",
"uuid",
]
@@ -1645,9 +1646,9 @@ dependencies = [
[[package]]
name = "lance"
version = "0.4.12"
version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc96cf89139af6f439a0e28ccd04ddf81be795b79fda3105b7a8952fadeb778e"
checksum = "86dda8185bd1ffae7b910c1f68035af23be9b717c52e9cc4de176cd30b47f772"
dependencies = [
"accelerate-src",
"arrow",
@@ -1684,6 +1685,7 @@ dependencies = [
"rand",
"reqwest",
"shellexpand",
"snafu",
"sqlparser-lance",
"tokio",
"url",
@@ -3356,12 +3358,13 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "vectordb"
version = "0.1.2"
version = "0.0.1"
dependencies = [
"arrow-array",
"arrow-data",
"arrow-schema",
"lance",
"object_store",
"rand",
"tempfile",
"tokio",
@@ -3369,7 +3372,7 @@ dependencies = [
[[package]]
name = "vectordb-node"
version = "0.1.2"
version = "0.1.0"
dependencies = [
"arrow-array",
"arrow-ipc",

View File

@@ -1,39 +0,0 @@
# These make sure our builds are compatible with old glibc versions.
[target.x86_64-unknown-linux-gnu]
pre-build = [
# Install newer gfortran
"yum install -y openssl-devel unzip gcc-gfortran",
"scl enable devtoolset-11 bash",
# protobuf is too old, so we directly download binaries
"PB_REL=https://github.com/protocolbuffers/protobuf/releases",
"PB_VERSION=23.1",
"curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-x86_64.zip",
"unzip protoc-$PB_VERSION-linux-x86_64.zip -d /usr/local",
]
image = "ghcr.io/cross-rs/x86_64-unknown-linux-gnu:main-centos"
[target.aarch64-unknown-linux-gnu]
pre-build = [
"yum install -y openssl-devel unzip",
# protobuf is too old, so we directly download binaries
"PB_REL=https://github.com/protocolbuffers/protobuf/releases",
"PB_VERSION=23.1",
"curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-x86_64.zip",
"unzip protoc-$PB_VERSION-linux-x86_64.zip -d /usr/local",
]
# https://github.com/cross-rs/cross/blob/main/docker/Dockerfile.aarch64-unknown-linux-gnu.centos
image = "ghcr.io/cross-rs/aarch64-unknown-linux-gnu:main-centos"
[target.x86_64-unknown-linux-musl]
# https://github.com/cross-rs/cross/blob/main/docker/Dockerfile.x86_64-unknown-linux-musl
pre-build = [
"dpkg --add-architecture $CROSS_DEB_ARCH",
"apt-get update && apt-get install --assume-yes libssl-dev:$CROSS_DEB_ARCH",
]
[target.aarch64-unknown-linux-musl]
# https://github.com/cross-rs/cross/blob/main/docker/Dockerfile.aarch64-unknown-linux-musl
pre-build = [
"dpkg --add-architecture $CROSS_DEB_ARCH",
"apt-get update && apt-get install --assume-yes libssl-dev:$CROSS_DEB_ARCH",
]

View File

@@ -1,95 +0,0 @@
#!/bin/bash
# Builds the Linux artifacts (node binaries).
# Usage: ./build_linux_artifacts.sh [target]
# Targets supported:
# - x86_64-unknown-linux-gnu:centos
# - aarch64-unknown-linux-gnu:centos
# - aarch64-unknown-linux-musl
# - x86_64-unknown-linux-musl
# TODO: refactor this into a Docker container we can pull
set -e
setup_dependencies() {
echo "Installing system dependencies..."
if [[ $1 == *musl ]]; then
# musllinux
apk add openssl-dev
else
# manylinux2014
yum install -y openssl-devel unzip
fi
if [[ $1 == x86_64* ]]; then
ARCH=x86_64
else
# gnu target
ARCH=aarch_64
fi
# Install new enough protobuf (yum-provided is old)
PB_REL=https://github.com/protocolbuffers/protobuf/releases
PB_VERSION=23.1
curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-$ARCH.zip
unzip protoc-$PB_VERSION-linux-$ARCH.zip -d /usr/local
}
install_node() {
echo "Installing node..."
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
source "$HOME"/.bashrc
if [[ $1 == *musl ]]; then
# This node version is 15, we need 16 or higher:
# apk add nodejs-current npm
# So instead we install from source (nvm doesn't provide binaries for musl):
nvm install -s 17
else
nvm install 17 # latest that supports glibc 2.17
fi
printenv
echo "Node version:"
npm --version
which npm
which node
}
install_rust() {
echo "Installing rust..."
curl https://sh.rustup.rs -sSf | bash -s -- -y
printenv
export PATH="$PATH:/root/.cargo/bin"
printenv
}
build_node_binary() {
echo "Building node library for $1..."
pushd node
if [[ $1 == *musl ]]; then
# This is needed for cargo to allow build cdylibs with musl
export RUSTFLAGS="-C target-feature=-crt-static"
fi
# We don't pass in target, since the native target here already matches
# and openblas-src doesn't do well with cross-compilation.
npm run build-release
npm run pack-build
popd
}
TARGET=${1:-x86_64-unknown-linux-gnu}
# Others:
# aarch64-unknown-linux-gnu
# x86_64-unknown-linux-musl
# aarch64-unknown-linux-musl
setup_dependencies $TARGET
install_node $TARGET
install_rust
build_node_binary $TARGET

View File

@@ -1,22 +0,0 @@
# Builds the macOS artifacts (node binaries).
# Usage: ./build_macos_artifacts.sh [target]
# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
build_node_binaries() {
pushd node
for target in $1
do
echo "Building node library for $target"
npm run build-release -- --target $target
npm run pack-build -- --target $target
done
popd
}
if [ -n "$1" ]; then
targets=$1
else
targets="x86_64-apple-darwin aarch64-apple-darwin"
fi
build_node_binaries $targets

View File

@@ -1,121 +0,0 @@
How to release the node module
### 1. Bump the versions
<!-- TODO: we also need to bump the optional dependencies for node! -->
```shell
pushd rust/vectordb
cargo bump minor
popd
pushd rust/ffi/node
cargo bump minor
popd
pushd python
cargo bump minor
popd
pushd node
npm version minor
popd
git add -u
git commit -m "Bump versions"
git push
```
### 2. Push a new tag
```shell
git tag vX.X.X
git push --tag vX.X.X
```
When the tag is pushed, GitHub actions will start building the libraries and
will upload them to a draft release. Wait for those jobs to complete.
### 3. Publish the release
Once the jobs are complete, you can edit the
2. Push a tag, such as vX.X.X. Once the tag is pushrf, GitHub actions will start
building the native libraries and uploading them to a draft release. Wait for
those jobs to complete.
3. If the libraries are successful, edit the changelog and then publish the
release. Once you publish, a new action will start and upload all the
release artifacts to npm.
## Manual process
You can build the artifacts locally on a MacOS machine.
### Build the MacOS release libraries
One-time setup:
```shell
rustup target add x86_64-apple-darwin aarch64-apple-darwin
```
To build:
```shell
bash ci/build_macos_artifacts.sh
```
### Build the Linux release libraries
To build a Linux library, we need to use docker with a different build script:
```shell
ARCH=aarch64
docker run \
-v $(pwd):/io -w /io \
quay.io/pypa/manylinux2014_$ARCH \
bash ci/build_linux_artifacts.sh $ARCH-unknown-linux-gnu
```
You can change `ARCH` to `x86_64`.
Similar script for musl binaries:
```shell
ARCH=aarch64
docker run \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_$ARCH \
bash ci/build_linux_artifacts.sh $ARCH-unknown-linux-musl
```
<!--
For debugging, use these snippets:
```shell
ARCH=aarch64
docker run -it \
-v $(pwd):/io -w /io \
quay.io/pypa/manylinux2014_$ARCH \
bash
```
```shell
ARCH=aarch64
docker run -it \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_$ARCH \
bash
```
Note: musllinux_1_1 is Alpine Linux 3.12
-->
```
docker run \
-v $(pwd):/io -w /io \
quay.io/pypa/musllinux_1_1_aarch64 \
bash alpine_repro.sh
```

View File

@@ -6,9 +6,10 @@ to make this available for JS as well.
## Installation
To use full text search, you must install the fts optional dependencies:
To use full text search, you must install optional dependency tantivy-py:
`pip install lancedb[fts]`
# tantivy 0.19.2
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
## Quickstart

View File

@@ -1,2 +0,0 @@
gen_test_data.py
index.node

View File

@@ -8,9 +8,6 @@ A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb)
npm install vectordb
```
This will download the appropriate native library for your platform. We currently
support x86_64 Linux, Intel MacOS, and ARM (M1/M2) MacOS.
## Usage
### Basic Example
@@ -27,19 +24,6 @@ The [examples](./examples) folder contains complete examples.
## Development
Build and install the rust library with:
```bash
npm run build
npm run pack-build
npm install --no-save ./dist/vectordb-*.tgz
```
`npm run build` builds the Rust library, `npm run pack-build` packages the Rust
binary into an npm module called `@vectordb/<platform>` (for example,
`@vectordb/darwin-arm64.node`), and then `npm run install ...` installs that
module.
The LanceDB javascript is built with npm:
```bash

View File

@@ -12,20 +12,29 @@
// See the License for the specific language governing permissions and
// limitations under the License.
const { currentTarget } = require('@neon-rs/load');
let nativeLib;
try {
nativeLib = require(`@vectordb/${currentTarget()}`);
} catch (e) {
throw new Error(`vectordb: failed to load native library.
You may need to run \`npm install @vectordb/${currentTarget()}\`.
If that does not work, please file a bug report at https://github.com/lancedb/lancedb/issues
Source error: ${e}`);
function getPlatformLibrary() {
if (process.platform === "darwin" && process.arch == "arm64") {
return require('./aarch64-apple-darwin.node');
} else if (process.platform === "darwin" && process.arch == "x64") {
return require('./x86_64-apple-darwin.node');
} else if (process.platform === "linux" && process.arch == "x64") {
return require('./x86_64-unknown-linux-gnu.node');
} else {
throw new Error(`vectordb: unsupported platform ${process.platform}_${process.arch}. Please file a bug report at https://github.com/lancedb/lancedb/issues`)
}
}
// Dynamic require for runtime.
module.exports = nativeLib;
try {
nativeLib = require('./index.node')
} catch (e) {
if (e.code === "MODULE_NOT_FOUND") {
nativeLib = getPlatformLibrary();
} else {
throw new Error('vectordb: failed to load native library. Please file a bug report at https://github.com/lancedb/lancedb/issues');
}
}
module.exports = nativeLib

45
node/package-lock.json generated
View File

@@ -7,26 +7,12 @@
"": {
"name": "vectordb",
"version": "0.1.1",
"cpu": [
"x64",
"arm64"
],
"license": "Apache-2.0",
"os": [
"darwin",
"linux"
],
"dependencies": {
"@apache-arrow/ts": "^12.0.0",
"@neon-rs/load": "^0.0.74",
"@vectordb/darwin-arm64": "0.1.1",
"@vectordb/darwin-x64": "0.1.1",
"@vectordb/linux-x64-gnu": "0.1.1",
"@vectordb/linux-x64-musl": "0.1.1",
"apache-arrow": "^12.0.0"
},
"devDependencies": {
"@neon-rs/cli": "^0.0.74",
"@types/chai": "^4.3.4",
"@types/mocha": "^10.0.1",
"@types/node": "^18.16.2",
@@ -44,12 +30,6 @@
"ts-node": "^10.9.1",
"ts-node-dev": "^2.0.0",
"typescript": "*"
},
"optionalDependencies": {
"@vectordb/darwin-arm64": "0.1.1",
"@vectordb/darwin-x64": "0.1.1",
"@vectordb/linux-x64-gnu": "0.1.1",
"@vectordb/linux-x64-musl": "0.1.1"
}
},
"node_modules/@apache-arrow/ts": {
@@ -217,20 +197,6 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"node_modules/@neon-rs/cli": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
"integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
"dev": true,
"bin": {
"neon": "index.js"
}
},
"node_modules/@neon-rs/load": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
"integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
},
"node_modules/@nodelib/fs.scandir": {
"version": "2.1.5",
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
@@ -4225,17 +4191,6 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"@neon-rs/cli": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
"integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
"dev": true
},
"@neon-rs/load": {
"version": "0.0.74",
"resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
"integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
},
"@nodelib/fs.scandir": {
"version": "2.1.5",
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",

View File

@@ -1,17 +1,15 @@
{
"name": "vectordb",
"version": "0.1.2",
"version": "0.1.1",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"tsc": "tsc -b",
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
"build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json-render-diagnostics",
"build-release": "npm run build -- --release",
"cross-release": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cross build --message-format=json --release -p vectordb-node",
"test": "mocha -recursive dist/test",
"lint": "eslint src --ext .js,.ts",
"pack-build": "neon pack-build"
"lint": "eslint src --ext .js,.ts"
},
"repository": {
"type": "git",
@@ -26,7 +24,6 @@
"author": "Lance Devs",
"license": "Apache-2.0",
"devDependencies": {
"@neon-rs/cli": "^0.0.74",
"@types/chai": "^4.3.4",
"@types/mocha": "^10.0.1",
"@types/node": "^18.16.2",
@@ -47,33 +44,6 @@
},
"dependencies": {
"@apache-arrow/ts": "^12.0.0",
"@neon-rs/load": "^0.0.74",
"apache-arrow": "^12.0.0"
},
"os": [
"darwin",
"linux"
],
"cpu": [
"x64",
"arm64"
],
"neon": {
"targets": {
"x86_64-apple-darwin": "@vectordb/darwin-x64",
"aarch64-apple-darwin": "@vectordb/darwin-arm64",
"x86_64-unknown-linux-gnu": "@vectordb/linux-x64-gnu",
"x86_64-unknown-linux-musl": "@vectordb/linux-x64-musl",
"aarch64-unknown-linux-gnu": "@vectordb/linux-arm64-gnu",
"aarch64-unknown-linux-musl": "@vectordb/linux-arm64-musl"
}
},
"optionalDependencies": {
"@vectordb/darwin-arm64": "0.1.2",
"@vectordb/darwin-x64": "0.1.2",
"@vectordb/linux-x64-gnu": "0.1.2",
"@vectordb/linux-x64-musl": "0.1.2",
"@vectordb/linux-arm64-gnu": "0.1.2",
"@vectordb/linux-arm64-musl": "0.1.2"
}
}

View File

@@ -15,15 +15,16 @@
import {
Field,
Float32,
List,
List, type ListBuilder,
makeBuilder,
RecordBatchFileWriter,
Table, Utf8,
type Vector,
vectorFromArray
} from 'apache-arrow'
import { type EmbeddingFunction } from './index'
export function convertToTable (data: Array<Record<string, unknown>>): Table {
export function convertToTable<T> (data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Table {
if (data.length === 0) {
throw new Error('At least one record needs to be provided')
}
@@ -33,11 +34,7 @@ export function convertToTable (data: Array<Record<string, unknown>>): Table {
for (const columnsKey of columns) {
if (columnsKey === 'vector') {
const children = new Field<Float32>('item', new Float32())
const list = new List(children)
const listBuilder = makeBuilder({
type: list
})
const listBuilder = newVectorListBuilder()
const vectorSize = (data[0].vector as any[]).length
for (const datum of data) {
if ((datum[columnsKey] as any[]).length !== vectorSize) {
@@ -52,6 +49,14 @@ export function convertToTable (data: Array<Record<string, unknown>>): Table {
for (const datum of data) {
values.push(datum[columnsKey])
}
if (columnsKey === embeddings?.sourceColumn) {
const vectors = embeddings.embed(values as T[])
const listBuilder = newVectorListBuilder()
vectors.map(v => listBuilder.append(v))
records.vector = listBuilder.finish().toVector()
}
if (typeof values[0] === 'string') {
// `vectorFromArray` converts strings into dictionary vectors, forcing it back to a string column
records[columnsKey] = vectorFromArray(values, new Utf8())
@@ -64,8 +69,17 @@ export function convertToTable (data: Array<Record<string, unknown>>): Table {
return new Table(records)
}
export async function fromRecordsToBuffer (data: Array<Record<string, unknown>>): Promise<Buffer> {
const table = convertToTable(data)
// Creates a new Arrow ListBuilder that stores a Vector column
function newVectorListBuilder (): ListBuilder<Float32, any> {
const children = new Field<Float32>('item', new Float32())
const list = new List(children)
return makeBuilder({
type: list
})
}
export async function fromRecordsToBuffer<T> (data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Promise<Buffer> {
const table = convertToTable(data, embeddings)
const writer = RecordBatchFileWriter.writeAll(table)
return Buffer.from(await writer.toUint8Array())
}

View File

@@ -28,7 +28,8 @@ const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSe
* @param uri The uri of the database.
*/
export async function connect (uri: string): Promise<Connection> {
return new Connection(uri)
const db = await databaseNew(uri)
return new Connection(db, uri)
}
/**
@@ -38,9 +39,9 @@ export class Connection {
private readonly _uri: string
private readonly _db: any
constructor (uri: string) {
constructor (db: any, uri: string) {
this._uri = uri
this._db = databaseNew(uri)
this._db = db
}
get uri (): string {
@@ -55,17 +56,50 @@ export class Connection {
}
/**
* Open a table in the database.
* @param name The name of the table.
*/
async openTable (name: string): Promise<Table> {
* Open a table in the database.
*
* @param name The name of the table.
*/
async openTable (name: string): Promise<Table>
/**
* Open a table in the database.
*
* @param name The name of the table.
* @param embeddings An embedding function to use on this Table
*/
async openTable<T> (name: string, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
async openTable<T> (name: string, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
const tbl = await databaseOpenTable.call(this._db, name)
return new Table(tbl, name)
if (embeddings !== undefined) {
return new Table(tbl, name, embeddings)
} else {
return new Table(tbl, name)
}
}
async createTable (name: string, data: Array<Record<string, unknown>>): Promise<Table> {
await tableCreate.call(this._db, name, await fromRecordsToBuffer(data))
return await this.openTable(name)
/**
* Creates a new Table and initialize it with new data.
*
* @param name The name of the table.
* @param data Non-empty Array of Records to be inserted into the Table
*/
async createTable (name: string, data: Array<Record<string, unknown>>): Promise<Table>
/**
* Creates a new Table and initialize it with new data.
*
* @param name The name of the table.
* @param data Non-empty Array of Records to be inserted into the Table
* @param embeddings An embedding function to use on this Table
*/
async createTable<T> (name: string, data: Array<Record<string, unknown>>, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
async createTable<T> (name: string, data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
const tbl = await tableCreate.call(this._db, name, await fromRecordsToBuffer(data, embeddings))
if (embeddings !== undefined) {
return new Table(tbl, name, embeddings)
} else {
return new Table(tbl, name)
}
}
async createTableArrow (name: string, table: ArrowTable): Promise<Table> {
@@ -75,16 +109,22 @@ export class Connection {
}
}
/**
* A table in a LanceDB database.
*/
export class Table {
export class Table<T = number[]> {
private readonly _tbl: any
private readonly _name: string
private readonly _embeddings?: EmbeddingFunction<T>
constructor (tbl: any, name: string) {
constructor (tbl: any, name: string)
/**
* @param tbl
* @param name
* @param embeddings An embedding function to use when interacting with this table
*/
constructor (tbl: any, name: string, embeddings: EmbeddingFunction<T>)
constructor (tbl: any, name: string, embeddings?: EmbeddingFunction<T>) {
this._tbl = tbl
this._name = name
this._embeddings = embeddings
}
get name (): string {
@@ -92,10 +132,16 @@ export class Table {
}
/**
* Create a search query to find the nearest neighbors of the given query vector.
* @param queryVector The query vector.
*/
search (queryVector: number[]): Query {
* Creates a search query to find the nearest neighbors of the given search term
* @param query The query search term
*/
search (query: T): Query {
let queryVector: number[]
if (this._embeddings !== undefined) {
queryVector = this._embeddings.embed([query])[0]
} else {
queryVector = query as number[]
}
return new Query(this._tbl, queryVector)
}
@@ -106,7 +152,7 @@ export class Table {
* @return The number of rows added to the table
*/
async add (data: Array<Record<string, unknown>>): Promise<number> {
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Append.toString())
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Append.toString())
}
/**
@@ -116,9 +162,14 @@ export class Table {
* @return The number of rows added to the table
*/
async overwrite (data: Array<Record<string, unknown>>): Promise<number> {
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Overwrite.toString())
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Overwrite.toString())
}
/**
* Create an ANN index on this Table vector index.
*
* @param indexParams The parameters of this Index, @see VectorIndexParams.
*/
async create_index (indexParams: VectorIndexParams): Promise<any> {
return tableCreateVectorIndex.call(this._tbl, indexParams)
}
@@ -268,6 +319,21 @@ export enum WriteMode {
Append = 'append'
}
/**
* An embedding function that automatically creates vector representation for a given column.
*/
export interface EmbeddingFunction<T> {
/**
* The name of the column that will be used as input for the Embedding Function.
*/
sourceColumn: string
/**
* Creates a vector representation for the given values.
*/
embed: (data: T[]) => number[][]
}
/**
* Distance metrics type.
*/

52
node/src/test/io.ts Normal file
View File

@@ -0,0 +1,52 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// IO tests
import { describe } from 'mocha'
import { assert } from 'chai'
import * as lancedb from '../index'
describe('LanceDB S3 client', function () {
if (process.env.TEST_S3_BASE_URL != null) {
const baseUri = process.env.TEST_S3_BASE_URL
it('should have a valid url', async function () {
const uri = `${baseUri}/valid_url`
const table = await createTestDB(uri, 2, 20)
const con = await lancedb.connect(uri)
assert.equal(con.uri, uri)
const results = await table.search([0.1, 0.3]).limit(5).execute()
assert.equal(results.length, 5)
})
} else {
describe.skip('Skip S3 test', function () {})
}
})
async function createTestDB (uri: string, numDimensions: number = 2, numRows: number = 2): Promise<lancedb.Table> {
const con = await lancedb.connect(uri)
const data = []
for (let i = 0; i < numRows; i++) {
const vector = []
for (let j = 0; j < numDimensions; j++) {
vector.push(i + (j * 0.1))
}
data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
}
return await con.createTable('vectors', data)
}

View File

@@ -17,7 +17,7 @@ import { assert } from 'chai'
import { track } from 'temp'
import * as lancedb from '../index'
import { MetricType, Query } from '../index'
import { type EmbeddingFunction, MetricType, Query } from '../index'
describe('LanceDB client', function () {
describe('when creating a connection to lancedb', function () {
@@ -140,6 +140,39 @@ describe('LanceDB client', function () {
await table.create_index({ type: 'ivf_pq', column: 'vector', num_partitions: 2, max_iters: 2 })
}).timeout(10_000) // Timeout is high partially because GH macos runner is pretty slow
})
describe('when using a custom embedding function', function () {
class TextEmbedding implements EmbeddingFunction<string> {
sourceColumn: string
constructor (targetColumn: string) {
this.sourceColumn = targetColumn
}
_embedding_map = new Map<string, number[]>([
['foo', [2.1, 2.2]],
['bar', [3.1, 3.2]]
])
embed (data: string[]): number[][] {
return data.map(datum => this._embedding_map.get(datum) ?? [0.0, 0.0])
}
}
it('should encode the original data into embeddings', async function () {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const embeddings = new TextEmbedding('name')
const data = [
{ price: 10, name: 'foo' },
{ price: 50, name: 'bar' }
]
const table = await con.createTable('vectors', data, embeddings)
const results = await table.search('foo').execute()
assert.equal(results.length, 2)
})
})
})
describe('Query object', function () {

View File

@@ -16,7 +16,13 @@ import os
from typing import List, Tuple
import pyarrow as pa
import tantivy
try:
import tantivy
except ImportError:
raise ImportError(
"Please install tantivy-py `pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985` to use the full text search feature."
)
from .table import LanceTable

View File

@@ -153,7 +153,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
import tantivy
except ImportError:
raise ImportError(
"You need to install the `lancedb[fts]` extra to use this method."
"Please install tantivy-py `pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985` to use the full text search feature."
)
from .fts import search_index

View File

@@ -1,7 +1,7 @@
[project]
name = "lancedb"
version = "0.1.2"
dependencies = ["pylance>=0.4.6", "ratelimiter", "retry", "tqdm"]
version = "0.1.4"
dependencies = ["pylance>=0.4.17", "ratelimiter", "retry", "tqdm"]
description = "lancedb"
authors = [
{ name = "LanceDB Devs", email = "dev@lancedb.com" },
@@ -45,10 +45,6 @@ dev = [
docs = [
"mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"
]
fts = [
# tantivy 0.19.2
"tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985"
]
[build-system]
requires = [

View File

@@ -14,7 +14,6 @@ import sys
import numpy as np
import pyarrow as pa
from lancedb.embeddings import with_embeddings

View File

@@ -13,13 +13,13 @@
import os
import random
import lancedb.fts
import numpy as np
import pandas as pd
import pytest
import tantivy
import lancedb as ldb
import lancedb.fts
@pytest.fixture

View File

@@ -17,7 +17,6 @@ import pandas as pd
import pandas.testing as tm
import pyarrow as pa
import pytest
from lancedb.query import LanceQueryBuilder

View File

@@ -16,7 +16,6 @@ from pathlib import Path
import pandas as pd
import pyarrow as pa
import pytest
from lancedb.table import LanceTable

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb-node"
version = "0.1.2"
version = "0.1.0"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
edition = "2018"
@@ -15,7 +15,7 @@ arrow-ipc = "37.0"
arrow-schema = "37.0"
once_cell = "1"
futures = "0.3"
lance = "0.4.3"
lance = "0.4.17"
vectordb = { path = "../../vectordb" }
tokio = { version = "1.23", features = ["rt-multi-thread"] }
neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] }

View File

@@ -39,7 +39,7 @@ pub(crate) fn table_create_vector_index(mut cx: FunctionContext) -> JsResult<JsP
let add_result = table
.lock()
.unwrap()
.create_idx(&index_params_builder)
.create_index(&index_params_builder)
.await;
deferred.settle_with(&channel, move |mut cx| {

View File

@@ -56,23 +56,46 @@ fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
RUNTIME.get_or_try_init(|| Runtime::new().or_else(|err| cx.throw_error(err.to_string())))
}
fn database_new(mut cx: FunctionContext) -> JsResult<JsBox<JsDatabase>> {
fn database_new(mut cx: FunctionContext) -> JsResult<JsPromise> {
let path = cx.argument::<JsString>(0)?.value(&mut cx);
let db = JsDatabase {
database: Arc::new(Database::connect(path).or_else(|err| cx.throw_error(err.to_string()))?),
};
Ok(cx.boxed(db))
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let (deferred, promise) = cx.promise();
rt.spawn(async move {
let database = Database::connect(&path).await;
deferred.settle_with(&channel, move |mut cx| {
let db = JsDatabase {
database: Arc::new(database.or_else(|err| cx.throw_error(err.to_string()))?),
};
Ok(cx.boxed(db))
});
});
Ok(promise)
}
fn database_table_names(mut cx: FunctionContext) -> JsResult<JsArray> {
fn database_table_names(mut cx: FunctionContext) -> JsResult<JsPromise> {
let db = cx
.this()
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
let tables = db
.database
.table_names()
.or_else(|err| cx.throw_error(err.to_string()))?;
convert::vec_str_to_array(&tables, &mut cx)
let rt = runtime(&mut cx)?;
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let database = db.database.clone();
rt.spawn(async move {
let tables_rst = database.table_names().await;
deferred.settle_with(&channel, move |mut cx| {
let tables = tables_rst.or_else(|err| cx.throw_error(err.to_string()))?;
let table_names = convert::vec_str_to_array(&tables, &mut cx);
table_names
});
});
Ok(promise)
}
fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
@@ -87,7 +110,7 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
let (deferred, promise) = cx.promise();
rt.spawn(async move {
let table_rst = database.open_table(table_name).await;
let table_rst = database.open_table(&table_name).await;
deferred.settle_with(&channel, move |mut cx| {
let table = Arc::new(Mutex::new(
@@ -186,7 +209,7 @@ fn table_create(mut cx: FunctionContext) -> JsResult<JsPromise> {
rt.block_on(async move {
let batch_reader: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(batches));
let table_rst = database.create_table(table_name, batch_reader).await;
let table_rst = database.create_table(&table_name, batch_reader).await;
deferred.settle_with(&channel, move |mut cx| {
let table = Arc::new(Mutex::new(

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb"
version = "0.1.2"
version = "0.0.1"
edition = "2021"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
@@ -12,7 +12,9 @@ repository = "https://github.com/lancedb/lancedb"
arrow-array = "37.0"
arrow-data = "37.0"
arrow-schema = "37.0"
lance = "0.4.3"
object_store = "0.5.6"
lance = "0.4.17"
tokio = { version = "1.23", features = ["rt-multi-thread"] }
[dev-dependencies]

View File

@@ -12,16 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use arrow_array::RecordBatchReader;
use std::fs::create_dir_all;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::path::Path;
use arrow_array::RecordBatchReader;
use lance::io::object_store::ObjectStore;
use crate::error::Result;
use crate::table::Table;
pub struct Database {
pub(crate) path: Arc<PathBuf>,
object_store: ObjectStore,
pub(crate) uri: String,
}
const LANCE_EXTENSION: &str = "lance";
@@ -37,12 +40,17 @@ impl Database {
/// # Returns
///
/// * A [Database] object.
pub fn connect<P: AsRef<Path>>(path: P) -> Result<Database> {
if !path.as_ref().try_exists()? {
create_dir_all(&path)?;
pub async fn connect(uri: &str) -> Result<Database> {
let object_store = ObjectStore::new(uri).await?;
if object_store.is_local() {
let path = Path::new(uri);
if !path.try_exists()? {
create_dir_all(&path)?;
}
}
Ok(Database {
path: Arc::new(path.as_ref().to_path_buf()),
uri: uri.to_string(),
object_store,
})
}
@@ -51,12 +59,13 @@ impl Database {
/// # Returns
///
/// * A [Vec<String>] with all table names.
pub fn table_names(&self) -> Result<Vec<String>> {
pub async fn table_names(&self) -> Result<Vec<String>> {
let f = self
.path
.read_dir()?
.flatten()
.map(|dir_entry| dir_entry.path())
.object_store
.read_dir("/")
.await?
.iter()
.map(|fname| Path::new(fname))
.filter(|path| {
let is_lance = path
.extension()
@@ -76,10 +85,10 @@ impl Database {
pub async fn create_table(
&self,
name: String,
name: &str,
batches: Box<dyn RecordBatchReader>,
) -> Result<Table> {
Table::create(self.path.clone(), name, batches).await
Table::create(&self.uri, name, batches).await
}
/// Open a table in the database.
@@ -90,8 +99,8 @@ impl Database {
/// # Returns
///
/// * A [Table] object.
pub async fn open_table(&self, name: String) -> Result<Table> {
Table::open(self.path.clone(), name).await
pub async fn open_table(&self, name: &str) -> Result<Table> {
Table::open(&self.uri, name).await
}
}
@@ -105,10 +114,10 @@ mod tests {
#[tokio::test]
async fn test_connect() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let db = Database::connect(&path_buf);
let uri = tmp_dir.path().to_str().unwrap();
let db = Database::connect(uri).await.unwrap();
assert_eq!(db.unwrap().path.as_path(), path_buf.as_path())
assert_eq!(db.uri, uri);
}
#[tokio::test]
@@ -118,10 +127,16 @@ mod tests {
create_dir_all(tmp_dir.path().join("table2.lance")).unwrap();
create_dir_all(tmp_dir.path().join("invalidlance")).unwrap();
let db = Database::connect(&tmp_dir.into_path()).unwrap();
let tables = db.table_names().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let db = Database::connect(uri).await.unwrap();
let tables = db.table_names().await.unwrap();
assert_eq!(tables.len(), 2);
assert!(tables.contains(&String::from("table1")));
assert!(tables.contains(&String::from("table2")));
}
#[tokio::test]
async fn test_connect_s3() {
// let db = Database::connect("s3://bucket/path/to/database").await.unwrap();
}
}

View File

@@ -41,3 +41,15 @@ impl From<lance::Error> for Error {
Self::Lance(e.to_string())
}
}
impl From<object_store::Error> for Error {
fn from(e: object_store::Error) -> Self {
Self::IO(e.to_string())
}
}
impl From<object_store::path::Error> for Error {
fn from(e: object_store::path::Error) -> Self {
Self::IO(e.to_string())
}
}

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::path::PathBuf;
use std::path::Path;
use std::sync::Arc;
use arrow_array::{Float32Array, RecordBatchReader};
@@ -24,16 +24,21 @@ use crate::index::vector::VectorIndexBuilder;
use crate::query::Query;
pub const VECTOR_COLUMN_NAME: &str = "vector";
pub const LANCE_FILE_EXTENSION: &str = "lance";
/// A table in a LanceDB database.
pub struct Table {
name: String,
path: String,
uri: String,
dataset: Arc<Dataset>,
}
impl std::fmt::Display for Table {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Table({})", self.name)
}
}
impl Table {
/// Opens an existing Table
///
@@ -45,18 +50,21 @@ impl Table {
/// # Returns
///
/// * A [Table] object.
pub async fn open(base_path: Arc<PathBuf>, name: String) -> Result<Self> {
let ds_path = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let ds_uri = ds_path
pub async fn open(base_uri: &str, name: &str) -> Result<Self> {
let path = Path::new(base_uri);
let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let uri = table_uri
.as_path()
.to_str()
.ok_or(Error::IO(format!("Unable to find table {}", name)))?;
let dataset = Dataset::open(ds_uri).await?;
let table = Table {
name,
path: ds_uri.to_string(),
.ok_or(Error::IO(format!("Invalid table name: {}", name)))?;
let dataset = Dataset::open(&uri).await?;
Ok(Table {
name: name.to_string(),
uri: uri.to_string(),
dataset: Arc::new(dataset),
};
Ok(table)
})
}
/// Creates a new Table
@@ -71,25 +79,28 @@ impl Table {
///
/// * A [Table] object.
pub async fn create(
base_path: Arc<PathBuf>,
name: String,
base_uri: &str,
name: &str,
mut batches: Box<dyn RecordBatchReader>,
) -> Result<Self> {
let ds_path = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let path = ds_path
let base_path = Path::new(base_uri);
let table_uri = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let uri = table_uri
.as_path()
.to_str()
.ok_or(Error::IO(format!("Unable to find table {}", name)))?;
.ok_or(Error::IO(format!("Invalid table name: {}", name)))?
.to_string();
let dataset =
Arc::new(Dataset::write(&mut batches, path, Some(WriteParams::default())).await?);
Arc::new(Dataset::write(&mut batches, &uri, Some(WriteParams::default())).await?);
Ok(Table {
name,
path: path.to_string(),
name: name.to_string(),
uri,
dataset,
})
}
pub async fn create_idx(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
/// Create index on the table.
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
use lance::index::DatasetIndexExt;
let dataset = self
@@ -125,8 +136,7 @@ impl Table {
let mut params = WriteParams::default();
params.mode = write_mode.unwrap_or(WriteMode::Append);
self.dataset =
Arc::new(Dataset::write(&mut batches, self.path.as_str(), Some(params)).await?);
self.dataset = Arc::new(Dataset::write(&mut batches, &self.uri, Some(params)).await?);
Ok(batches.count())
}
@@ -151,6 +161,8 @@ impl Table {
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow_array::{
Array, FixedSizeListArray, Float32Array, Int32Array, RecordBatch, RecordBatchReader,
};
@@ -161,53 +173,52 @@ mod tests {
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use rand::Rng;
use std::sync::Arc;
use tempfile::tempdir;
use crate::error::Result;
use super::*;
use crate::index::vector::IvfPQIndexBuilder;
use crate::table::Table;
#[tokio::test]
async fn test_new_table_not_exists() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let uri = tmp_dir.path().to_str().unwrap();
let table = Table::open(Arc::new(path_buf), "test".to_string()).await;
let table = Table::open(&uri, "test").await;
assert!(table.is_err());
}
#[tokio::test]
async fn test_open() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let dataset_path = tmp_dir.path().join("test.lance");
let uri = tmp_dir.path().to_str().unwrap();
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
Dataset::write(
&mut batches,
path_buf.join("test.lance").to_str().unwrap(),
None,
)
.await
.unwrap();
let table = Table::open(Arc::new(path_buf), "test".to_string())
Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
.await
.unwrap();
let table = Table::open(uri, "test").await.unwrap();
assert_eq!(table.name, "test")
}
#[test]
fn test_object_store_path() {
use std::path::Path as StdPath;
let p = StdPath::new("s3://bucket/path/to/file");
let c = p.join("subfile");
assert_eq!(c.to_str().unwrap(), "s3://bucket/path/to/file/subfile");
}
#[tokio::test]
async fn test_add() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let uri = tmp_dir.path().to_str().unwrap();
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let schema = batches.schema().clone();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches)
.await
.unwrap();
let mut table = Table::create(&uri, "test", batches).await.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
let new_batches: Box<dyn RecordBatchReader> =
@@ -225,13 +236,11 @@ mod tests {
#[tokio::test]
async fn test_add_overwrite() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let uri = tmp_dir.path().to_str().unwrap();
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let schema = batches.schema().clone();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches)
.await
.unwrap();
let mut table = Table::create(uri, "test", batches).await.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10);
let new_batches: Box<dyn RecordBatchReader> =
@@ -252,21 +261,16 @@ mod tests {
#[tokio::test]
async fn test_search() {
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let dataset_path = tmp_dir.path().join("test.lance");
let uri = tmp_dir.path().to_str().unwrap();
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
Dataset::write(
&mut batches,
path_buf.join("test.lance").to_str().unwrap(),
None,
)
.await
.unwrap();
let table = Table::open(Arc::new(path_buf), "test".to_string())
Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
.await
.unwrap();
let table = Table::open(uri, "test").await.unwrap();
let vector = Float32Array::from_iter_values([0.1, 0.2]);
let query = table.search(vector.clone());
assert_eq!(vector, query.query_vector);
@@ -291,7 +295,7 @@ mod tests {
use arrow_array::Float32Array;
let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path();
let uri = tmp_dir.path().to_str().unwrap();
let dimension = 16;
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
@@ -318,9 +322,7 @@ mod tests {
.unwrap()]);
let reader: Box<dyn RecordBatchReader + Send> = Box::new(batches);
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), reader)
.await
.unwrap();
let mut table = Table::create(uri, "test", reader).await.unwrap();
let mut i = IvfPQIndexBuilder::new();
@@ -330,7 +332,7 @@ mod tests {
.ivf_params(IvfBuildParams::new(256))
.pq_params(PQBuildParams::default());
table.create_idx(index_builder).await.unwrap();
table.create_index(index_builder).await.unwrap();
assert_eq!(table.dataset.load_indices().await.unwrap().len(), 1);
assert_eq!(table.count_rows().await.unwrap(), 512);